@fugood/llama.node 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. package/CMakeLists.txt +85 -0
  2. package/README.md +56 -0
  3. package/bin/darwin/arm64/llama-node.node +0 -0
  4. package/bin/darwin/x64/llama-node.node +0 -0
  5. package/bin/linux/arm64/llama-node.node +0 -0
  6. package/bin/linux/x64/llama-node.node +0 -0
  7. package/bin/win32/arm64/llama-node.node +0 -0
  8. package/bin/win32/arm64/node.lib +0 -0
  9. package/bin/win32/x64/llama-node.node +0 -0
  10. package/bin/win32/x64/node.lib +0 -0
  11. package/lib/binding.js +13 -0
  12. package/lib/binding.ts +57 -0
  13. package/lib/index.js +24 -0
  14. package/lib/index.ts +13 -0
  15. package/package.json +65 -0
  16. package/src/addons.cpp +506 -0
  17. package/src/llama.cpp/CMakeLists.txt +1320 -0
  18. package/src/llama.cpp/build.zig +172 -0
  19. package/src/llama.cpp/cmake/FindSIMD.cmake +100 -0
  20. package/src/llama.cpp/common/CMakeLists.txt +87 -0
  21. package/src/llama.cpp/common/base64.hpp +392 -0
  22. package/src/llama.cpp/common/common.cpp +2949 -0
  23. package/src/llama.cpp/common/common.h +324 -0
  24. package/src/llama.cpp/common/console.cpp +501 -0
  25. package/src/llama.cpp/common/console.h +19 -0
  26. package/src/llama.cpp/common/grammar-parser.cpp +440 -0
  27. package/src/llama.cpp/common/grammar-parser.h +29 -0
  28. package/src/llama.cpp/common/json-schema-to-grammar.cpp +764 -0
  29. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
  30. package/src/llama.cpp/common/json.hpp +24766 -0
  31. package/src/llama.cpp/common/log.h +724 -0
  32. package/src/llama.cpp/common/ngram-cache.cpp +282 -0
  33. package/src/llama.cpp/common/ngram-cache.h +94 -0
  34. package/src/llama.cpp/common/sampling.cpp +353 -0
  35. package/src/llama.cpp/common/sampling.h +147 -0
  36. package/src/llama.cpp/common/stb_image.h +8396 -0
  37. package/src/llama.cpp/common/train.cpp +1513 -0
  38. package/src/llama.cpp/common/train.h +233 -0
  39. package/src/llama.cpp/examples/CMakeLists.txt +52 -0
  40. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +5 -0
  41. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1640 -0
  42. package/src/llama.cpp/examples/batched/CMakeLists.txt +5 -0
  43. package/src/llama.cpp/examples/batched/batched.cpp +262 -0
  44. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +5 -0
  45. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +261 -0
  46. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +5 -0
  47. package/src/llama.cpp/examples/beam-search/beam-search.cpp +188 -0
  48. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +6 -0
  49. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +275 -0
  50. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +5 -0
  51. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +936 -0
  52. package/src/llama.cpp/examples/embedding/CMakeLists.txt +5 -0
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +211 -0
  54. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +9 -0
  55. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +195 -0
  56. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +5 -0
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +462 -0
  58. package/src/llama.cpp/examples/finetune/CMakeLists.txt +5 -0
  59. package/src/llama.cpp/examples/finetune/finetune.cpp +1861 -0
  60. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +5 -0
  61. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +132 -0
  62. package/src/llama.cpp/examples/gguf/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/gguf/gguf.cpp +256 -0
  64. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +5 -0
  65. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +553 -0
  66. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +5 -0
  67. package/src/llama.cpp/examples/gritlm/gritlm.cpp +215 -0
  68. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +5 -0
  69. package/src/llama.cpp/examples/imatrix/imatrix.cpp +655 -0
  70. package/src/llama.cpp/examples/infill/CMakeLists.txt +5 -0
  71. package/src/llama.cpp/examples/infill/infill.cpp +767 -0
  72. package/src/llama.cpp/examples/jeopardy/questions.txt +100 -0
  73. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +5 -0
  74. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +1286 -0
  75. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/CMakeLists.txt +50 -0
  76. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/llama-android.cpp +443 -0
  77. package/src/llama.cpp/examples/llava/CMakeLists.txt +37 -0
  78. package/src/llama.cpp/examples/llava/clip.cpp +2027 -0
  79. package/src/llama.cpp/examples/llava/clip.h +85 -0
  80. package/src/llama.cpp/examples/llava/llava-cli.cpp +309 -0
  81. package/src/llama.cpp/examples/llava/llava.cpp +426 -0
  82. package/src/llama.cpp/examples/llava/llava.h +50 -0
  83. package/src/llama.cpp/examples/llava/requirements.txt +3 -0
  84. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +5 -0
  85. package/src/llama.cpp/examples/lookahead/lookahead.cpp +485 -0
  86. package/src/llama.cpp/examples/lookup/CMakeLists.txt +23 -0
  87. package/src/llama.cpp/examples/lookup/lookup-create.cpp +41 -0
  88. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +47 -0
  89. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +160 -0
  90. package/src/llama.cpp/examples/lookup/lookup.cpp +258 -0
  91. package/src/llama.cpp/examples/main/CMakeLists.txt +5 -0
  92. package/src/llama.cpp/examples/main/main.cpp +957 -0
  93. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +33 -0
  94. package/src/llama.cpp/examples/parallel/CMakeLists.txt +5 -0
  95. package/src/llama.cpp/examples/parallel/parallel.cpp +427 -0
  96. package/src/llama.cpp/examples/passkey/CMakeLists.txt +5 -0
  97. package/src/llama.cpp/examples/passkey/passkey.cpp +302 -0
  98. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +5 -0
  99. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1943 -0
  100. package/src/llama.cpp/examples/quantize/CMakeLists.txt +6 -0
  101. package/src/llama.cpp/examples/quantize/quantize.cpp +423 -0
  102. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +6 -0
  103. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +424 -0
  104. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +5 -0
  105. package/src/llama.cpp/examples/retrieval/retrieval.cpp +350 -0
  106. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +5 -0
  107. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +246 -0
  108. package/src/llama.cpp/examples/server/CMakeLists.txt +40 -0
  109. package/src/llama.cpp/examples/server/bench/requirements.txt +2 -0
  110. package/src/llama.cpp/examples/server/httplib.h +9465 -0
  111. package/src/llama.cpp/examples/server/server.cpp +3826 -0
  112. package/src/llama.cpp/examples/server/tests/requirements.txt +6 -0
  113. package/src/llama.cpp/examples/server/utils.hpp +653 -0
  114. package/src/llama.cpp/examples/simple/CMakeLists.txt +5 -0
  115. package/src/llama.cpp/examples/simple/simple.cpp +183 -0
  116. package/src/llama.cpp/examples/speculative/CMakeLists.txt +5 -0
  117. package/src/llama.cpp/examples/speculative/speculative.cpp +614 -0
  118. package/src/llama.cpp/examples/sycl/CMakeLists.txt +9 -0
  119. package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +13 -0
  120. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +5 -0
  121. package/src/llama.cpp/examples/tokenize/tokenize.cpp +42 -0
  122. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +5 -0
  123. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +1252 -0
  124. package/src/llama.cpp/ggml-alloc.c +985 -0
  125. package/src/llama.cpp/ggml-alloc.h +76 -0
  126. package/src/llama.cpp/ggml-backend-impl.h +141 -0
  127. package/src/llama.cpp/ggml-backend.c +2099 -0
  128. package/src/llama.cpp/ggml-backend.h +233 -0
  129. package/src/llama.cpp/ggml-common.h +1853 -0
  130. package/src/llama.cpp/ggml-cuda.h +43 -0
  131. package/src/llama.cpp/ggml-impl.h +265 -0
  132. package/src/llama.cpp/ggml-kompute.cpp +2006 -0
  133. package/src/llama.cpp/ggml-kompute.h +46 -0
  134. package/src/llama.cpp/ggml-metal.h +66 -0
  135. package/src/llama.cpp/ggml-mpi.c +216 -0
  136. package/src/llama.cpp/ggml-mpi.h +39 -0
  137. package/src/llama.cpp/ggml-opencl.cpp +2301 -0
  138. package/src/llama.cpp/ggml-opencl.h +36 -0
  139. package/src/llama.cpp/ggml-quants.c +12678 -0
  140. package/src/llama.cpp/ggml-quants.h +133 -0
  141. package/src/llama.cpp/ggml-sycl.cpp +17882 -0
  142. package/src/llama.cpp/ggml-sycl.h +49 -0
  143. package/src/llama.cpp/ggml-vulkan-shaders.hpp +69849 -0
  144. package/src/llama.cpp/ggml-vulkan.cpp +6442 -0
  145. package/src/llama.cpp/ggml-vulkan.h +29 -0
  146. package/src/llama.cpp/ggml.c +21819 -0
  147. package/src/llama.cpp/ggml.h +2403 -0
  148. package/src/llama.cpp/llama.cpp +17468 -0
  149. package/src/llama.cpp/llama.h +1117 -0
  150. package/src/llama.cpp/pocs/CMakeLists.txt +12 -0
  151. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +9 -0
  152. package/src/llama.cpp/pocs/vdot/q8dot.cpp +172 -0
  153. package/src/llama.cpp/pocs/vdot/vdot.cpp +310 -0
  154. package/src/llama.cpp/prompts/LLM-questions.txt +49 -0
  155. package/src/llama.cpp/prompts/alpaca.txt +1 -0
  156. package/src/llama.cpp/prompts/assistant.txt +31 -0
  157. package/src/llama.cpp/prompts/chat-with-baichuan.txt +4 -0
  158. package/src/llama.cpp/prompts/chat-with-bob.txt +7 -0
  159. package/src/llama.cpp/prompts/chat-with-qwen.txt +1 -0
  160. package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +7 -0
  161. package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +7 -0
  162. package/src/llama.cpp/prompts/chat.txt +28 -0
  163. package/src/llama.cpp/prompts/dan-modified.txt +1 -0
  164. package/src/llama.cpp/prompts/dan.txt +1 -0
  165. package/src/llama.cpp/prompts/mnemonics.txt +93 -0
  166. package/src/llama.cpp/prompts/parallel-questions.txt +43 -0
  167. package/src/llama.cpp/prompts/reason-act.txt +18 -0
  168. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +3 -0
  169. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +1 -0
  170. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +2 -0
  171. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +2 -0
  172. package/src/llama.cpp/requirements/requirements-convert.txt +5 -0
  173. package/src/llama.cpp/requirements.txt +12 -0
  174. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +24 -0
  175. package/src/llama.cpp/scripts/xxd.cmake +16 -0
  176. package/src/llama.cpp/sgemm.cpp +999 -0
  177. package/src/llama.cpp/sgemm.h +12 -0
  178. package/src/llama.cpp/tests/CMakeLists.txt +78 -0
  179. package/src/llama.cpp/tests/get-model.cpp +21 -0
  180. package/src/llama.cpp/tests/get-model.h +2 -0
  181. package/src/llama.cpp/tests/test-autorelease.cpp +24 -0
  182. package/src/llama.cpp/tests/test-backend-ops.cpp +2266 -0
  183. package/src/llama.cpp/tests/test-c.c +7 -0
  184. package/src/llama.cpp/tests/test-chat-template.cpp +107 -0
  185. package/src/llama.cpp/tests/test-double-float.cpp +57 -0
  186. package/src/llama.cpp/tests/test-grad0.cpp +1606 -0
  187. package/src/llama.cpp/tests/test-grammar-integration.cpp +243 -0
  188. package/src/llama.cpp/tests/test-grammar-parser.cpp +250 -0
  189. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +899 -0
  190. package/src/llama.cpp/tests/test-llama-grammar.cpp +402 -0
  191. package/src/llama.cpp/tests/test-model-load-cancel.cpp +27 -0
  192. package/src/llama.cpp/tests/test-opt.cpp +181 -0
  193. package/src/llama.cpp/tests/test-quantize-fns.cpp +185 -0
  194. package/src/llama.cpp/tests/test-quantize-perf.cpp +363 -0
  195. package/src/llama.cpp/tests/test-rope.cpp +221 -0
  196. package/src/llama.cpp/tests/test-sampling.cpp +301 -0
  197. package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +187 -0
  198. package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +190 -0
  199. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +123 -0
  200. package/src/llama.cpp/tests/test-tokenizer-1-llama.cpp +111 -0
  201. package/src/llama.cpp/unicode-data.cpp +1651 -0
  202. package/src/llama.cpp/unicode-data.h +16 -0
  203. package/src/llama.cpp/unicode.cpp +277 -0
  204. package/src/llama.cpp/unicode.h +28 -0
@@ -0,0 +1,2027 @@
1
+ // NOTE: This is modified from clip.cpp only for LLaVA,
2
+ // so there might be still unnecessary artifacts hanging around
3
+ // I'll gradually clean and extend it
4
+ // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
5
+ #include "clip.h"
6
+ #include "log.h"
7
+ #include "ggml.h"
8
+ #include "ggml-alloc.h"
9
+ #include "ggml-backend.h"
10
+
11
+ #ifdef GGML_USE_CUDA
12
+ #include "ggml-cuda.h"
13
+ #endif
14
+
15
+ #ifdef GGML_USE_METAL
16
+ #include "ggml-metal.h"
17
+ #endif
18
+
19
+ #define STB_IMAGE_IMPLEMENTATION
20
+ #include "stb_image.h"
21
+
22
+ #include <cassert>
23
+ #include <cmath>
24
+ #include <cstdlib>
25
+ #include <cstring>
26
+ #include <fstream>
27
+ #include <map>
28
+ #include <regex>
29
+ #include <stdexcept>
30
+ #include <vector>
31
+ #include <sstream>
32
+ #include <cinttypes>
33
+ #include <limits>
34
+
35
+ //#define CLIP_DEBUG_FUNCTIONS
36
+
37
+ // RGB uint8 image
38
+ struct clip_image_u8 {
39
+ int nx;
40
+ int ny;
41
+
42
+ std::vector<uint8_t> buf;
43
+ };
44
+
45
+ // RGB float32 image (NHWC)
46
+ // Memory layout: RGBRGBRGB...
47
+ struct clip_image_f32 {
48
+ int nx;
49
+ int ny;
50
+
51
+ std::vector<float> buf;
52
+ };
53
+
54
+ static std::string format(const char * fmt, ...) {
55
+ va_list ap;
56
+ va_list ap2;
57
+ va_start(ap, fmt);
58
+ va_copy(ap2, ap);
59
+ int size = vsnprintf(NULL, 0, fmt, ap);
60
+ GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
61
+ std::vector<char> buf(size + 1);
62
+ int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
63
+ GGML_ASSERT(size2 == size);
64
+ va_end(ap2);
65
+ va_end(ap);
66
+ return std::string(buf.data(), buf.size());
67
+ }
68
+
69
+ //
70
+ // key constants
71
+ //
72
+
73
+ #define KEY_FTYPE "general.file_type"
74
+ #define KEY_NAME "general.name"
75
+ #define KEY_DESCRIPTION "general.description"
76
+ #define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
77
+ #define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
78
+ #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
79
+ #define KEY_USE_GELU "clip.use_gelu"
80
+ #define KEY_N_EMBD "clip.%s.embedding_length"
81
+ #define KEY_N_FF "clip.%s.feed_forward_length"
82
+ #define KEY_N_BLOCK "clip.%s.block_count"
83
+ #define KEY_N_HEAD "clip.%s.attention.head_count"
84
+ #define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
85
+ #define KEY_PROJ_DIM "clip.%s.projection_dim"
86
+ #define KEY_TOKENS "tokenizer.ggml.tokens"
87
+ #define KEY_N_POSITIONS "clip.text.context_length"
88
+ #define KEY_IMAGE_SIZE "clip.vision.image_size"
89
+ #define KEY_PATCH_SIZE "clip.vision.patch_size"
90
+ #define KEY_IMAGE_MEAN "clip.vision.image_mean"
91
+ #define KEY_IMAGE_STD "clip.vision.image_std"
92
+ #define KEY_PROJ_TYPE "clip.projector_type"
93
+
94
+ #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
95
+ #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
96
+ #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
97
+
98
+
99
+ //
100
+ // tensor name constants
101
+ //
102
+
103
+ #define TN_TOKEN_EMBD "%s.token_embd.weight"
104
+ #define TN_POS_EMBD "%s.position_embd.weight"
105
+ #define TN_CLASS_EMBD "v.class_embd"
106
+ #define TN_PATCH_EMBD "v.patch_embd.weight"
107
+ #define TN_ATTN_K "%s.blk.%d.attn_k.%s"
108
+ #define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
109
+ #define TN_ATTN_V "%s.blk.%d.attn_v.%s"
110
+ #define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
111
+ #define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
112
+ #define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
113
+ #define TN_LN_1 "%s.blk.%d.ln1.%s"
114
+ #define TN_LN_2 "%s.blk.%d.ln2.%s"
115
+ #define TN_LN_PRE "%s.pre_ln.%s"
116
+ #define TN_LN_POST "%s.post_ln.%s"
117
+ #define TN_TEXT_PROJ "text_projection.weight"
118
+ #define TN_VIS_PROJ "visual_projection.weight"
119
+ #define TN_LLAVA_PROJ "mm.%d.%s"
120
+ #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
121
+ #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
122
+ #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
123
+ #define TN_IMAGE_NEWLINE "model.image_newline"
124
+
125
+
126
+ enum projector_type {
127
+ PROJECTOR_TYPE_MLP,
128
+ PROJECTOR_TYPE_MLP_NORM,
129
+ PROJECTOR_TYPE_LDP,
130
+ PROJECTOR_TYPE_LDPV2,
131
+ PROJECTOR_TYPE_UNKNOWN,
132
+ };
133
+
134
+ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
135
+ { PROJECTOR_TYPE_MLP, "mlp" },
136
+ { PROJECTOR_TYPE_LDP, "ldp" },
137
+ { PROJECTOR_TYPE_LDPV2, "ldpv2"},
138
+ };
139
+
140
+
141
+ //
142
+ // utilities to get data from a gguf file
143
+ //
144
+
145
+ static int get_key_idx(const gguf_context * ctx, const char * key) {
146
+ int i = gguf_find_key(ctx, key);
147
+ if (i == -1) {
148
+ LOG_TEE("key %s not found in file\n", key);
149
+ throw std::runtime_error(format("Missing required key: %s", key));
150
+ }
151
+
152
+ return i;
153
+ }
154
+
155
+ static uint32_t get_u32(const gguf_context * ctx, const std::string & key) {
156
+ const int i = get_key_idx(ctx, key.c_str());
157
+
158
+ return gguf_get_val_u32(ctx, i);
159
+ }
160
+
161
+ static float get_f32(const gguf_context * ctx, const std::string & key) {
162
+ const int i = get_key_idx(ctx, key.c_str());
163
+
164
+ return gguf_get_val_f32(ctx, i);
165
+ }
166
+
167
+ static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::string & name) {
168
+ struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
169
+ if (!cur) {
170
+ throw std::runtime_error(format("%s: unable to find tensor %s\n", __func__, name.c_str()));
171
+ }
172
+
173
+ return cur;
174
+ }
175
+
176
+ static std::string get_ftype(int ftype) {
177
+ return ggml_type_name(static_cast<ggml_type>(ftype));
178
+ }
179
+
180
+ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
181
+ switch (type) {
182
+ case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
183
+ case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
184
+ case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
185
+ case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
186
+ case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
187
+ case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
188
+ case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
189
+ case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
190
+ case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
191
+ case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
192
+ case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
193
+ default: return format("unknown type %d", type);
194
+ }
195
+ }
196
+
197
+ static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
198
+ std::string result;
199
+ for (size_t pos = 0; ; pos += search.length()) {
200
+ auto new_pos = s.find(search, pos);
201
+ if (new_pos == std::string::npos) {
202
+ result += s.substr(pos, s.size() - pos);
203
+ break;
204
+ }
205
+ result += s.substr(pos, new_pos - pos) + replace;
206
+ pos = new_pos;
207
+ }
208
+ s = std::move(result);
209
+ }
210
+
211
+ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
212
+ const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
213
+
214
+ switch (type) {
215
+ case GGUF_TYPE_STRING:
216
+ return gguf_get_val_str(ctx_gguf, i);
217
+ case GGUF_TYPE_ARRAY:
218
+ {
219
+ const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
220
+ int arr_n = gguf_get_arr_n(ctx_gguf, i);
221
+ const void * data = gguf_get_arr_data(ctx_gguf, i);
222
+ std::stringstream ss;
223
+ ss << "[";
224
+ for (int j = 0; j < arr_n; j++) {
225
+ if (arr_type == GGUF_TYPE_STRING) {
226
+ std::string val = gguf_get_arr_str(ctx_gguf, i, j);
227
+ // escape quotes
228
+ replace_all(val, "\\", "\\\\");
229
+ replace_all(val, "\"", "\\\"");
230
+ ss << '"' << val << '"';
231
+ } else if (arr_type == GGUF_TYPE_ARRAY) {
232
+ ss << "???";
233
+ } else {
234
+ ss << gguf_data_to_str(arr_type, data, j);
235
+ }
236
+ if (j < arr_n - 1) {
237
+ ss << ", ";
238
+ }
239
+ }
240
+ ss << "]";
241
+ return ss.str();
242
+ }
243
+ default:
244
+ return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
245
+ }
246
+ }
247
+
248
+ static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
249
+ size_t tensor_size = ggml_nbytes(tensor);
250
+ LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
251
+ prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
252
+ tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
253
+ }
254
+
255
+ static projector_type clip_projector_type_from_string(const std::string & name) {
256
+ for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT
257
+ if (kv.second == name) {
258
+ return kv.first;
259
+ }
260
+ }
261
+ return PROJECTOR_TYPE_UNKNOWN;
262
+ }
263
+
264
+ #ifdef CLIP_DEBUG_FUNCTIONS
265
+ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
266
+ std::ofstream file(filename, std::ios::binary);
267
+ if (!file.is_open()) {
268
+ LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
269
+ return;
270
+ }
271
+
272
+ // PPM header: P6 format, width, height, and max color value
273
+ file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
274
+
275
+ // Write pixel data
276
+ for (size_t i = 0; i < img.buf.size(); i += 3) {
277
+ // PPM expects binary data in RGB format, which matches our image buffer
278
+ file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
279
+ }
280
+
281
+ file.close();
282
+ }
283
+
284
+ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
285
+ std::ofstream file(filename, std::ios::binary);
286
+ if (!file.is_open()) {
287
+ LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
288
+ return;
289
+ }
290
+
291
+ int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
292
+ int bytesPerPixel = 3;
293
+ int widthInBytes = img.nx * bytesPerPixel;
294
+ int paddingAmount = (4 - (widthInBytes % 4)) % 4;
295
+ int stride = widthInBytes + paddingAmount;
296
+
297
+ // Bitmap file header
298
+ unsigned char fileHeader[14] = {
299
+ 'B','M', // Signature
300
+ 0,0,0,0, // Image file size in bytes
301
+ 0,0,0,0, // Reserved
302
+ 54,0,0,0 // Start of pixel array
303
+ };
304
+
305
+ // Total file size
306
+ fileSize = 54 + (stride * img.ny);
307
+ fileHeader[2] = (unsigned char)(fileSize);
308
+ fileHeader[3] = (unsigned char)(fileSize >> 8);
309
+ fileHeader[4] = (unsigned char)(fileSize >> 16);
310
+ fileHeader[5] = (unsigned char)(fileSize >> 24);
311
+
312
+ // Bitmap information header (BITMAPINFOHEADER)
313
+ unsigned char infoHeader[40] = {
314
+ 40,0,0,0, // Size of this header (40 bytes)
315
+ 0,0,0,0, // Image width
316
+ 0,0,0,0, // Image height
317
+ 1,0, // Number of color planes
318
+ 24,0, // Bits per pixel
319
+ 0,0,0,0, // No compression
320
+ 0,0,0,0, // Image size (can be 0 for no compression)
321
+ 0,0,0,0, // X pixels per meter (not specified)
322
+ 0,0,0,0, // Y pixels per meter (not specified)
323
+ 0,0,0,0, // Total colors (color table not used)
324
+ 0,0,0,0 // Important colors (all are important)
325
+ };
326
+
327
+ // Width and height in the information header
328
+ infoHeader[4] = (unsigned char)(img.nx);
329
+ infoHeader[5] = (unsigned char)(img.nx >> 8);
330
+ infoHeader[6] = (unsigned char)(img.nx >> 16);
331
+ infoHeader[7] = (unsigned char)(img.nx >> 24);
332
+ infoHeader[8] = (unsigned char)(img.ny);
333
+ infoHeader[9] = (unsigned char)(img.ny >> 8);
334
+ infoHeader[10] = (unsigned char)(img.ny >> 16);
335
+ infoHeader[11] = (unsigned char)(img.ny >> 24);
336
+
337
+ // Write file headers
338
+ file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
339
+ file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));
340
+
341
+ // Pixel data
342
+ std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
343
+ for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
344
+ for (int x = 0; x < img.nx; ++x) {
345
+ // Each pixel
346
+ size_t pixelIndex = (y * img.nx + x) * 3;
347
+ unsigned char pixel[3] = {
348
+ img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
349
+ img.buf[pixelIndex + 1],
350
+ img.buf[pixelIndex]
351
+ };
352
+ file.write(reinterpret_cast<char*>(pixel), 3);
353
+ }
354
+ // Write padding for the row
355
+ file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
356
+ }
357
+
358
+ file.close();
359
+ }
360
+
361
+ // debug function to convert f32 to u8
362
+ static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
363
+ dst.nx = src.nx;
364
+ dst.ny = src.ny;
365
+ dst.buf.resize(3 * src.nx * src.ny);
366
+ for (size_t i = 0; i < src.buf.size(); ++i) {
367
+ dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
368
+ }
369
+ }
370
+ #endif
371
+
372
+
373
+ //
374
+ // clip layers
375
+ //
376
+
377
+ struct clip_hparams {
378
+ int32_t image_size;
379
+ int32_t patch_size;
380
+ int32_t hidden_size;
381
+ int32_t n_intermediate;
382
+ int32_t projection_dim;
383
+ int32_t n_head;
384
+ int32_t n_layer;
385
+
386
+ float eps;
387
+
388
+ char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)
389
+
390
+ int32_t image_grid_pinpoints[32];
391
+ int32_t image_crop_resolution;
392
+ };
393
+
394
+ struct clip_layer {
395
+ // attention
396
+ struct ggml_tensor * k_w;
397
+ struct ggml_tensor * k_b;
398
+ struct ggml_tensor * q_w;
399
+ struct ggml_tensor * q_b;
400
+ struct ggml_tensor * v_w;
401
+ struct ggml_tensor * v_b;
402
+
403
+ struct ggml_tensor * o_w;
404
+ struct ggml_tensor * o_b;
405
+
406
+ // layernorm 1
407
+ struct ggml_tensor * ln_1_w;
408
+ struct ggml_tensor * ln_1_b;
409
+
410
+ // ff
411
+ struct ggml_tensor * ff_i_w;
412
+ struct ggml_tensor * ff_i_b;
413
+
414
+ struct ggml_tensor * ff_o_w;
415
+ struct ggml_tensor * ff_o_b;
416
+
417
+ // layernorm 2
418
+ struct ggml_tensor * ln_2_w;
419
+ struct ggml_tensor * ln_2_b;
420
+ };
421
+
422
+ struct clip_vision_model {
423
+ struct clip_hparams hparams;
424
+
425
+ // embeddings
426
+ struct ggml_tensor * class_embedding;
427
+ struct ggml_tensor * patch_embeddings;
428
+ struct ggml_tensor * position_embeddings;
429
+
430
+ struct ggml_tensor * pre_ln_w;
431
+ struct ggml_tensor * pre_ln_b;
432
+
433
+ std::vector<clip_layer> layers;
434
+
435
+ struct ggml_tensor * post_ln_w;
436
+ struct ggml_tensor * post_ln_b;
437
+
438
+ struct ggml_tensor * projection;
439
+
440
+ // LLaVA projection
441
+ struct ggml_tensor * mm_0_w = NULL;
442
+ struct ggml_tensor * mm_0_b = NULL;
443
+ struct ggml_tensor * mm_2_w = NULL;
444
+ struct ggml_tensor * mm_2_b = NULL;
445
+
446
+ struct ggml_tensor * image_newline = NULL;
447
+
448
+ // Yi type models with mlp+normalization projection
449
+ struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
450
+ struct ggml_tensor * mm_1_b = NULL;
451
+ struct ggml_tensor * mm_3_w = NULL;
452
+ struct ggml_tensor * mm_3_b = NULL;
453
+ struct ggml_tensor * mm_4_w = NULL;
454
+ struct ggml_tensor * mm_4_b = NULL;
455
+
456
+ // MobileVLM projection
457
+ struct ggml_tensor * mm_model_mlp_1_w;
458
+ struct ggml_tensor * mm_model_mlp_1_b;
459
+ struct ggml_tensor * mm_model_mlp_3_w;
460
+ struct ggml_tensor * mm_model_mlp_3_b;
461
+ struct ggml_tensor * mm_model_block_1_block_0_0_w;
462
+ struct ggml_tensor * mm_model_block_1_block_0_1_w;
463
+ struct ggml_tensor * mm_model_block_1_block_0_1_b;
464
+ struct ggml_tensor * mm_model_block_1_block_1_fc1_w;
465
+ struct ggml_tensor * mm_model_block_1_block_1_fc1_b;
466
+ struct ggml_tensor * mm_model_block_1_block_1_fc2_w;
467
+ struct ggml_tensor * mm_model_block_1_block_1_fc2_b;
468
+ struct ggml_tensor * mm_model_block_1_block_2_0_w;
469
+ struct ggml_tensor * mm_model_block_1_block_2_1_w;
470
+ struct ggml_tensor * mm_model_block_1_block_2_1_b;
471
+ struct ggml_tensor * mm_model_block_2_block_0_0_w;
472
+ struct ggml_tensor * mm_model_block_2_block_0_1_w;
473
+ struct ggml_tensor * mm_model_block_2_block_0_1_b;
474
+ struct ggml_tensor * mm_model_block_2_block_1_fc1_w;
475
+ struct ggml_tensor * mm_model_block_2_block_1_fc1_b;
476
+ struct ggml_tensor * mm_model_block_2_block_1_fc2_w;
477
+ struct ggml_tensor * mm_model_block_2_block_1_fc2_b;
478
+ struct ggml_tensor * mm_model_block_2_block_2_0_w;
479
+ struct ggml_tensor * mm_model_block_2_block_2_1_w;
480
+ struct ggml_tensor * mm_model_block_2_block_2_1_b;
481
+
482
+ // MobileVLM_V2 projection
483
+ struct ggml_tensor * mm_model_mlp_0_w;
484
+ struct ggml_tensor * mm_model_mlp_0_b;
485
+ struct ggml_tensor * mm_model_mlp_2_w;
486
+ struct ggml_tensor * mm_model_mlp_2_b;
487
+ struct ggml_tensor * mm_model_peg_0_w;
488
+ struct ggml_tensor * mm_model_peg_0_b;
489
+ };
490
+
491
+ struct clip_ctx {
492
+ bool has_text_encoder = false;
493
+ bool has_vision_encoder = false;
494
+ bool has_llava_projector = false;
495
+
496
+ struct clip_vision_model vision_model;
497
+ projector_type proj_type = PROJECTOR_TYPE_MLP;
498
+
499
+ float image_mean[3];
500
+ float image_std[3];
501
+ bool use_gelu = false;
502
+ int32_t ftype = 1;
503
+
504
+ struct gguf_context * ctx_gguf;
505
+ struct ggml_context * ctx_data;
506
+
507
+ std::vector<uint8_t> buf_compute_meta;
508
+
509
+ // memory buffers to evaluate the model
510
+ ggml_backend_buffer_t params_buffer = NULL;
511
+
512
+ ggml_backend_t backend = NULL;
513
+ ggml_gallocr_t compute_alloc = NULL;
514
+ };
515
+
516
+ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
517
+ if (!ctx->has_vision_encoder) {
518
+ LOG_TEE("This gguf file seems to have no vision encoder\n");
519
+ return nullptr;
520
+ }
521
+
522
+ const auto & model = ctx->vision_model;
523
+ const auto & hparams = model.hparams;
524
+
525
+ const int image_size = hparams.image_size;
526
+ const int patch_size = hparams.patch_size;
527
+ const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
528
+ const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
529
+ const int num_positions = num_patches + 1;
530
+ const int hidden_size = hparams.hidden_size;
531
+ const int n_head = hparams.n_head;
532
+ const int d_head = hidden_size / n_head;
533
+ const int n_layer = hparams.n_layer;
534
+ const float eps = hparams.eps;
535
+
536
+ const int batch_size = imgs->size;
537
+
538
+ if (ctx->has_llava_projector) {
539
+ GGML_ASSERT(batch_size == 1);
540
+ }
541
+
542
+ struct ggml_init_params params = {
543
+ /*.mem_size =*/ ctx->buf_compute_meta.size(),
544
+ /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
545
+ /*.no_alloc =*/ true,
546
+ };
547
+
548
+ struct ggml_context * ctx0 = ggml_init(params);
549
+ struct ggml_cgraph * gf = ggml_new_graph(ctx0);
550
+
551
+ struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
552
+ ggml_set_name(inp_raw, "inp_raw");
553
+ ggml_set_input(inp_raw);
554
+
555
+ struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
556
+
557
+ inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
558
+ inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
559
+
560
+ // concat class_embeddings and patch_embeddings
561
+ struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
562
+ ggml_set_name(embeddings, "embeddings");
563
+ ggml_set_input(embeddings);
564
+
565
+ embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
566
+ embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
567
+
568
+ embeddings = ggml_acc(ctx0, embeddings, inp,
569
+ embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
570
+
571
+ struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
572
+ ggml_set_name(positions, "positions");
573
+ ggml_set_input(positions);
574
+
575
+ embeddings =
576
+ ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
577
+
578
+ // pre-layernorm
579
+ {
580
+ embeddings = ggml_norm(ctx0, embeddings, eps);
581
+ ggml_set_name(embeddings, "pre_ln");
582
+
583
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
584
+ }
585
+
586
+ // loop over layers
587
+ for (int il = 0; il < n_layer - 1; il++) {
588
+ struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
589
+
590
+ //const size_t nb_q_w = model.layers[il].q_w->nb[0];
591
+
592
+ // layernorm1
593
+ {
594
+ cur = ggml_norm(ctx0, cur, eps);
595
+
596
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w),
597
+ model.layers[il].ln_1_b);
598
+ }
599
+
600
+ // self-attention
601
+ {
602
+
603
+ struct ggml_tensor * Q =
604
+ ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
605
+
606
+ Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
607
+ Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
608
+ Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
609
+ Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
610
+
611
+ struct ggml_tensor * K =
612
+ ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
613
+
614
+ K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
615
+ K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
616
+ K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
617
+
618
+ struct ggml_tensor * V =
619
+ ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
620
+
621
+ V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
622
+ V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
623
+ V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
624
+
625
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
626
+ KQ = ggml_soft_max_inplace(ctx0, KQ);
627
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
628
+ KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
629
+ KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
630
+
631
+ cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size);
632
+ }
633
+
634
+ // attention output
635
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
636
+
637
+ // re-add the layer input, e.g., residual
638
+ cur = ggml_add(ctx0, cur, embeddings);
639
+
640
+ embeddings = cur; // embeddings = residual, cur = hidden_states
641
+
642
+ // layernorm2
643
+ {
644
+ cur = ggml_norm(ctx0, cur, eps);
645
+
646
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
647
+ }
648
+
649
+ cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
650
+ cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
651
+
652
+ if (ctx->use_gelu) {
653
+ cur = ggml_gelu_inplace(ctx0, cur);
654
+ } else {
655
+ cur = ggml_gelu_quick_inplace(ctx0, cur);
656
+ }
657
+
658
+ cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
659
+ cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
660
+
661
+ // residual 2
662
+ cur = ggml_add(ctx0, embeddings, cur);
663
+
664
+ embeddings = cur;
665
+ }
666
+
667
+ // llava projector
668
+ {
669
+ embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
670
+
671
+ struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
672
+ ggml_set_name(patches, "patches");
673
+ ggml_set_input(patches);
674
+
675
+ // shape [1, 576, 1024]
676
+ // ne is whcn, ne = [1024, 576, 1, 1]
677
+ embeddings = ggml_get_rows(ctx0, embeddings, patches);
678
+
679
+ // print_tensor_info(embeddings, "embeddings");
680
+
681
+ // llava projector
682
+ if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
683
+ embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
684
+ embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
685
+
686
+ embeddings = ggml_gelu(ctx0, embeddings);
687
+ embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
688
+ embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
689
+
690
+ } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
691
+ embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
692
+ embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
693
+ // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
694
+ // First LayerNorm
695
+ embeddings = ggml_norm(ctx0, embeddings, eps);
696
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
697
+ model.mm_1_b);
698
+
699
+ // GELU activation
700
+ embeddings = ggml_gelu(ctx0, embeddings);
701
+
702
+ // Second linear layer
703
+ embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
704
+ embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
705
+
706
+ // Second LayerNorm
707
+ embeddings = ggml_norm(ctx0, embeddings, eps);
708
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
709
+ model.mm_4_b);
710
+ }
711
+ else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
712
+ // MobileVLM projector
713
+ int n_patch = 24;
714
+ struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
715
+ mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
716
+ mlp_1 = ggml_gelu(ctx0, mlp_1);
717
+ struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
718
+ mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
719
+ // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
720
+
721
+ // block 1
722
+ struct ggml_tensor * block_1 = nullptr;
723
+ {
724
+ // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
725
+ mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
726
+ mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
727
+ // stride = 1, padding = 1, bias is nullptr
728
+ block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
729
+
730
+ // layer norm
731
+ // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
732
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
733
+ // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
734
+ block_1 = ggml_norm(ctx0, block_1, eps);
735
+ block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
736
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
737
+
738
+ // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
739
+ // hardswish
740
+ struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
741
+
742
+ block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
743
+ // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
744
+ // pointwise conv
745
+ block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
746
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
747
+ block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
748
+ block_1 = ggml_relu(ctx0, block_1);
749
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
750
+ block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
751
+ block_1 = ggml_hardsigmoid(ctx0, block_1);
752
+ // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
753
+ block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
754
+ block_1 = ggml_mul(ctx0, block_1_hw, block_1);
755
+
756
+ int w = block_1->ne[0], h = block_1->ne[1];
757
+ block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
758
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
759
+
760
+ // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
761
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
762
+ block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
763
+
764
+ // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
765
+ block_1 = ggml_norm(ctx0, block_1, eps);
766
+ block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
767
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
768
+ // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
769
+ // residual
770
+ block_1 = ggml_add(ctx0, mlp_3, block_1);
771
+ }
772
+
773
+ // block_2
774
+ {
775
+ // stride = 2
776
+ block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
777
+
778
+ // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
779
+ // layer norm
780
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
781
+ // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
782
+ block_1 = ggml_norm(ctx0, block_1, eps);
783
+ block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
784
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
785
+ // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
786
+ // hardswish
787
+ struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
788
+
789
+ // not sure the parameters is right for globalAvgPooling
790
+ block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
791
+ // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
792
+ // pointwise conv
793
+ block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
794
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
795
+ block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
796
+ block_1 = ggml_relu(ctx0, block_1);
797
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
798
+ block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
799
+ block_1 = ggml_hardsigmoid(ctx0, block_1);
800
+
801
+ // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
802
+ block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
803
+ block_1 = ggml_mul(ctx0, block_1_hw, block_1);
804
+
805
+ int w = block_1->ne[0], h = block_1->ne[1];
806
+ block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
807
+ block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
808
+ // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
809
+ block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
810
+ block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
811
+
812
+
813
+ // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
814
+ block_1 = ggml_norm(ctx0, block_1, eps);
815
+ block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
816
+ block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
817
+ // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
818
+ }
819
+ embeddings = block_1;
820
+ }
821
+ else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2)
822
+ {
823
+ int n_patch = 24;
824
+ struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
825
+ mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
826
+ mlp_0 = ggml_gelu(ctx0, mlp_0);
827
+ struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
828
+ mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
829
+ // mlp_2 ne = [2048, 576, 1, 1]
830
+ // // AVG Pool Layer 2*2, strides = 2
831
+ mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
832
+ // mlp_2 ne = [576, 2048, 1, 1]
833
+ mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
834
+ // mlp_2 ne [24, 24, 2048, 1]
835
+ mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
836
+ // weight ne = [3, 3, 2048, 1]
837
+ struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
838
+ peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
839
+ peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
840
+ mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
841
+ peg_0 = ggml_add(ctx0, peg_0, mlp_2);
842
+ peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
843
+ embeddings = peg_0;
844
+ }
845
+ else {
846
+ GGML_ASSERT(false);
847
+ }
848
+ }
849
+
850
+ // build the graph
851
+ ggml_build_forward_expand(gf, embeddings);
852
+
853
+ ggml_free(ctx0);
854
+
855
+ return gf;
856
+ }
857
+
858
+ // read and create ggml_context containing the tensors and their data
859
+ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
860
+ struct ggml_context * meta = NULL;
861
+
862
+ struct gguf_init_params params = {
863
+ /*.no_alloc = */ true,
864
+ /*.ctx = */ &meta,
865
+ };
866
+
867
+ struct gguf_context * ctx = gguf_init_from_file(fname, params);
868
+ if (!ctx) {
869
+ throw std::runtime_error(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
870
+ }
871
+
872
+ if (verbosity >= 1) {
873
+ const int n_tensors = gguf_get_n_tensors(ctx);
874
+ const int n_kv = gguf_get_n_kv(ctx);
875
+ const int ftype = get_u32(ctx, KEY_FTYPE);
876
+ const std::string ftype_str = get_ftype(ftype);
877
+ const int idx_desc = get_key_idx(ctx, KEY_DESCRIPTION);
878
+ const std::string description = gguf_get_val_str(ctx, idx_desc);
879
+ const int idx_name = gguf_find_key(ctx, KEY_NAME);
880
+ if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
881
+ const std::string name = gguf_get_val_str(ctx, idx_name);
882
+ LOG_TEE("%s: model name: %s\n", __func__, name.c_str());
883
+ }
884
+ LOG_TEE("%s: description: %s\n", __func__, description.c_str());
885
+ LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
886
+ LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
887
+ LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors);
888
+ LOG_TEE("%s: n_kv: %d\n", __func__, n_kv);
889
+ LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str());
890
+ LOG_TEE("\n");
891
+ }
892
+ const int n_tensors = gguf_get_n_tensors(ctx);
893
+
894
+ // kv
895
+ const int n_kv = gguf_get_n_kv(ctx);
896
+ LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
897
+ __func__, n_kv, n_tensors, fname);
898
+ {
899
+ std::map<enum ggml_type, uint32_t> n_type;
900
+
901
+ for (int i = 0; i < n_tensors; i++) {
902
+ enum ggml_type type = gguf_get_tensor_type(ctx, i);
903
+
904
+ n_type[type]++;
905
+ }
906
+
907
+ LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
908
+ for (int i = 0; i < n_kv; i++) {
909
+ const char * name = gguf_get_key(ctx, i);
910
+ const enum gguf_type type = gguf_get_kv_type(ctx, i);
911
+ const std::string type_name =
912
+ type == GGUF_TYPE_ARRAY
913
+ ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i))
914
+ : gguf_type_name(type);
915
+
916
+ std::string value = gguf_kv_to_str(ctx, i);
917
+ const size_t MAX_VALUE_LEN = 40;
918
+ if (value.size() > MAX_VALUE_LEN) {
919
+ value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
920
+ }
921
+ replace_all(value, "\n", "\\n");
922
+
923
+ LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
924
+ }
925
+
926
+ // print type counts
927
+ for (auto & kv : n_type) {
928
+ if (kv.second == 0) {
929
+ continue;
930
+ }
931
+
932
+ LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
933
+ }
934
+ }
935
+
936
+ // data
937
+ size_t model_size = 0;
938
+ {
939
+ for (int i = 0; i < n_tensors; ++i) {
940
+ const char * name = gguf_get_tensor_name(ctx, i);
941
+ const size_t offset = gguf_get_tensor_offset(ctx, i);
942
+ enum ggml_type type = gguf_get_tensor_type(ctx, i);
943
+ struct ggml_tensor * cur = ggml_get_tensor(meta, name);
944
+ size_t tensor_size = ggml_nbytes(cur);
945
+ model_size += tensor_size;
946
+ if (verbosity >= 3) {
947
+ LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
948
+ __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
949
+ }
950
+ }
951
+ }
952
+
953
+ clip_ctx * new_clip = new clip_ctx;
954
+
955
+ // update projector type
956
+ {
957
+ int idx = gguf_find_key(ctx, KEY_PROJ_TYPE);
958
+ if (idx != -1) {
959
+ const std::string proj_type = gguf_get_val_str(ctx, idx);
960
+ new_clip->proj_type = clip_projector_type_from_string(proj_type);
961
+ } else {
962
+ new_clip->proj_type = PROJECTOR_TYPE_MLP;
963
+ }
964
+
965
+ if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
966
+ if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
967
+ new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
968
+ }
969
+ }
970
+ }
971
+
972
+ #ifdef GGML_USE_CUDA
973
+ new_clip->backend = ggml_backend_cuda_init(0);
974
+ LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
975
+ #endif
976
+
977
+ #ifdef GGML_USE_METAL
978
+ new_clip->backend = ggml_backend_metal_init();
979
+ LOG_TEE("%s: CLIP using Metal backend\n", __func__);
980
+ #endif
981
+
982
+
983
+ if (!new_clip->backend) {
984
+ new_clip->backend = ggml_backend_cpu_init();
985
+ LOG_TEE("%s: CLIP using CPU backend\n", __func__);
986
+ }
987
+
988
+ // model size and capabilities
989
+ {
990
+ int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC);
991
+ new_clip->has_text_encoder = gguf_get_val_bool(ctx, idx);
992
+
993
+ idx = get_key_idx(ctx, KEY_HAS_VIS_ENC);
994
+ new_clip->has_vision_encoder = gguf_get_val_bool(ctx, idx);
995
+
996
+ idx = gguf_find_key(ctx, KEY_HAS_LLAVA_PROJ);
997
+ if (idx != -1) {
998
+ new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx);
999
+ }
1000
+
1001
+ GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
1002
+ GGML_ASSERT(new_clip->has_vision_encoder);
1003
+ GGML_ASSERT(!new_clip->has_text_encoder);
1004
+
1005
+ idx = get_key_idx(ctx, KEY_USE_GELU);
1006
+ new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
1007
+
1008
+ if (verbosity >= 1) {
1009
+ LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
1010
+ LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
1011
+ LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
1012
+ LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
1013
+ LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
1014
+ }
1015
+ }
1016
+
1017
+ LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
1018
+
1019
+ // load tensors
1020
+ {
1021
+ std::vector<uint8_t> read_buf;
1022
+ struct ggml_init_params params = {
1023
+ /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(),
1024
+ /*.mem_buffer =*/ NULL,
1025
+ /*.no_alloc =*/ true,
1026
+ };
1027
+
1028
+ new_clip->ctx_data = ggml_init(params);
1029
+ if (!new_clip->ctx_data) {
1030
+ LOG_TEE("%s: ggml_init() failed\n", __func__);
1031
+ clip_free(new_clip);
1032
+ gguf_free(ctx);
1033
+ return nullptr;
1034
+ }
1035
+
1036
+ auto fin = std::ifstream(fname, std::ios::binary);
1037
+ if (!fin) {
1038
+ LOG_TEE("cannot open model file for loading tensors\n");
1039
+ clip_free(new_clip);
1040
+ gguf_free(ctx);
1041
+ return nullptr;
1042
+ }
1043
+
1044
+ // add tensors to context
1045
+ for (int i = 0; i < n_tensors; ++i) {
1046
+ const char * name = gguf_get_tensor_name(ctx, i);
1047
+ struct ggml_tensor * t = ggml_get_tensor(meta, name);
1048
+ struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx_data, t);
1049
+ ggml_set_name(cur, name);
1050
+ }
1051
+
1052
+ // alloc memory and offload data
1053
+ new_clip->params_buffer = ggml_backend_alloc_ctx_tensors(new_clip->ctx_data, new_clip->backend);
1054
+ for (int i = 0; i < n_tensors; ++i) {
1055
+ const char * name = gguf_get_tensor_name(ctx, i);
1056
+ struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
1057
+ const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
1058
+ fin.seekg(offset, std::ios::beg);
1059
+ if (!fin) {
1060
+ LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
1061
+ clip_free(new_clip);
1062
+ gguf_free(ctx);
1063
+ return nullptr;
1064
+ }
1065
+ int num_bytes = ggml_nbytes(cur);
1066
+ if (ggml_backend_buffer_is_host(new_clip->params_buffer)) {
1067
+ // for the CPU and Metal backend, we can read directly into the tensor
1068
+ fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
1069
+ } else {
1070
+ // read into a temporary buffer first, then copy to device memory
1071
+ read_buf.resize(num_bytes);
1072
+ fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
1073
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
1074
+ }
1075
+ }
1076
+ fin.close();
1077
+ }
1078
+
1079
+ // vision model
1080
+ if (new_clip->has_vision_encoder) {
1081
+ // load vision model
1082
+ auto & vision_model = new_clip->vision_model;
1083
+ auto & hparams = vision_model.hparams;
1084
+ hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision"));
1085
+ hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision"));
1086
+ hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "vision"));
1087
+ hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision"));
1088
+ hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE);
1089
+ hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE);
1090
+ hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision"));
1091
+ hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
1092
+
1093
+ try {
1094
+ int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
1095
+ int n = gguf_get_arr_n(ctx, idx);
1096
+ const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx);
1097
+ for (int i = 0; i < 32 && i < n && pinpoints[i] != 0; ++i) {
1098
+ hparams.image_grid_pinpoints[i] = pinpoints[i];
1099
+ }
1100
+ if (n < 32)
1101
+ hparams.image_grid_pinpoints[n] = 0;
1102
+ } catch (std::runtime_error & e) {
1103
+ hparams.image_grid_pinpoints[0]=0;
1104
+ }
1105
+
1106
+ try {
1107
+ int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
1108
+ strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx));
1109
+ } catch (std::runtime_error & e) {
1110
+ strcpy(hparams.mm_patch_merge_type, "flat");
1111
+ }
1112
+
1113
+ try {
1114
+ hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6
1115
+ } catch(const std::exception& e) {
1116
+ hparams.image_crop_resolution = hparams.image_size;
1117
+ }
1118
+
1119
+ int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
1120
+ int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
1121
+
1122
+ const float * mean_data = (const float *)gguf_get_arr_data(ctx, idx_mean);
1123
+ const float * std_data = (const float *)gguf_get_arr_data(ctx, idx_std);
1124
+
1125
+ for (int i = 0; i < 3; ++i) {
1126
+ new_clip->image_mean[i] = mean_data[i];
1127
+ new_clip->image_std[i] = std_data[i];
1128
+ }
1129
+
1130
+ if (verbosity >= 2) {
1131
+ LOG_TEE("\n%s: vision model hparams\n", __func__);
1132
+ LOG_TEE("image_size %d\n", hparams.image_size);
1133
+ LOG_TEE("patch_size %d\n", hparams.patch_size);
1134
+ LOG_TEE("v_hidden_size %d\n", hparams.hidden_size);
1135
+ LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate);
1136
+ LOG_TEE("v_projection_dim %d\n", hparams.projection_dim);
1137
+ LOG_TEE("v_n_head %d\n", hparams.n_head);
1138
+ LOG_TEE("v_n_layer %d\n", hparams.n_layer);
1139
+ LOG_TEE("v_eps %f\n", hparams.eps);
1140
+ LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
1141
+ LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
1142
+ LOG_TEE("v_image_grid_pinpoints: ");
1143
+ for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
1144
+ LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
1145
+ }
1146
+ LOG_TEE("\n");
1147
+ LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
1148
+
1149
+ }
1150
+
1151
+ try {
1152
+ vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
1153
+ vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
1154
+ vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
1155
+ vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
1156
+ vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
1157
+ } catch(const std::exception& e) {
1158
+ LOG_TEE("%s: failed to load vision model tensors\n", __func__);
1159
+ }
1160
+
1161
+ // LLaVA projection
1162
+ if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
1163
+ vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
1164
+ vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
1165
+ try {
1166
+ // Yi-type llava
1167
+ vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
1168
+ vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
1169
+ } catch (std::runtime_error & e) { }
1170
+ try {
1171
+ // missing in Yi-type llava
1172
+ vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
1173
+ vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
1174
+ } catch (std::runtime_error & e) { }
1175
+ try {
1176
+ // Yi-type llava
1177
+ vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
1178
+ vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
1179
+ } catch (std::runtime_error & e) { }
1180
+ try {
1181
+ // Yi-type llava
1182
+ vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
1183
+ vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
1184
+ } catch (std::runtime_error & e) { }
1185
+ try {
1186
+ vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
1187
+ // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
1188
+ } catch (std::runtime_error & e) { }
1189
+ } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
1190
+ // MobileVLM projection
1191
+ vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
1192
+ vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
1193
+ vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
1194
+ vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
1195
+ vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
1196
+ vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
1197
+ vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
1198
+ vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
1199
+ vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
1200
+ vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
1201
+ vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
1202
+ vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
1203
+ vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
1204
+ vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
1205
+ vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
1206
+ vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
1207
+ vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
1208
+ vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
1209
+ vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
1210
+ vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
1211
+ vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
1212
+ vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
1213
+ vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
1214
+ vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
1215
+ }
1216
+ else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2)
1217
+ {
1218
+ // MobilVLM_V2 projection
1219
+ vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight"));
1220
+ vision_model.mm_model_mlp_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "bias"));
1221
+ vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "weight"));
1222
+ vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias"));
1223
+ vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
1224
+ vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
1225
+ }
1226
+ else {
1227
+ std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
1228
+ throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
1229
+ }
1230
+
1231
+ vision_model.layers.resize(hparams.n_layer);
1232
+
1233
+ for (int il = 0; il < hparams.n_layer; ++il) {
1234
+ auto & layer = vision_model.layers[il];
1235
+ layer.k_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "weight"));
1236
+ layer.q_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "weight"));
1237
+ layer.v_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "weight"));
1238
+ layer.o_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "weight"));
1239
+ layer.ln_1_w = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "weight"));
1240
+ layer.ln_2_w = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "weight"));
1241
+ layer.ff_i_w = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "weight"));
1242
+ layer.ff_o_w = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "weight"));
1243
+ layer.k_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "bias"));
1244
+ layer.q_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "bias"));
1245
+ layer.v_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "bias"));
1246
+ layer.o_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "bias"));
1247
+ layer.ln_1_b = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "bias"));
1248
+ layer.ln_2_b = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "bias"));
1249
+ layer.ff_i_b = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "bias"));
1250
+ layer.ff_o_b = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "bias"));
1251
+ }
1252
+ }
1253
+
1254
+ ggml_free(meta);
1255
+
1256
+ new_clip->ctx_gguf = ctx;
1257
+
1258
+ // measure mem requirement and allocate
1259
+ {
1260
+ new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
1261
+ new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
1262
+ clip_image_f32_batch batch;
1263
+ batch.size = 1;
1264
+ ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
1265
+ ggml_gallocr_reserve(new_clip->compute_alloc, gf);
1266
+ size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
1267
+ LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
1268
+ }
1269
+
1270
+ return new_clip;
1271
+ }
1272
+
1273
+ struct clip_image_u8 * clip_image_u8_init() {
1274
+ return new clip_image_u8();
1275
+ }
1276
+
1277
+ struct clip_image_f32 * clip_image_f32_init() {
1278
+ return new clip_image_f32();
1279
+ }
1280
+
1281
+ void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
1282
+ void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
1283
+ void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) {
1284
+ if (batch->size > 0) {
1285
+ delete[] batch->data;
1286
+ batch->size = 0;
1287
+ }
1288
+ }
1289
+ void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) {
1290
+ if (batch->size > 0) {
1291
+ delete[] batch->data;
1292
+ batch->size = 0;
1293
+ }
1294
+ }
1295
+
1296
+ static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
1297
+ img->nx = nx;
1298
+ img->ny = ny;
1299
+ img->buf.resize(3 * nx * ny);
1300
+ memcpy(img->buf.data(), data, img->buf.size());
1301
+ }
1302
+
1303
+ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
1304
+ int nx, ny, nc;
1305
+ auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
1306
+ if (!data) {
1307
+ LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
1308
+ return false;
1309
+ }
1310
+ build_clip_img_from_data(data, nx, ny, img);
1311
+ stbi_image_free(data);
1312
+ return true;
1313
+ }
1314
+
1315
+ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
1316
+ int nx, ny, nc;
1317
+ auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
1318
+ if (!data) {
1319
+ LOG_TEE("%s: failed to decode image bytes\n", __func__);
1320
+ return false;
1321
+ }
1322
+ build_clip_img_from_data(data, nx, ny, img);
1323
+ stbi_image_free(data);
1324
+ return true;
1325
+ }
1326
+
1327
+ // Linear interpolation between two points
1328
+ inline float lerp(float s, float e, float t) {
1329
+ return s + (e - s) * t;
1330
+ }
1331
+ // Bilinear resize function
1332
+ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
1333
+ dst.nx = target_width;
1334
+ dst.ny = target_height;
1335
+ dst.buf.resize(3 * target_width * target_height);
1336
+
1337
+ float x_ratio = static_cast<float>(src.nx - 1) / target_width;
1338
+ float y_ratio = static_cast<float>(src.ny - 1) / target_height;
1339
+
1340
+ for (int y = 0; y < target_height; y++) {
1341
+ for (int x = 0; x < target_width; x++) {
1342
+ float px = x_ratio * x;
1343
+ float py = y_ratio * y;
1344
+ int x_floor = static_cast<int>(px);
1345
+ int y_floor = static_cast<int>(py);
1346
+ float x_lerp = px - x_floor;
1347
+ float y_lerp = py - y_floor;
1348
+
1349
+ for (int c = 0; c < 3; c++) {
1350
+ float top = lerp(
1351
+ static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
1352
+ static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
1353
+ x_lerp
1354
+ );
1355
+ float bottom = lerp(
1356
+ static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
1357
+ static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
1358
+ x_lerp
1359
+ );
1360
+ dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
1361
+ }
1362
+ }
1363
+ }
1364
+ }
1365
+
1366
+ // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
1367
+ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3]) {
1368
+ dst->nx = src->nx;
1369
+ dst->ny = src->ny;
1370
+ dst->buf.resize(src->buf.size());
1371
+
1372
+ for (size_t i = 0; i < src->buf.size(); ++i) {
1373
+ int c = i % 3; // rgb
1374
+ dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - mean[c]) / std[c];
1375
+ }
1376
+ }
1377
+
1378
+ inline float clip(float x, float lower, float upper) {
1379
+ return std::max(lower, std::min(x, upper));
1380
+ }
1381
+
1382
+ static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) {
1383
+ const int nx = img.nx;
1384
+ const int ny = img.ny;
1385
+
1386
+ dst.nx = target_width;
1387
+ dst.ny = target_height;
1388
+ dst.buf.resize(3 * target_width * target_height);
1389
+
1390
+ float Cc;
1391
+ float C[5];
1392
+ float d0, d2, d3, a0, a1, a2, a3;
1393
+ int i, j, k, jj;
1394
+ int x, y;
1395
+ float dx, dy;
1396
+ float tx, ty;
1397
+
1398
+ tx = (float)nx / (float)target_width;
1399
+ ty = (float)ny / (float)target_height;
1400
+
1401
+ // Bicubic interpolation; adapted from ViT.cpp, inspired from :
1402
+ // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
1403
+ // -> https://en.wikipedia.org/wiki/Bicubic_interpolation
1404
+
1405
+ for (i = 0; i < target_height; i++) {
1406
+ for (j = 0; j < target_width; j++) {
1407
+ x = (int)(tx * j);
1408
+ y = (int)(ty * i);
1409
+
1410
+ dx = tx * j - x;
1411
+ dy = ty * i - y;
1412
+
1413
+ for (k = 0; k < 3; k++) {
1414
+ for (jj = 0; jj <= 3; jj++) {
1415
+ d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
1416
+ d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
1417
+ d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
1418
+ a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
1419
+
1420
+ a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
1421
+ a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
1422
+ a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
1423
+
1424
+ C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
1425
+
1426
+ d0 = C[0] - C[1];
1427
+ d2 = C[2] - C[1];
1428
+ d3 = C[3] - C[1];
1429
+ a0 = C[1];
1430
+ a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
1431
+ a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
1432
+ a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
1433
+ Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
1434
+
1435
+ const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
1436
+ dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
1437
+ }
1438
+ }
1439
+ }
1440
+ }
1441
+
1442
+ return true;
1443
+ }
1444
+
1445
+ // llava-1.6 type of resize_and_pad (black)
1446
+ static void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_output, const std::pair<int, int>& target_resolution) {
1447
+ int target_width = target_resolution.first;
1448
+ int target_height = target_resolution.second;
1449
+
1450
+ float scale_w = static_cast<float>(target_width) / image.nx;
1451
+ float scale_h = static_cast<float>(target_height) / image.ny;
1452
+
1453
+ int new_width, new_height;
1454
+
1455
+ if (scale_w < scale_h) {
1456
+ new_width = target_width;
1457
+ new_height = std::min(static_cast<int>(std::ceil(image.ny * scale_w)), target_height);
1458
+ } else {
1459
+ new_height = target_height;
1460
+ new_width = std::min(static_cast<int>(std::ceil(image.nx * scale_h)), target_width);
1461
+ }
1462
+
1463
+ clip_image_u8 resized_image;
1464
+ // bilinear_resize(image, resized_image, new_width, new_height);
1465
+ bicubic_resize(image, resized_image, new_width, new_height);
1466
+
1467
+ clip_image_u8 padded_image;
1468
+ padded_image.nx = target_width;
1469
+ padded_image.ny = target_height;
1470
+ padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black
1471
+
1472
+ // Calculate padding offsets
1473
+ int pad_x = (target_width - new_width) / 2;
1474
+ int pad_y = (target_height - new_height) / 2;
1475
+
1476
+ // Copy the resized image into the center of the padded buffer
1477
+ for (int y = 0; y < new_height; ++y) {
1478
+ for (int x = 0; x < new_width; ++x) {
1479
+ for (int c = 0; c < 3; ++c) {
1480
+ padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c];
1481
+ }
1482
+ }
1483
+ }
1484
+ image_output = std::move(padded_image);
1485
+ }
1486
+
1487
+ /**
1488
+ * Selects the best resolution from a list of possible resolutions based on the original size.
1489
+ *
1490
+ * @param original_size The original size of the image in the format (width, height).
1491
+ * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
1492
+ * @return The best fit resolution in the format (width, height).
1493
+ */
1494
+ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & original_size, const std::vector<std::pair<int, int>> & possible_resolutions) {
1495
+ int original_width = original_size.first;
1496
+ int original_height = original_size.second;
1497
+ std::pair<int, int> best_fit;
1498
+ int max_effective_resolution = 0;
1499
+ int min_wasted_resolution = std::numeric_limits<int>::max();
1500
+
1501
+ for (const auto& resolution : possible_resolutions) {
1502
+ int width = resolution.first;
1503
+ int height = resolution.second;
1504
+ float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
1505
+ int downscaled_width = static_cast<int>(original_width * scale);
1506
+ int downscaled_height = static_cast<int>(original_height * scale);
1507
+ int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
1508
+ int wasted_resolution = (width * height) - effective_resolution;
1509
+ // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
1510
+ if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
1511
+ max_effective_resolution = effective_resolution;
1512
+ min_wasted_resolution = wasted_resolution;
1513
+ best_fit = resolution;
1514
+ }
1515
+ }
1516
+
1517
+ return best_fit;
1518
+ }
1519
+
1520
+ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & image, int patch_size) {
1521
+ std::vector<clip_image_u8*> patches;
1522
+ int width = image.nx;
1523
+ int height = image.ny;
1524
+ for (int i = 0; i < height; i += patch_size) {
1525
+ for (int j = 0; j < width; j += patch_size) {
1526
+ clip_image_u8 *patch = clip_image_u8_init();
1527
+ patch->nx = std::min(patch_size, width - j);
1528
+ patch->ny = std::min(patch_size, height - i);
1529
+ patch->buf.resize(3 * patch->nx * patch->ny);
1530
+ for (int y = 0; y < patch->ny; ++y) {
1531
+ for (int x = 0; x < patch->nx; ++x) {
1532
+ for (int c = 0; c < 3; ++c) {
1533
+ patch->buf[3 * (y * patch->nx + x) + c] = image.buf[3 * ((i + y) * width + (j + x)) + c];
1534
+ }
1535
+ }
1536
+ }
1537
+ patches.push_back(patch);
1538
+ }
1539
+ }
1540
+ return patches;
1541
+ }
1542
+
1543
+ // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
1544
+ // res_imgs memory is being allocated here, previous allocations will be freed if found
1545
+ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
1546
+ bool pad_to_square = true;
1547
+ if (!ctx->has_vision_encoder) {
1548
+ LOG_TEE("This gguf file seems to have no vision encoder\n");
1549
+ return false;
1550
+ }
1551
+ auto & params = ctx->vision_model.hparams;
1552
+ // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
1553
+ if (strcmp(params.mm_patch_merge_type, "spatial_unpad") == 0) {
1554
+ pad_to_square = false;
1555
+ }
1556
+ // free the previous res_imgs if any set
1557
+ if (res_imgs->size > 0) {
1558
+ clip_image_f32_batch_free(res_imgs);
1559
+ }
1560
+ res_imgs->data = nullptr;
1561
+ res_imgs->size = 0;
1562
+
1563
+ // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
1564
+ // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
1565
+
1566
+ clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily
1567
+ if (pad_to_square && img->nx != img->ny) {
1568
+ int longer_side = std::max(img->nx, img->ny);
1569
+ temp->nx = longer_side;
1570
+ temp->ny = longer_side;
1571
+ temp->buf.resize(3 * longer_side * longer_side);
1572
+ const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA (this is the mean rgb color * 255)
1573
+
1574
+ // fill with background color
1575
+ for (size_t i = 0; i < temp->buf.size(); i++) {
1576
+ temp->buf[i] = bc[i % 3];
1577
+ }
1578
+
1579
+ // copy from the input image
1580
+ for (int y = 0; y < img->ny; y++) {
1581
+ for (int x = 0; x < img->nx; x++) {
1582
+ const int i = 3 * (y * img->nx + x);
1583
+ const int j = 3 * (y * temp->nx + x);
1584
+ temp->buf[j] = img->buf[i];
1585
+ temp->buf[j+1] = img->buf[i+1];
1586
+ temp->buf[j+2] = img->buf[i+2];
1587
+ }
1588
+ }
1589
+ } else {
1590
+ if (params.image_grid_pinpoints[0] != 0) {
1591
+ // "spatial_unpad" with "anyres" processing for llava-1.6
1592
+ std::vector<std::pair<int, int>> possible_resolutions;
1593
+ for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
1594
+ possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
1595
+ }
1596
+ std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
1597
+ // clip_image_save_to_bmp(*img, "input.bmp");
1598
+ resize_and_pad_image(*img, *temp, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6
1599
+ // clip_image_save_to_bmp(*temp, "resized.bmp");
1600
+ // visually verify normalized image:
1601
+ // normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std);
1602
+ // {
1603
+ // clip_image_u8 * temp2 = clip_image_u8_init();
1604
+ // clip_image_convert_f32_to_u8(*res, *temp2);
1605
+ // clip_image_save_to_bmp(*temp2, "resized_normalized_f32.bmp");
1606
+ // clip_image_u8_free(temp2);
1607
+ // }
1608
+
1609
+ std::vector<clip_image_u8 *> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
1610
+
1611
+ clip_image_u8 *image_original_resize = clip_image_u8_init();
1612
+ // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
1613
+ bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
1614
+ patches.insert(patches.begin(), image_original_resize);
1615
+ // clip_image_f32_batch_init(patches.size());
1616
+ res_imgs->size = patches.size();
1617
+ res_imgs->data = new clip_image_f32[res_imgs->size];
1618
+ int num=0;
1619
+ for (auto& patch : patches) {
1620
+ normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std);
1621
+ num++;
1622
+ }
1623
+
1624
+ for (size_t i = 0; i < patches.size(); i++) {
1625
+ // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
1626
+ clip_image_u8_free(patches[i]);
1627
+ }
1628
+
1629
+ clip_image_u8_free(temp);
1630
+
1631
+ return true;
1632
+ } else {
1633
+ temp->nx = img->nx;
1634
+ temp->ny = img->ny;
1635
+ temp->buf.resize(img->buf.size());
1636
+ memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
1637
+ }
1638
+ }
1639
+
1640
+ const int nx = temp->nx;
1641
+ const int ny = temp->ny;
1642
+ // clip_image_save_to_bmp(*temp, "resized_vanilla.bmp");
1643
+
1644
+ const int nx2 = ctx->vision_model.hparams.image_size;
1645
+ const int ny2 = ctx->vision_model.hparams.image_size;
1646
+ clip_image_f32 * res = clip_image_f32_init();
1647
+ res->nx = nx2;
1648
+ res->ny = ny2;
1649
+ res->buf.resize(3 * nx2 * ny2);
1650
+
1651
+ const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size;
1652
+
1653
+ const int nx3 = int(nx / scale + 0.5f);
1654
+ const int ny3 = int(ny / scale + 0.5f);
1655
+
1656
+ const auto & m3 = ctx->image_mean; // {0.48145466f, 0.4578275f, 0.40821073f};
1657
+ const auto & s3 = ctx->image_std; // {0.26862954f, 0.26130258f, 0.27577711f};
1658
+
1659
+ for (int y = 0; y < ny3; y++) {
1660
+ for (int x = 0; x < nx3; x++) {
1661
+ for (int c = 0; c < 3; c++) {
1662
+ // linear interpolation
1663
+ const float sx = (x + 0.5f) * scale - 0.5f;
1664
+ const float sy = (y + 0.5f) * scale - 0.5f;
1665
+
1666
+ const int x0 = std::max(0, (int)std::floor(sx));
1667
+ const int y0 = std::max(0, (int)std::floor(sy));
1668
+
1669
+ const int x1 = std::min(x0 + 1, nx - 1);
1670
+ const int y1 = std::min(y0 + 1, ny - 1);
1671
+
1672
+ const float dx = sx - x0;
1673
+ const float dy = sy - y0;
1674
+
1675
+ const int j00 = 3 * (y0 * nx + x0) + c;
1676
+ const int j01 = 3 * (y0 * nx + x1) + c;
1677
+ const int j10 = 3 * (y1 * nx + x0) + c;
1678
+ const int j11 = 3 * (y1 * nx + x1) + c;
1679
+
1680
+ const float v00 = temp->buf[j00];
1681
+ const float v01 = temp->buf[j01];
1682
+ const float v10 = temp->buf[j10];
1683
+ const float v11 = temp->buf[j11];
1684
+
1685
+ const float v0 = v00 * (1.0f - dx) + v01 * dx;
1686
+ const float v1 = v10 * (1.0f - dx) + v11 * dx;
1687
+
1688
+ const float v = v0 * (1.0f - dy) + v1 * dy;
1689
+
1690
+ const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f);
1691
+
1692
+ const int i = 3 * (y * nx3 + x) + c;
1693
+
1694
+ res->buf[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
1695
+ }
1696
+ }
1697
+ }
1698
+ clip_image_u8_free(temp);
1699
+
1700
+ // {
1701
+ // clip_image_u8 * temp2 = clip_image_u8_init();
1702
+ // clip_image_convert_f32_to_u8(*res, *temp2);
1703
+ // clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp");
1704
+ // clip_image_u8_free(temp2);
1705
+ // }
1706
+ // res_imgs.push_back(res);
1707
+
1708
+ res_imgs->size = 1;
1709
+ res_imgs->data = new clip_image_f32[res_imgs->size];
1710
+ res_imgs->data[0] = *res;
1711
+ clip_image_f32_free(res);
1712
+
1713
+ return true;
1714
+ }
1715
+
1716
+ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
1717
+ return ctx->vision_model.image_newline;
1718
+ }
1719
+
1720
+ void clip_free(clip_ctx * ctx) {
1721
+ ggml_free(ctx->ctx_data);
1722
+ gguf_free(ctx->ctx_gguf);
1723
+
1724
+ ggml_backend_buffer_free(ctx->params_buffer);
1725
+ ggml_backend_free(ctx->backend);
1726
+ ggml_gallocr_free(ctx->compute_alloc);
1727
+ delete ctx;
1728
+ }
1729
+
1730
+ size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
1731
+ return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
1732
+ }
1733
+
1734
+ int32_t clip_image_size(const struct clip_ctx * ctx) {
1735
+ return ctx->vision_model.hparams.image_size;
1736
+ }
1737
+
1738
+ int32_t clip_patch_size(const struct clip_ctx * ctx) {
1739
+ return ctx->vision_model.hparams.patch_size;
1740
+ }
1741
+
1742
+ int32_t clip_hidden_size(const struct clip_ctx * ctx) {
1743
+ return ctx->vision_model.hparams.hidden_size;
1744
+ }
1745
+
1746
+ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
1747
+ return ctx->vision_model.hparams.mm_patch_merge_type;
1748
+ }
1749
+
1750
+ const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
1751
+ return ctx->vision_model.hparams.image_grid_pinpoints;
1752
+ }
1753
+
1754
+ int clip_n_patches(const struct clip_ctx * ctx) {
1755
+ const auto & params = ctx->vision_model.hparams;
1756
+
1757
+ int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
1758
+
1759
+ if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
1760
+ n_patches /= 4;
1761
+ }
1762
+
1763
+ return n_patches;
1764
+ }
1765
+
1766
+ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
1767
+ if (!ctx->has_vision_encoder) {
1768
+ LOG_TEE("This gguf file seems to have no vision encoder\n");
1769
+ return false;
1770
+ }
1771
+
1772
+ clip_image_f32_batch imgs{};
1773
+ imgs.size = 1;
1774
+ imgs.data = img;
1775
+ return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
1776
+ }
1777
+
1778
+ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
1779
+ if (!ctx->has_vision_encoder) {
1780
+ LOG_TEE("This gguf file seems to have no vision encoder\n");
1781
+ return false;
1782
+ }
1783
+
1784
+ int batch_size = imgs->size;
1785
+ if (ctx->has_llava_projector) {
1786
+ GGML_ASSERT(batch_size == 1); // TODO: support multiple images
1787
+ }
1788
+
1789
+ // build the inference graph
1790
+ ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
1791
+ ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
1792
+
1793
+ // set inputs
1794
+ const auto & model = ctx->vision_model;
1795
+ const auto & hparams = model.hparams;
1796
+
1797
+ const int image_size = hparams.image_size;
1798
+ const int patch_size = hparams.patch_size;
1799
+ const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
1800
+ const int num_positions = num_patches + 1;
1801
+
1802
+ {
1803
+ struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
1804
+ float * data = (float *)malloc(ggml_nbytes(inp_raw));
1805
+
1806
+ for (size_t i = 0; i < imgs->size; i++) {
1807
+ const int nx = imgs->data[i].nx;
1808
+ const int ny = imgs->data[i].ny;
1809
+ GGML_ASSERT(nx == image_size && ny == image_size);
1810
+
1811
+ const int n = nx * ny;
1812
+
1813
+ for (int b = 0; b < batch_size; b++) {
1814
+ for (int k = 0; k < 3; k++) {
1815
+ for (int y = 0; y < ny; y++) {
1816
+ for (int x = 0; x < nx; x++) {
1817
+ data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k];
1818
+ }
1819
+ }
1820
+ }
1821
+ }
1822
+ }
1823
+ ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
1824
+ free(data);
1825
+ }
1826
+
1827
+ {
1828
+ struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
1829
+
1830
+ void* zero_mem = malloc(ggml_nbytes(embeddings));
1831
+ memset(zero_mem, 0, ggml_nbytes(embeddings));
1832
+ ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
1833
+ free(zero_mem);
1834
+ }
1835
+
1836
+ {
1837
+ struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
1838
+
1839
+ int* positions_data = (int*)malloc(ggml_nbytes(positions));
1840
+ for (int i = 0; i < num_positions; i++) {
1841
+ positions_data[i] = i;
1842
+ }
1843
+ ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
1844
+ free(positions_data);
1845
+ }
1846
+
1847
+ {
1848
+ struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
1849
+ int* patches_data = (int*)malloc(ggml_nbytes(patches));
1850
+ for (int i = 0; i < num_patches; i++) {
1851
+ patches_data[i] = i + 1;
1852
+ }
1853
+ ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
1854
+ free(patches_data);
1855
+ }
1856
+
1857
+ if (ggml_backend_is_cpu(ctx->backend)) {
1858
+ ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
1859
+ }
1860
+
1861
+ #ifdef GGML_USE_METAL
1862
+ if (ggml_backend_is_metal(ctx->backend)) {
1863
+ ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
1864
+ }
1865
+ #endif
1866
+
1867
+ ggml_backend_graph_compute(ctx->backend, gf);
1868
+
1869
+ // the last node is the embedding tensor
1870
+ struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
1871
+
1872
+ // copy the embeddings to the location passed by the user
1873
+ ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
1874
+
1875
+ return true;
1876
+ }
1877
+
1878
+ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
1879
+ ggml_type type = GGML_TYPE_Q4_1;
1880
+
1881
+ assert(itype < GGML_TYPE_COUNT);
1882
+ type = static_cast<ggml_type>(itype);
1883
+
1884
+ auto * ctx_clip = clip_model_load(fname_inp, 2);
1885
+
1886
+ const auto & ctx_src = ctx_clip->ctx_gguf;
1887
+ const auto & ctx_data = ctx_clip->ctx_data;
1888
+
1889
+ auto * ctx_out = gguf_init_empty();
1890
+ gguf_set_kv(ctx_out, ctx_src);
1891
+ gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
1892
+ gguf_set_val_u32(ctx_out, "general.file_type", itype);
1893
+
1894
+ auto fout = std::ofstream(fname_out, std::ios::binary);
1895
+
1896
+ const int n_tensors = gguf_get_n_tensors(ctx_src);
1897
+
1898
+ for (int i = 0; i < n_tensors; ++i) {
1899
+ const char * name = gguf_get_tensor_name(ctx_src, i);
1900
+ struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
1901
+ gguf_add_tensor(ctx_out, cur);
1902
+ }
1903
+
1904
+ const size_t meta_size = gguf_get_meta_size(ctx_out);
1905
+ for (size_t i = 0; i < meta_size; ++i) {
1906
+ fout.put(0);
1907
+ }
1908
+
1909
+ // regexes of tensor names to be quantized
1910
+ const std::vector<std::string> k_names = {
1911
+ ".*weight",
1912
+ };
1913
+
1914
+ std::vector<uint8_t> work(512);
1915
+ std::vector<float> conv_buf(512);
1916
+ size_t total_size_org = 0;
1917
+ size_t total_size_new = 0;
1918
+
1919
+ for (int i = 0; i < n_tensors; ++i) {
1920
+ const std::string name = gguf_get_tensor_name(ctx_src, i);
1921
+ struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str());
1922
+
1923
+ enum ggml_type new_type;
1924
+ void * new_data;
1925
+ size_t new_size;
1926
+
1927
+ bool quantize = false;
1928
+ for (const auto & s : k_names) {
1929
+ if (std::regex_match(name, std::regex(s))) {
1930
+ quantize = true;
1931
+ break;
1932
+ }
1933
+ }
1934
+
1935
+ // quantize only 2D tensors
1936
+ quantize &= (ggml_n_dims(cur) == 2);
1937
+
1938
+ if (quantize) {
1939
+ new_type = type;
1940
+ if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
1941
+ new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
1942
+ // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
1943
+ }
1944
+ const size_t n_elms = ggml_nelements(cur);
1945
+ float * f32_data;
1946
+
1947
+ switch (cur->type) {
1948
+ case GGML_TYPE_F32:
1949
+ f32_data = (float *)cur->data;
1950
+ break;
1951
+ case GGML_TYPE_F16:
1952
+ if (conv_buf.size() < n_elms) {
1953
+ conv_buf.resize(n_elms);
1954
+ }
1955
+ for (size_t j = 0; j < n_elms; ++j) {
1956
+ conv_buf[j] = ggml_fp16_to_fp32(((ggml_fp16_t *)cur->data)[j]);
1957
+ }
1958
+ f32_data = (float *)conv_buf.data();
1959
+ break;
1960
+ default:
1961
+ LOG_TEE("Please use an input file in f32 or f16\n");
1962
+ gguf_free(ctx_out);
1963
+ return false;
1964
+ }
1965
+
1966
+ if (work.size() < n_elms * 4) {
1967
+ work.resize(n_elms * 4);
1968
+ }
1969
+ new_data = work.data();
1970
+
1971
+ new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr);
1972
+ } else {
1973
+ new_type = cur->type;
1974
+ new_data = cur->data;
1975
+ new_size = ggml_nbytes(cur);
1976
+ }
1977
+ const size_t orig_size = ggml_nbytes(cur);
1978
+ total_size_org += orig_size;
1979
+ total_size_new += new_size;
1980
+ gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
1981
+ gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
1982
+ fout.write((const char *)new_data, new_size);
1983
+ size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
1984
+ for (size_t j = 0; j < pad; ++j) {
1985
+ fout.put(0);
1986
+ }
1987
+
1988
+ LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
1989
+ orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
1990
+ }
1991
+
1992
+ // go back to beginning of file and write the updated metadata
1993
+ fout.seekp(0, std::ios::beg);
1994
+ std::vector<uint8_t> meta(meta_size);
1995
+ gguf_get_meta_data(ctx_out, meta.data());
1996
+ fout.write((const char *)meta.data(), meta_size);
1997
+
1998
+ fout.close();
1999
+
2000
+ clip_free(ctx_clip);
2001
+ gguf_free(ctx_out);
2002
+
2003
+ {
2004
+ LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
2005
+ LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
2006
+ }
2007
+
2008
+ return true;
2009
+ }
2010
+
2011
+ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
2012
+ if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
2013
+ return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
2014
+ }
2015
+ if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
2016
+ return ctx->vision_model.mm_model_peg_0_b->ne[0];
2017
+ }
2018
+ if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
2019
+ return ctx->vision_model.mm_2_b->ne[0];
2020
+ }
2021
+ if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
2022
+ return ctx->vision_model.mm_3_b->ne[0];
2023
+ }
2024
+
2025
+ std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
2026
+ throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
2027
+ }