@fugood/llama.node 0.3.17 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +39 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +366 -19
- package/src/LlamaCompletionWorker.h +30 -10
- package/src/LlamaContext.cpp +213 -5
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
- package/src/llama.cpp/.github/workflows/build.yml +41 -762
- package/src/llama.cpp/.github/workflows/docker.yml +5 -2
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +12 -12
- package/src/llama.cpp/CMakeLists.txt +5 -17
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +31 -3
- package/src/llama.cpp/common/arg.cpp +48 -29
- package/src/llama.cpp/common/chat.cpp +128 -106
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +37 -1
- package/src/llama.cpp/common/common.h +18 -9
- package/src/llama.cpp/common/llguidance.cpp +1 -0
- package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
- package/src/llama.cpp/common/minja/minja.hpp +69 -36
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +57 -50
- package/src/llama.cpp/examples/CMakeLists.txt +2 -23
- package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
- package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml.h +10 -7
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
- package/src/llama.cpp/ggml/src/ggml.c +29 -20
- package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
- package/src/llama.cpp/include/llama.h +52 -11
- package/src/llama.cpp/requirements/requirements-all.txt +3 -3
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-adapter.cpp +6 -0
- package/src/llama.cpp/src/llama-arch.cpp +3 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +17 -7
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +389 -501
- package/src/llama.cpp/src/llama-context.h +44 -32
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +20 -38
- package/src/llama.cpp/src/llama-graph.h +12 -8
- package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
- package/src/llama.cpp/src/llama-kv-cache.h +271 -85
- package/src/llama.cpp/src/llama-memory.h +11 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +316 -69
- package/src/llama.cpp/src/llama-model.h +8 -1
- package/src/llama.cpp/src/llama-quant.cpp +15 -13
- package/src/llama.cpp/src/llama-sampling.cpp +18 -6
- package/src/llama.cpp/src/llama-vocab.cpp +42 -4
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +14 -0
- package/src/llama.cpp/tests/CMakeLists.txt +10 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
- package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
- package/src/llama.cpp/tests/test-chat.cpp +3 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
- package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
- package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
- package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
- package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
- package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
- package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.h +0 -135
- package/src/llama.cpp/examples/llava/llava.cpp +0 -586
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/mtmd.h +0 -168
- package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
|
@@ -29,9 +29,21 @@
|
|
|
29
29
|
#include <limits>
|
|
30
30
|
#include <array>
|
|
31
31
|
#include <numeric>
|
|
32
|
+
#include <functional>
|
|
32
33
|
|
|
33
34
|
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
|
|
34
35
|
|
|
36
|
+
enum ffn_op_type {
|
|
37
|
+
FFN_GELU,
|
|
38
|
+
FFN_SILU,
|
|
39
|
+
FFN_GELU_QUICK,
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
enum norm_type {
|
|
43
|
+
NORM_TYPE_NORMAL,
|
|
44
|
+
NORM_TYPE_RMS,
|
|
45
|
+
};
|
|
46
|
+
|
|
35
47
|
//#define CLIP_DEBUG_FUNCTIONS
|
|
36
48
|
|
|
37
49
|
#ifdef CLIP_DEBUG_FUNCTIONS
|
|
@@ -155,13 +167,19 @@ enum patch_merge_type {
|
|
|
155
167
|
struct clip_hparams {
|
|
156
168
|
int32_t image_size;
|
|
157
169
|
int32_t patch_size;
|
|
158
|
-
int32_t
|
|
159
|
-
int32_t
|
|
170
|
+
int32_t n_embd;
|
|
171
|
+
int32_t n_ff;
|
|
160
172
|
int32_t projection_dim;
|
|
161
173
|
int32_t n_head;
|
|
162
174
|
int32_t n_layer;
|
|
163
175
|
int32_t proj_scale_factor = 0; // idefics3
|
|
164
176
|
|
|
177
|
+
// for models using dynamic image size, we need to have a smaller image size to warmup
|
|
178
|
+
// otherwise, user will get OOM everytime they load the model
|
|
179
|
+
int32_t warmup_image_size = 0;
|
|
180
|
+
|
|
181
|
+
ffn_op_type ffn_op = FFN_GELU;
|
|
182
|
+
|
|
165
183
|
patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
|
|
166
184
|
|
|
167
185
|
float eps = 1e-6;
|
|
@@ -172,145 +190,148 @@ struct clip_hparams {
|
|
|
172
190
|
std::unordered_set<int32_t> vision_feature_layer;
|
|
173
191
|
int32_t attn_window_size = 0;
|
|
174
192
|
int32_t n_wa_pattern = 0;
|
|
193
|
+
int32_t spatial_merge_size = 0;
|
|
175
194
|
};
|
|
176
195
|
|
|
177
196
|
struct clip_layer {
|
|
178
197
|
// attention
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
198
|
+
ggml_tensor * k_w = nullptr;
|
|
199
|
+
ggml_tensor * k_b = nullptr;
|
|
200
|
+
ggml_tensor * q_w = nullptr;
|
|
201
|
+
ggml_tensor * q_b = nullptr;
|
|
202
|
+
ggml_tensor * v_w = nullptr;
|
|
203
|
+
ggml_tensor * v_b = nullptr;
|
|
185
204
|
|
|
186
|
-
|
|
187
|
-
|
|
205
|
+
ggml_tensor * o_w = nullptr;
|
|
206
|
+
ggml_tensor * o_b = nullptr;
|
|
188
207
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
struct ggml_tensor * ln_1_b = nullptr;
|
|
192
|
-
|
|
193
|
-
// ff
|
|
194
|
-
struct ggml_tensor * ff_i_w = nullptr; // legacy naming
|
|
195
|
-
struct ggml_tensor * ff_i_b = nullptr; // legacy naming
|
|
196
|
-
struct ggml_tensor * ff_o_w = nullptr; // legacy naming
|
|
197
|
-
struct ggml_tensor * ff_o_b = nullptr; // legacy naming
|
|
208
|
+
ggml_tensor * k_norm = nullptr;
|
|
209
|
+
ggml_tensor * q_norm = nullptr;
|
|
198
210
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
struct ggml_tensor * ff_gate_b = nullptr;
|
|
203
|
-
struct ggml_tensor * ff_down_w = nullptr;
|
|
204
|
-
struct ggml_tensor * ff_down_b = nullptr;
|
|
211
|
+
// layernorm 1
|
|
212
|
+
ggml_tensor * ln_1_w = nullptr;
|
|
213
|
+
ggml_tensor * ln_1_b = nullptr;
|
|
205
214
|
|
|
206
|
-
|
|
207
|
-
|
|
215
|
+
ggml_tensor * ff_up_w = nullptr;
|
|
216
|
+
ggml_tensor * ff_up_b = nullptr;
|
|
217
|
+
ggml_tensor * ff_gate_w = nullptr;
|
|
218
|
+
ggml_tensor * ff_gate_b = nullptr;
|
|
219
|
+
ggml_tensor * ff_down_w = nullptr;
|
|
220
|
+
ggml_tensor * ff_down_b = nullptr;
|
|
208
221
|
|
|
209
222
|
// layernorm 2
|
|
210
|
-
|
|
211
|
-
|
|
223
|
+
ggml_tensor * ln_2_w = nullptr;
|
|
224
|
+
ggml_tensor * ln_2_b = nullptr;
|
|
225
|
+
|
|
226
|
+
// layer scale (no bias)
|
|
227
|
+
ggml_tensor * ls_1_w = nullptr;
|
|
228
|
+
ggml_tensor * ls_2_w = nullptr;
|
|
212
229
|
};
|
|
213
230
|
|
|
214
231
|
struct clip_vision_model {
|
|
215
232
|
struct clip_hparams hparams;
|
|
216
233
|
|
|
217
234
|
// embeddings
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
235
|
+
ggml_tensor * class_embedding = nullptr;
|
|
236
|
+
ggml_tensor * patch_embeddings_0 = nullptr;
|
|
237
|
+
ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
|
|
238
|
+
ggml_tensor * patch_bias = nullptr;
|
|
239
|
+
ggml_tensor * position_embeddings = nullptr;
|
|
223
240
|
|
|
224
|
-
|
|
225
|
-
|
|
241
|
+
ggml_tensor * pre_ln_w = nullptr;
|
|
242
|
+
ggml_tensor * pre_ln_b = nullptr;
|
|
226
243
|
|
|
227
244
|
std::vector<clip_layer> layers;
|
|
228
245
|
|
|
229
|
-
|
|
230
|
-
|
|
246
|
+
ggml_tensor * post_ln_w;
|
|
247
|
+
ggml_tensor * post_ln_b;
|
|
231
248
|
|
|
232
|
-
|
|
249
|
+
ggml_tensor * projection;
|
|
233
250
|
|
|
234
251
|
// LLaVA projection
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
252
|
+
ggml_tensor * mm_input_norm_w = nullptr;
|
|
253
|
+
ggml_tensor * mm_0_w = nullptr;
|
|
254
|
+
ggml_tensor * mm_0_b = nullptr;
|
|
255
|
+
ggml_tensor * mm_2_w = nullptr;
|
|
256
|
+
ggml_tensor * mm_2_b = nullptr;
|
|
239
257
|
|
|
240
|
-
|
|
258
|
+
ggml_tensor * image_newline = nullptr;
|
|
241
259
|
|
|
242
260
|
// Yi type models with mlp+normalization projection
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
//GLMV-Edge projection
|
|
251
|
-
|
|
252
|
-
|
|
261
|
+
ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
|
|
262
|
+
ggml_tensor * mm_1_b = nullptr;
|
|
263
|
+
ggml_tensor * mm_3_w = nullptr;
|
|
264
|
+
ggml_tensor * mm_3_b = nullptr;
|
|
265
|
+
ggml_tensor * mm_4_w = nullptr;
|
|
266
|
+
ggml_tensor * mm_4_b = nullptr;
|
|
267
|
+
|
|
268
|
+
// GLMV-Edge projection
|
|
269
|
+
ggml_tensor * mm_model_adapter_conv_w = nullptr;
|
|
270
|
+
ggml_tensor * mm_model_adapter_conv_b = nullptr;
|
|
271
|
+
ggml_tensor * mm_glm_tok_boi = nullptr;
|
|
272
|
+
ggml_tensor * mm_glm_tok_eoi = nullptr;
|
|
253
273
|
|
|
254
274
|
// MobileVLM projection
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
275
|
+
ggml_tensor * mm_model_mlp_1_w = nullptr;
|
|
276
|
+
ggml_tensor * mm_model_mlp_1_b = nullptr;
|
|
277
|
+
ggml_tensor * mm_model_mlp_3_w = nullptr;
|
|
278
|
+
ggml_tensor * mm_model_mlp_3_b = nullptr;
|
|
279
|
+
ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
|
|
280
|
+
ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
|
|
281
|
+
ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
|
|
282
|
+
ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
|
|
283
|
+
ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
|
|
284
|
+
ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
|
|
285
|
+
ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
|
|
286
|
+
ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
|
|
287
|
+
ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
|
|
288
|
+
ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
|
|
289
|
+
ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
|
|
290
|
+
ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
|
|
291
|
+
ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
|
|
292
|
+
ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
|
|
293
|
+
ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
|
|
294
|
+
ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
|
|
295
|
+
ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
|
|
296
|
+
ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
|
|
297
|
+
ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
|
|
298
|
+
ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
|
|
279
299
|
|
|
280
300
|
// MobileVLM_V2 projection
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
301
|
+
ggml_tensor * mm_model_mlp_0_w = nullptr;
|
|
302
|
+
ggml_tensor * mm_model_mlp_0_b = nullptr;
|
|
303
|
+
ggml_tensor * mm_model_mlp_2_w = nullptr;
|
|
304
|
+
ggml_tensor * mm_model_mlp_2_b = nullptr;
|
|
305
|
+
ggml_tensor * mm_model_peg_0_w = nullptr;
|
|
306
|
+
ggml_tensor * mm_model_peg_0_b = nullptr;
|
|
287
307
|
|
|
288
308
|
// MINICPMV projection
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
309
|
+
ggml_tensor * mm_model_pos_embed_k = nullptr;
|
|
310
|
+
ggml_tensor * mm_model_query = nullptr;
|
|
311
|
+
ggml_tensor * mm_model_proj = nullptr;
|
|
312
|
+
ggml_tensor * mm_model_kv_proj = nullptr;
|
|
313
|
+
ggml_tensor * mm_model_attn_q_w = nullptr;
|
|
314
|
+
ggml_tensor * mm_model_attn_q_b = nullptr;
|
|
315
|
+
ggml_tensor * mm_model_attn_k_w = nullptr;
|
|
316
|
+
ggml_tensor * mm_model_attn_k_b = nullptr;
|
|
317
|
+
ggml_tensor * mm_model_attn_v_w = nullptr;
|
|
318
|
+
ggml_tensor * mm_model_attn_v_b = nullptr;
|
|
319
|
+
ggml_tensor * mm_model_attn_o_w = nullptr;
|
|
320
|
+
ggml_tensor * mm_model_attn_o_b = nullptr;
|
|
321
|
+
ggml_tensor * mm_model_ln_q_w = nullptr;
|
|
322
|
+
ggml_tensor * mm_model_ln_q_b = nullptr;
|
|
323
|
+
ggml_tensor * mm_model_ln_kv_w = nullptr;
|
|
324
|
+
ggml_tensor * mm_model_ln_kv_b = nullptr;
|
|
325
|
+
ggml_tensor * mm_model_ln_post_w = nullptr;
|
|
326
|
+
ggml_tensor * mm_model_ln_post_b = nullptr;
|
|
307
327
|
|
|
308
328
|
// gemma3
|
|
309
|
-
|
|
310
|
-
|
|
329
|
+
ggml_tensor * mm_input_proj_w = nullptr;
|
|
330
|
+
ggml_tensor * mm_soft_emb_norm_w = nullptr;
|
|
311
331
|
|
|
312
332
|
// pixtral
|
|
313
|
-
|
|
333
|
+
ggml_tensor * token_embd_img_break = nullptr;
|
|
334
|
+
ggml_tensor * mm_patch_merger_w = nullptr;
|
|
314
335
|
};
|
|
315
336
|
|
|
316
337
|
struct clip_ctx {
|
|
@@ -320,11 +341,8 @@ struct clip_ctx {
|
|
|
320
341
|
struct clip_vision_model vision_model;
|
|
321
342
|
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
|
322
343
|
|
|
323
|
-
int32_t max_feature_layer; // unused in newer models like gemma3
|
|
324
344
|
float image_mean[3];
|
|
325
345
|
float image_std[3];
|
|
326
|
-
bool use_gelu = false;
|
|
327
|
-
bool use_silu = false;
|
|
328
346
|
|
|
329
347
|
gguf_context_ptr ctx_gguf;
|
|
330
348
|
ggml_context_ptr ctx_data;
|
|
@@ -345,9 +363,12 @@ struct clip_ctx {
|
|
|
345
363
|
|
|
346
364
|
clip_ctx(clip_context_params & ctx_params) {
|
|
347
365
|
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
366
|
+
if (!backend_cpu) {
|
|
367
|
+
throw std::runtime_error("failed to initialize CPU backend");
|
|
368
|
+
}
|
|
369
|
+
backend = ctx_params.use_gpu
|
|
370
|
+
? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr)
|
|
371
|
+
: nullptr;
|
|
351
372
|
|
|
352
373
|
if (backend) {
|
|
353
374
|
LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
|
|
@@ -362,7 +383,7 @@ struct clip_ctx {
|
|
|
362
383
|
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
|
|
363
384
|
|
|
364
385
|
sched.reset(
|
|
365
|
-
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false)
|
|
386
|
+
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
|
|
366
387
|
);
|
|
367
388
|
}
|
|
368
389
|
|
|
@@ -374,1194 +395,1337 @@ struct clip_ctx {
|
|
|
374
395
|
}
|
|
375
396
|
};
|
|
376
397
|
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
const
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
const int
|
|
386
|
-
const int
|
|
387
|
-
const int
|
|
388
|
-
const int
|
|
389
|
-
const int
|
|
390
|
-
const
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
398
|
+
struct clip_graph {
|
|
399
|
+
clip_ctx * ctx;
|
|
400
|
+
const clip_vision_model & model;
|
|
401
|
+
const clip_hparams & hparams;
|
|
402
|
+
|
|
403
|
+
// we only support single image per batch
|
|
404
|
+
const clip_image_f32 & img;
|
|
405
|
+
|
|
406
|
+
const int patch_size;
|
|
407
|
+
const int n_patches_x;
|
|
408
|
+
const int n_patches_y;
|
|
409
|
+
const int n_patches;
|
|
410
|
+
const int n_embd;
|
|
411
|
+
const int n_head;
|
|
412
|
+
const int d_head;
|
|
413
|
+
const int n_layer;
|
|
414
|
+
const float eps;
|
|
415
|
+
const float kq_scale;
|
|
416
|
+
|
|
417
|
+
ggml_context_ptr ctx0_ptr;
|
|
418
|
+
ggml_context * ctx0;
|
|
419
|
+
ggml_cgraph * gf;
|
|
420
|
+
|
|
421
|
+
clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
|
|
422
|
+
ctx(ctx),
|
|
423
|
+
model(ctx->vision_model),
|
|
424
|
+
hparams(model.hparams),
|
|
425
|
+
img(img),
|
|
426
|
+
patch_size(hparams.patch_size),
|
|
427
|
+
n_patches_x(img.nx / patch_size),
|
|
428
|
+
n_patches_y(img.ny / patch_size),
|
|
429
|
+
n_patches(n_patches_x * n_patches_y),
|
|
430
|
+
n_embd(hparams.n_embd),
|
|
431
|
+
n_head(hparams.n_head),
|
|
432
|
+
d_head(n_embd / n_head),
|
|
433
|
+
n_layer(hparams.n_layer),
|
|
434
|
+
eps(hparams.eps),
|
|
435
|
+
kq_scale(1.0f / sqrtf((float)d_head)) {
|
|
436
|
+
struct ggml_init_params params = {
|
|
437
|
+
/*.mem_size =*/ ctx->buf_compute_meta.size(),
|
|
438
|
+
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
|
|
439
|
+
/*.no_alloc =*/ true,
|
|
440
|
+
};
|
|
441
|
+
ctx0_ptr.reset(ggml_init(params));
|
|
442
|
+
ctx0 = ctx0_ptr.get();
|
|
443
|
+
gf = ggml_new_graph(ctx0);
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
ggml_cgraph * build_siglip() {
|
|
447
|
+
ggml_tensor * inp = build_inp();
|
|
448
|
+
ggml_tensor * cur = build_vit(
|
|
449
|
+
inp, n_patches,
|
|
450
|
+
NORM_TYPE_NORMAL,
|
|
451
|
+
hparams.ffn_op,
|
|
452
|
+
model.position_embeddings,
|
|
453
|
+
nullptr);
|
|
454
|
+
|
|
455
|
+
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
|
456
|
+
const int batch_size = 1;
|
|
457
|
+
GGML_ASSERT(n_patches_x == n_patches_y);
|
|
458
|
+
const int patches_per_image = n_patches_x;
|
|
459
|
+
const int kernel_size = hparams.proj_scale_factor;
|
|
460
|
+
|
|
461
|
+
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
462
|
+
cur = ggml_reshape_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
|
|
463
|
+
|
|
464
|
+
// doing a pool2d to reduce the number of output tokens
|
|
465
|
+
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
|
|
466
|
+
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size);
|
|
467
|
+
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
468
|
+
|
|
469
|
+
// apply norm before projection
|
|
470
|
+
cur = ggml_rms_norm(ctx0, cur, eps);
|
|
471
|
+
cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
|
|
472
|
+
|
|
473
|
+
// apply projection
|
|
474
|
+
cur = ggml_mul_mat(ctx0,
|
|
475
|
+
ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
|
|
476
|
+
cur);
|
|
477
|
+
|
|
478
|
+
} else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
|
|
479
|
+
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
|
|
480
|
+
|
|
481
|
+
const int scale_factor = model.hparams.proj_scale_factor;
|
|
482
|
+
const int n_embd = cur->ne[0];
|
|
483
|
+
const int seq = cur->ne[1];
|
|
484
|
+
const int bsz = 1; // batch size, always 1 for now since we don't support batching
|
|
485
|
+
const int height = std::sqrt(seq);
|
|
486
|
+
const int width = std::sqrt(seq);
|
|
487
|
+
GGML_ASSERT(scale_factor != 0);
|
|
488
|
+
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
|
|
489
|
+
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
|
490
|
+
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
|
|
491
|
+
n_embd * scale_factor * scale_factor,
|
|
492
|
+
height / scale_factor,
|
|
493
|
+
width / scale_factor,
|
|
494
|
+
bsz);
|
|
495
|
+
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
|
496
|
+
cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur),
|
|
497
|
+
n_embd * scale_factor * scale_factor,
|
|
498
|
+
seq / (scale_factor * scale_factor),
|
|
499
|
+
bsz);
|
|
500
|
+
|
|
501
|
+
cur = ggml_mul_mat(ctx0, model.projection, cur);
|
|
502
|
+
} else {
|
|
503
|
+
GGML_ABORT("SigLIP: Unsupported projector type");
|
|
424
504
|
}
|
|
425
505
|
|
|
426
|
-
//
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
struct ggml_tensor * Q =
|
|
430
|
-
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
|
|
506
|
+
// build the graph
|
|
507
|
+
ggml_build_forward_expand(gf, cur);
|
|
431
508
|
|
|
432
|
-
|
|
433
|
-
|
|
509
|
+
return gf;
|
|
510
|
+
}
|
|
434
511
|
|
|
435
|
-
|
|
436
|
-
|
|
512
|
+
ggml_cgraph * build_pixtral() {
|
|
513
|
+
const int n_merge = hparams.spatial_merge_size;
|
|
437
514
|
|
|
438
|
-
|
|
439
|
-
|
|
515
|
+
// 2D input positions
|
|
516
|
+
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
|
517
|
+
ggml_set_name(pos_h, "pos_h");
|
|
518
|
+
ggml_set_input(pos_h);
|
|
440
519
|
|
|
441
|
-
|
|
442
|
-
|
|
520
|
+
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
|
521
|
+
ggml_set_name(pos_w, "pos_w");
|
|
522
|
+
ggml_set_input(pos_w);
|
|
443
523
|
|
|
444
|
-
|
|
445
|
-
|
|
524
|
+
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
|
525
|
+
return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta);
|
|
526
|
+
};
|
|
446
527
|
|
|
447
|
-
|
|
448
|
-
|
|
528
|
+
ggml_tensor * inp = build_inp();
|
|
529
|
+
ggml_tensor * cur = build_vit(
|
|
530
|
+
inp, n_patches,
|
|
531
|
+
NORM_TYPE_RMS,
|
|
532
|
+
hparams.ffn_op,
|
|
533
|
+
nullptr, // no learned pos embd
|
|
534
|
+
add_pos);
|
|
449
535
|
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
536
|
+
// mistral small 3.1 patch merger
|
|
537
|
+
// ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
|
|
538
|
+
if (model.mm_patch_merger_w) {
|
|
539
|
+
GGML_ASSERT(hparams.spatial_merge_size > 0);
|
|
453
540
|
|
|
454
|
-
cur =
|
|
455
|
-
}
|
|
541
|
+
cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w);
|
|
456
542
|
|
|
457
|
-
|
|
458
|
-
|
|
543
|
+
// reshape image tokens to 2D grid
|
|
544
|
+
cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
|
|
545
|
+
cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd]
|
|
546
|
+
cur = ggml_cont(ctx0, cur);
|
|
459
547
|
|
|
460
|
-
|
|
461
|
-
|
|
548
|
+
// torch.nn.functional.unfold is just an im2col under the hood
|
|
549
|
+
// we just need a dummy kernel to make it work
|
|
550
|
+
ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0);
|
|
551
|
+
cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type);
|
|
462
552
|
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
{
|
|
467
|
-
cur = ggml_norm(ctx0, cur, eps);
|
|
468
|
-
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
|
|
553
|
+
// project to n_embd
|
|
554
|
+
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
|
|
555
|
+
cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
|
|
469
556
|
}
|
|
470
557
|
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
|
|
478
|
-
cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
|
|
479
|
-
|
|
480
|
-
// residual 2
|
|
481
|
-
cur = ggml_add(ctx0, embeddings, cur);
|
|
558
|
+
// LlavaMultiModalProjector (always using GELU activation)
|
|
559
|
+
{
|
|
560
|
+
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
|
561
|
+
if (model.mm_1_b) {
|
|
562
|
+
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
|
563
|
+
}
|
|
482
564
|
|
|
483
|
-
|
|
484
|
-
|
|
565
|
+
cur = ggml_gelu(ctx0, cur);
|
|
566
|
+
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
|
567
|
+
if (model.mm_2_b) {
|
|
568
|
+
cur = ggml_add(ctx0, cur, model.mm_2_b);
|
|
569
|
+
}
|
|
570
|
+
}
|
|
485
571
|
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
572
|
+
// arrangement of the [IMG_BREAK] token
|
|
573
|
+
{
|
|
574
|
+
// not efficient, but works
|
|
575
|
+
// the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
|
|
576
|
+
// and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
|
|
577
|
+
// after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows]
|
|
490
578
|
|
|
491
|
-
|
|
492
|
-
|
|
579
|
+
const int p_y = n_merge > 0 ? n_patches_y / n_merge : n_patches_y;
|
|
580
|
+
const int p_x = n_merge > 0 ? n_patches_x / n_merge : n_patches_x;
|
|
581
|
+
const int p_total = p_x * p_y;
|
|
582
|
+
const int n_embd_text = cur->ne[0];
|
|
583
|
+
const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row
|
|
493
584
|
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
// doing a pool2d to reduce the number of output tokens to 256
|
|
505
|
-
embeddings = ggml_pool_2d(ctx0, embeddings, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
|
|
506
|
-
embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0] * embeddings->ne[0], hidden_size, batch_size);
|
|
507
|
-
embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
|
|
508
|
-
|
|
509
|
-
// apply norm before projection
|
|
510
|
-
embeddings = ggml_rms_norm(ctx0, embeddings, eps);
|
|
511
|
-
embeddings = ggml_mul(ctx0, embeddings, model.mm_soft_emb_norm_w);
|
|
512
|
-
|
|
513
|
-
// apply projection
|
|
514
|
-
embeddings = ggml_mul_mat(ctx0,
|
|
515
|
-
ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
|
|
516
|
-
embeddings);
|
|
517
|
-
|
|
518
|
-
} else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
|
|
519
|
-
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
|
|
520
|
-
|
|
521
|
-
ggml_tensor * cur = embeddings;
|
|
522
|
-
const int scale_factor = model.hparams.proj_scale_factor;
|
|
523
|
-
const int n_embd = cur->ne[0];
|
|
524
|
-
const int seq = cur->ne[1];
|
|
525
|
-
const int bsz = 1; // batch size, always 1 for now since we don't support batching
|
|
526
|
-
const int height = std::sqrt(seq);
|
|
527
|
-
const int width = std::sqrt(seq);
|
|
528
|
-
GGML_ASSERT(scale_factor != 0);
|
|
529
|
-
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
|
|
530
|
-
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
|
531
|
-
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
|
|
532
|
-
n_embd * scale_factor * scale_factor,
|
|
533
|
-
height / scale_factor,
|
|
534
|
-
width / scale_factor,
|
|
535
|
-
bsz);
|
|
536
|
-
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
|
537
|
-
cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur),
|
|
538
|
-
n_embd * scale_factor * scale_factor,
|
|
539
|
-
seq / (scale_factor * scale_factor),
|
|
540
|
-
bsz);
|
|
541
|
-
|
|
542
|
-
cur = ggml_mul_mat(ctx0, model.projection, cur);
|
|
543
|
-
embeddings = cur;
|
|
544
|
-
} else {
|
|
545
|
-
GGML_ABORT("SigLIP: Unsupported projector type");
|
|
546
|
-
}
|
|
547
|
-
|
|
548
|
-
// build the graph
|
|
549
|
-
ggml_build_forward_expand(gf, embeddings);
|
|
550
|
-
|
|
551
|
-
return gf;
|
|
552
|
-
}
|
|
585
|
+
ggml_tensor * tmp = ggml_reshape_3d(ctx0, cur, n_embd_text, p_x, p_y);
|
|
586
|
+
ggml_tensor * tok = ggml_new_tensor_3d(ctx0, tmp->type, n_embd_text, 1, p_y);
|
|
587
|
+
tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor
|
|
588
|
+
tok = ggml_add(ctx0, tok, model.token_embd_img_break);
|
|
589
|
+
tmp = ggml_concat(ctx0, tmp, tok, 1);
|
|
590
|
+
cur = ggml_view_2d(ctx0, tmp,
|
|
591
|
+
n_embd_text, n_tokens_output,
|
|
592
|
+
ggml_row_size(tmp->type, n_embd_text), 0);
|
|
593
|
+
}
|
|
553
594
|
|
|
554
|
-
//
|
|
555
|
-
|
|
556
|
-
// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
|
|
557
|
-
static ggml_tensor * build_rope_2d(
|
|
558
|
-
ggml_context * ctx0,
|
|
559
|
-
ggml_tensor * cur,
|
|
560
|
-
ggml_tensor * pos_h,
|
|
561
|
-
ggml_tensor * pos_w,
|
|
562
|
-
const float freq_base
|
|
563
|
-
) {
|
|
564
|
-
const int64_t n_dim = cur->ne[0];
|
|
565
|
-
const int64_t n_head = cur->ne[1];
|
|
566
|
-
const int64_t n_pos = cur->ne[2];
|
|
567
|
-
|
|
568
|
-
// for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
|
|
569
|
-
// we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
|
|
570
|
-
// first half of cur will use 1e-0, 1e-2 (even)
|
|
571
|
-
// second half of cur will use 1e-1, 1e-3 (odd)
|
|
572
|
-
// the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
|
|
573
|
-
// ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
|
|
574
|
-
// then for the second half, we use freq_scale to shift the inv_freq
|
|
575
|
-
// ^ why? replace (2i) with (2i+1) in the above equation
|
|
576
|
-
const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim);
|
|
577
|
-
|
|
578
|
-
// first half
|
|
579
|
-
ggml_tensor * first;
|
|
580
|
-
{
|
|
581
|
-
first = ggml_view_3d(ctx0, cur,
|
|
582
|
-
n_dim/2, n_head, n_pos,
|
|
583
|
-
ggml_row_size(cur->type, n_dim),
|
|
584
|
-
ggml_row_size(cur->type, n_dim*n_head),
|
|
585
|
-
0);
|
|
586
|
-
first = ggml_rope_ext(
|
|
587
|
-
ctx0,
|
|
588
|
-
first,
|
|
589
|
-
pos_h, // positions
|
|
590
|
-
nullptr, // freq factors
|
|
591
|
-
n_dim/2, // n_dims
|
|
592
|
-
0, 0, freq_base,
|
|
593
|
-
1.0f, 0.0f, 1.0f, 0.0f, 0.0f
|
|
594
|
-
);
|
|
595
|
-
}
|
|
595
|
+
// build the graph
|
|
596
|
+
ggml_build_forward_expand(gf, cur);
|
|
596
597
|
|
|
597
|
-
|
|
598
|
-
ggml_tensor * second;
|
|
599
|
-
{
|
|
600
|
-
second = ggml_view_3d(ctx0, cur,
|
|
601
|
-
n_dim/2, n_head, n_pos,
|
|
602
|
-
ggml_row_size(cur->type, n_dim),
|
|
603
|
-
ggml_row_size(cur->type, n_dim*n_head),
|
|
604
|
-
n_dim/2 * ggml_element_size(cur));
|
|
605
|
-
second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
|
|
606
|
-
second = ggml_rope_ext(
|
|
607
|
-
ctx0,
|
|
608
|
-
second,
|
|
609
|
-
pos_w, // positions
|
|
610
|
-
nullptr, // freq factors
|
|
611
|
-
n_dim/2, // n_dims
|
|
612
|
-
0, 0, freq_base,
|
|
613
|
-
freq_scale_odd,
|
|
614
|
-
0.0f, 1.0f, 0.0f, 0.0f
|
|
615
|
-
);
|
|
598
|
+
return gf;
|
|
616
599
|
}
|
|
617
600
|
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
601
|
+
// Qwen2VL and Qwen2.5VL use M-RoPE
|
|
602
|
+
ggml_cgraph * build_qwen2vl() {
|
|
603
|
+
GGML_ASSERT(model.patch_bias == nullptr);
|
|
604
|
+
GGML_ASSERT(model.class_embedding == nullptr);
|
|
621
605
|
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
int image_size_width = img.nx;
|
|
629
|
-
int image_size_height = img.ny;
|
|
630
|
-
|
|
631
|
-
const int patch_size = hparams.patch_size;
|
|
632
|
-
const int n_patches_x = image_size_width / patch_size;
|
|
633
|
-
const int n_patches_y = image_size_height / patch_size;
|
|
634
|
-
const int num_patches = n_patches_x * n_patches_y;
|
|
635
|
-
const int hidden_size = hparams.hidden_size;
|
|
636
|
-
const int n_head = hparams.n_head;
|
|
637
|
-
const int d_head = hidden_size / n_head;
|
|
638
|
-
const int n_layer = hparams.n_layer;
|
|
639
|
-
const float eps = hparams.eps;
|
|
640
|
-
|
|
641
|
-
struct ggml_init_params params = {
|
|
642
|
-
/*.mem_size =*/ ctx->buf_compute_meta.size(),
|
|
643
|
-
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
|
|
644
|
-
/*.no_alloc =*/ true,
|
|
645
|
-
};
|
|
606
|
+
const int batch_size = 1;
|
|
607
|
+
const bool use_window_attn = hparams.n_wa_pattern > 0;
|
|
608
|
+
const int n_wa_pattern = hparams.n_wa_pattern;
|
|
609
|
+
const int n_pos = n_patches;
|
|
610
|
+
const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
|
|
646
611
|
|
|
647
|
-
|
|
648
|
-
|
|
612
|
+
norm_type norm_t = ctx->proj_type == PROJECTOR_TYPE_QWEN25VL
|
|
613
|
+
? NORM_TYPE_RMS // qwen 2.5 vl
|
|
614
|
+
: NORM_TYPE_NORMAL; // qwen 2 vl
|
|
649
615
|
|
|
650
|
-
|
|
616
|
+
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
|
651
617
|
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
ggml_set_name(inp_raw, "inp_raw");
|
|
655
|
-
ggml_set_input(inp_raw);
|
|
618
|
+
ggml_tensor * inp_raw = build_inp_raw();
|
|
619
|
+
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
|
656
620
|
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
ggml_set_name(pos_h, "pos_h");
|
|
660
|
-
ggml_set_input(pos_h);
|
|
661
|
-
struct ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
|
|
662
|
-
ggml_set_name(pos_w, "pos_w");
|
|
663
|
-
ggml_set_input(pos_w);
|
|
621
|
+
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
|
|
622
|
+
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
|
|
664
623
|
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
624
|
+
// second conv dimension
|
|
625
|
+
{
|
|
626
|
+
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
|
627
|
+
inp = ggml_add(ctx0, inp, inp_1);
|
|
628
|
+
|
|
629
|
+
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
|
|
630
|
+
inp = ggml_reshape_4d(
|
|
631
|
+
ctx0, inp,
|
|
632
|
+
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
|
633
|
+
inp = ggml_reshape_4d(
|
|
634
|
+
ctx0, inp,
|
|
635
|
+
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
|
636
|
+
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
|
|
637
|
+
inp = ggml_reshape_3d(
|
|
638
|
+
ctx0, inp,
|
|
639
|
+
n_embd, n_patches_x * n_patches_y, batch_size);
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
ggml_tensor * inpL = inp;
|
|
643
|
+
ggml_tensor * window_mask = nullptr;
|
|
644
|
+
ggml_tensor * window_idx = nullptr;
|
|
645
|
+
ggml_tensor * inv_window_idx = nullptr;
|
|
646
|
+
|
|
647
|
+
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
|
|
648
|
+
ggml_set_name(positions, "positions");
|
|
649
|
+
ggml_set_input(positions);
|
|
650
|
+
|
|
651
|
+
// pre-layernorm
|
|
652
|
+
if (model.pre_ln_w) {
|
|
653
|
+
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
if (use_window_attn) {
|
|
657
|
+
// handle window attention inputs
|
|
658
|
+
inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
|
|
659
|
+
ggml_set_name(inv_window_idx, "inv_window_idx");
|
|
660
|
+
ggml_set_input(inv_window_idx);
|
|
661
|
+
// mask for window attention
|
|
662
|
+
window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
|
|
663
|
+
ggml_set_name(window_mask, "window_mask");
|
|
664
|
+
ggml_set_input(window_mask);
|
|
665
|
+
|
|
666
|
+
// inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
|
|
667
|
+
GGML_ASSERT(batch_size == 1);
|
|
668
|
+
inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
|
|
669
|
+
inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
|
|
670
|
+
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
// loop over layers
|
|
674
|
+
for (int il = 0; il < n_layer; il++) {
|
|
675
|
+
auto & layer = model.layers[il];
|
|
676
|
+
const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
|
|
668
677
|
|
|
669
|
-
|
|
678
|
+
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
|
|
670
679
|
|
|
671
|
-
|
|
672
|
-
|
|
680
|
+
// layernorm1
|
|
681
|
+
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
|
|
682
|
+
cb(cur, "ln1", il);
|
|
673
683
|
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
684
|
+
// self-attention
|
|
685
|
+
{
|
|
686
|
+
ggml_tensor * Qcur = ggml_add(ctx0,
|
|
687
|
+
ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
|
|
688
|
+
ggml_tensor * Kcur = ggml_add(ctx0,
|
|
689
|
+
ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
|
|
690
|
+
ggml_tensor * Vcur = ggml_add(ctx0,
|
|
691
|
+
ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
|
|
692
|
+
|
|
693
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
|
|
694
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
|
|
695
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
|
|
696
|
+
|
|
697
|
+
cb(Qcur, "Qcur", il);
|
|
698
|
+
cb(Kcur, "Kcur", il);
|
|
699
|
+
cb(Vcur, "Vcur", il);
|
|
700
|
+
|
|
701
|
+
// apply M-RoPE
|
|
702
|
+
Qcur = ggml_rope_multi(
|
|
703
|
+
ctx0, Qcur, positions, nullptr,
|
|
704
|
+
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
|
705
|
+
Kcur = ggml_rope_multi(
|
|
706
|
+
ctx0, Kcur, positions, nullptr,
|
|
707
|
+
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
|
677
708
|
|
|
678
|
-
|
|
679
|
-
|
|
709
|
+
cb(Qcur, "Qcur_rope", il);
|
|
710
|
+
cb(Kcur, "Kcur_rope", il);
|
|
680
711
|
|
|
681
|
-
|
|
682
|
-
{
|
|
683
|
-
struct ggml_tensor * Q = ggml_mul_mat(ctx0, model.layers[il].q_w, cur);
|
|
712
|
+
ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
|
|
684
713
|
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
714
|
+
cur = build_attn(layer.o_w, layer.o_b,
|
|
715
|
+
Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
|
|
716
|
+
cb(cur, "attn_out", il);
|
|
717
|
+
}
|
|
688
718
|
|
|
689
|
-
|
|
719
|
+
// re-add the layer input, e.g., residual
|
|
720
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
690
721
|
|
|
691
|
-
|
|
692
|
-
K = build_rope_2d(ctx0, K, pos_h, pos_w, hparams.rope_theta);
|
|
693
|
-
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
|
722
|
+
inpL = cur; // inpL = residual, cur = hidden_states
|
|
694
723
|
|
|
695
|
-
|
|
724
|
+
cb(cur, "ffn_inp", il);
|
|
696
725
|
|
|
697
|
-
|
|
698
|
-
|
|
726
|
+
// layernorm2
|
|
727
|
+
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
|
|
728
|
+
cb(cur, "ffn_inp_normed", il);
|
|
699
729
|
|
|
700
|
-
|
|
701
|
-
|
|
730
|
+
// ffn
|
|
731
|
+
cur = build_ffn(cur,
|
|
732
|
+
layer.ff_up_w, layer.ff_up_b,
|
|
733
|
+
layer.ff_gate_w, layer.ff_gate_b,
|
|
734
|
+
layer.ff_down_w, layer.ff_down_b,
|
|
735
|
+
hparams.ffn_op, il);
|
|
702
736
|
|
|
703
|
-
|
|
704
|
-
KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
|
|
705
|
-
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
|
737
|
+
cb(cur, "ffn_out", il);
|
|
706
738
|
|
|
707
|
-
|
|
739
|
+
// residual 2
|
|
740
|
+
cur = ggml_add(ctx0, inpL, cur);
|
|
741
|
+
cb(cur, "layer_out", il);
|
|
708
742
|
|
|
709
|
-
|
|
743
|
+
inpL = cur;
|
|
710
744
|
}
|
|
711
745
|
|
|
712
|
-
//
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
embeddings = cur; // embeddings = residual, cur = hidden_states
|
|
716
|
-
|
|
717
|
-
// pre-ffn norm
|
|
718
|
-
cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.layers[il].ln_2_w);
|
|
719
|
-
|
|
720
|
-
// feed-forward
|
|
721
|
-
{
|
|
722
|
-
ggml_tensor * gate_proj = ggml_mul_mat(ctx0, model.layers[il].ff_gate_w, cur);
|
|
723
|
-
ggml_tensor * up_proj = ggml_mul_mat(ctx0, model.layers[il].ff_up_w, cur);
|
|
724
|
-
gate_proj = ggml_silu(ctx0, gate_proj); // pixtral uses silu
|
|
725
|
-
cur = ggml_mul(ctx0, up_proj, gate_proj);
|
|
726
|
-
cur = ggml_mul_mat(ctx0, model.layers[il].ff_down_w, cur);
|
|
746
|
+
// post-layernorm
|
|
747
|
+
if (model.post_ln_w) {
|
|
748
|
+
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
|
|
727
749
|
}
|
|
728
750
|
|
|
729
|
-
//
|
|
730
|
-
|
|
751
|
+
// multimodal projection
|
|
752
|
+
ggml_tensor * embeddings = inpL;
|
|
753
|
+
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
|
|
731
754
|
|
|
732
|
-
embeddings =
|
|
733
|
-
|
|
755
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
|
756
|
+
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
|
734
757
|
|
|
735
|
-
|
|
736
|
-
|
|
758
|
+
// GELU activation
|
|
759
|
+
embeddings = ggml_gelu(ctx0, embeddings);
|
|
760
|
+
|
|
761
|
+
// Second linear layer
|
|
737
762
|
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
|
|
738
763
|
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
|
|
739
764
|
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
765
|
+
if (use_window_attn) {
|
|
766
|
+
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
|
|
767
|
+
ggml_set_name(window_idx, "window_idx");
|
|
768
|
+
ggml_set_input(window_idx);
|
|
744
769
|
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
770
|
+
// embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size]
|
|
771
|
+
GGML_ASSERT(batch_size == 1);
|
|
772
|
+
embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4);
|
|
773
|
+
embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
|
|
774
|
+
embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size);
|
|
775
|
+
}
|
|
751
776
|
|
|
752
|
-
|
|
753
|
-
|
|
777
|
+
// build the graph
|
|
778
|
+
ggml_build_forward_expand(gf, embeddings);
|
|
754
779
|
|
|
755
|
-
|
|
756
|
-
ggml_tensor * tok = ggml_new_tensor_3d(ctx0, embeddings->type, n_embd_text, 1, n_patches_y);
|
|
757
|
-
tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor
|
|
758
|
-
tok = ggml_add(ctx0, tok, model.token_embd_img_break);
|
|
759
|
-
cur = ggml_concat(ctx0, cur, tok, 1);
|
|
760
|
-
embeddings = ggml_view_2d(ctx0, cur,
|
|
761
|
-
n_embd_text, n_tokens_output,
|
|
762
|
-
ggml_row_size(cur->type, n_embd_text), 0);
|
|
780
|
+
return gf;
|
|
763
781
|
}
|
|
764
782
|
|
|
765
|
-
|
|
766
|
-
|
|
783
|
+
ggml_cgraph * build_minicpmv() {
|
|
784
|
+
const int batch_size = 1;
|
|
767
785
|
|
|
768
|
-
|
|
769
|
-
|
|
786
|
+
GGML_ASSERT(model.class_embedding == nullptr);
|
|
787
|
+
const int n_pos = n_patches;
|
|
770
788
|
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
789
|
+
// position embeddings for the projector (not for ViT)
|
|
790
|
+
int n_output_dim = clip_n_mmproj_embd(ctx);
|
|
791
|
+
ggml_tensor * pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, n_pos, batch_size);
|
|
792
|
+
ggml_set_name(pos_embed, "pos_embed");
|
|
793
|
+
ggml_set_input(pos_embed);
|
|
774
794
|
|
|
775
|
-
|
|
776
|
-
|
|
795
|
+
// for selecting learned pos embd, used by ViT
|
|
796
|
+
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
|
797
|
+
ggml_set_name(positions, "positions");
|
|
798
|
+
ggml_set_input(positions);
|
|
777
799
|
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
const int hidden_size = hparams.hidden_size;
|
|
788
|
-
const int n_head = hparams.n_head;
|
|
789
|
-
const int d_head = hidden_size / n_head;
|
|
790
|
-
const int n_layer = hparams.n_layer;
|
|
791
|
-
const float eps = hparams.eps;
|
|
792
|
-
|
|
793
|
-
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
|
794
|
-
|
|
795
|
-
const int batch_size = imgs.entries.size();
|
|
796
|
-
GGML_ASSERT(batch_size == 1);
|
|
797
|
-
|
|
798
|
-
struct ggml_init_params params = {
|
|
799
|
-
/*.mem_size =*/ ctx->buf_compute_meta.size(),
|
|
800
|
-
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
|
|
801
|
-
/*.no_alloc =*/ true,
|
|
802
|
-
};
|
|
800
|
+
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
|
|
801
|
+
|
|
802
|
+
ggml_tensor * inp = build_inp();
|
|
803
|
+
ggml_tensor * embeddings = build_vit(
|
|
804
|
+
inp, n_patches,
|
|
805
|
+
NORM_TYPE_NORMAL,
|
|
806
|
+
hparams.ffn_op,
|
|
807
|
+
learned_pos_embd,
|
|
808
|
+
nullptr);
|
|
803
809
|
|
|
804
|
-
|
|
805
|
-
auto ctx0 = ctx0_ptr.get();
|
|
810
|
+
// resampler projector (it is just another transformer)
|
|
806
811
|
|
|
807
|
-
|
|
812
|
+
ggml_tensor * q = model.mm_model_query;
|
|
813
|
+
ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
|
|
808
814
|
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
815
|
+
// norm
|
|
816
|
+
q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
|
|
817
|
+
v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
|
|
812
818
|
|
|
813
|
-
|
|
819
|
+
// k = v + pos_embed
|
|
820
|
+
ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
|
|
814
821
|
|
|
815
|
-
|
|
816
|
-
|
|
822
|
+
// attention
|
|
823
|
+
{
|
|
824
|
+
int n_embd = clip_n_mmproj_embd(ctx);
|
|
825
|
+
const int d_head = 128;
|
|
826
|
+
int n_head = n_embd/d_head;
|
|
827
|
+
int num_query = 96;
|
|
828
|
+
if (ctx->minicpmv_version == 2) {
|
|
829
|
+
num_query = 96;
|
|
830
|
+
} else if (ctx->minicpmv_version == 3) {
|
|
831
|
+
num_query = 64;
|
|
832
|
+
} else if (ctx->minicpmv_version == 4) {
|
|
833
|
+
num_query = 64;
|
|
834
|
+
}
|
|
817
835
|
|
|
818
|
-
|
|
819
|
-
|
|
836
|
+
ggml_tensor * Q = ggml_add(ctx0,
|
|
837
|
+
ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
|
|
838
|
+
model.mm_model_attn_q_b);
|
|
839
|
+
ggml_tensor * K = ggml_add(ctx0,
|
|
840
|
+
ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k),
|
|
841
|
+
model.mm_model_attn_k_b);
|
|
842
|
+
ggml_tensor * V = ggml_add(ctx0,
|
|
843
|
+
ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v),
|
|
844
|
+
model.mm_model_attn_v_b);
|
|
845
|
+
|
|
846
|
+
Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
|
|
847
|
+
K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos);
|
|
848
|
+
V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos);
|
|
849
|
+
|
|
850
|
+
cb(Q, "resampler_Q", -1);
|
|
851
|
+
cb(K, "resampler_K", -1);
|
|
852
|
+
cb(V, "resampler_V", -1);
|
|
853
|
+
|
|
854
|
+
embeddings = build_attn(
|
|
855
|
+
model.mm_model_attn_o_w,
|
|
856
|
+
model.mm_model_attn_o_b,
|
|
857
|
+
Q, K, V, nullptr, kq_scale, -1);
|
|
858
|
+
cb(embeddings, "resampler_attn_out", -1);
|
|
859
|
+
}
|
|
860
|
+
// layernorm
|
|
861
|
+
embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
|
|
862
|
+
|
|
863
|
+
// projection
|
|
864
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
|
|
820
865
|
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
ctx0, inp,
|
|
824
|
-
hidden_size * 2, patches_w / 2, patches_h, batch_size);
|
|
825
|
-
inp = ggml_reshape_4d(
|
|
826
|
-
ctx0, inp,
|
|
827
|
-
hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
|
|
828
|
-
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
|
|
829
|
-
inp = ggml_reshape_3d(
|
|
830
|
-
ctx0, inp,
|
|
831
|
-
hidden_size, patches_w * patches_h, batch_size);
|
|
866
|
+
// build the graph
|
|
867
|
+
ggml_build_forward_expand(gf, embeddings);
|
|
832
868
|
|
|
833
|
-
|
|
834
|
-
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
|
|
835
|
-
inp = ggml_add(ctx0, inp, model.patch_bias);
|
|
869
|
+
return gf;
|
|
836
870
|
}
|
|
837
|
-
struct ggml_tensor * embeddings = inp;
|
|
838
|
-
struct ggml_tensor * window_mask = nullptr;
|
|
839
|
-
struct ggml_tensor * window_idx = nullptr;
|
|
840
|
-
struct ggml_tensor * inv_window_idx = nullptr;
|
|
841
871
|
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
872
|
+
ggml_cgraph * build_internvl() {
|
|
873
|
+
GGML_ASSERT(model.class_embedding != nullptr);
|
|
874
|
+
GGML_ASSERT(model.position_embeddings != nullptr);
|
|
845
875
|
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
embeddings = ggml_rms_norm(ctx0, embeddings, eps);
|
|
849
|
-
ggml_set_name(embeddings, "pre_ln");
|
|
876
|
+
const int n_pos = n_patches + 1;
|
|
877
|
+
ggml_tensor * inp = build_inp();
|
|
850
878
|
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
if (use_window_attn) {
|
|
855
|
-
// handle window attention inputs
|
|
856
|
-
inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions / 4);
|
|
857
|
-
ggml_set_name(inv_window_idx, "inv_window_idx");
|
|
858
|
-
ggml_set_input(inv_window_idx);
|
|
859
|
-
// mask for window attention
|
|
860
|
-
window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, num_positions, num_positions);
|
|
861
|
-
ggml_set_name(window_mask, "window_mask");
|
|
862
|
-
ggml_set_input(window_mask);
|
|
879
|
+
// add CLS token
|
|
880
|
+
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
|
|
863
881
|
|
|
864
|
-
//
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
}
|
|
882
|
+
// The larger models use a different ViT, which uses RMS norm instead of layer norm
|
|
883
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
|
|
884
|
+
norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45)
|
|
885
|
+
? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
|
|
886
|
+
: NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
|
|
870
887
|
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
888
|
+
ggml_tensor * cur = build_vit(
|
|
889
|
+
inp, n_pos,
|
|
890
|
+
norm_t,
|
|
891
|
+
hparams.ffn_op,
|
|
892
|
+
model.position_embeddings,
|
|
893
|
+
nullptr);
|
|
874
894
|
|
|
875
|
-
//
|
|
876
|
-
cur =
|
|
877
|
-
|
|
895
|
+
// remove CLS token
|
|
896
|
+
cur = ggml_view_2d(ctx0, cur,
|
|
897
|
+
n_embd, n_patches,
|
|
898
|
+
ggml_row_size(cur->type, n_embd), 0);
|
|
878
899
|
|
|
879
|
-
//
|
|
900
|
+
// pixel shuffle
|
|
880
901
|
{
|
|
902
|
+
const int scale_factor = model.hparams.proj_scale_factor;
|
|
903
|
+
const int bsz = 1; // batch size, always 1 for now since we don't support batching
|
|
904
|
+
const int height = n_patches_y;
|
|
905
|
+
const int width = n_patches_x;
|
|
906
|
+
GGML_ASSERT(scale_factor > 0);
|
|
907
|
+
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
|
|
908
|
+
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
|
909
|
+
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
|
|
910
|
+
n_embd * scale_factor * scale_factor,
|
|
911
|
+
height / scale_factor,
|
|
912
|
+
width / scale_factor,
|
|
913
|
+
bsz);
|
|
914
|
+
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
|
915
|
+
// flatten to 2D
|
|
916
|
+
cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur),
|
|
917
|
+
n_embd * scale_factor * scale_factor,
|
|
918
|
+
cur->ne[1] * cur->ne[2]);
|
|
919
|
+
}
|
|
920
|
+
|
|
921
|
+
// projector (always using GELU activation)
|
|
922
|
+
{
|
|
923
|
+
// projector LayerNorm uses pytorch's default eps = 1e-5
|
|
924
|
+
// ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
|
|
925
|
+
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
|
926
|
+
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
|
927
|
+
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
|
928
|
+
cur = ggml_gelu(ctx0, cur);
|
|
929
|
+
cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
|
|
930
|
+
cur = ggml_add(ctx0, cur, model.mm_3_b);
|
|
931
|
+
}
|
|
881
932
|
|
|
882
|
-
|
|
883
|
-
|
|
933
|
+
// build the graph
|
|
934
|
+
ggml_build_forward_expand(gf, cur);
|
|
884
935
|
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
ctx0, Q, positions, nullptr,
|
|
888
|
-
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
|
889
|
-
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
|
890
|
-
Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
|
|
936
|
+
return gf;
|
|
937
|
+
}
|
|
891
938
|
|
|
892
|
-
|
|
893
|
-
|
|
939
|
+
// this graph is used by llava, granite and glm
|
|
940
|
+
// due to having embedding_stack (used by granite), we cannot reuse build_vit
|
|
941
|
+
ggml_cgraph * build_llava() {
|
|
942
|
+
const int batch_size = 1;
|
|
943
|
+
const int n_pos = n_patches + (model.class_embedding ? 1 : 0);
|
|
894
944
|
|
|
895
|
-
|
|
896
|
-
K = ggml_rope_multi(
|
|
897
|
-
ctx0, K, positions, nullptr,
|
|
898
|
-
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
|
899
|
-
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
|
900
|
-
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
|
|
945
|
+
GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
|
|
901
946
|
|
|
902
|
-
|
|
903
|
-
|
|
947
|
+
// Calculate the deepest feature layer based on hparams and projector type
|
|
948
|
+
int max_feature_layer = n_layer;
|
|
949
|
+
{
|
|
950
|
+
// Get the index of the second to last layer; this is the default for models that have a llava projector
|
|
951
|
+
int il_last = hparams.n_layer - 1;
|
|
952
|
+
int deepest_feature_layer = -1;
|
|
904
953
|
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
954
|
+
if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
|
955
|
+
il_last += 1;
|
|
956
|
+
}
|
|
908
957
|
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
958
|
+
// If we set explicit vision feature layers, only go up to the deepest one
|
|
959
|
+
// NOTE: only used by granite-vision models for now
|
|
960
|
+
for (const auto & feature_layer : hparams.vision_feature_layer) {
|
|
961
|
+
if (feature_layer > deepest_feature_layer) {
|
|
962
|
+
deepest_feature_layer = feature_layer;
|
|
963
|
+
}
|
|
915
964
|
}
|
|
965
|
+
max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer;
|
|
966
|
+
}
|
|
916
967
|
|
|
917
|
-
|
|
918
|
-
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
|
|
919
|
-
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
|
968
|
+
ggml_tensor * inp = build_inp();
|
|
920
969
|
|
|
921
|
-
|
|
970
|
+
// concat class_embeddings and patch_embeddings
|
|
971
|
+
if (model.class_embedding) {
|
|
972
|
+
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
|
|
922
973
|
}
|
|
923
974
|
|
|
924
|
-
|
|
925
|
-
|
|
975
|
+
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
|
976
|
+
ggml_set_name(positions, "positions");
|
|
977
|
+
ggml_set_input(positions);
|
|
926
978
|
|
|
927
|
-
|
|
928
|
-
cur = ggml_add(ctx0, cur, embeddings);
|
|
979
|
+
inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
|
929
980
|
|
|
930
|
-
|
|
981
|
+
ggml_tensor * inpL = inp;
|
|
931
982
|
|
|
932
|
-
//
|
|
933
|
-
|
|
934
|
-
|
|
983
|
+
// pre-layernorm
|
|
984
|
+
if (model.pre_ln_w) {
|
|
985
|
+
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
|
|
986
|
+
cb(inpL, "pre_ln", -1);
|
|
987
|
+
}
|
|
935
988
|
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
auto cur_up = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
|
|
939
|
-
cur_up = ggml_add(ctx0, cur_up, model.layers[il].ff_o_b);
|
|
989
|
+
std::vector<ggml_tensor *> embedding_stack;
|
|
990
|
+
const auto & vision_feature_layer = hparams.vision_feature_layer;
|
|
940
991
|
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
cur_gate = ggml_gelu_inplace(ctx0, cur_gate);
|
|
946
|
-
} else if (ctx->use_silu) {
|
|
947
|
-
cur_gate = ggml_silu_inplace(ctx0, cur_gate);
|
|
948
|
-
} else {
|
|
949
|
-
cur_gate = ggml_gelu_quick_inplace(ctx0, cur_gate);
|
|
950
|
-
}
|
|
951
|
-
cur = ggml_mul(ctx0, cur_gate, cur_up);
|
|
992
|
+
// loop over layers
|
|
993
|
+
for (int il = 0; il < max_feature_layer; il++) {
|
|
994
|
+
auto & layer = model.layers[il];
|
|
995
|
+
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
|
|
952
996
|
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
997
|
+
// If this is an embedding feature layer, save the output.
|
|
998
|
+
// NOTE: 0 index here refers to the input to the encoder.
|
|
999
|
+
if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
|
|
1000
|
+
embedding_stack.push_back(cur);
|
|
1001
|
+
}
|
|
956
1002
|
|
|
957
|
-
|
|
958
|
-
|
|
1003
|
+
// layernorm1
|
|
1004
|
+
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
|
|
1005
|
+
cb(cur, "layer_inp_normed", il);
|
|
959
1006
|
|
|
960
|
-
|
|
961
|
-
|
|
1007
|
+
// self-attention
|
|
1008
|
+
{
|
|
1009
|
+
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
|
|
1010
|
+
if (layer.q_b) {
|
|
1011
|
+
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
|
1012
|
+
}
|
|
962
1013
|
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
1014
|
+
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
|
|
1015
|
+
if (layer.k_b) {
|
|
1016
|
+
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
|
1017
|
+
}
|
|
967
1018
|
|
|
968
|
-
|
|
969
|
-
|
|
1019
|
+
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
|
|
1020
|
+
if (layer.v_b) {
|
|
1021
|
+
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
|
1022
|
+
}
|
|
970
1023
|
|
|
971
|
-
|
|
1024
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
|
1025
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
|
1026
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
|
972
1027
|
|
|
973
|
-
|
|
974
|
-
|
|
1028
|
+
cb(Qcur, "Qcur", il);
|
|
1029
|
+
cb(Kcur, "Kcur", il);
|
|
1030
|
+
cb(Vcur, "Vcur", il);
|
|
975
1031
|
|
|
976
|
-
|
|
977
|
-
|
|
1032
|
+
cur = build_attn(layer.o_w, layer.o_b,
|
|
1033
|
+
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
|
1034
|
+
cb(cur, "attn_out", il);
|
|
1035
|
+
}
|
|
978
1036
|
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
|
|
1037
|
+
// re-add the layer input, e.g., residual
|
|
1038
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
982
1039
|
|
|
983
|
-
|
|
984
|
-
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions / 4);
|
|
985
|
-
ggml_set_name(window_idx, "window_idx");
|
|
986
|
-
ggml_set_input(window_idx);
|
|
1040
|
+
inpL = cur; // inpL = residual, cur = hidden_states
|
|
987
1041
|
|
|
988
|
-
|
|
989
|
-
GGML_ASSERT(batch_size == 1);
|
|
990
|
-
embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4);
|
|
991
|
-
embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
|
|
992
|
-
embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4, batch_size);
|
|
993
|
-
}
|
|
1042
|
+
cb(cur, "ffn_inp", il);
|
|
994
1043
|
|
|
995
|
-
|
|
996
|
-
|
|
1044
|
+
// layernorm2
|
|
1045
|
+
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
|
|
1046
|
+
cb(cur, "ffn_inp_normed", il);
|
|
997
1047
|
|
|
998
|
-
|
|
999
|
-
|
|
1048
|
+
// ffn
|
|
1049
|
+
cur = build_ffn(cur,
|
|
1050
|
+
layer.ff_up_w, layer.ff_up_b,
|
|
1051
|
+
layer.ff_gate_w, layer.ff_gate_b,
|
|
1052
|
+
layer.ff_down_w, layer.ff_down_b,
|
|
1053
|
+
hparams.ffn_op, il);
|
|
1000
1054
|
|
|
1001
|
-
|
|
1002
|
-
const auto & model = ctx->vision_model;
|
|
1003
|
-
const auto & hparams = model.hparams;
|
|
1055
|
+
cb(cur, "ffn_out", il);
|
|
1004
1056
|
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1057
|
+
// residual 2
|
|
1058
|
+
cur = ggml_add(ctx0, inpL, cur);
|
|
1059
|
+
cb(cur, "layer_out", il);
|
|
1008
1060
|
|
|
1009
|
-
|
|
1010
|
-
LOG_DBG("%s: %d %d\n", __func__, load_image_size.width, load_image_size.height);
|
|
1011
|
-
image_size_width = load_image_size.width;
|
|
1012
|
-
image_size_height = load_image_size.height;
|
|
1013
|
-
if (is_inf) {
|
|
1014
|
-
image_size_width = imgs.entries[0]->nx;
|
|
1015
|
-
image_size_height = imgs.entries[0]->ny;
|
|
1061
|
+
inpL = cur;
|
|
1016
1062
|
}
|
|
1017
|
-
}
|
|
1018
1063
|
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
// if (imgs->data->nx && imgs->data->ny) {
|
|
1023
|
-
image_size_width = imgs.entries[0]->nx;
|
|
1024
|
-
image_size_height = imgs.entries[0]->ny;
|
|
1064
|
+
// post-layernorm
|
|
1065
|
+
if (model.post_ln_w) {
|
|
1066
|
+
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
|
|
1025
1067
|
}
|
|
1026
|
-
}
|
|
1027
|
-
|
|
1028
|
-
const int patch_size = hparams.patch_size;
|
|
1029
|
-
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
|
1030
|
-
const int patches_w = image_size_width / patch_size;
|
|
1031
|
-
const int patches_h = image_size_height / patch_size;
|
|
1032
|
-
const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
|
|
1033
|
-
const int num_position_ids = ctx->proj_type == PROJECTOR_TYPE_QWEN2VL ? num_positions * 4 : num_positions;
|
|
1034
|
-
const int hidden_size = hparams.hidden_size;
|
|
1035
|
-
const int n_head = hparams.n_head;
|
|
1036
|
-
const int d_head = hidden_size / n_head;
|
|
1037
|
-
const float eps = hparams.eps;
|
|
1038
|
-
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
|
1039
|
-
|
|
1040
|
-
const int batch_size = imgs.entries.size();
|
|
1041
|
-
|
|
1042
|
-
if (ctx->has_llava_projector
|
|
1043
|
-
|| ctx->proj_type == PROJECTOR_TYPE_MINICPMV
|
|
1044
|
-
|| ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
|
1045
|
-
GGML_ASSERT(batch_size == 1);
|
|
1046
|
-
}
|
|
1047
1068
|
|
|
1048
|
-
|
|
1049
|
-
/*.mem_size =*/ ctx->buf_compute_meta.size(),
|
|
1050
|
-
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
|
|
1051
|
-
/*.no_alloc =*/ true,
|
|
1052
|
-
};
|
|
1069
|
+
ggml_tensor * embeddings = inpL;
|
|
1053
1070
|
|
|
1054
|
-
|
|
1055
|
-
|
|
1071
|
+
// process vision feature layers (used by granite)
|
|
1072
|
+
{
|
|
1073
|
+
// final layer is a vision feature layer
|
|
1074
|
+
if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) {
|
|
1075
|
+
embedding_stack.push_back(inpL);
|
|
1076
|
+
}
|
|
1056
1077
|
|
|
1057
|
-
|
|
1078
|
+
// If feature layers are explicitly set, stack them (if we have multiple)
|
|
1079
|
+
if (!embedding_stack.empty()) {
|
|
1080
|
+
embeddings = embedding_stack[0];
|
|
1081
|
+
for (size_t i = 1; i < embedding_stack.size(); i++) {
|
|
1082
|
+
embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
|
|
1083
|
+
}
|
|
1084
|
+
}
|
|
1085
|
+
}
|
|
1058
1086
|
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1087
|
+
// llava projector (also used by granite)
|
|
1088
|
+
if (ctx->has_llava_projector) {
|
|
1089
|
+
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
|
|
1062
1090
|
|
|
1063
|
-
|
|
1091
|
+
ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
|
1092
|
+
ggml_set_name(patches, "patches");
|
|
1093
|
+
ggml_set_input(patches);
|
|
1064
1094
|
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1095
|
+
// shape [1, 576, 1024]
|
|
1096
|
+
// ne is whcn, ne = [1024, 576, 1, 1]
|
|
1097
|
+
embeddings = ggml_get_rows(ctx0, embeddings, patches);
|
|
1068
1098
|
|
|
1069
|
-
|
|
1070
|
-
inp = ggml_add(ctx0, inp, inp_1);
|
|
1071
|
-
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
|
|
1072
|
-
inp = ggml_reshape_4d(
|
|
1073
|
-
ctx0, inp,
|
|
1074
|
-
hidden_size * 2, patches_w / 2, patches_h, batch_size);
|
|
1075
|
-
inp = ggml_reshape_4d(
|
|
1076
|
-
ctx0, inp,
|
|
1077
|
-
hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
|
|
1078
|
-
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
|
|
1079
|
-
inp = ggml_reshape_3d(
|
|
1080
|
-
ctx0, inp,
|
|
1081
|
-
hidden_size, patches_w * patches_h, batch_size);
|
|
1082
|
-
}
|
|
1083
|
-
else {
|
|
1084
|
-
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
|
|
1085
|
-
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
|
|
1086
|
-
}
|
|
1099
|
+
// print_tensor_info(embeddings, "embeddings");
|
|
1087
1100
|
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
struct ggml_tensor * embeddings = inp;
|
|
1093
|
-
struct ggml_tensor * pos_embed = nullptr;
|
|
1101
|
+
// llava projector
|
|
1102
|
+
if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
|
|
1103
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
|
1104
|
+
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
|
1094
1105
|
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
embeddings
|
|
1103
|
-
|
|
1106
|
+
embeddings = ggml_gelu(ctx0, embeddings);
|
|
1107
|
+
if (model.mm_2_w) {
|
|
1108
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
|
|
1109
|
+
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
|
1110
|
+
}
|
|
1111
|
+
}
|
|
1112
|
+
else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
|
1113
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
|
1114
|
+
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
|
1115
|
+
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
|
|
1116
|
+
// First LayerNorm
|
|
1117
|
+
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
1118
|
+
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
|
|
1119
|
+
model.mm_1_b);
|
|
1120
|
+
|
|
1121
|
+
// GELU activation
|
|
1122
|
+
embeddings = ggml_gelu(ctx0, embeddings);
|
|
1123
|
+
|
|
1124
|
+
// Second linear layer
|
|
1125
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
|
|
1126
|
+
embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
|
|
1127
|
+
|
|
1128
|
+
// Second LayerNorm
|
|
1129
|
+
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
1130
|
+
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
|
|
1131
|
+
model.mm_4_b);
|
|
1132
|
+
}
|
|
1133
|
+
else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
|
|
1134
|
+
// MobileVLM projector
|
|
1135
|
+
int n_patch = 24;
|
|
1136
|
+
ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
|
|
1137
|
+
mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
|
|
1138
|
+
mlp_1 = ggml_gelu(ctx0, mlp_1);
|
|
1139
|
+
ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
|
|
1140
|
+
mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
|
|
1141
|
+
// mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
|
|
1142
|
+
|
|
1143
|
+
// block 1
|
|
1144
|
+
ggml_tensor * block_1 = nullptr;
|
|
1145
|
+
{
|
|
1146
|
+
// transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
|
|
1147
|
+
mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
|
|
1148
|
+
mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
|
|
1149
|
+
// stride = 1, padding = 1, bias is nullptr
|
|
1150
|
+
block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
|
|
1151
|
+
|
|
1152
|
+
// layer norm
|
|
1153
|
+
// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
|
1154
|
+
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
|
|
1155
|
+
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
|
|
1156
|
+
block_1 = ggml_norm(ctx0, block_1, eps);
|
|
1157
|
+
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
|
|
1158
|
+
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
|
1159
|
+
|
|
1160
|
+
// block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
|
1161
|
+
// hardswish
|
|
1162
|
+
ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
|
|
1163
|
+
|
|
1164
|
+
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
|
|
1165
|
+
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
|
1166
|
+
// pointwise conv
|
|
1167
|
+
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
|
1168
|
+
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
|
|
1169
|
+
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
|
|
1170
|
+
block_1 = ggml_relu(ctx0, block_1);
|
|
1171
|
+
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
|
|
1172
|
+
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
|
|
1173
|
+
block_1 = ggml_hardsigmoid(ctx0, block_1);
|
|
1174
|
+
// block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
|
|
1175
|
+
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
|
1176
|
+
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
|
1177
|
+
|
|
1178
|
+
int w = block_1->ne[0], h = block_1->ne[1];
|
|
1179
|
+
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
|
1180
|
+
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
|
1181
|
+
|
|
1182
|
+
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
|
1183
|
+
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
|
|
1184
|
+
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
|
|
1185
|
+
|
|
1186
|
+
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
|
|
1187
|
+
block_1 = ggml_norm(ctx0, block_1, eps);
|
|
1188
|
+
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
|
|
1189
|
+
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
|
1190
|
+
// block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
|
1191
|
+
// residual
|
|
1192
|
+
block_1 = ggml_add(ctx0, mlp_3, block_1);
|
|
1193
|
+
}
|
|
1104
1194
|
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1195
|
+
// block_2
|
|
1196
|
+
{
|
|
1197
|
+
// stride = 2
|
|
1198
|
+
block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
|
|
1199
|
+
|
|
1200
|
+
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
|
|
1201
|
+
// layer norm
|
|
1202
|
+
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
|
|
1203
|
+
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
|
|
1204
|
+
block_1 = ggml_norm(ctx0, block_1, eps);
|
|
1205
|
+
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
|
|
1206
|
+
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
|
1207
|
+
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
|
|
1208
|
+
// hardswish
|
|
1209
|
+
ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
|
|
1210
|
+
|
|
1211
|
+
// not sure the parameters is right for globalAvgPooling
|
|
1212
|
+
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
|
|
1213
|
+
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
|
1214
|
+
// pointwise conv
|
|
1215
|
+
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
|
1216
|
+
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
|
|
1217
|
+
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
|
|
1218
|
+
block_1 = ggml_relu(ctx0, block_1);
|
|
1219
|
+
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
|
|
1220
|
+
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
|
|
1221
|
+
block_1 = ggml_hardsigmoid(ctx0, block_1);
|
|
1222
|
+
|
|
1223
|
+
// block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
|
1224
|
+
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
|
1225
|
+
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
|
1226
|
+
|
|
1227
|
+
int w = block_1->ne[0], h = block_1->ne[1];
|
|
1228
|
+
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
|
1229
|
+
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
|
1230
|
+
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
|
1231
|
+
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
|
|
1232
|
+
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
|
|
1233
|
+
|
|
1234
|
+
|
|
1235
|
+
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
|
|
1236
|
+
block_1 = ggml_norm(ctx0, block_1, eps);
|
|
1237
|
+
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
|
|
1238
|
+
block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
|
|
1239
|
+
// block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
|
|
1240
|
+
}
|
|
1241
|
+
embeddings = block_1;
|
|
1242
|
+
}
|
|
1243
|
+
else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2)
|
|
1244
|
+
{
|
|
1245
|
+
int n_patch = 24;
|
|
1246
|
+
ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
|
|
1247
|
+
mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
|
|
1248
|
+
mlp_0 = ggml_gelu(ctx0, mlp_0);
|
|
1249
|
+
ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
|
|
1250
|
+
mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
|
|
1251
|
+
// mlp_2 ne = [2048, 576, 1, 1]
|
|
1252
|
+
// // AVG Pool Layer 2*2, strides = 2
|
|
1253
|
+
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
|
|
1254
|
+
// mlp_2 ne = [576, 2048, 1, 1]
|
|
1255
|
+
mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
|
|
1256
|
+
// mlp_2 ne [24, 24, 2048, 1]
|
|
1257
|
+
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
|
|
1258
|
+
// weight ne = [3, 3, 2048, 1]
|
|
1259
|
+
ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
|
|
1260
|
+
peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
|
|
1261
|
+
peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
|
|
1262
|
+
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
|
|
1263
|
+
peg_0 = ggml_add(ctx0, peg_0, mlp_2);
|
|
1264
|
+
peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
|
|
1265
|
+
embeddings = peg_0;
|
|
1266
|
+
}
|
|
1267
|
+
else {
|
|
1268
|
+
GGML_ABORT("fatal error");
|
|
1269
|
+
}
|
|
1270
|
+
}
|
|
1108
1271
|
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1272
|
+
// glm projector
|
|
1273
|
+
else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
|
1274
|
+
size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
|
|
1275
|
+
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
|
|
1276
|
+
embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
|
|
1277
|
+
embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
|
|
1278
|
+
embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
|
|
1279
|
+
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
|
|
1280
|
+
embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
|
|
1281
|
+
// GLU
|
|
1282
|
+
{
|
|
1283
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
|
|
1284
|
+
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
1285
|
+
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
|
|
1286
|
+
embeddings = ggml_gelu_inplace(ctx0, embeddings);
|
|
1287
|
+
ggml_tensor * x = embeddings;
|
|
1288
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
|
|
1289
|
+
x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
|
|
1290
|
+
embeddings = ggml_silu_inplace(ctx0, embeddings);
|
|
1291
|
+
embeddings = ggml_mul(ctx0, embeddings,x);
|
|
1292
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
|
|
1293
|
+
}
|
|
1294
|
+
// arrangement of BOI/EOI token embeddings
|
|
1295
|
+
// note: these embeddings are not present in text model, hence we cannot process them as text tokens
|
|
1296
|
+
// see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
|
|
1297
|
+
{
|
|
1298
|
+
embeddings = ggml_concat(ctx0, model.mm_glm_tok_boi, embeddings, 1); // BOI
|
|
1299
|
+
embeddings = ggml_concat(ctx0, embeddings, model.mm_glm_tok_eoi, 1); // EOI
|
|
1300
|
+
}
|
|
1301
|
+
}
|
|
1113
1302
|
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
int n_output_dim = clip_n_mmproj_embd(ctx);
|
|
1118
|
-
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, pos_w * pos_h, 1);
|
|
1119
|
-
ggml_set_name(pos_embed, "pos_embed");
|
|
1120
|
-
ggml_set_input(pos_embed);
|
|
1121
|
-
}
|
|
1303
|
+
else {
|
|
1304
|
+
GGML_ABORT("llava: unknown projector type");
|
|
1305
|
+
}
|
|
1122
1306
|
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
1126
|
-
ggml_set_name(embeddings, "pre_ln");
|
|
1307
|
+
// build the graph
|
|
1308
|
+
ggml_build_forward_expand(gf, embeddings);
|
|
1127
1309
|
|
|
1128
|
-
|
|
1310
|
+
return gf;
|
|
1129
1311
|
}
|
|
1130
1312
|
|
|
1131
|
-
|
|
1132
|
-
|
|
1313
|
+
private:
|
|
1314
|
+
//
|
|
1315
|
+
// utility functions
|
|
1316
|
+
//
|
|
1317
|
+
|
|
1318
|
+
void cb(ggml_tensor * cur, const char * name, int il) const {
|
|
1319
|
+
// TODO: implement this
|
|
1320
|
+
GGML_UNUSED(cur);
|
|
1321
|
+
GGML_UNUSED(name);
|
|
1322
|
+
GGML_UNUSED(il);
|
|
1323
|
+
}
|
|
1324
|
+
|
|
1325
|
+
// build vision transformer (ViT) cgraph
|
|
1326
|
+
// this function should cover most of the models
|
|
1327
|
+
// if your model has specific features, you should probably duplicate this function
|
|
1328
|
+
ggml_tensor * build_vit(
|
|
1329
|
+
ggml_tensor * inp,
|
|
1330
|
+
int64_t n_pos,
|
|
1331
|
+
norm_type norm_t,
|
|
1332
|
+
ffn_op_type ffn_t,
|
|
1333
|
+
ggml_tensor * learned_pos_embd,
|
|
1334
|
+
std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos
|
|
1335
|
+
) {
|
|
1336
|
+
if (learned_pos_embd) {
|
|
1337
|
+
inp = ggml_add(ctx0, inp, learned_pos_embd);
|
|
1338
|
+
cb(inp, "pos_embed", -1);
|
|
1339
|
+
}
|
|
1340
|
+
|
|
1341
|
+
ggml_tensor * inpL = inp;
|
|
1342
|
+
|
|
1343
|
+
// pre-layernorm
|
|
1344
|
+
if (model.pre_ln_w) {
|
|
1345
|
+
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
|
|
1346
|
+
cb(inpL, "pre_ln", -1);
|
|
1347
|
+
}
|
|
1348
|
+
|
|
1349
|
+
// loop over layers
|
|
1350
|
+
for (int il = 0; il < n_layer; il++) {
|
|
1351
|
+
auto & layer = model.layers[il];
|
|
1352
|
+
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
|
|
1353
|
+
|
|
1354
|
+
// layernorm1
|
|
1355
|
+
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
|
|
1356
|
+
cb(cur, "layer_inp_normed", il);
|
|
1357
|
+
|
|
1358
|
+
// self-attention
|
|
1359
|
+
{
|
|
1360
|
+
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
|
|
1361
|
+
if (layer.q_b) {
|
|
1362
|
+
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
|
1363
|
+
}
|
|
1133
1364
|
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1365
|
+
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
|
|
1366
|
+
if (layer.k_b) {
|
|
1367
|
+
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
|
1368
|
+
}
|
|
1137
1369
|
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
}
|
|
1370
|
+
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
|
|
1371
|
+
if (layer.v_b) {
|
|
1372
|
+
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
|
1373
|
+
}
|
|
1143
1374
|
|
|
1144
|
-
|
|
1375
|
+
if (layer.q_norm) {
|
|
1376
|
+
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
|
|
1377
|
+
cb(Qcur, "Qcur_norm", il);
|
|
1378
|
+
}
|
|
1145
1379
|
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1380
|
+
if (layer.k_norm) {
|
|
1381
|
+
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
|
|
1382
|
+
cb(Kcur, "Kcur_norm", il);
|
|
1383
|
+
}
|
|
1149
1384
|
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1385
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
|
1386
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
|
1387
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
|
1153
1388
|
|
|
1154
|
-
|
|
1155
|
-
|
|
1389
|
+
cb(Qcur, "Qcur", il);
|
|
1390
|
+
cb(Kcur, "Kcur", il);
|
|
1391
|
+
cb(Vcur, "Vcur", il);
|
|
1156
1392
|
|
|
1157
|
-
|
|
1158
|
-
|
|
1393
|
+
if (add_pos) {
|
|
1394
|
+
Qcur = add_pos(Qcur, layer);
|
|
1395
|
+
Kcur = add_pos(Kcur, layer);
|
|
1396
|
+
cb(Qcur, "Qcur_pos", il);
|
|
1397
|
+
cb(Kcur, "Kcur_pos", il);
|
|
1398
|
+
}
|
|
1159
1399
|
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
ctx0, Q, positions, nullptr,
|
|
1164
|
-
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
|
1400
|
+
cur = build_attn(layer.o_w, layer.o_b,
|
|
1401
|
+
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
|
1402
|
+
cb(cur, "attn_out", il);
|
|
1165
1403
|
}
|
|
1166
|
-
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
|
1167
|
-
Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
|
|
1168
|
-
|
|
1169
|
-
struct ggml_tensor * K =
|
|
1170
|
-
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
|
|
1171
1404
|
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
ctx0, K, positions, nullptr,
|
|
1176
|
-
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
|
1405
|
+
if (layer.ls_1_w) {
|
|
1406
|
+
cur = ggml_mul(ctx0, cur, layer.ls_1_w);
|
|
1407
|
+
cb(cur, "attn_out_scaled", il);
|
|
1177
1408
|
}
|
|
1178
|
-
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
|
1179
|
-
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
|
|
1180
1409
|
|
|
1181
|
-
|
|
1182
|
-
|
|
1410
|
+
// re-add the layer input, e.g., residual
|
|
1411
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
1183
1412
|
|
|
1184
|
-
|
|
1185
|
-
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
|
1186
|
-
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
|
|
1413
|
+
inpL = cur; // inpL = residual, cur = hidden_states
|
|
1187
1414
|
|
|
1188
|
-
|
|
1189
|
-
KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
|
|
1190
|
-
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
|
1191
|
-
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
|
|
1192
|
-
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
|
1415
|
+
cb(cur, "ffn_inp", il);
|
|
1193
1416
|
|
|
1194
|
-
|
|
1195
|
-
|
|
1417
|
+
// layernorm2
|
|
1418
|
+
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
|
|
1419
|
+
cb(cur, "ffn_inp_normed", il);
|
|
1196
1420
|
|
|
1197
|
-
|
|
1198
|
-
|
|
1421
|
+
// ffn
|
|
1422
|
+
cur = build_ffn(cur,
|
|
1423
|
+
layer.ff_up_w, layer.ff_up_b,
|
|
1424
|
+
layer.ff_gate_w, layer.ff_gate_b,
|
|
1425
|
+
layer.ff_down_w, layer.ff_down_b,
|
|
1426
|
+
ffn_t, il);
|
|
1199
1427
|
|
|
1200
|
-
|
|
1201
|
-
cur = ggml_add(ctx0, cur, embeddings);
|
|
1428
|
+
cb(cur, "ffn_out", il);
|
|
1202
1429
|
|
|
1203
|
-
|
|
1430
|
+
if (layer.ls_2_w) {
|
|
1431
|
+
cur = ggml_mul(ctx0, cur, layer.ls_2_w);
|
|
1432
|
+
cb(cur, "ffn_out_scaled", il);
|
|
1433
|
+
}
|
|
1204
1434
|
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
cur
|
|
1435
|
+
// residual 2
|
|
1436
|
+
cur = ggml_add(ctx0, inpL, cur);
|
|
1437
|
+
cb(cur, "layer_out", il);
|
|
1208
1438
|
|
|
1209
|
-
|
|
1439
|
+
inpL = cur;
|
|
1210
1440
|
}
|
|
1211
1441
|
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
if (ctx->use_gelu) {
|
|
1216
|
-
cur = ggml_gelu_inplace(ctx0, cur);
|
|
1217
|
-
} else if (ctx->use_silu) {
|
|
1218
|
-
cur = ggml_silu_inplace(ctx0, cur);
|
|
1219
|
-
} else {
|
|
1220
|
-
cur = ggml_gelu_quick_inplace(ctx0, cur);
|
|
1442
|
+
// post-layernorm
|
|
1443
|
+
if (model.post_ln_w) {
|
|
1444
|
+
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1);
|
|
1221
1445
|
}
|
|
1446
|
+
return inpL;
|
|
1447
|
+
}
|
|
1222
1448
|
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1449
|
+
// build the input after conv2d (inp_raw --> patches)
|
|
1450
|
+
// returns tensor with shape [n_embd, n_patches]
|
|
1451
|
+
ggml_tensor * build_inp() {
|
|
1452
|
+
ggml_tensor * inp_raw = build_inp_raw();
|
|
1453
|
+
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
|
1454
|
+
inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
|
|
1455
|
+
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
|
|
1456
|
+
if (model.patch_bias) {
|
|
1457
|
+
inp = ggml_add(ctx0, inp, model.patch_bias);
|
|
1458
|
+
cb(inp, "patch_bias", -1);
|
|
1459
|
+
}
|
|
1460
|
+
return inp;
|
|
1461
|
+
}
|
|
1228
1462
|
|
|
1229
|
-
|
|
1463
|
+
ggml_tensor * build_inp_raw() {
|
|
1464
|
+
ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, 3);
|
|
1465
|
+
ggml_set_name(inp_raw, "inp_raw");
|
|
1466
|
+
ggml_set_input(inp_raw);
|
|
1467
|
+
return inp_raw;
|
|
1230
1468
|
}
|
|
1231
1469
|
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1470
|
+
ggml_tensor * build_norm(
|
|
1471
|
+
ggml_tensor * cur,
|
|
1472
|
+
ggml_tensor * mw,
|
|
1473
|
+
ggml_tensor * mb,
|
|
1474
|
+
norm_type type,
|
|
1475
|
+
float norm_eps,
|
|
1476
|
+
int il) const {
|
|
1236
1477
|
|
|
1237
|
-
|
|
1238
|
-
|
|
1478
|
+
cur = type == NORM_TYPE_RMS
|
|
1479
|
+
? ggml_rms_norm(ctx0, cur, norm_eps)
|
|
1480
|
+
: ggml_norm(ctx0, cur, norm_eps);
|
|
1239
1481
|
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
}
|
|
1482
|
+
if (mw || mb) {
|
|
1483
|
+
cb(cur, "norm", il);
|
|
1484
|
+
}
|
|
1244
1485
|
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1486
|
+
if (mw) {
|
|
1487
|
+
cur = ggml_mul(ctx0, cur, mw);
|
|
1488
|
+
if (mb) {
|
|
1489
|
+
cb(cur, "norm_w", il);
|
|
1490
|
+
}
|
|
1250
1491
|
}
|
|
1251
|
-
}
|
|
1252
1492
|
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1493
|
+
if (mb) {
|
|
1494
|
+
cur = ggml_add(ctx0, cur, mb);
|
|
1495
|
+
}
|
|
1496
|
+
|
|
1497
|
+
return cur;
|
|
1498
|
+
}
|
|
1256
1499
|
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1500
|
+
ggml_tensor * build_ffn(
|
|
1501
|
+
ggml_tensor * cur,
|
|
1502
|
+
ggml_tensor * up,
|
|
1503
|
+
ggml_tensor * up_b,
|
|
1504
|
+
ggml_tensor * gate,
|
|
1505
|
+
ggml_tensor * gate_b,
|
|
1506
|
+
ggml_tensor * down,
|
|
1507
|
+
ggml_tensor * down_b,
|
|
1508
|
+
ffn_op_type type_op,
|
|
1509
|
+
int il) const {
|
|
1260
1510
|
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
embeddings = ggml_get_rows(ctx0, embeddings, patches);
|
|
1511
|
+
ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur;
|
|
1512
|
+
cb(tmp, "ffn_up", il);
|
|
1264
1513
|
|
|
1265
|
-
|
|
1514
|
+
if (up_b) {
|
|
1515
|
+
tmp = ggml_add(ctx0, tmp, up_b);
|
|
1516
|
+
cb(tmp, "ffn_up_b", il);
|
|
1517
|
+
}
|
|
1266
1518
|
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
|
1519
|
+
if (gate) {
|
|
1520
|
+
cur = ggml_mul_mat(ctx0, gate, cur);
|
|
1521
|
+
cb(cur, "ffn_gate", il);
|
|
1271
1522
|
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
|
1523
|
+
if (gate_b) {
|
|
1524
|
+
cur = ggml_add(ctx0, cur, gate_b);
|
|
1525
|
+
cb(cur, "ffn_gate_b", il);
|
|
1276
1526
|
}
|
|
1527
|
+
} else {
|
|
1528
|
+
cur = tmp;
|
|
1277
1529
|
}
|
|
1278
|
-
else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
|
1279
|
-
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
|
1280
|
-
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
|
1281
|
-
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
|
|
1282
|
-
// First LayerNorm
|
|
1283
|
-
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
1284
|
-
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
|
|
1285
|
-
model.mm_1_b);
|
|
1286
|
-
|
|
1287
|
-
// GELU activation
|
|
1288
|
-
embeddings = ggml_gelu(ctx0, embeddings);
|
|
1289
|
-
|
|
1290
|
-
// Second linear layer
|
|
1291
|
-
embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
|
|
1292
|
-
embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
|
|
1293
|
-
|
|
1294
|
-
// Second LayerNorm
|
|
1295
|
-
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
1296
|
-
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
|
|
1297
|
-
model.mm_4_b);
|
|
1298
|
-
}
|
|
1299
|
-
else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
|
|
1300
|
-
// MobileVLM projector
|
|
1301
|
-
int n_patch = 24;
|
|
1302
|
-
struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
|
|
1303
|
-
mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
|
|
1304
|
-
mlp_1 = ggml_gelu(ctx0, mlp_1);
|
|
1305
|
-
struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
|
|
1306
|
-
mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
|
|
1307
|
-
// mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
|
|
1308
|
-
|
|
1309
|
-
// block 1
|
|
1310
|
-
struct ggml_tensor * block_1 = nullptr;
|
|
1311
|
-
{
|
|
1312
|
-
// transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
|
|
1313
|
-
mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
|
|
1314
|
-
mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
|
|
1315
|
-
// stride = 1, padding = 1, bias is nullptr
|
|
1316
|
-
block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
|
|
1317
|
-
|
|
1318
|
-
// layer norm
|
|
1319
|
-
// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
|
1320
|
-
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
|
|
1321
|
-
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
|
|
1322
|
-
block_1 = ggml_norm(ctx0, block_1, eps);
|
|
1323
|
-
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
|
|
1324
|
-
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
|
1325
|
-
|
|
1326
|
-
// block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
|
1327
|
-
// hardswish
|
|
1328
|
-
struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
|
|
1329
|
-
|
|
1330
|
-
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
|
|
1331
|
-
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
|
1332
|
-
// pointwise conv
|
|
1333
|
-
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
|
1334
|
-
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
|
|
1335
|
-
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
|
|
1336
|
-
block_1 = ggml_relu(ctx0, block_1);
|
|
1337
|
-
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
|
|
1338
|
-
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
|
|
1339
|
-
block_1 = ggml_hardsigmoid(ctx0, block_1);
|
|
1340
|
-
// block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
|
|
1341
|
-
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
|
1342
|
-
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
|
1343
|
-
|
|
1344
|
-
int w = block_1->ne[0], h = block_1->ne[1];
|
|
1345
|
-
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
|
1346
|
-
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
|
1347
|
-
|
|
1348
|
-
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
|
1349
|
-
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
|
|
1350
|
-
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
|
|
1351
|
-
|
|
1352
|
-
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
|
|
1353
|
-
block_1 = ggml_norm(ctx0, block_1, eps);
|
|
1354
|
-
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
|
|
1355
|
-
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
|
1356
|
-
// block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
|
1357
|
-
// residual
|
|
1358
|
-
block_1 = ggml_add(ctx0, mlp_3, block_1);
|
|
1359
|
-
}
|
|
1360
1530
|
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
// not sure the parameters is right for globalAvgPooling
|
|
1378
|
-
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
|
|
1379
|
-
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
|
1380
|
-
// pointwise conv
|
|
1381
|
-
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
|
1382
|
-
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
|
|
1383
|
-
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
|
|
1384
|
-
block_1 = ggml_relu(ctx0, block_1);
|
|
1385
|
-
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
|
|
1386
|
-
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
|
|
1387
|
-
block_1 = ggml_hardsigmoid(ctx0, block_1);
|
|
1388
|
-
|
|
1389
|
-
// block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
|
1390
|
-
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
|
1391
|
-
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
|
1392
|
-
|
|
1393
|
-
int w = block_1->ne[0], h = block_1->ne[1];
|
|
1394
|
-
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
|
1395
|
-
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
|
1396
|
-
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
|
1397
|
-
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
|
|
1398
|
-
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
|
|
1402
|
-
block_1 = ggml_norm(ctx0, block_1, eps);
|
|
1403
|
-
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
|
|
1404
|
-
block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
|
|
1405
|
-
// block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
|
|
1406
|
-
}
|
|
1407
|
-
embeddings = block_1;
|
|
1531
|
+
switch (type_op) {
|
|
1532
|
+
case FFN_SILU:
|
|
1533
|
+
{
|
|
1534
|
+
cur = ggml_silu(ctx0, cur);
|
|
1535
|
+
cb(cur, "ffn_silu", il);
|
|
1536
|
+
} break;
|
|
1537
|
+
case FFN_GELU:
|
|
1538
|
+
{
|
|
1539
|
+
cur = ggml_gelu(ctx0, cur);
|
|
1540
|
+
cb(cur, "ffn_gelu", il);
|
|
1541
|
+
} break;
|
|
1542
|
+
case FFN_GELU_QUICK:
|
|
1543
|
+
{
|
|
1544
|
+
cur = ggml_gelu_quick(ctx0, cur);
|
|
1545
|
+
cb(cur, "ffn_relu", il);
|
|
1546
|
+
} break;
|
|
1408
1547
|
}
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
mlp_0 = ggml_gelu(ctx0, mlp_0);
|
|
1415
|
-
struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
|
|
1416
|
-
mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
|
|
1417
|
-
// mlp_2 ne = [2048, 576, 1, 1]
|
|
1418
|
-
// // AVG Pool Layer 2*2, strides = 2
|
|
1419
|
-
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
|
|
1420
|
-
// mlp_2 ne = [576, 2048, 1, 1]
|
|
1421
|
-
mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
|
|
1422
|
-
// mlp_2 ne [24, 24, 2048, 1]
|
|
1423
|
-
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
|
|
1424
|
-
// weight ne = [3, 3, 2048, 1]
|
|
1425
|
-
struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
|
|
1426
|
-
peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
|
|
1427
|
-
peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
|
|
1428
|
-
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
|
|
1429
|
-
peg_0 = ggml_add(ctx0, peg_0, mlp_2);
|
|
1430
|
-
peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
|
|
1431
|
-
embeddings = peg_0;
|
|
1548
|
+
|
|
1549
|
+
// we only support parallel ffn for now
|
|
1550
|
+
if (gate) {
|
|
1551
|
+
cur = ggml_mul(ctx0, cur, tmp);
|
|
1552
|
+
cb(cur, "ffn_gate_par", il);
|
|
1432
1553
|
}
|
|
1433
|
-
|
|
1434
|
-
|
|
1554
|
+
|
|
1555
|
+
if (down) {
|
|
1556
|
+
cur = ggml_mul_mat(ctx0, down, cur);
|
|
1435
1557
|
}
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
struct ggml_tensor * q = model.mm_model_query;
|
|
1440
|
-
{ // layernorm
|
|
1441
|
-
q = ggml_norm(ctx0, q, eps);
|
|
1442
|
-
q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
|
|
1558
|
+
|
|
1559
|
+
if (down_b) {
|
|
1560
|
+
cb(cur, "ffn_down", il);
|
|
1443
1561
|
}
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
|
|
1562
|
+
|
|
1563
|
+
if (down_b) {
|
|
1564
|
+
cur = ggml_add(ctx0, cur, down_b);
|
|
1448
1565
|
}
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1566
|
+
|
|
1567
|
+
return cur;
|
|
1568
|
+
}
|
|
1569
|
+
|
|
1570
|
+
ggml_tensor * build_attn(
|
|
1571
|
+
ggml_tensor * wo,
|
|
1572
|
+
ggml_tensor * wo_b,
|
|
1573
|
+
ggml_tensor * q_cur,
|
|
1574
|
+
ggml_tensor * k_cur,
|
|
1575
|
+
ggml_tensor * v_cur,
|
|
1576
|
+
ggml_tensor * kq_mask,
|
|
1577
|
+
float kq_scale,
|
|
1578
|
+
int il) const {
|
|
1579
|
+
// these nodes are added to the graph together so that they are not reordered
|
|
1580
|
+
// by doing so, the number of splits in the graph is reduced
|
|
1581
|
+
ggml_build_forward_expand(gf, q_cur);
|
|
1582
|
+
ggml_build_forward_expand(gf, k_cur);
|
|
1583
|
+
ggml_build_forward_expand(gf, v_cur);
|
|
1584
|
+
|
|
1585
|
+
ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
|
|
1586
|
+
//cb(q, "q", il);
|
|
1587
|
+
|
|
1588
|
+
ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
|
|
1589
|
+
//cb(k, "k", il);
|
|
1590
|
+
|
|
1591
|
+
ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3);
|
|
1592
|
+
v = ggml_cont(ctx0, v);
|
|
1593
|
+
//cb(k, "v", il);
|
|
1594
|
+
|
|
1595
|
+
ggml_tensor * cur;
|
|
1596
|
+
|
|
1597
|
+
// TODO @ngxson : support flash attention
|
|
1598
|
+
{
|
|
1599
|
+
const auto n_tokens = q->ne[1];
|
|
1600
|
+
const auto n_head = q->ne[2];
|
|
1601
|
+
// const auto n_kv = k->ne[1]; // for flash attention
|
|
1602
|
+
|
|
1603
|
+
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
|
1604
|
+
// F32 may not needed for vision encoders?
|
|
1605
|
+
// ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
|
1606
|
+
|
|
1607
|
+
kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f);
|
|
1608
|
+
|
|
1609
|
+
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
|
|
1610
|
+
cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
|
1611
|
+
cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
|
|
1453
1612
|
}
|
|
1454
1613
|
|
|
1455
|
-
|
|
1456
|
-
int hidden_size = clip_n_mmproj_embd(ctx);
|
|
1457
|
-
const int d_head = 128;
|
|
1458
|
-
int n_head = hidden_size/d_head;
|
|
1459
|
-
int num_query = 96;
|
|
1460
|
-
if (ctx->minicpmv_version == 2) {
|
|
1461
|
-
num_query = 96;
|
|
1462
|
-
}
|
|
1463
|
-
else if (ctx->minicpmv_version == 3) {
|
|
1464
|
-
num_query = 64;
|
|
1465
|
-
}
|
|
1466
|
-
else if (ctx->minicpmv_version == 4) {
|
|
1467
|
-
num_query = 64;
|
|
1468
|
-
}
|
|
1614
|
+
cb(cur, "kqv_out", il);
|
|
1469
1615
|
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
|
|
1473
|
-
// permute
|
|
1474
|
-
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
|
|
1475
|
-
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
|
1476
|
-
Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
|
|
1477
|
-
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
|
|
1478
|
-
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
|
1479
|
-
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
|
|
1480
|
-
V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
|
|
1481
|
-
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
|
1482
|
-
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
|
|
1483
|
-
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
|
1484
|
-
KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
|
|
1485
|
-
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
|
1486
|
-
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
|
|
1487
|
-
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
|
1488
|
-
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
|
|
1489
|
-
|
|
1490
|
-
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
|
|
1491
|
-
}
|
|
1492
|
-
{ // layernorm
|
|
1493
|
-
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
1494
|
-
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
|
|
1616
|
+
if (wo) {
|
|
1617
|
+
cur = ggml_mul_mat(ctx0, wo, cur);
|
|
1495
1618
|
}
|
|
1496
|
-
embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
|
|
1497
|
-
}
|
|
1498
1619
|
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
|
|
1502
|
-
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
|
|
1503
|
-
embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
|
|
1504
|
-
embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
|
|
1505
|
-
embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
|
|
1506
|
-
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
|
|
1507
|
-
embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
|
|
1508
|
-
// GLU
|
|
1509
|
-
{
|
|
1510
|
-
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
|
|
1511
|
-
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
1512
|
-
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
|
|
1513
|
-
embeddings = ggml_gelu_inplace(ctx0, embeddings);
|
|
1514
|
-
struct ggml_tensor * x = embeddings;
|
|
1515
|
-
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
|
|
1516
|
-
x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
|
|
1517
|
-
embeddings = ggml_silu_inplace(ctx0, embeddings);
|
|
1518
|
-
embeddings = ggml_mul(ctx0, embeddings,x);
|
|
1519
|
-
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
|
|
1620
|
+
if (wo_b) {
|
|
1621
|
+
cur = ggml_add(ctx0, cur, wo_b);
|
|
1520
1622
|
}
|
|
1623
|
+
|
|
1624
|
+
return cur;
|
|
1521
1625
|
}
|
|
1522
1626
|
|
|
1523
|
-
|
|
1524
|
-
|
|
1627
|
+
// implementation of the 2D RoPE without adding a new op in ggml
|
|
1628
|
+
// this is not efficient (use double the memory), but works on all backends
|
|
1629
|
+
// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
|
|
1630
|
+
static ggml_tensor * build_rope_2d(
|
|
1631
|
+
ggml_context * ctx0,
|
|
1632
|
+
ggml_tensor * cur,
|
|
1633
|
+
ggml_tensor * pos_h,
|
|
1634
|
+
ggml_tensor * pos_w,
|
|
1635
|
+
const float freq_base
|
|
1636
|
+
) {
|
|
1637
|
+
const int64_t n_dim = cur->ne[0];
|
|
1638
|
+
const int64_t n_head = cur->ne[1];
|
|
1639
|
+
const int64_t n_pos = cur->ne[2];
|
|
1525
1640
|
|
|
1526
|
-
|
|
1527
|
-
|
|
1641
|
+
// for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
|
|
1642
|
+
// we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
|
|
1643
|
+
// first half of cur will use 1e-0, 1e-2 (even)
|
|
1644
|
+
// second half of cur will use 1e-1, 1e-3 (odd)
|
|
1645
|
+
// the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
|
|
1646
|
+
// ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
|
|
1647
|
+
// then for the second half, we use freq_scale to shift the inv_freq
|
|
1648
|
+
// ^ why? replace (2i) with (2i+1) in the above equation
|
|
1649
|
+
const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim);
|
|
1528
1650
|
|
|
1529
|
-
//
|
|
1530
|
-
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
|
|
1534
|
-
|
|
1651
|
+
// first half
|
|
1652
|
+
ggml_tensor * first;
|
|
1653
|
+
{
|
|
1654
|
+
first = ggml_view_3d(ctx0, cur,
|
|
1655
|
+
n_dim/2, n_head, n_pos,
|
|
1656
|
+
ggml_row_size(cur->type, n_dim),
|
|
1657
|
+
ggml_row_size(cur->type, n_dim*n_head),
|
|
1658
|
+
0);
|
|
1659
|
+
first = ggml_rope_ext(
|
|
1660
|
+
ctx0,
|
|
1661
|
+
first,
|
|
1662
|
+
pos_h, // positions
|
|
1663
|
+
nullptr, // freq factors
|
|
1664
|
+
n_dim/2, // n_dims
|
|
1665
|
+
0, 0, freq_base,
|
|
1666
|
+
1.0f, 0.0f, 1.0f, 0.0f, 0.0f
|
|
1667
|
+
);
|
|
1668
|
+
}
|
|
1669
|
+
|
|
1670
|
+
// second half
|
|
1671
|
+
ggml_tensor * second;
|
|
1672
|
+
{
|
|
1673
|
+
second = ggml_view_3d(ctx0, cur,
|
|
1674
|
+
n_dim/2, n_head, n_pos,
|
|
1675
|
+
ggml_row_size(cur->type, n_dim),
|
|
1676
|
+
ggml_row_size(cur->type, n_dim*n_head),
|
|
1677
|
+
n_dim/2 * ggml_element_size(cur));
|
|
1678
|
+
second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
|
|
1679
|
+
second = ggml_rope_ext(
|
|
1680
|
+
ctx0,
|
|
1681
|
+
second,
|
|
1682
|
+
pos_w, // positions
|
|
1683
|
+
nullptr, // freq factors
|
|
1684
|
+
n_dim/2, // n_dims
|
|
1685
|
+
0, 0, freq_base,
|
|
1686
|
+
freq_scale_odd,
|
|
1687
|
+
0.0f, 1.0f, 0.0f, 0.0f
|
|
1688
|
+
);
|
|
1689
|
+
}
|
|
1690
|
+
|
|
1691
|
+
cur = ggml_concat(ctx0, first, second, 0);
|
|
1692
|
+
return cur;
|
|
1535
1693
|
}
|
|
1536
1694
|
|
|
1537
|
-
|
|
1538
|
-
ggml_build_forward_expand(gf, embeddings);
|
|
1695
|
+
};
|
|
1539
1696
|
|
|
1540
|
-
|
|
1541
|
-
|
|
1697
|
+
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
|
|
1698
|
+
GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");
|
|
1699
|
+
clip_graph graph(ctx, *imgs.entries[0]);
|
|
1542
1700
|
|
|
1543
|
-
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
|
|
1544
1701
|
ggml_cgraph * res;
|
|
1702
|
+
|
|
1545
1703
|
switch (ctx->proj_type) {
|
|
1546
1704
|
case PROJECTOR_TYPE_GEMMA3:
|
|
1547
1705
|
case PROJECTOR_TYPE_IDEFICS3:
|
|
1548
1706
|
{
|
|
1549
|
-
|
|
1550
|
-
res = clip_image_build_graph_siglip(ctx, *imgs.entries[0]);
|
|
1707
|
+
res = graph.build_siglip();
|
|
1551
1708
|
} break;
|
|
1552
1709
|
case PROJECTOR_TYPE_PIXTRAL:
|
|
1553
1710
|
{
|
|
1554
|
-
|
|
1555
|
-
res = clip_image_build_graph_pixtral(ctx, *imgs.entries[0]);
|
|
1711
|
+
res = graph.build_pixtral();
|
|
1556
1712
|
} break;
|
|
1713
|
+
case PROJECTOR_TYPE_QWEN2VL:
|
|
1557
1714
|
case PROJECTOR_TYPE_QWEN25VL:
|
|
1558
1715
|
{
|
|
1559
|
-
res =
|
|
1716
|
+
res = graph.build_qwen2vl();
|
|
1717
|
+
} break;
|
|
1718
|
+
case PROJECTOR_TYPE_MINICPMV:
|
|
1719
|
+
{
|
|
1720
|
+
res = graph.build_minicpmv();
|
|
1721
|
+
} break;
|
|
1722
|
+
case PROJECTOR_TYPE_INTERNVL:
|
|
1723
|
+
{
|
|
1724
|
+
res = graph.build_internvl();
|
|
1560
1725
|
} break;
|
|
1561
1726
|
default:
|
|
1562
1727
|
{
|
|
1563
|
-
|
|
1564
|
-
res = clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf);
|
|
1728
|
+
res = graph.build_llava();
|
|
1565
1729
|
} break;
|
|
1566
1730
|
}
|
|
1567
1731
|
return res;
|
|
@@ -1615,7 +1779,7 @@ struct clip_model_loader {
|
|
|
1615
1779
|
const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
|
|
1616
1780
|
const size_t offset = gguf_get_tensor_offset(ctx_gguf.get(), i);
|
|
1617
1781
|
enum ggml_type type = gguf_get_tensor_type(ctx_gguf.get(), i);
|
|
1618
|
-
|
|
1782
|
+
ggml_tensor * cur = ggml_get_tensor(meta, name);
|
|
1619
1783
|
size_t tensor_size = ggml_nbytes(cur);
|
|
1620
1784
|
model_size += tensor_size;
|
|
1621
1785
|
LOG_DBG("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
|
|
@@ -1626,6 +1790,7 @@ struct clip_model_loader {
|
|
|
1626
1790
|
|
|
1627
1791
|
void load_hparams() {
|
|
1628
1792
|
auto & hparams = ctx_clip.vision_model.hparams;
|
|
1793
|
+
std::string log_ffn_op; // for logging
|
|
1629
1794
|
|
|
1630
1795
|
// projector type
|
|
1631
1796
|
std::string proj_type;
|
|
@@ -1641,14 +1806,11 @@ struct clip_model_loader {
|
|
|
1641
1806
|
|
|
1642
1807
|
// other hparams
|
|
1643
1808
|
{
|
|
1644
|
-
get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false);
|
|
1809
|
+
get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false); // legacy
|
|
1645
1810
|
|
|
1646
|
-
|
|
1647
|
-
get_bool(KEY_USE_SILU, ctx_clip.use_silu, false);
|
|
1648
|
-
|
|
1649
|
-
get_u32(KEY_N_EMBD, hparams.hidden_size);
|
|
1811
|
+
get_u32(KEY_N_EMBD, hparams.n_embd);
|
|
1650
1812
|
get_u32(KEY_N_HEAD, hparams.n_head);
|
|
1651
|
-
get_u32(KEY_N_FF, hparams.
|
|
1813
|
+
get_u32(KEY_N_FF, hparams.n_ff);
|
|
1652
1814
|
get_u32(KEY_N_BLOCK, hparams.n_layer);
|
|
1653
1815
|
get_u32(KEY_PROJ_DIM, hparams.projection_dim);
|
|
1654
1816
|
get_f32(KEY_LAYER_NORM_EPS, hparams.eps);
|
|
@@ -1657,11 +1819,34 @@ struct clip_model_loader {
|
|
|
1657
1819
|
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
|
|
1658
1820
|
get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
|
|
1659
1821
|
|
|
1822
|
+
// default warmup value
|
|
1823
|
+
hparams.warmup_image_size = hparams.image_size;
|
|
1824
|
+
|
|
1660
1825
|
ctx_clip.has_llava_projector = ctx_clip.proj_type == PROJECTOR_TYPE_MLP
|
|
1661
1826
|
|| ctx_clip.proj_type == PROJECTOR_TYPE_MLP_NORM
|
|
1662
1827
|
|| ctx_clip.proj_type == PROJECTOR_TYPE_LDP
|
|
1663
1828
|
|| ctx_clip.proj_type == PROJECTOR_TYPE_LDPV2;
|
|
1664
1829
|
|
|
1830
|
+
{
|
|
1831
|
+
bool use_gelu = false;
|
|
1832
|
+
bool use_silu = false;
|
|
1833
|
+
get_bool(KEY_USE_GELU, use_gelu, false);
|
|
1834
|
+
get_bool(KEY_USE_SILU, use_silu, false);
|
|
1835
|
+
if (use_gelu && use_silu) {
|
|
1836
|
+
throw std::runtime_error(string_format("%s: both use_gelu and use_silu are set to true\n", __func__));
|
|
1837
|
+
}
|
|
1838
|
+
if (use_gelu) {
|
|
1839
|
+
hparams.ffn_op = FFN_GELU;
|
|
1840
|
+
log_ffn_op = "gelu";
|
|
1841
|
+
} else if (use_silu) {
|
|
1842
|
+
hparams.ffn_op = FFN_SILU;
|
|
1843
|
+
log_ffn_op = "silu";
|
|
1844
|
+
} else {
|
|
1845
|
+
hparams.ffn_op = FFN_GELU_QUICK;
|
|
1846
|
+
log_ffn_op = "gelu_quick";
|
|
1847
|
+
}
|
|
1848
|
+
}
|
|
1849
|
+
|
|
1665
1850
|
{
|
|
1666
1851
|
std::string mm_patch_merge_type;
|
|
1667
1852
|
get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false);
|
|
@@ -1695,30 +1880,6 @@ struct clip_model_loader {
|
|
|
1695
1880
|
hparams.vision_feature_layer.insert(layer);
|
|
1696
1881
|
}
|
|
1697
1882
|
|
|
1698
|
-
// Calculate the deepest feature layer based on hparams and projector type
|
|
1699
|
-
// NOTE: This is only used by build_graph_legacy()
|
|
1700
|
-
{
|
|
1701
|
-
// Get the index of the second to last layer; this is the default for models that have a llava projector
|
|
1702
|
-
int n_layer = hparams.n_layer - 1;
|
|
1703
|
-
int deepest_feature_layer = -1;
|
|
1704
|
-
|
|
1705
|
-
if (ctx_clip.proj_type == PROJECTOR_TYPE_MINICPMV
|
|
1706
|
-
|| ctx_clip.proj_type == PROJECTOR_TYPE_GLM_EDGE
|
|
1707
|
-
|| ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL
|
|
1708
|
-
|| ctx_clip.proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
|
1709
|
-
n_layer += 1;
|
|
1710
|
-
}
|
|
1711
|
-
|
|
1712
|
-
// If we set explicit vision feature layers, only go up to the deepest one
|
|
1713
|
-
// NOTE: only used by granite-vision models for now
|
|
1714
|
-
for (const auto & feature_layer : hparams.vision_feature_layer) {
|
|
1715
|
-
if (feature_layer > deepest_feature_layer) {
|
|
1716
|
-
deepest_feature_layer = feature_layer;
|
|
1717
|
-
}
|
|
1718
|
-
}
|
|
1719
|
-
ctx_clip.max_feature_layer = deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
|
|
1720
|
-
}
|
|
1721
|
-
|
|
1722
1883
|
// model-specific params
|
|
1723
1884
|
switch (ctx_clip.proj_type) {
|
|
1724
1885
|
case PROJECTOR_TYPE_MINICPMV:
|
|
@@ -1728,15 +1889,41 @@ struct clip_model_loader {
|
|
|
1728
1889
|
}
|
|
1729
1890
|
} break;
|
|
1730
1891
|
case PROJECTOR_TYPE_IDEFICS3:
|
|
1892
|
+
case PROJECTOR_TYPE_INTERNVL:
|
|
1731
1893
|
{
|
|
1732
1894
|
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
|
|
1733
1895
|
} break;
|
|
1734
1896
|
case PROJECTOR_TYPE_PIXTRAL:
|
|
1735
1897
|
{
|
|
1736
1898
|
hparams.rope_theta = 10000.0f;
|
|
1899
|
+
hparams.warmup_image_size = hparams.patch_size * 8;
|
|
1900
|
+
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
|
|
1901
|
+
} break;
|
|
1902
|
+
case PROJECTOR_TYPE_GEMMA3:
|
|
1903
|
+
{
|
|
1904
|
+
// default value (used by all model sizes in gemma 3 family)
|
|
1905
|
+
// number of patches for each **side** is reduced by a factor of 4
|
|
1906
|
+
hparams.proj_scale_factor = 4;
|
|
1907
|
+
// test model (tinygemma3) has a different value, we optionally read it
|
|
1908
|
+
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
|
|
1909
|
+
} break;
|
|
1910
|
+
case PROJECTOR_TYPE_QWEN2VL:
|
|
1911
|
+
{
|
|
1912
|
+
// max image size = sqrt(max_pixels) = 3584
|
|
1913
|
+
// ref: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json
|
|
1914
|
+
// however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
|
|
1915
|
+
// ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
|
|
1916
|
+
hparams.image_size = 1024;
|
|
1917
|
+
hparams.warmup_image_size = hparams.patch_size * 8;
|
|
1737
1918
|
} break;
|
|
1738
1919
|
case PROJECTOR_TYPE_QWEN25VL:
|
|
1739
1920
|
{
|
|
1921
|
+
// max image size = sqrt(max_pixels)
|
|
1922
|
+
// https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
|
|
1923
|
+
// however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
|
|
1924
|
+
// ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
|
|
1925
|
+
hparams.image_size = 1024;
|
|
1926
|
+
hparams.warmup_image_size = hparams.patch_size * 8;
|
|
1740
1927
|
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
|
|
1741
1928
|
} break;
|
|
1742
1929
|
default:
|
|
@@ -1744,18 +1931,26 @@ struct clip_model_loader {
|
|
|
1744
1931
|
}
|
|
1745
1932
|
|
|
1746
1933
|
LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
|
|
1934
|
+
LOG_INF("%s: n_embd: %d\n", __func__, hparams.n_embd);
|
|
1935
|
+
LOG_INF("%s: n_head: %d\n", __func__, hparams.n_head);
|
|
1936
|
+
LOG_INF("%s: n_ff: %d\n", __func__, hparams.n_ff);
|
|
1937
|
+
LOG_INF("%s: n_layer: %d\n", __func__, hparams.n_layer);
|
|
1938
|
+
LOG_INF("%s: projection_dim: %d\n", __func__, hparams.projection_dim);
|
|
1939
|
+
LOG_INF("%s: image_size: %d\n", __func__, hparams.image_size);
|
|
1940
|
+
LOG_INF("%s: patch_size: %d\n", __func__, hparams.patch_size);
|
|
1941
|
+
LOG_INF("\n");
|
|
1747
1942
|
LOG_INF("%s: has_llava_proj: %d\n", __func__, ctx_clip.has_llava_projector);
|
|
1748
1943
|
LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version);
|
|
1749
1944
|
LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
|
|
1750
1945
|
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
|
|
1751
|
-
LOG_INF("%s:
|
|
1752
|
-
LOG_INF("%s: use_gelu: %d\n", __func__, ctx_clip.use_gelu);
|
|
1946
|
+
LOG_INF("%s: ffn_op: %s\n", __func__, log_ffn_op.c_str());
|
|
1753
1947
|
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
|
|
1754
1948
|
LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
|
|
1755
1949
|
}
|
|
1756
1950
|
}
|
|
1757
1951
|
|
|
1758
1952
|
void load_tensors() {
|
|
1953
|
+
auto & hparams = ctx_clip.vision_model.hparams;
|
|
1759
1954
|
std::map<std::string, size_t> tensor_offset;
|
|
1760
1955
|
std::vector<ggml_tensor *> tensors_to_load;
|
|
1761
1956
|
|
|
@@ -1778,14 +1973,14 @@ struct clip_model_loader {
|
|
|
1778
1973
|
|
|
1779
1974
|
// helper function
|
|
1780
1975
|
auto get_tensor = [&](const std::string & name, bool required = true) {
|
|
1781
|
-
|
|
1976
|
+
ggml_tensor * cur = ggml_get_tensor(ctx_meta.get(), name.c_str());
|
|
1782
1977
|
if (!cur && required) {
|
|
1783
1978
|
throw std::runtime_error(string_format("%s: unable to find tensor %s\n", __func__, name.c_str()));
|
|
1784
1979
|
}
|
|
1785
1980
|
if (cur) {
|
|
1786
1981
|
tensors_to_load.push_back(cur);
|
|
1787
1982
|
// add tensors to context
|
|
1788
|
-
|
|
1983
|
+
ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data.get(), cur);
|
|
1789
1984
|
ggml_set_name(data_tensor, cur->name);
|
|
1790
1985
|
cur = data_tensor;
|
|
1791
1986
|
}
|
|
@@ -1809,15 +2004,20 @@ struct clip_model_loader {
|
|
|
1809
2004
|
vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, "v"), false);
|
|
1810
2005
|
|
|
1811
2006
|
// layers
|
|
1812
|
-
vision_model.layers.resize(
|
|
1813
|
-
for (int il = 0; il <
|
|
2007
|
+
vision_model.layers.resize(hparams.n_layer);
|
|
2008
|
+
for (int il = 0; il < hparams.n_layer; ++il) {
|
|
1814
2009
|
auto & layer = vision_model.layers[il];
|
|
1815
2010
|
layer.k_w = get_tensor(string_format(TN_ATTN_K, "v", il, "weight"));
|
|
1816
2011
|
layer.q_w = get_tensor(string_format(TN_ATTN_Q, "v", il, "weight"));
|
|
1817
2012
|
layer.v_w = get_tensor(string_format(TN_ATTN_V, "v", il, "weight"));
|
|
1818
2013
|
layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight"));
|
|
2014
|
+
layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, "v", il, "weight"), false);
|
|
2015
|
+
layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, "v", il, "weight"), false);
|
|
1819
2016
|
layer.ln_1_w = get_tensor(string_format(TN_LN_1, "v", il, "weight"), false);
|
|
1820
2017
|
layer.ln_2_w = get_tensor(string_format(TN_LN_2, "v", il, "weight"), false);
|
|
2018
|
+
layer.ls_1_w = get_tensor(string_format(TN_LS_1, "v", il, "weight"), false); // no bias
|
|
2019
|
+
layer.ls_2_w = get_tensor(string_format(TN_LS_2, "v", il, "weight"), false); // no bias
|
|
2020
|
+
|
|
1821
2021
|
layer.k_b = get_tensor(string_format(TN_ATTN_K, "v", il, "bias"), false);
|
|
1822
2022
|
layer.q_b = get_tensor(string_format(TN_ATTN_Q, "v", il, "bias"), false);
|
|
1823
2023
|
layer.v_b = get_tensor(string_format(TN_ATTN_V, "v", il, "bias"), false);
|
|
@@ -1825,7 +2025,7 @@ struct clip_model_loader {
|
|
|
1825
2025
|
layer.ln_1_b = get_tensor(string_format(TN_LN_1, "v", il, "bias"), false);
|
|
1826
2026
|
layer.ln_2_b = get_tensor(string_format(TN_LN_2, "v", il, "bias"), false);
|
|
1827
2027
|
|
|
1828
|
-
//
|
|
2028
|
+
// ffn
|
|
1829
2029
|
layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, "v", il, "weight"));
|
|
1830
2030
|
layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, "v", il, "bias"), false);
|
|
1831
2031
|
layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, "v", il, "weight"), false);
|
|
@@ -1833,13 +2033,18 @@ struct clip_model_loader {
|
|
|
1833
2033
|
layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight"));
|
|
1834
2034
|
layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"), false);
|
|
1835
2035
|
|
|
1836
|
-
// legacy
|
|
1837
|
-
|
|
1838
|
-
layer.
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
|
|
2036
|
+
// some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
|
|
2037
|
+
// note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
|
|
2038
|
+
if (layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd) {
|
|
2039
|
+
// swap up and down weights
|
|
2040
|
+
ggml_tensor * tmp = layer.ff_up_w;
|
|
2041
|
+
layer.ff_up_w = layer.ff_down_w;
|
|
2042
|
+
layer.ff_down_w = tmp;
|
|
2043
|
+
// swap up and down biases
|
|
2044
|
+
tmp = layer.ff_up_b;
|
|
2045
|
+
layer.ff_up_b = layer.ff_down_b;
|
|
2046
|
+
layer.ff_down_b = tmp;
|
|
2047
|
+
}
|
|
1843
2048
|
}
|
|
1844
2049
|
|
|
1845
2050
|
switch (ctx_clip.proj_type) {
|
|
@@ -1930,12 +2135,14 @@ struct clip_model_loader {
|
|
|
1930
2135
|
{
|
|
1931
2136
|
vision_model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight"));
|
|
1932
2137
|
vision_model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias"));
|
|
1933
|
-
vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR,"weight"));
|
|
1934
|
-
vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1,"weight"));
|
|
1935
|
-
vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1,"bias"));
|
|
1936
|
-
vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
|
|
1937
|
-
vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight"));
|
|
1938
|
-
vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
|
|
2138
|
+
vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR, "weight"));
|
|
2139
|
+
vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "weight"));
|
|
2140
|
+
vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "bias"));
|
|
2141
|
+
vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
|
|
2142
|
+
vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
|
|
2143
|
+
vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
|
|
2144
|
+
vision_model.mm_glm_tok_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
|
|
2145
|
+
vision_model.mm_glm_tok_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
|
|
1939
2146
|
} break;
|
|
1940
2147
|
case PROJECTOR_TYPE_QWEN2VL:
|
|
1941
2148
|
case PROJECTOR_TYPE_QWEN25VL:
|
|
@@ -1957,11 +2164,23 @@ struct clip_model_loader {
|
|
|
1957
2164
|
case PROJECTOR_TYPE_PIXTRAL:
|
|
1958
2165
|
{
|
|
1959
2166
|
vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
|
1960
|
-
vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
|
|
2167
|
+
vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
|
|
1961
2168
|
vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
|
1962
|
-
vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
|
2169
|
+
vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
|
|
1963
2170
|
// [IMG_BREAK] token embedding
|
|
1964
2171
|
vision_model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
|
|
2172
|
+
// for mistral small 3.1
|
|
2173
|
+
vision_model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
|
2174
|
+
vision_model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
|
|
2175
|
+
} break;
|
|
2176
|
+
case PROJECTOR_TYPE_INTERNVL:
|
|
2177
|
+
{
|
|
2178
|
+
vision_model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
|
|
2179
|
+
vision_model.mm_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
|
|
2180
|
+
vision_model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
|
|
2181
|
+
vision_model.mm_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
|
|
2182
|
+
vision_model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
|
|
2183
|
+
vision_model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
|
|
1965
2184
|
} break;
|
|
1966
2185
|
default:
|
|
1967
2186
|
GGML_ASSERT(false && "unknown projector type");
|
|
@@ -1981,7 +2200,7 @@ struct clip_model_loader {
|
|
|
1981
2200
|
ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
|
|
1982
2201
|
ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
|
1983
2202
|
for (auto & t : tensors_to_load) {
|
|
1984
|
-
|
|
2203
|
+
ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name);
|
|
1985
2204
|
const size_t offset = tensor_offset[t->name];
|
|
1986
2205
|
fin.seekg(offset, std::ios::beg);
|
|
1987
2206
|
if (!fin) {
|
|
@@ -2010,16 +2229,14 @@ struct clip_model_loader {
|
|
|
2010
2229
|
// create a fake batch
|
|
2011
2230
|
clip_image_f32_batch batch;
|
|
2012
2231
|
clip_image_f32_ptr img(clip_image_f32_init());
|
|
2013
|
-
|
|
2014
|
-
|
|
2015
|
-
|
|
2016
|
-
img->nx = image_size.width;
|
|
2017
|
-
img->ny = image_size.height;
|
|
2018
|
-
img->buf.resize(image_size.width * image_size.height * 3);
|
|
2232
|
+
img->nx = ctx_clip.vision_model.hparams.warmup_image_size;
|
|
2233
|
+
img->ny = ctx_clip.vision_model.hparams.warmup_image_size;
|
|
2234
|
+
img->buf.resize(img->nx * img->ny * 3);
|
|
2019
2235
|
batch.entries.push_back(std::move(img));
|
|
2020
2236
|
|
|
2021
|
-
ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch
|
|
2237
|
+
ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
|
|
2022
2238
|
ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
|
|
2239
|
+
|
|
2023
2240
|
for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) {
|
|
2024
2241
|
ggml_backend_t backend = ctx_clip.backend_ptrs[i];
|
|
2025
2242
|
ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i];
|
|
@@ -2092,19 +2309,12 @@ struct clip_model_loader {
|
|
|
2092
2309
|
}
|
|
2093
2310
|
};
|
|
2094
2311
|
|
|
2095
|
-
// read and create ggml_context containing the tensors and their data
|
|
2096
|
-
struct clip_ctx * clip_model_load(const char * fname, const int verbosity) {
|
|
2097
|
-
return clip_init(fname, clip_context_params{
|
|
2098
|
-
/* use_gpu */ true,
|
|
2099
|
-
/* verbosity */ static_cast<ggml_log_level>(verbosity),
|
|
2100
|
-
});
|
|
2101
|
-
}
|
|
2102
|
-
|
|
2103
2312
|
struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
|
|
2104
2313
|
g_logger_state.verbosity_thold = ctx_params.verbosity;
|
|
2105
|
-
clip_ctx * ctx_clip =
|
|
2314
|
+
clip_ctx * ctx_clip = nullptr;
|
|
2106
2315
|
|
|
2107
2316
|
try {
|
|
2317
|
+
ctx_clip = new clip_ctx(ctx_params);
|
|
2108
2318
|
clip_model_loader loader(fname, *ctx_clip);
|
|
2109
2319
|
loader.load_hparams();
|
|
2110
2320
|
loader.load_tensors();
|
|
@@ -2417,8 +2627,8 @@ struct image_manipulation {
|
|
|
2417
2627
|
float target_width_f = static_cast<float>(inp_size.width) * scale;
|
|
2418
2628
|
float target_height_f = static_cast<float>(inp_size.height) * scale;
|
|
2419
2629
|
|
|
2420
|
-
int aligned_width =
|
|
2421
|
-
int aligned_height =
|
|
2630
|
+
int aligned_width = CLIP_ALIGN((int)target_width_f, align_size);
|
|
2631
|
+
int aligned_height = CLIP_ALIGN((int)target_height_f, align_size);
|
|
2422
2632
|
|
|
2423
2633
|
return {aligned_width, aligned_height};
|
|
2424
2634
|
}
|
|
@@ -2516,7 +2726,7 @@ struct llava_uhd {
|
|
|
2516
2726
|
|
|
2517
2727
|
// no pinpoints, dynamically calculate the grid size (e.g. minicpmv)
|
|
2518
2728
|
|
|
2519
|
-
auto best_size = get_best_resize(original_size, slice_size, patch_size, has_slices);
|
|
2729
|
+
auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices);
|
|
2520
2730
|
res.overview_size = best_size;
|
|
2521
2731
|
|
|
2522
2732
|
if (!has_slices) {
|
|
@@ -2737,10 +2947,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
|
|
2737
2947
|
}
|
|
2738
2948
|
else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
|
2739
2949
|
clip_image_u8 resized;
|
|
2740
|
-
auto patch_size =
|
|
2741
|
-
|
|
2742
|
-
|
|
2743
|
-
image_manipulation::bicubic_resize(*img, resized, nx, ny);
|
|
2950
|
+
auto patch_size = params.patch_size * 2;
|
|
2951
|
+
auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, patch_size, params.image_size);
|
|
2952
|
+
image_manipulation::bicubic_resize(*img, resized, new_size.width, new_size.height);
|
|
2744
2953
|
|
|
2745
2954
|
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
|
2746
2955
|
// clip_image_f32_ptr res(clip_image_f32_init());
|
|
@@ -2751,7 +2960,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
|
|
2751
2960
|
}
|
|
2752
2961
|
else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE
|
|
2753
2962
|
|| ctx->proj_type == PROJECTOR_TYPE_GEMMA3
|
|
2754
|
-
|| ctx->proj_type == PROJECTOR_TYPE_IDEFICS3
|
|
2963
|
+
|| ctx->proj_type == PROJECTOR_TYPE_IDEFICS3
|
|
2964
|
+
|| ctx->proj_type == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
|
|
2965
|
+
) {
|
|
2755
2966
|
clip_image_u8 resized_image;
|
|
2756
2967
|
int sz = params.image_size;
|
|
2757
2968
|
image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz});
|
|
@@ -2848,7 +3059,7 @@ int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
|
|
|
2848
3059
|
}
|
|
2849
3060
|
|
|
2850
3061
|
int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
|
|
2851
|
-
return ctx->vision_model.hparams.
|
|
3062
|
+
return ctx->vision_model.hparams.n_embd;
|
|
2852
3063
|
}
|
|
2853
3064
|
|
|
2854
3065
|
const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
|
|
@@ -2866,19 +3077,6 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
|
|
|
2866
3077
|
return ctx->vision_model.hparams.image_grid_pinpoints.size();
|
|
2867
3078
|
}
|
|
2868
3079
|
|
|
2869
|
-
// deprecated
|
|
2870
|
-
int clip_n_patches(const struct clip_ctx * ctx) {
|
|
2871
|
-
clip_image_f32 img;
|
|
2872
|
-
img.nx = ctx->vision_model.hparams.image_size;
|
|
2873
|
-
img.ny = ctx->vision_model.hparams.image_size;
|
|
2874
|
-
return clip_n_output_tokens(ctx, &img);
|
|
2875
|
-
}
|
|
2876
|
-
|
|
2877
|
-
// deprecated
|
|
2878
|
-
int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
|
2879
|
-
return clip_n_output_tokens(ctx, img);
|
|
2880
|
-
}
|
|
2881
|
-
|
|
2882
3080
|
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
|
2883
3081
|
const auto & params = ctx->vision_model.hparams;
|
|
2884
3082
|
const int n_total = clip_n_output_tokens(ctx, img);
|
|
@@ -2901,8 +3099,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|
|
2901
3099
|
|
|
2902
3100
|
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
|
2903
3101
|
|
|
2904
|
-
if (ctx->proj_type == PROJECTOR_TYPE_LDP
|
|
3102
|
+
if (ctx->proj_type == PROJECTOR_TYPE_LDP
|
|
3103
|
+
|| ctx->proj_type == PROJECTOR_TYPE_LDPV2
|
|
3104
|
+
|| ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
|
2905
3105
|
n_patches /= 4;
|
|
3106
|
+
if (ctx->vision_model.mm_glm_tok_boi) {
|
|
3107
|
+
n_patches += 2; // for BOI and EOI token embeddings
|
|
3108
|
+
}
|
|
2906
3109
|
} else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
|
|
2907
3110
|
if (ctx->minicpmv_version == 2) {
|
|
2908
3111
|
n_patches = 96;
|
|
@@ -2922,12 +3125,16 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|
|
2922
3125
|
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
|
|
2923
3126
|
n_patches = x_patch * y_patch;
|
|
2924
3127
|
} else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
|
2925
|
-
|
|
2926
|
-
|
|
2927
|
-
n_patches
|
|
3128
|
+
int n_per_side = params.image_size / params.patch_size;
|
|
3129
|
+
int n_per_side_2d_pool = n_per_side / params.proj_scale_factor;
|
|
3130
|
+
n_patches = n_per_side_2d_pool * n_per_side_2d_pool;
|
|
3131
|
+
} else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3 || ctx->proj_type == PROJECTOR_TYPE_INTERNVL) {
|
|
3132
|
+
// both W and H are divided by proj_scale_factor
|
|
3133
|
+
n_patches /= (params.proj_scale_factor * params.proj_scale_factor);
|
|
2928
3134
|
} else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
|
|
2929
|
-
int
|
|
2930
|
-
int
|
|
3135
|
+
int n_merge = params.spatial_merge_size;
|
|
3136
|
+
int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1);
|
|
3137
|
+
int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1);
|
|
2931
3138
|
n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
|
|
2932
3139
|
}
|
|
2933
3140
|
|
|
@@ -3033,15 +3240,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
3033
3240
|
const clip_image_f32_batch & imgs = *imgs_c_ptr;
|
|
3034
3241
|
int batch_size = imgs.entries.size();
|
|
3035
3242
|
|
|
3036
|
-
|
|
3037
|
-
|
|
3038
|
-
|
|
3039
|
-
|
|
3243
|
+
// TODO @ngxson : implement batch size > 1 as a loop
|
|
3244
|
+
// we don't need true batching support because the cgraph will gonna be big anyway
|
|
3245
|
+
if (batch_size != 1) {
|
|
3246
|
+
return false; // only support batch size of 1
|
|
3040
3247
|
}
|
|
3041
3248
|
|
|
3042
3249
|
// build the inference graph
|
|
3043
3250
|
ggml_backend_sched_reset(ctx->sched.get());
|
|
3044
|
-
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs
|
|
3251
|
+
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
|
|
3045
3252
|
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
|
|
3046
3253
|
|
|
3047
3254
|
// set inputs
|
|
@@ -3053,14 +3260,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
3053
3260
|
|
|
3054
3261
|
const int patch_size = hparams.patch_size;
|
|
3055
3262
|
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
|
3056
|
-
const int
|
|
3263
|
+
const int n_pos = num_patches + (model.class_embedding ? 1 : 0);
|
|
3057
3264
|
const int pos_w = ctx->load_image_size.width / patch_size;
|
|
3058
3265
|
const int pos_h = ctx->load_image_size.height / patch_size;
|
|
3059
3266
|
|
|
3060
3267
|
const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
|
|
3061
3268
|
|
|
3062
3269
|
auto get_inp_tensor = [&gf](const char * name) {
|
|
3063
|
-
|
|
3270
|
+
ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
|
|
3064
3271
|
if (inp == nullptr) {
|
|
3065
3272
|
GGML_ABORT("Failed to get tensor %s", name);
|
|
3066
3273
|
}
|
|
@@ -3169,7 +3376,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
3169
3376
|
const int merge_ratio = 2;
|
|
3170
3377
|
const int pw = image_size_width / patch_size;
|
|
3171
3378
|
const int ph = image_size_height / patch_size;
|
|
3172
|
-
std::vector<int> positions(
|
|
3379
|
+
std::vector<int> positions(n_pos * 4);
|
|
3173
3380
|
int ptr = 0;
|
|
3174
3381
|
for (int y = 0; y < ph; y += merge_ratio) {
|
|
3175
3382
|
for (int x = 0; x < pw; x += merge_ratio) {
|
|
@@ -3246,7 +3453,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
3246
3453
|
}
|
|
3247
3454
|
|
|
3248
3455
|
const int mpow = merge_ratio * merge_ratio;
|
|
3249
|
-
std::vector<int> positions(
|
|
3456
|
+
std::vector<int> positions(n_pos * 4);
|
|
3250
3457
|
|
|
3251
3458
|
int ptr = 0;
|
|
3252
3459
|
for (int y = 0; y < iph; y += merge_ratio) {
|
|
@@ -3272,14 +3479,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
3272
3479
|
{
|
|
3273
3480
|
// set the 2D positions
|
|
3274
3481
|
int n_patches_per_col = image_size_width / patch_size;
|
|
3275
|
-
std::vector<int> pos_data(
|
|
3482
|
+
std::vector<int> pos_data(n_pos);
|
|
3276
3483
|
// dimension H
|
|
3277
|
-
for (int i = 0; i <
|
|
3484
|
+
for (int i = 0; i < n_pos; i++) {
|
|
3278
3485
|
pos_data[i] = i / n_patches_per_col;
|
|
3279
3486
|
}
|
|
3280
3487
|
set_input_i32("pos_h", pos_data);
|
|
3281
3488
|
// dimension W
|
|
3282
|
-
for (int i = 0; i <
|
|
3489
|
+
for (int i = 0; i < n_pos; i++) {
|
|
3283
3490
|
pos_data[i] = i % n_patches_per_col;
|
|
3284
3491
|
}
|
|
3285
3492
|
set_input_i32("pos_w", pos_data);
|
|
@@ -3287,8 +3494,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
3287
3494
|
case PROJECTOR_TYPE_GLM_EDGE:
|
|
3288
3495
|
{
|
|
3289
3496
|
// llava and other models
|
|
3290
|
-
std::vector<int32_t> positions(
|
|
3291
|
-
for (int i = 0; i <
|
|
3497
|
+
std::vector<int32_t> positions(n_pos);
|
|
3498
|
+
for (int i = 0; i < n_pos; i++) {
|
|
3292
3499
|
positions[i] = i;
|
|
3293
3500
|
}
|
|
3294
3501
|
set_input_i32("positions", positions);
|
|
@@ -3299,8 +3506,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
3299
3506
|
case PROJECTOR_TYPE_LDPV2:
|
|
3300
3507
|
{
|
|
3301
3508
|
// llava and other models
|
|
3302
|
-
std::vector<int32_t> positions(
|
|
3303
|
-
for (int i = 0; i <
|
|
3509
|
+
std::vector<int32_t> positions(n_pos);
|
|
3510
|
+
for (int i = 0; i < n_pos; i++) {
|
|
3304
3511
|
positions[i] = i;
|
|
3305
3512
|
}
|
|
3306
3513
|
set_input_i32("positions", positions);
|
|
@@ -3317,6 +3524,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
3317
3524
|
} break;
|
|
3318
3525
|
case PROJECTOR_TYPE_GEMMA3:
|
|
3319
3526
|
case PROJECTOR_TYPE_IDEFICS3:
|
|
3527
|
+
case PROJECTOR_TYPE_INTERNVL:
|
|
3320
3528
|
{
|
|
3321
3529
|
// do nothing
|
|
3322
3530
|
} break;
|
|
@@ -3324,7 +3532,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
3324
3532
|
GGML_ABORT("Unknown projector type");
|
|
3325
3533
|
}
|
|
3326
3534
|
|
|
3327
|
-
ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
|
|
3535
|
+
// ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
|
|
3536
|
+
ggml_backend_dev_t dev = ggml_backend_get_device(ctx->backend_cpu);
|
|
3537
|
+
ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
|
|
3538
|
+
if (reg) {
|
|
3539
|
+
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
|
|
3540
|
+
if (ggml_backend_set_n_threads_fn) {
|
|
3541
|
+
ggml_backend_set_n_threads_fn(ctx->backend_cpu, n_threads);
|
|
3542
|
+
}
|
|
3543
|
+
}
|
|
3328
3544
|
|
|
3329
3545
|
auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
|
|
3330
3546
|
if (status != GGML_STATUS_SUCCESS) {
|
|
@@ -3333,145 +3549,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
3333
3549
|
}
|
|
3334
3550
|
|
|
3335
3551
|
// the last node is the embedding tensor
|
|
3336
|
-
|
|
3552
|
+
ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
|
3337
3553
|
|
|
3338
|
-
//
|
|
3339
|
-
|
|
3340
|
-
|
|
3341
|
-
|
|
3342
|
-
|
|
3343
|
-
|
|
3344
|
-
bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
|
|
3345
|
-
assert(itype < GGML_TYPE_COUNT);
|
|
3346
|
-
ggml_type type = static_cast<ggml_type>(itype);
|
|
3347
|
-
|
|
3348
|
-
auto * ctx_clip = clip_init(fname_inp, clip_context_params{
|
|
3349
|
-
/* use_gpu */ false,
|
|
3350
|
-
/* verbosity */ GGML_LOG_LEVEL_ERROR,
|
|
3351
|
-
});
|
|
3352
|
-
|
|
3353
|
-
const auto & ctx_src = ctx_clip->ctx_gguf.get();
|
|
3354
|
-
const auto & ctx_data = ctx_clip->ctx_data.get();
|
|
3355
|
-
|
|
3356
|
-
auto * ctx_out = gguf_init_empty();
|
|
3357
|
-
gguf_set_kv(ctx_out, ctx_src);
|
|
3358
|
-
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
|
3359
|
-
gguf_set_val_u32(ctx_out, "general.file_type", itype);
|
|
3360
|
-
|
|
3361
|
-
auto fout = std::ofstream(fname_out, std::ios::binary);
|
|
3362
|
-
|
|
3363
|
-
const int n_tensors = gguf_get_n_tensors(ctx_src);
|
|
3364
|
-
|
|
3365
|
-
for (int i = 0; i < n_tensors; ++i) {
|
|
3366
|
-
const char * name = gguf_get_tensor_name(ctx_src, i);
|
|
3367
|
-
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
|
3368
|
-
gguf_add_tensor(ctx_out, cur);
|
|
3554
|
+
// sanity check (only support batch size of 1 for now)
|
|
3555
|
+
const int n_tokens_out = embeddings->ne[1];
|
|
3556
|
+
const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
|
|
3557
|
+
if (n_tokens_out != expected_n_tokens_out) {
|
|
3558
|
+
LOG_ERR("%s: expected %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
|
|
3559
|
+
GGML_ABORT("Invalid number of output tokens");
|
|
3369
3560
|
}
|
|
3370
3561
|
|
|
3371
|
-
|
|
3372
|
-
|
|
3373
|
-
fout.put(0);
|
|
3374
|
-
}
|
|
3375
|
-
|
|
3376
|
-
// regexes of tensor names to be quantized
|
|
3377
|
-
const std::vector<std::string> k_names = {
|
|
3378
|
-
".*weight",
|
|
3379
|
-
};
|
|
3380
|
-
|
|
3381
|
-
std::vector<uint8_t> work(512);
|
|
3382
|
-
std::vector<float> conv_buf(512);
|
|
3383
|
-
size_t total_size_org = 0;
|
|
3384
|
-
size_t total_size_new = 0;
|
|
3385
|
-
|
|
3386
|
-
for (int i = 0; i < n_tensors; ++i) {
|
|
3387
|
-
const std::string name = gguf_get_tensor_name(ctx_src, i);
|
|
3388
|
-
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str());
|
|
3389
|
-
|
|
3390
|
-
enum ggml_type new_type;
|
|
3391
|
-
void * new_data;
|
|
3392
|
-
size_t new_size;
|
|
3393
|
-
|
|
3394
|
-
bool quantize = false;
|
|
3395
|
-
for (const auto & s : k_names) {
|
|
3396
|
-
if (std::regex_match(name, std::regex(s))) {
|
|
3397
|
-
quantize = true;
|
|
3398
|
-
break;
|
|
3399
|
-
}
|
|
3400
|
-
}
|
|
3401
|
-
|
|
3402
|
-
// quantize only 2D tensors and bigger than block size
|
|
3403
|
-
quantize &= (ggml_n_dims(cur) == 2) && cur->ne[0] > ggml_blck_size(type);
|
|
3404
|
-
|
|
3405
|
-
if (quantize) {
|
|
3406
|
-
new_type = type;
|
|
3407
|
-
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
|
|
3408
|
-
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
|
|
3409
|
-
// LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
|
|
3410
|
-
}
|
|
3411
|
-
const size_t n_elms = ggml_nelements(cur);
|
|
3412
|
-
float * f32_data;
|
|
3413
|
-
|
|
3414
|
-
switch (cur->type) {
|
|
3415
|
-
case GGML_TYPE_F32:
|
|
3416
|
-
f32_data = (float *)cur->data;
|
|
3417
|
-
break;
|
|
3418
|
-
case GGML_TYPE_F16:
|
|
3419
|
-
if (conv_buf.size() < n_elms) {
|
|
3420
|
-
conv_buf.resize(n_elms);
|
|
3421
|
-
}
|
|
3422
|
-
for (size_t j = 0; j < n_elms; ++j) {
|
|
3423
|
-
conv_buf[j] = ggml_fp16_to_fp32(((ggml_fp16_t *)cur->data)[j]);
|
|
3424
|
-
}
|
|
3425
|
-
f32_data = (float *)conv_buf.data();
|
|
3426
|
-
break;
|
|
3427
|
-
default:
|
|
3428
|
-
LOG_ERR("%s: Please use an input file in f32 or f16\n", __func__);
|
|
3429
|
-
gguf_free(ctx_out);
|
|
3430
|
-
return false;
|
|
3431
|
-
}
|
|
3432
|
-
|
|
3433
|
-
if (work.size() < n_elms * 4) {
|
|
3434
|
-
work.resize(n_elms * 4);
|
|
3435
|
-
}
|
|
3436
|
-
new_data = work.data();
|
|
3437
|
-
|
|
3438
|
-
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr);
|
|
3439
|
-
} else {
|
|
3440
|
-
new_type = cur->type;
|
|
3441
|
-
new_data = cur->data;
|
|
3442
|
-
new_size = ggml_nbytes(cur);
|
|
3443
|
-
}
|
|
3444
|
-
const size_t orig_size = ggml_nbytes(cur);
|
|
3445
|
-
total_size_org += orig_size;
|
|
3446
|
-
total_size_new += new_size;
|
|
3447
|
-
gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
|
|
3448
|
-
GGML_ASSERT(gguf_get_tensor_size(ctx_out, gguf_find_tensor(ctx_out, name.c_str())) == new_size);
|
|
3449
|
-
gguf_set_tensor_data(ctx_out, name.c_str(), new_data);
|
|
3450
|
-
fout.write((const char *)new_data, new_size);
|
|
3451
|
-
size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
|
|
3452
|
-
for (size_t j = 0; j < pad; ++j) {
|
|
3453
|
-
fout.put(0);
|
|
3454
|
-
}
|
|
3455
|
-
|
|
3456
|
-
LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
|
|
3457
|
-
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
|
3458
|
-
}
|
|
3459
|
-
|
|
3460
|
-
// go back to beginning of file and write the updated metadata
|
|
3461
|
-
fout.seekp(0, std::ios::beg);
|
|
3462
|
-
std::vector<uint8_t> meta(meta_size);
|
|
3463
|
-
gguf_get_meta_data(ctx_out, meta.data());
|
|
3464
|
-
fout.write((const char *)meta.data(), meta_size);
|
|
3465
|
-
|
|
3466
|
-
fout.close();
|
|
3467
|
-
|
|
3468
|
-
clip_free(ctx_clip);
|
|
3469
|
-
gguf_free(ctx_out);
|
|
3470
|
-
|
|
3471
|
-
{
|
|
3472
|
-
LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
|
|
3473
|
-
LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
|
|
3474
|
-
}
|
|
3562
|
+
// copy the embeddings to the location passed by the user
|
|
3563
|
+
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
|
3475
3564
|
|
|
3476
3565
|
return true;
|
|
3477
3566
|
}
|
|
@@ -3484,7 +3573,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|
|
3484
3573
|
return ctx->vision_model.mm_model_peg_0_b->ne[0];
|
|
3485
3574
|
case PROJECTOR_TYPE_MLP:
|
|
3486
3575
|
case PROJECTOR_TYPE_PIXTRAL:
|
|
3487
|
-
return ctx->vision_model.
|
|
3576
|
+
return ctx->vision_model.mm_2_w->ne[1];
|
|
3488
3577
|
case PROJECTOR_TYPE_MLP_NORM:
|
|
3489
3578
|
return ctx->vision_model.mm_3_b->ne[0];
|
|
3490
3579
|
case PROJECTOR_TYPE_MINICPMV:
|
|
@@ -3505,6 +3594,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|
|
3505
3594
|
return ctx->vision_model.mm_input_proj_w->ne[0];
|
|
3506
3595
|
case PROJECTOR_TYPE_IDEFICS3:
|
|
3507
3596
|
return ctx->vision_model.projection->ne[1];
|
|
3597
|
+
case PROJECTOR_TYPE_INTERNVL:
|
|
3598
|
+
return ctx->vision_model.mm_3_w->ne[1];
|
|
3508
3599
|
default:
|
|
3509
3600
|
GGML_ABORT("Unknown projector type");
|
|
3510
3601
|
}
|