@fugood/llama.node 0.3.16 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +8 -0
- package/src/LlamaCompletionWorker.h +1 -0
- package/src/LlamaContext.cpp +3 -2
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
- package/src/llama.cpp/.github/workflows/build.yml +70 -27
- package/src/llama.cpp/.github/workflows/docker.yml +6 -6
- package/src/llama.cpp/.github/workflows/server.yml +7 -11
- package/src/llama.cpp/CMakeLists.txt +23 -1
- package/src/llama.cpp/common/CMakeLists.txt +6 -3
- package/src/llama.cpp/common/arg.cpp +809 -105
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +1 -1
- package/src/llama.cpp/common/common.cpp +31 -521
- package/src/llama.cpp/common/common.h +17 -36
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +30 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
- package/src/llama.cpp/common/minja/minja.hpp +119 -93
- package/src/llama.cpp/common/sampling.cpp +3 -0
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
- package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
- package/src/llama.cpp/examples/llava/clip.h +39 -22
- package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/examples/llava/llava.cpp +64 -52
- package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
- package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
- package/src/llama.cpp/examples/llava/mtmd.h +168 -0
- package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
- package/src/llama.cpp/examples/main/main.cpp +16 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
- package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
- package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
- package/src/llama.cpp/examples/run/run.cpp +14 -28
- package/src/llama.cpp/examples/server/httplib.h +313 -247
- package/src/llama.cpp/examples/server/server.cpp +238 -139
- package/src/llama.cpp/examples/server/utils.hpp +51 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +6 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +66 -99
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
- package/src/llama.cpp/ggml/src/ggml.c +141 -245
- package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
- package/src/llama.cpp/include/llama.h +30 -11
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +2 -0
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/src/CMakeLists.txt +3 -2
- package/src/llama.cpp/src/llama-adapter.cpp +37 -1
- package/src/llama.cpp/src/llama-arch.cpp +160 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-chat.cpp +82 -17
- package/src/llama.cpp/src/llama-chat.h +6 -2
- package/src/llama.cpp/src/llama-context.cpp +108 -92
- package/src/llama.cpp/src/llama-context.h +1 -2
- package/src/llama.cpp/src/llama-graph.cpp +189 -119
- package/src/llama.cpp/src/llama-graph.h +26 -6
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
- package/src/llama.cpp/src/llama-kv-cache.h +41 -115
- package/src/llama.cpp/src/llama-memory.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model.cpp +1760 -534
- package/src/llama.cpp/src/llama-model.h +13 -1
- package/src/llama.cpp/src/llama-quant.cpp +29 -8
- package/src/llama.cpp/src/llama-sampling.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.cpp +44 -6
- package/src/llama.cpp/src/llama.cpp +1 -1
- package/src/llama.cpp/tests/CMakeLists.txt +43 -30
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
- package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
- package/src/llama.cpp/tests/test-chat.cpp +12 -2
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
// I'll gradually clean and extend it
|
|
4
4
|
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
|
5
5
|
#include "clip.h"
|
|
6
|
+
#include "clip-impl.h"
|
|
6
7
|
#include "ggml.h"
|
|
7
8
|
#include "ggml-cpp.h"
|
|
8
9
|
#include "ggml-cpu.h"
|
|
@@ -26,285 +27,13 @@
|
|
|
26
27
|
#include <sstream>
|
|
27
28
|
#include <cinttypes>
|
|
28
29
|
#include <limits>
|
|
30
|
+
#include <array>
|
|
31
|
+
#include <numeric>
|
|
29
32
|
|
|
30
|
-
|
|
31
|
-
# define LOG_INF(...)
|
|
32
|
-
# define LOG_WRN(...)
|
|
33
|
-
# define LOG_ERR(...)
|
|
34
|
-
# define LOG_DBG(...)
|
|
35
|
-
#else // defined(LLAVA_LOG_OFF)
|
|
36
|
-
# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
|
37
|
-
# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
38
|
-
# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
39
|
-
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
|
40
|
-
#endif // defined(LLAVA_LOG_OFF)
|
|
33
|
+
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
|
|
41
34
|
|
|
42
35
|
//#define CLIP_DEBUG_FUNCTIONS
|
|
43
36
|
|
|
44
|
-
// RGB uint8 image
|
|
45
|
-
struct clip_image_u8 {
|
|
46
|
-
int nx;
|
|
47
|
-
int ny;
|
|
48
|
-
|
|
49
|
-
std::vector<uint8_t> buf;
|
|
50
|
-
};
|
|
51
|
-
|
|
52
|
-
// RGB float32 image (NHWC)
|
|
53
|
-
// Memory layout: RGBRGBRGB...
|
|
54
|
-
struct clip_image_f32 {
|
|
55
|
-
int nx;
|
|
56
|
-
int ny;
|
|
57
|
-
|
|
58
|
-
std::vector<float> buf;
|
|
59
|
-
};
|
|
60
|
-
|
|
61
|
-
static std::string format(const char * fmt, ...) {
|
|
62
|
-
va_list ap;
|
|
63
|
-
va_list ap2;
|
|
64
|
-
va_start(ap, fmt);
|
|
65
|
-
va_copy(ap2, ap);
|
|
66
|
-
int size = vsnprintf(NULL, 0, fmt, ap);
|
|
67
|
-
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
|
68
|
-
std::vector<char> buf(size + 1);
|
|
69
|
-
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
|
70
|
-
GGML_ASSERT(size2 == size);
|
|
71
|
-
va_end(ap2);
|
|
72
|
-
va_end(ap);
|
|
73
|
-
return std::string(buf.data(), buf.size());
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
//
|
|
77
|
-
// key constants
|
|
78
|
-
//
|
|
79
|
-
|
|
80
|
-
#define KEY_FTYPE "general.file_type"
|
|
81
|
-
#define KEY_NAME "general.name"
|
|
82
|
-
#define KEY_DESCRIPTION "general.description"
|
|
83
|
-
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
|
|
84
|
-
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
|
|
85
|
-
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
|
86
|
-
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
|
|
87
|
-
#define KEY_HAS_GLM_PROJ "clip.has_glm_projector"
|
|
88
|
-
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
|
89
|
-
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
|
|
90
|
-
#define KEY_USE_GELU "clip.use_gelu"
|
|
91
|
-
#define KEY_USE_SILU "clip.use_silu"
|
|
92
|
-
#define KEY_N_EMBD "clip.%s.embedding_length"
|
|
93
|
-
#define KEY_N_FF "clip.%s.feed_forward_length"
|
|
94
|
-
#define KEY_N_BLOCK "clip.%s.block_count"
|
|
95
|
-
#define KEY_N_HEAD "clip.%s.attention.head_count"
|
|
96
|
-
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
|
|
97
|
-
#define KEY_PROJ_DIM "clip.%s.projection_dim"
|
|
98
|
-
#define KEY_TOKENS "tokenizer.ggml.tokens"
|
|
99
|
-
#define KEY_N_POSITIONS "clip.text.context_length"
|
|
100
|
-
#define KEY_IMAGE_SIZE "clip.vision.image_size"
|
|
101
|
-
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
|
102
|
-
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
|
103
|
-
#define KEY_IMAGE_STD "clip.vision.image_std"
|
|
104
|
-
#define KEY_PROJ_TYPE "clip.projector_type"
|
|
105
|
-
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
|
|
106
|
-
|
|
107
|
-
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
|
108
|
-
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
|
109
|
-
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
//
|
|
113
|
-
// tensor name constants
|
|
114
|
-
//
|
|
115
|
-
|
|
116
|
-
#define TN_TOKEN_EMBD "%s.token_embd.weight"
|
|
117
|
-
#define TN_POS_EMBD "%s.position_embd.weight"
|
|
118
|
-
#define TN_CLASS_EMBD "v.class_embd"
|
|
119
|
-
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
|
|
120
|
-
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
|
|
121
|
-
#define TN_PATCH_BIAS "v.patch_embd.bias"
|
|
122
|
-
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
|
|
123
|
-
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
|
124
|
-
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
|
|
125
|
-
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
|
|
126
|
-
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
|
|
127
|
-
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
|
|
128
|
-
#define TN_LN_1 "%s.blk.%d.ln1.%s"
|
|
129
|
-
#define TN_LN_2 "%s.blk.%d.ln2.%s"
|
|
130
|
-
#define TN_LN_PRE "%s.pre_ln.%s"
|
|
131
|
-
#define TN_LN_POST "%s.post_ln.%s"
|
|
132
|
-
#define TN_TEXT_PROJ "text_projection.weight"
|
|
133
|
-
#define TN_VIS_PROJ "visual_projection.weight"
|
|
134
|
-
#define TN_LLAVA_PROJ "mm.%d.%s"
|
|
135
|
-
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
|
136
|
-
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
|
137
|
-
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
|
138
|
-
#define TN_IMAGE_NEWLINE "model.image_newline"
|
|
139
|
-
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
|
140
|
-
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
|
141
|
-
|
|
142
|
-
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
|
143
|
-
#define TN_MINICPMV_QUERY "resampler.query"
|
|
144
|
-
#define TN_MINICPMV_PROJ "resampler.proj.weight"
|
|
145
|
-
#define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
|
|
146
|
-
#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
|
|
147
|
-
#define TN_MINICPMV_LN "resampler.ln_%s.%s"
|
|
148
|
-
|
|
149
|
-
#define TN_GLM_ADAPER_CONV "adapter.conv.%s"
|
|
150
|
-
#define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s"
|
|
151
|
-
#define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s"
|
|
152
|
-
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
|
|
153
|
-
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
|
|
154
|
-
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
|
|
155
|
-
#define TN_GLM_BOI_W "adapter.boi"
|
|
156
|
-
#define TN_GLM_EOI_W "adapter.eoi"
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
enum projector_type {
|
|
160
|
-
PROJECTOR_TYPE_MLP,
|
|
161
|
-
PROJECTOR_TYPE_MLP_NORM,
|
|
162
|
-
PROJECTOR_TYPE_LDP,
|
|
163
|
-
PROJECTOR_TYPE_LDPV2,
|
|
164
|
-
PROJECTOR_TYPE_RESAMPLER,
|
|
165
|
-
PROJECTOR_TYPE_GLM_EDGE,
|
|
166
|
-
PROJECTOR_TYPE_MERGER,
|
|
167
|
-
PROJECTOR_TYPE_GEMMA3,
|
|
168
|
-
PROJECTOR_TYPE_UNKNOWN,
|
|
169
|
-
};
|
|
170
|
-
|
|
171
|
-
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|
172
|
-
{ PROJECTOR_TYPE_MLP, "mlp" },
|
|
173
|
-
{ PROJECTOR_TYPE_LDP, "ldp" },
|
|
174
|
-
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
|
175
|
-
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
|
176
|
-
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
|
|
177
|
-
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
|
|
178
|
-
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
|
179
|
-
};
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
//
|
|
183
|
-
// utilities to get data from a gguf file
|
|
184
|
-
//
|
|
185
|
-
|
|
186
|
-
static int get_key_idx(const gguf_context * ctx, const char * key) {
|
|
187
|
-
int i = gguf_find_key(ctx, key);
|
|
188
|
-
if (i == -1) {
|
|
189
|
-
LOG_ERR("key %s not found in file\n", key);
|
|
190
|
-
throw std::runtime_error(format("Missing required key: %s", key));
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
return i;
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
static uint32_t get_u32(const gguf_context * ctx, const std::string & key) {
|
|
197
|
-
const int i = get_key_idx(ctx, key.c_str());
|
|
198
|
-
|
|
199
|
-
return gguf_get_val_u32(ctx, i);
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
static float get_f32(const gguf_context * ctx, const std::string & key) {
|
|
203
|
-
const int i = get_key_idx(ctx, key.c_str());
|
|
204
|
-
|
|
205
|
-
return gguf_get_val_f32(ctx, i);
|
|
206
|
-
}
|
|
207
|
-
|
|
208
|
-
static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::string & name) {
|
|
209
|
-
struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
|
|
210
|
-
if (!cur) {
|
|
211
|
-
throw std::runtime_error(format("%s: unable to find tensor %s\n", __func__, name.c_str()));
|
|
212
|
-
}
|
|
213
|
-
|
|
214
|
-
return cur;
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
static std::string get_ftype(int ftype) {
|
|
218
|
-
return ggml_type_name(static_cast<ggml_type>(ftype));
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
|
222
|
-
switch (type) {
|
|
223
|
-
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
|
|
224
|
-
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
|
|
225
|
-
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
|
|
226
|
-
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
|
|
227
|
-
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
|
|
228
|
-
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
|
|
229
|
-
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
|
|
230
|
-
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
|
|
231
|
-
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
|
|
232
|
-
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
|
|
233
|
-
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
|
|
234
|
-
default: return format("unknown type %d", type);
|
|
235
|
-
}
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
|
239
|
-
if (search.empty()) {
|
|
240
|
-
return;
|
|
241
|
-
}
|
|
242
|
-
std::string builder;
|
|
243
|
-
builder.reserve(s.length());
|
|
244
|
-
size_t pos = 0;
|
|
245
|
-
size_t last_pos = 0;
|
|
246
|
-
while ((pos = s.find(search, last_pos)) != std::string::npos) {
|
|
247
|
-
builder.append(s, last_pos, pos - last_pos);
|
|
248
|
-
builder.append(replace);
|
|
249
|
-
last_pos = pos + search.length();
|
|
250
|
-
}
|
|
251
|
-
builder.append(s, last_pos, std::string::npos);
|
|
252
|
-
s = std::move(builder);
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
|
256
|
-
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
|
257
|
-
|
|
258
|
-
switch (type) {
|
|
259
|
-
case GGUF_TYPE_STRING:
|
|
260
|
-
return gguf_get_val_str(ctx_gguf, i);
|
|
261
|
-
case GGUF_TYPE_ARRAY:
|
|
262
|
-
{
|
|
263
|
-
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
|
264
|
-
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
|
265
|
-
const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
|
|
266
|
-
std::stringstream ss;
|
|
267
|
-
ss << "[";
|
|
268
|
-
for (int j = 0; j < arr_n; j++) {
|
|
269
|
-
if (arr_type == GGUF_TYPE_STRING) {
|
|
270
|
-
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
|
|
271
|
-
// escape quotes
|
|
272
|
-
replace_all(val, "\\", "\\\\");
|
|
273
|
-
replace_all(val, "\"", "\\\"");
|
|
274
|
-
ss << '"' << val << '"';
|
|
275
|
-
} else if (arr_type == GGUF_TYPE_ARRAY) {
|
|
276
|
-
ss << "???";
|
|
277
|
-
} else {
|
|
278
|
-
ss << gguf_data_to_str(arr_type, data, j);
|
|
279
|
-
}
|
|
280
|
-
if (j < arr_n - 1) {
|
|
281
|
-
ss << ", ";
|
|
282
|
-
}
|
|
283
|
-
}
|
|
284
|
-
ss << "]";
|
|
285
|
-
return ss.str();
|
|
286
|
-
}
|
|
287
|
-
default:
|
|
288
|
-
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
|
|
289
|
-
}
|
|
290
|
-
}
|
|
291
|
-
|
|
292
|
-
static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
|
|
293
|
-
size_t tensor_size = ggml_nbytes(tensor);
|
|
294
|
-
LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
|
|
295
|
-
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
|
|
296
|
-
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
|
|
297
|
-
}
|
|
298
|
-
|
|
299
|
-
static projector_type clip_projector_type_from_string(const std::string & name) {
|
|
300
|
-
for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT
|
|
301
|
-
if (kv.second == name) {
|
|
302
|
-
return kv.first;
|
|
303
|
-
}
|
|
304
|
-
}
|
|
305
|
-
throw std::runtime_error(format("Unknown projector type: %s", name.c_str()));
|
|
306
|
-
}
|
|
307
|
-
|
|
308
37
|
#ifdef CLIP_DEBUG_FUNCTIONS
|
|
309
38
|
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
|
|
310
39
|
std::ofstream file(filename, std::ios::binary);
|
|
@@ -418,6 +147,11 @@ static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u
|
|
|
418
147
|
// clip layers
|
|
419
148
|
//
|
|
420
149
|
|
|
150
|
+
enum patch_merge_type {
|
|
151
|
+
PATCH_MERGE_FLAT,
|
|
152
|
+
PATCH_MERGE_SPATIAL_UNPAD,
|
|
153
|
+
};
|
|
154
|
+
|
|
421
155
|
struct clip_hparams {
|
|
422
156
|
int32_t image_size;
|
|
423
157
|
int32_t patch_size;
|
|
@@ -426,56 +160,69 @@ struct clip_hparams {
|
|
|
426
160
|
int32_t projection_dim;
|
|
427
161
|
int32_t n_head;
|
|
428
162
|
int32_t n_layer;
|
|
163
|
+
int32_t proj_scale_factor = 0; // idefics3
|
|
429
164
|
|
|
430
|
-
|
|
165
|
+
patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
|
|
431
166
|
|
|
432
|
-
|
|
167
|
+
float eps = 1e-6;
|
|
168
|
+
float rope_theta = 0.0;
|
|
433
169
|
|
|
434
170
|
std::vector<int32_t> image_grid_pinpoints;
|
|
435
171
|
int32_t image_crop_resolution;
|
|
436
172
|
std::unordered_set<int32_t> vision_feature_layer;
|
|
173
|
+
int32_t attn_window_size = 0;
|
|
174
|
+
int32_t n_wa_pattern = 0;
|
|
437
175
|
};
|
|
438
176
|
|
|
439
177
|
struct clip_layer {
|
|
440
178
|
// attention
|
|
441
|
-
struct ggml_tensor * k_w;
|
|
442
|
-
struct ggml_tensor * k_b;
|
|
443
|
-
struct ggml_tensor * q_w;
|
|
444
|
-
struct ggml_tensor * q_b;
|
|
445
|
-
struct ggml_tensor * v_w;
|
|
446
|
-
struct ggml_tensor * v_b;
|
|
179
|
+
struct ggml_tensor * k_w = nullptr;
|
|
180
|
+
struct ggml_tensor * k_b = nullptr;
|
|
181
|
+
struct ggml_tensor * q_w = nullptr;
|
|
182
|
+
struct ggml_tensor * q_b = nullptr;
|
|
183
|
+
struct ggml_tensor * v_w = nullptr;
|
|
184
|
+
struct ggml_tensor * v_b = nullptr;
|
|
447
185
|
|
|
448
|
-
struct ggml_tensor * o_w;
|
|
449
|
-
struct ggml_tensor * o_b;
|
|
186
|
+
struct ggml_tensor * o_w = nullptr;
|
|
187
|
+
struct ggml_tensor * o_b = nullptr;
|
|
450
188
|
|
|
451
189
|
// layernorm 1
|
|
452
|
-
struct ggml_tensor * ln_1_w;
|
|
453
|
-
struct ggml_tensor * ln_1_b;
|
|
190
|
+
struct ggml_tensor * ln_1_w = nullptr;
|
|
191
|
+
struct ggml_tensor * ln_1_b = nullptr;
|
|
454
192
|
|
|
455
193
|
// ff
|
|
456
|
-
struct ggml_tensor * ff_i_w;
|
|
457
|
-
struct ggml_tensor * ff_i_b;
|
|
194
|
+
struct ggml_tensor * ff_i_w = nullptr; // legacy naming
|
|
195
|
+
struct ggml_tensor * ff_i_b = nullptr; // legacy naming
|
|
196
|
+
struct ggml_tensor * ff_o_w = nullptr; // legacy naming
|
|
197
|
+
struct ggml_tensor * ff_o_b = nullptr; // legacy naming
|
|
458
198
|
|
|
459
|
-
struct ggml_tensor *
|
|
460
|
-
struct ggml_tensor *
|
|
199
|
+
struct ggml_tensor * ff_up_w = nullptr;
|
|
200
|
+
struct ggml_tensor * ff_up_b = nullptr;
|
|
201
|
+
struct ggml_tensor * ff_gate_w = nullptr;
|
|
202
|
+
struct ggml_tensor * ff_gate_b = nullptr;
|
|
203
|
+
struct ggml_tensor * ff_down_w = nullptr;
|
|
204
|
+
struct ggml_tensor * ff_down_b = nullptr;
|
|
205
|
+
|
|
206
|
+
struct ggml_tensor * ff_g_w = NULL;
|
|
207
|
+
struct ggml_tensor * ff_g_b = NULL;
|
|
461
208
|
|
|
462
209
|
// layernorm 2
|
|
463
|
-
struct ggml_tensor * ln_2_w;
|
|
464
|
-
struct ggml_tensor * ln_2_b;
|
|
210
|
+
struct ggml_tensor * ln_2_w = nullptr;
|
|
211
|
+
struct ggml_tensor * ln_2_b = nullptr;
|
|
465
212
|
};
|
|
466
213
|
|
|
467
214
|
struct clip_vision_model {
|
|
468
215
|
struct clip_hparams hparams;
|
|
469
216
|
|
|
470
217
|
// embeddings
|
|
471
|
-
struct ggml_tensor * class_embedding;
|
|
472
|
-
struct ggml_tensor * patch_embeddings_0;
|
|
473
|
-
struct ggml_tensor * patch_embeddings_1; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
|
|
474
|
-
struct ggml_tensor * patch_bias;
|
|
475
|
-
struct ggml_tensor * position_embeddings;
|
|
218
|
+
struct ggml_tensor * class_embedding = nullptr;
|
|
219
|
+
struct ggml_tensor * patch_embeddings_0 = nullptr;
|
|
220
|
+
struct ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
|
|
221
|
+
struct ggml_tensor * patch_bias = nullptr;
|
|
222
|
+
struct ggml_tensor * position_embeddings = nullptr;
|
|
476
223
|
|
|
477
|
-
struct ggml_tensor * pre_ln_w;
|
|
478
|
-
struct ggml_tensor * pre_ln_b;
|
|
224
|
+
struct ggml_tensor * pre_ln_w = nullptr;
|
|
225
|
+
struct ggml_tensor * pre_ln_b = nullptr;
|
|
479
226
|
|
|
480
227
|
std::vector<clip_layer> layers;
|
|
481
228
|
|
|
@@ -485,94 +232,90 @@ struct clip_vision_model {
|
|
|
485
232
|
struct ggml_tensor * projection;
|
|
486
233
|
|
|
487
234
|
// LLaVA projection
|
|
488
|
-
struct ggml_tensor * mm_0_w =
|
|
489
|
-
struct ggml_tensor * mm_0_b =
|
|
490
|
-
struct ggml_tensor * mm_2_w =
|
|
491
|
-
struct ggml_tensor * mm_2_b =
|
|
235
|
+
struct ggml_tensor * mm_0_w = nullptr;
|
|
236
|
+
struct ggml_tensor * mm_0_b = nullptr;
|
|
237
|
+
struct ggml_tensor * mm_2_w = nullptr;
|
|
238
|
+
struct ggml_tensor * mm_2_b = nullptr;
|
|
492
239
|
|
|
493
|
-
struct ggml_tensor * image_newline =
|
|
240
|
+
struct ggml_tensor * image_newline = nullptr;
|
|
494
241
|
|
|
495
242
|
// Yi type models with mlp+normalization projection
|
|
496
|
-
struct ggml_tensor * mm_1_w =
|
|
497
|
-
struct ggml_tensor * mm_1_b =
|
|
498
|
-
struct ggml_tensor * mm_3_w =
|
|
499
|
-
struct ggml_tensor * mm_3_b =
|
|
500
|
-
struct ggml_tensor * mm_4_w =
|
|
501
|
-
struct ggml_tensor * mm_4_b =
|
|
243
|
+
struct ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
|
|
244
|
+
struct ggml_tensor * mm_1_b = nullptr;
|
|
245
|
+
struct ggml_tensor * mm_3_w = nullptr;
|
|
246
|
+
struct ggml_tensor * mm_3_b = nullptr;
|
|
247
|
+
struct ggml_tensor * mm_4_w = nullptr;
|
|
248
|
+
struct ggml_tensor * mm_4_b = nullptr;
|
|
502
249
|
|
|
503
250
|
//GLMV-Edge projection
|
|
504
|
-
struct ggml_tensor * mm_model_adapter_conv_w;
|
|
505
|
-
struct ggml_tensor * mm_model_adapter_conv_b;
|
|
506
|
-
struct ggml_tensor * boi_w;
|
|
507
|
-
struct ggml_tensor * eoi_w;
|
|
251
|
+
struct ggml_tensor * mm_model_adapter_conv_w = nullptr;
|
|
252
|
+
struct ggml_tensor * mm_model_adapter_conv_b = nullptr;
|
|
508
253
|
|
|
509
254
|
// MobileVLM projection
|
|
510
|
-
struct ggml_tensor * mm_model_mlp_1_w;
|
|
511
|
-
struct ggml_tensor * mm_model_mlp_1_b;
|
|
512
|
-
struct ggml_tensor * mm_model_mlp_3_w;
|
|
513
|
-
struct ggml_tensor * mm_model_mlp_3_b;
|
|
514
|
-
struct ggml_tensor * mm_model_block_1_block_0_0_w;
|
|
515
|
-
struct ggml_tensor * mm_model_block_1_block_0_1_w;
|
|
516
|
-
struct ggml_tensor * mm_model_block_1_block_0_1_b;
|
|
517
|
-
struct ggml_tensor * mm_model_block_1_block_1_fc1_w;
|
|
518
|
-
struct ggml_tensor * mm_model_block_1_block_1_fc1_b;
|
|
519
|
-
struct ggml_tensor * mm_model_block_1_block_1_fc2_w;
|
|
520
|
-
struct ggml_tensor * mm_model_block_1_block_1_fc2_b;
|
|
521
|
-
struct ggml_tensor * mm_model_block_1_block_2_0_w;
|
|
522
|
-
struct ggml_tensor * mm_model_block_1_block_2_1_w;
|
|
523
|
-
struct ggml_tensor * mm_model_block_1_block_2_1_b;
|
|
524
|
-
struct ggml_tensor * mm_model_block_2_block_0_0_w;
|
|
525
|
-
struct ggml_tensor * mm_model_block_2_block_0_1_w;
|
|
526
|
-
struct ggml_tensor * mm_model_block_2_block_0_1_b;
|
|
527
|
-
struct ggml_tensor * mm_model_block_2_block_1_fc1_w;
|
|
528
|
-
struct ggml_tensor * mm_model_block_2_block_1_fc1_b;
|
|
529
|
-
struct ggml_tensor * mm_model_block_2_block_1_fc2_w;
|
|
530
|
-
struct ggml_tensor * mm_model_block_2_block_1_fc2_b;
|
|
531
|
-
struct ggml_tensor * mm_model_block_2_block_2_0_w;
|
|
532
|
-
struct ggml_tensor * mm_model_block_2_block_2_1_w;
|
|
533
|
-
struct ggml_tensor * mm_model_block_2_block_2_1_b;
|
|
255
|
+
struct ggml_tensor * mm_model_mlp_1_w = nullptr;
|
|
256
|
+
struct ggml_tensor * mm_model_mlp_1_b = nullptr;
|
|
257
|
+
struct ggml_tensor * mm_model_mlp_3_w = nullptr;
|
|
258
|
+
struct ggml_tensor * mm_model_mlp_3_b = nullptr;
|
|
259
|
+
struct ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
|
|
260
|
+
struct ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
|
|
261
|
+
struct ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
|
|
262
|
+
struct ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
|
|
263
|
+
struct ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
|
|
264
|
+
struct ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
|
|
265
|
+
struct ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
|
|
266
|
+
struct ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
|
|
267
|
+
struct ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
|
|
268
|
+
struct ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
|
|
269
|
+
struct ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
|
|
270
|
+
struct ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
|
|
271
|
+
struct ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
|
|
272
|
+
struct ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
|
|
273
|
+
struct ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
|
|
274
|
+
struct ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
|
|
275
|
+
struct ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
|
|
276
|
+
struct ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
|
|
277
|
+
struct ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
|
|
278
|
+
struct ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
|
|
534
279
|
|
|
535
280
|
// MobileVLM_V2 projection
|
|
536
|
-
struct ggml_tensor * mm_model_mlp_0_w;
|
|
537
|
-
struct ggml_tensor * mm_model_mlp_0_b;
|
|
538
|
-
struct ggml_tensor * mm_model_mlp_2_w;
|
|
539
|
-
struct ggml_tensor * mm_model_mlp_2_b;
|
|
540
|
-
struct ggml_tensor * mm_model_peg_0_w;
|
|
541
|
-
struct ggml_tensor * mm_model_peg_0_b;
|
|
281
|
+
struct ggml_tensor * mm_model_mlp_0_w = nullptr;
|
|
282
|
+
struct ggml_tensor * mm_model_mlp_0_b = nullptr;
|
|
283
|
+
struct ggml_tensor * mm_model_mlp_2_w = nullptr;
|
|
284
|
+
struct ggml_tensor * mm_model_mlp_2_b = nullptr;
|
|
285
|
+
struct ggml_tensor * mm_model_peg_0_w = nullptr;
|
|
286
|
+
struct ggml_tensor * mm_model_peg_0_b = nullptr;
|
|
542
287
|
|
|
543
288
|
// MINICPMV projection
|
|
544
|
-
struct ggml_tensor * mm_model_pos_embed_k;
|
|
545
|
-
struct ggml_tensor * mm_model_query;
|
|
546
|
-
struct ggml_tensor * mm_model_proj;
|
|
547
|
-
struct ggml_tensor * mm_model_kv_proj;
|
|
548
|
-
struct ggml_tensor * mm_model_attn_q_w;
|
|
549
|
-
struct ggml_tensor * mm_model_attn_q_b;
|
|
550
|
-
struct ggml_tensor * mm_model_attn_k_w;
|
|
551
|
-
struct ggml_tensor * mm_model_attn_k_b;
|
|
552
|
-
struct ggml_tensor * mm_model_attn_v_w;
|
|
553
|
-
struct ggml_tensor * mm_model_attn_v_b;
|
|
554
|
-
struct ggml_tensor * mm_model_attn_o_w;
|
|
555
|
-
struct ggml_tensor * mm_model_attn_o_b;
|
|
556
|
-
struct ggml_tensor * mm_model_ln_q_w;
|
|
557
|
-
struct ggml_tensor * mm_model_ln_q_b;
|
|
558
|
-
struct ggml_tensor * mm_model_ln_kv_w;
|
|
559
|
-
struct ggml_tensor * mm_model_ln_kv_b;
|
|
560
|
-
struct ggml_tensor * mm_model_ln_post_w;
|
|
561
|
-
struct ggml_tensor * mm_model_ln_post_b;
|
|
289
|
+
struct ggml_tensor * mm_model_pos_embed_k = nullptr;
|
|
290
|
+
struct ggml_tensor * mm_model_query = nullptr;
|
|
291
|
+
struct ggml_tensor * mm_model_proj = nullptr;
|
|
292
|
+
struct ggml_tensor * mm_model_kv_proj = nullptr;
|
|
293
|
+
struct ggml_tensor * mm_model_attn_q_w = nullptr;
|
|
294
|
+
struct ggml_tensor * mm_model_attn_q_b = nullptr;
|
|
295
|
+
struct ggml_tensor * mm_model_attn_k_w = nullptr;
|
|
296
|
+
struct ggml_tensor * mm_model_attn_k_b = nullptr;
|
|
297
|
+
struct ggml_tensor * mm_model_attn_v_w = nullptr;
|
|
298
|
+
struct ggml_tensor * mm_model_attn_v_b = nullptr;
|
|
299
|
+
struct ggml_tensor * mm_model_attn_o_w = nullptr;
|
|
300
|
+
struct ggml_tensor * mm_model_attn_o_b = nullptr;
|
|
301
|
+
struct ggml_tensor * mm_model_ln_q_w = nullptr;
|
|
302
|
+
struct ggml_tensor * mm_model_ln_q_b = nullptr;
|
|
303
|
+
struct ggml_tensor * mm_model_ln_kv_w = nullptr;
|
|
304
|
+
struct ggml_tensor * mm_model_ln_kv_b = nullptr;
|
|
305
|
+
struct ggml_tensor * mm_model_ln_post_w = nullptr;
|
|
306
|
+
struct ggml_tensor * mm_model_ln_post_b = nullptr;
|
|
562
307
|
|
|
563
308
|
// gemma3
|
|
564
|
-
struct ggml_tensor * mm_input_proj_w;
|
|
565
|
-
struct ggml_tensor * mm_soft_emb_norm_w;
|
|
309
|
+
struct ggml_tensor * mm_input_proj_w = nullptr;
|
|
310
|
+
struct ggml_tensor * mm_soft_emb_norm_w = nullptr;
|
|
311
|
+
|
|
312
|
+
// pixtral
|
|
313
|
+
struct ggml_tensor * token_embd_img_break = nullptr;
|
|
566
314
|
};
|
|
567
315
|
|
|
568
316
|
struct clip_ctx {
|
|
569
|
-
bool has_text_encoder = false;
|
|
570
|
-
bool has_vision_encoder = false;
|
|
571
317
|
bool has_llava_projector = false;
|
|
572
|
-
|
|
573
|
-
bool has_glm_projector = false;
|
|
574
|
-
bool has_qwen2vl_merger = false;
|
|
575
|
-
int minicpmv_version = 2;
|
|
318
|
+
int minicpmv_version = 0;
|
|
576
319
|
|
|
577
320
|
struct clip_vision_model vision_model;
|
|
578
321
|
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
|
@@ -582,28 +325,23 @@ struct clip_ctx {
|
|
|
582
325
|
float image_std[3];
|
|
583
326
|
bool use_gelu = false;
|
|
584
327
|
bool use_silu = false;
|
|
585
|
-
int32_t ftype = 1;
|
|
586
328
|
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
bool has_post_norm = false;
|
|
590
|
-
bool has_patch_bias = false;
|
|
591
|
-
|
|
592
|
-
struct gguf_context * ctx_gguf = nullptr;
|
|
593
|
-
struct ggml_context * ctx_data = nullptr;
|
|
329
|
+
gguf_context_ptr ctx_gguf;
|
|
330
|
+
ggml_context_ptr ctx_data;
|
|
594
331
|
|
|
595
332
|
std::vector<uint8_t> buf_compute_meta;
|
|
596
333
|
|
|
597
334
|
std::vector<ggml_backend_t> backend_ptrs;
|
|
598
335
|
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
|
599
336
|
|
|
600
|
-
ggml_backend_t backend
|
|
601
|
-
ggml_backend_t backend_cpu
|
|
602
|
-
|
|
337
|
+
ggml_backend_t backend;
|
|
338
|
+
ggml_backend_t backend_cpu;
|
|
339
|
+
ggml_backend_buffer_ptr buf;
|
|
603
340
|
|
|
341
|
+
int max_nodes = 8192;
|
|
604
342
|
ggml_backend_sched_ptr sched;
|
|
605
343
|
|
|
606
|
-
|
|
344
|
+
clip_image_size load_image_size;
|
|
607
345
|
|
|
608
346
|
clip_ctx(clip_context_params & ctx_params) {
|
|
609
347
|
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
|
@@ -629,33 +367,27 @@ struct clip_ctx {
|
|
|
629
367
|
}
|
|
630
368
|
|
|
631
369
|
~clip_ctx() {
|
|
632
|
-
ggml_free(ctx_data);
|
|
633
|
-
gguf_free(ctx_gguf);
|
|
634
|
-
ggml_backend_buffer_free(buf);
|
|
635
370
|
ggml_backend_free(backend);
|
|
636
|
-
if (
|
|
371
|
+
if (backend != backend_cpu) {
|
|
637
372
|
ggml_backend_free(backend_cpu);
|
|
638
373
|
}
|
|
639
374
|
}
|
|
640
375
|
};
|
|
641
376
|
|
|
642
|
-
static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const
|
|
377
|
+
static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) {
|
|
643
378
|
const auto & model = ctx->vision_model;
|
|
644
379
|
const auto & hparams = model.hparams;
|
|
645
380
|
|
|
646
|
-
|
|
647
|
-
int
|
|
648
|
-
int image_size_height = image_size;
|
|
649
|
-
|
|
650
|
-
const int patch_size = hparams.patch_size;
|
|
651
|
-
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
|
652
|
-
const int hidden_size = hparams.hidden_size;
|
|
653
|
-
const int n_head = hparams.n_head;
|
|
654
|
-
const int d_head = hidden_size / n_head;
|
|
655
|
-
const int n_layer = hparams.n_layer;
|
|
656
|
-
const float eps = hparams.eps;
|
|
381
|
+
int image_size_width = img.nx;
|
|
382
|
+
int image_size_height = img.ny;
|
|
657
383
|
|
|
658
|
-
|
|
384
|
+
const int patch_size = hparams.patch_size;
|
|
385
|
+
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
|
386
|
+
const int hidden_size = hparams.hidden_size;
|
|
387
|
+
const int n_head = hparams.n_head;
|
|
388
|
+
const int d_head = hidden_size / n_head;
|
|
389
|
+
const int n_layer = hparams.n_layer;
|
|
390
|
+
const float eps = hparams.eps;
|
|
659
391
|
|
|
660
392
|
struct ggml_init_params params = {
|
|
661
393
|
/*.mem_size =*/ ctx->buf_compute_meta.size(),
|
|
@@ -663,7 +395,9 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
|
|
|
663
395
|
/*.no_alloc =*/ true,
|
|
664
396
|
};
|
|
665
397
|
|
|
666
|
-
|
|
398
|
+
ggml_context_ptr ctx0_ptr(ggml_init(params));
|
|
399
|
+
auto ctx0 = ctx0_ptr.get();
|
|
400
|
+
|
|
667
401
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
|
668
402
|
|
|
669
403
|
// input raw
|
|
@@ -711,8 +445,7 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
|
|
|
711
445
|
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
|
712
446
|
|
|
713
447
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
|
714
|
-
KQ =
|
|
715
|
-
KQ = ggml_soft_max_inplace(ctx0, KQ);
|
|
448
|
+
KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
|
|
716
449
|
|
|
717
450
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
|
718
451
|
KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
|
|
@@ -751,7 +484,7 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
|
|
|
751
484
|
}
|
|
752
485
|
|
|
753
486
|
// post-layernorm
|
|
754
|
-
if (
|
|
487
|
+
if (model.post_ln_w) {
|
|
755
488
|
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
756
489
|
ggml_set_name(embeddings, "post_ln");
|
|
757
490
|
|
|
@@ -781,63 +514,534 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
|
|
|
781
514
|
embeddings = ggml_mul_mat(ctx0,
|
|
782
515
|
ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
|
|
783
516
|
embeddings);
|
|
517
|
+
|
|
518
|
+
} else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
|
|
519
|
+
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
|
|
520
|
+
|
|
521
|
+
ggml_tensor * cur = embeddings;
|
|
522
|
+
const int scale_factor = model.hparams.proj_scale_factor;
|
|
523
|
+
const int n_embd = cur->ne[0];
|
|
524
|
+
const int seq = cur->ne[1];
|
|
525
|
+
const int bsz = 1; // batch size, always 1 for now since we don't support batching
|
|
526
|
+
const int height = std::sqrt(seq);
|
|
527
|
+
const int width = std::sqrt(seq);
|
|
528
|
+
GGML_ASSERT(scale_factor != 0);
|
|
529
|
+
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
|
|
530
|
+
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
|
531
|
+
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
|
|
532
|
+
n_embd * scale_factor * scale_factor,
|
|
533
|
+
height / scale_factor,
|
|
534
|
+
width / scale_factor,
|
|
535
|
+
bsz);
|
|
536
|
+
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
|
537
|
+
cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur),
|
|
538
|
+
n_embd * scale_factor * scale_factor,
|
|
539
|
+
seq / (scale_factor * scale_factor),
|
|
540
|
+
bsz);
|
|
541
|
+
|
|
542
|
+
cur = ggml_mul_mat(ctx0, model.projection, cur);
|
|
543
|
+
embeddings = cur;
|
|
544
|
+
} else {
|
|
545
|
+
GGML_ABORT("SigLIP: Unsupported projector type");
|
|
784
546
|
}
|
|
785
547
|
|
|
786
548
|
// build the graph
|
|
787
549
|
ggml_build_forward_expand(gf, embeddings);
|
|
788
550
|
|
|
789
|
-
|
|
551
|
+
return gf;
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
// implementation of the 2D RoPE without adding a new op in ggml
|
|
555
|
+
// this is not efficient (use double the memory), but works on all backends
|
|
556
|
+
// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
|
|
557
|
+
static ggml_tensor * build_rope_2d(
|
|
558
|
+
ggml_context * ctx0,
|
|
559
|
+
ggml_tensor * cur,
|
|
560
|
+
ggml_tensor * pos_h,
|
|
561
|
+
ggml_tensor * pos_w,
|
|
562
|
+
const float freq_base
|
|
563
|
+
) {
|
|
564
|
+
const int64_t n_dim = cur->ne[0];
|
|
565
|
+
const int64_t n_head = cur->ne[1];
|
|
566
|
+
const int64_t n_pos = cur->ne[2];
|
|
567
|
+
|
|
568
|
+
// for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
|
|
569
|
+
// we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
|
|
570
|
+
// first half of cur will use 1e-0, 1e-2 (even)
|
|
571
|
+
// second half of cur will use 1e-1, 1e-3 (odd)
|
|
572
|
+
// the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
|
|
573
|
+
// ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
|
|
574
|
+
// then for the second half, we use freq_scale to shift the inv_freq
|
|
575
|
+
// ^ why? replace (2i) with (2i+1) in the above equation
|
|
576
|
+
const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim);
|
|
577
|
+
|
|
578
|
+
// first half
|
|
579
|
+
ggml_tensor * first;
|
|
580
|
+
{
|
|
581
|
+
first = ggml_view_3d(ctx0, cur,
|
|
582
|
+
n_dim/2, n_head, n_pos,
|
|
583
|
+
ggml_row_size(cur->type, n_dim),
|
|
584
|
+
ggml_row_size(cur->type, n_dim*n_head),
|
|
585
|
+
0);
|
|
586
|
+
first = ggml_rope_ext(
|
|
587
|
+
ctx0,
|
|
588
|
+
first,
|
|
589
|
+
pos_h, // positions
|
|
590
|
+
nullptr, // freq factors
|
|
591
|
+
n_dim/2, // n_dims
|
|
592
|
+
0, 0, freq_base,
|
|
593
|
+
1.0f, 0.0f, 1.0f, 0.0f, 0.0f
|
|
594
|
+
);
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
// second half
|
|
598
|
+
ggml_tensor * second;
|
|
599
|
+
{
|
|
600
|
+
second = ggml_view_3d(ctx0, cur,
|
|
601
|
+
n_dim/2, n_head, n_pos,
|
|
602
|
+
ggml_row_size(cur->type, n_dim),
|
|
603
|
+
ggml_row_size(cur->type, n_dim*n_head),
|
|
604
|
+
n_dim/2 * ggml_element_size(cur));
|
|
605
|
+
second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
|
|
606
|
+
second = ggml_rope_ext(
|
|
607
|
+
ctx0,
|
|
608
|
+
second,
|
|
609
|
+
pos_w, // positions
|
|
610
|
+
nullptr, // freq factors
|
|
611
|
+
n_dim/2, // n_dims
|
|
612
|
+
0, 0, freq_base,
|
|
613
|
+
freq_scale_odd,
|
|
614
|
+
0.0f, 1.0f, 0.0f, 0.0f
|
|
615
|
+
);
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
cur = ggml_concat(ctx0, first, second, 0);
|
|
619
|
+
return cur;
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) {
|
|
623
|
+
const auto & model = ctx->vision_model;
|
|
624
|
+
const auto & hparams = model.hparams;
|
|
625
|
+
|
|
626
|
+
GGML_ASSERT(ctx->proj_type == PROJECTOR_TYPE_PIXTRAL);
|
|
627
|
+
|
|
628
|
+
int image_size_width = img.nx;
|
|
629
|
+
int image_size_height = img.ny;
|
|
630
|
+
|
|
631
|
+
const int patch_size = hparams.patch_size;
|
|
632
|
+
const int n_patches_x = image_size_width / patch_size;
|
|
633
|
+
const int n_patches_y = image_size_height / patch_size;
|
|
634
|
+
const int num_patches = n_patches_x * n_patches_y;
|
|
635
|
+
const int hidden_size = hparams.hidden_size;
|
|
636
|
+
const int n_head = hparams.n_head;
|
|
637
|
+
const int d_head = hidden_size / n_head;
|
|
638
|
+
const int n_layer = hparams.n_layer;
|
|
639
|
+
const float eps = hparams.eps;
|
|
640
|
+
|
|
641
|
+
struct ggml_init_params params = {
|
|
642
|
+
/*.mem_size =*/ ctx->buf_compute_meta.size(),
|
|
643
|
+
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
|
|
644
|
+
/*.no_alloc =*/ true,
|
|
645
|
+
};
|
|
646
|
+
|
|
647
|
+
ggml_context_ptr ctx0_ptr(ggml_init(params));
|
|
648
|
+
auto ctx0 = ctx0_ptr.get();
|
|
649
|
+
|
|
650
|
+
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
|
651
|
+
|
|
652
|
+
// input raw
|
|
653
|
+
struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3);
|
|
654
|
+
ggml_set_name(inp_raw, "inp_raw");
|
|
655
|
+
ggml_set_input(inp_raw);
|
|
656
|
+
|
|
657
|
+
// 2D input positions
|
|
658
|
+
struct ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
|
|
659
|
+
ggml_set_name(pos_h, "pos_h");
|
|
660
|
+
ggml_set_input(pos_h);
|
|
661
|
+
struct ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
|
|
662
|
+
ggml_set_name(pos_w, "pos_w");
|
|
663
|
+
ggml_set_input(pos_w);
|
|
664
|
+
|
|
665
|
+
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
|
666
|
+
inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
|
|
667
|
+
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
|
|
668
|
+
|
|
669
|
+
struct ggml_tensor * embeddings = inp;
|
|
670
|
+
|
|
671
|
+
// pre-layer norm
|
|
672
|
+
embeddings = ggml_mul(ctx0, ggml_rms_norm(ctx0, embeddings, eps), model.pre_ln_w);
|
|
673
|
+
|
|
674
|
+
// loop over layers
|
|
675
|
+
for (int il = 0; il < n_layer; il++) {
|
|
676
|
+
struct ggml_tensor * cur = embeddings;
|
|
677
|
+
|
|
678
|
+
// pre-attention norm
|
|
679
|
+
cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.layers[il].ln_1_w);
|
|
680
|
+
|
|
681
|
+
// self-attention
|
|
682
|
+
{
|
|
683
|
+
struct ggml_tensor * Q = ggml_mul_mat(ctx0, model.layers[il].q_w, cur);
|
|
684
|
+
|
|
685
|
+
Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
|
|
686
|
+
Q = build_rope_2d(ctx0, Q, pos_h, pos_w, hparams.rope_theta);
|
|
687
|
+
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
|
688
|
+
|
|
689
|
+
struct ggml_tensor * K = ggml_mul_mat(ctx0, model.layers[il].k_w, cur);
|
|
690
|
+
|
|
691
|
+
K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
|
|
692
|
+
K = build_rope_2d(ctx0, K, pos_h, pos_w, hparams.rope_theta);
|
|
693
|
+
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
|
694
|
+
|
|
695
|
+
struct ggml_tensor * V = ggml_mul_mat(ctx0, model.layers[il].v_w, cur);
|
|
696
|
+
|
|
697
|
+
V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches);
|
|
698
|
+
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
|
699
|
+
|
|
700
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
|
701
|
+
KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
|
|
702
|
+
|
|
703
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
|
704
|
+
KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
|
|
705
|
+
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
|
706
|
+
|
|
707
|
+
cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
|
|
708
|
+
|
|
709
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].o_w, cur);
|
|
710
|
+
}
|
|
711
|
+
|
|
712
|
+
// re-add the layer input, e.g., residual
|
|
713
|
+
cur = ggml_add(ctx0, cur, embeddings);
|
|
714
|
+
|
|
715
|
+
embeddings = cur; // embeddings = residual, cur = hidden_states
|
|
716
|
+
|
|
717
|
+
// pre-ffn norm
|
|
718
|
+
cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.layers[il].ln_2_w);
|
|
719
|
+
|
|
720
|
+
// feed-forward
|
|
721
|
+
{
|
|
722
|
+
ggml_tensor * gate_proj = ggml_mul_mat(ctx0, model.layers[il].ff_gate_w, cur);
|
|
723
|
+
ggml_tensor * up_proj = ggml_mul_mat(ctx0, model.layers[il].ff_up_w, cur);
|
|
724
|
+
gate_proj = ggml_silu(ctx0, gate_proj); // pixtral uses silu
|
|
725
|
+
cur = ggml_mul(ctx0, up_proj, gate_proj);
|
|
726
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].ff_down_w, cur);
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
// residual 2
|
|
730
|
+
cur = ggml_add(ctx0, embeddings, cur);
|
|
731
|
+
|
|
732
|
+
embeddings = cur;
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
// LlavaMultiModalProjector (with GELU activation)
|
|
736
|
+
{
|
|
737
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
|
|
738
|
+
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
|
|
739
|
+
|
|
740
|
+
embeddings = ggml_gelu(ctx0, embeddings);
|
|
741
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
|
|
742
|
+
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
// arrangement of the [IMG_BREAK] token
|
|
746
|
+
{
|
|
747
|
+
// not efficient, but works
|
|
748
|
+
// the trick is to view the embeddings as a 3D tensor with shape [hidden_size, n_patches_per_row, n_rows]
|
|
749
|
+
// and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
|
|
750
|
+
// after the concatenation, we have a tensor with shape [hidden_size, n_patches_per_row + 1, n_rows]
|
|
751
|
+
|
|
752
|
+
const int n_embd_text = embeddings->ne[0];
|
|
753
|
+
const int n_tokens_output = num_patches + n_patches_y - 1; // one [IMG_BREAK] per row, except the last row
|
|
754
|
+
|
|
755
|
+
ggml_tensor * cur = ggml_reshape_3d(ctx0, embeddings, n_embd_text, n_patches_x, n_patches_y);
|
|
756
|
+
ggml_tensor * tok = ggml_new_tensor_3d(ctx0, embeddings->type, n_embd_text, 1, n_patches_y);
|
|
757
|
+
tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor
|
|
758
|
+
tok = ggml_add(ctx0, tok, model.token_embd_img_break);
|
|
759
|
+
cur = ggml_concat(ctx0, cur, tok, 1);
|
|
760
|
+
embeddings = ggml_view_2d(ctx0, cur,
|
|
761
|
+
n_embd_text, n_tokens_output,
|
|
762
|
+
ggml_row_size(cur->type, n_embd_text), 0);
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
// build the graph
|
|
766
|
+
ggml_build_forward_expand(gf, embeddings);
|
|
790
767
|
|
|
791
768
|
return gf;
|
|
792
769
|
}
|
|
793
770
|
|
|
794
|
-
static ggml_cgraph *
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
771
|
+
static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
|
|
772
|
+
const auto & model = ctx->vision_model;
|
|
773
|
+
const auto & hparams = model.hparams;
|
|
774
|
+
|
|
775
|
+
const int image_size_width = imgs.entries[0]->nx;
|
|
776
|
+
const int image_size_height = imgs.entries[0]->ny;
|
|
777
|
+
|
|
778
|
+
const bool use_window_attn = hparams.n_wa_pattern > 0;
|
|
779
|
+
|
|
780
|
+
const int n_wa_pattern = hparams.n_wa_pattern;
|
|
781
|
+
const int patch_size = hparams.patch_size;
|
|
782
|
+
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
|
783
|
+
const int patches_w = image_size_width / patch_size;
|
|
784
|
+
const int patches_h = image_size_height / patch_size;
|
|
785
|
+
const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
|
|
786
|
+
const int num_position_ids = num_positions * 4; // m-rope requires 4 dim per position
|
|
787
|
+
const int hidden_size = hparams.hidden_size;
|
|
788
|
+
const int n_head = hparams.n_head;
|
|
789
|
+
const int d_head = hidden_size / n_head;
|
|
790
|
+
const int n_layer = hparams.n_layer;
|
|
791
|
+
const float eps = hparams.eps;
|
|
792
|
+
|
|
793
|
+
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
|
794
|
+
|
|
795
|
+
const int batch_size = imgs.entries.size();
|
|
796
|
+
GGML_ASSERT(batch_size == 1);
|
|
797
|
+
|
|
798
|
+
struct ggml_init_params params = {
|
|
799
|
+
/*.mem_size =*/ ctx->buf_compute_meta.size(),
|
|
800
|
+
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
|
|
801
|
+
/*.no_alloc =*/ true,
|
|
802
|
+
};
|
|
803
|
+
|
|
804
|
+
ggml_context_ptr ctx0_ptr(ggml_init(params));
|
|
805
|
+
auto ctx0 = ctx0_ptr.get();
|
|
806
|
+
|
|
807
|
+
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
|
808
|
+
|
|
809
|
+
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
|
|
810
|
+
ggml_set_name(inp_raw, "inp_raw");
|
|
811
|
+
ggml_set_input(inp_raw);
|
|
812
|
+
|
|
813
|
+
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
|
814
|
+
|
|
815
|
+
GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
|
|
816
|
+
GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
|
|
817
|
+
|
|
818
|
+
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
|
819
|
+
inp = ggml_add(ctx0, inp, inp_1);
|
|
820
|
+
|
|
821
|
+
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
|
|
822
|
+
inp = ggml_reshape_4d(
|
|
823
|
+
ctx0, inp,
|
|
824
|
+
hidden_size * 2, patches_w / 2, patches_h, batch_size);
|
|
825
|
+
inp = ggml_reshape_4d(
|
|
826
|
+
ctx0, inp,
|
|
827
|
+
hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
|
|
828
|
+
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
|
|
829
|
+
inp = ggml_reshape_3d(
|
|
830
|
+
ctx0, inp,
|
|
831
|
+
hidden_size, patches_w * patches_h, batch_size);
|
|
832
|
+
|
|
833
|
+
if (model.patch_bias) {
|
|
834
|
+
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
|
|
835
|
+
inp = ggml_add(ctx0, inp, model.patch_bias);
|
|
836
|
+
}
|
|
837
|
+
struct ggml_tensor * embeddings = inp;
|
|
838
|
+
struct ggml_tensor * window_mask = nullptr;
|
|
839
|
+
struct ggml_tensor * window_idx = nullptr;
|
|
840
|
+
struct ggml_tensor * inv_window_idx = nullptr;
|
|
841
|
+
|
|
842
|
+
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
|
|
843
|
+
ggml_set_name(positions, "positions");
|
|
844
|
+
ggml_set_input(positions);
|
|
845
|
+
|
|
846
|
+
// pre-layernorm
|
|
847
|
+
if (model.pre_ln_w) {
|
|
848
|
+
embeddings = ggml_rms_norm(ctx0, embeddings, eps);
|
|
849
|
+
ggml_set_name(embeddings, "pre_ln");
|
|
850
|
+
|
|
851
|
+
embeddings = ggml_mul(ctx0, embeddings, model.pre_ln_w);
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
if (use_window_attn) {
|
|
855
|
+
// handle window attention inputs
|
|
856
|
+
inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions / 4);
|
|
857
|
+
ggml_set_name(inv_window_idx, "inv_window_idx");
|
|
858
|
+
ggml_set_input(inv_window_idx);
|
|
859
|
+
// mask for window attention
|
|
860
|
+
window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, num_positions, num_positions);
|
|
861
|
+
ggml_set_name(window_mask, "window_mask");
|
|
862
|
+
ggml_set_input(window_mask);
|
|
863
|
+
|
|
864
|
+
// embeddings shape: [hidden_size, patches_w * patches_h, batch_size]
|
|
865
|
+
GGML_ASSERT(batch_size == 1);
|
|
866
|
+
embeddings = ggml_reshape_2d(ctx0, embeddings, hidden_size * 4, patches_w * patches_h * batch_size / 4);
|
|
867
|
+
embeddings = ggml_get_rows(ctx0, embeddings, inv_window_idx);
|
|
868
|
+
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, patches_w * patches_h, batch_size);
|
|
869
|
+
}
|
|
870
|
+
|
|
871
|
+
// loop over layers
|
|
872
|
+
for (int il = 0; il < n_layer; il++) {
|
|
873
|
+
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
|
|
874
|
+
|
|
875
|
+
// rmsnorm1
|
|
876
|
+
cur = ggml_rms_norm(ctx0, cur, eps);
|
|
877
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ln_1_w);
|
|
878
|
+
|
|
879
|
+
// self-attention
|
|
880
|
+
{
|
|
881
|
+
|
|
882
|
+
struct ggml_tensor * Q =
|
|
883
|
+
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
|
|
884
|
+
|
|
885
|
+
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
|
|
886
|
+
Q = ggml_rope_multi(
|
|
887
|
+
ctx0, Q, positions, nullptr,
|
|
888
|
+
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
|
889
|
+
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
|
890
|
+
Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
|
|
891
|
+
|
|
892
|
+
struct ggml_tensor * K =
|
|
893
|
+
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
|
|
894
|
+
|
|
895
|
+
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
|
|
896
|
+
K = ggml_rope_multi(
|
|
897
|
+
ctx0, K, positions, nullptr,
|
|
898
|
+
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
|
899
|
+
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
|
900
|
+
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
|
|
901
|
+
|
|
902
|
+
struct ggml_tensor * V =
|
|
903
|
+
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
|
|
904
|
+
|
|
905
|
+
V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
|
|
906
|
+
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
|
907
|
+
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
|
|
908
|
+
|
|
909
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
|
910
|
+
const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
|
|
911
|
+
if (full_attn) {
|
|
912
|
+
KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
|
|
913
|
+
} else {
|
|
914
|
+
KQ = ggml_soft_max_ext(ctx0, KQ, window_mask, 1.0f / sqrtf((float)d_head), 0.0f);
|
|
915
|
+
}
|
|
916
|
+
|
|
917
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
|
918
|
+
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
|
|
919
|
+
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
|
920
|
+
|
|
921
|
+
cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size);
|
|
922
|
+
}
|
|
923
|
+
|
|
924
|
+
// attention output
|
|
925
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
|
|
926
|
+
|
|
927
|
+
// re-add the layer input, e.g., residual
|
|
928
|
+
cur = ggml_add(ctx0, cur, embeddings);
|
|
929
|
+
|
|
930
|
+
embeddings = cur; // embeddings = residual, cur = hidden_states
|
|
931
|
+
|
|
932
|
+
// rms norm2
|
|
933
|
+
cur = ggml_rms_norm(ctx0, cur, eps);
|
|
934
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ln_2_w);
|
|
935
|
+
|
|
936
|
+
// mlp
|
|
937
|
+
// ffn_up
|
|
938
|
+
auto cur_up = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
|
|
939
|
+
cur_up = ggml_add(ctx0, cur_up, model.layers[il].ff_o_b);
|
|
940
|
+
|
|
941
|
+
auto cur_gate = ggml_mul_mat(ctx0, model.layers[il].ff_g_w, cur);
|
|
942
|
+
cur_gate = ggml_add(ctx0, cur_gate, model.layers[il].ff_g_b);
|
|
943
|
+
// TODO : only 2 of these 3 are actually used, should we remove one of them?
|
|
944
|
+
if (ctx->use_gelu) {
|
|
945
|
+
cur_gate = ggml_gelu_inplace(ctx0, cur_gate);
|
|
946
|
+
} else if (ctx->use_silu) {
|
|
947
|
+
cur_gate = ggml_silu_inplace(ctx0, cur_gate);
|
|
948
|
+
} else {
|
|
949
|
+
cur_gate = ggml_gelu_quick_inplace(ctx0, cur_gate);
|
|
950
|
+
}
|
|
951
|
+
cur = ggml_mul(ctx0, cur_gate, cur_up);
|
|
952
|
+
|
|
953
|
+
// ffn_down
|
|
954
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
|
|
955
|
+
cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
|
|
956
|
+
|
|
957
|
+
// residual 2
|
|
958
|
+
cur = ggml_add(ctx0, embeddings, cur);
|
|
959
|
+
|
|
960
|
+
embeddings = cur;
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
// post-layernorm
|
|
964
|
+
if (model.post_ln_w) {
|
|
965
|
+
embeddings = ggml_rms_norm(ctx0, embeddings, eps);
|
|
966
|
+
ggml_set_name(embeddings, "post_ln");
|
|
967
|
+
|
|
968
|
+
embeddings = ggml_mul(ctx0, embeddings, model.post_ln_w);
|
|
969
|
+
}
|
|
970
|
+
|
|
971
|
+
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
|
|
972
|
+
|
|
973
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
|
974
|
+
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
|
975
|
+
|
|
976
|
+
// GELU activation
|
|
977
|
+
embeddings = ggml_gelu(ctx0, embeddings);
|
|
978
|
+
|
|
979
|
+
// Second linear layer
|
|
980
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
|
|
981
|
+
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
|
|
982
|
+
|
|
983
|
+
if (use_window_attn) {
|
|
984
|
+
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions / 4);
|
|
985
|
+
ggml_set_name(window_idx, "window_idx");
|
|
986
|
+
ggml_set_input(window_idx);
|
|
987
|
+
|
|
988
|
+
// embeddings shape: [hidden_size, patches_w * patches_h, batch_size]
|
|
989
|
+
GGML_ASSERT(batch_size == 1);
|
|
990
|
+
embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4);
|
|
991
|
+
embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
|
|
992
|
+
embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4, batch_size);
|
|
798
993
|
}
|
|
799
994
|
|
|
995
|
+
// build the graph
|
|
996
|
+
ggml_build_forward_expand(gf, embeddings);
|
|
997
|
+
|
|
998
|
+
return gf;
|
|
999
|
+
}
|
|
1000
|
+
|
|
1001
|
+
static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
|
|
800
1002
|
const auto & model = ctx->vision_model;
|
|
801
1003
|
const auto & hparams = model.hparams;
|
|
802
1004
|
|
|
803
1005
|
const int image_size = hparams.image_size;
|
|
804
1006
|
int image_size_width = image_size;
|
|
805
1007
|
int image_size_height = image_size;
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
image_size_width = load_image_size->width;
|
|
812
|
-
image_size_height = load_image_size->height;
|
|
1008
|
+
|
|
1009
|
+
if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
|
|
1010
|
+
LOG_DBG("%s: %d %d\n", __func__, load_image_size.width, load_image_size.height);
|
|
1011
|
+
image_size_width = load_image_size.width;
|
|
1012
|
+
image_size_height = load_image_size.height;
|
|
813
1013
|
if (is_inf) {
|
|
814
|
-
image_size_width = imgs->
|
|
815
|
-
image_size_height = imgs->
|
|
1014
|
+
image_size_width = imgs.entries[0]->nx;
|
|
1015
|
+
image_size_height = imgs.entries[0]->ny;
|
|
816
1016
|
}
|
|
817
1017
|
}
|
|
818
|
-
|
|
1018
|
+
|
|
1019
|
+
else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
|
|
819
1020
|
// use the image's native resolution when image is avaible
|
|
820
1021
|
if (is_inf) {
|
|
821
1022
|
// if (imgs->data->nx && imgs->data->ny) {
|
|
822
|
-
image_size_width = imgs->
|
|
823
|
-
image_size_height = imgs->
|
|
1023
|
+
image_size_width = imgs.entries[0]->nx;
|
|
1024
|
+
image_size_height = imgs.entries[0]->ny;
|
|
824
1025
|
}
|
|
825
1026
|
}
|
|
1027
|
+
|
|
826
1028
|
const int patch_size = hparams.patch_size;
|
|
827
1029
|
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
|
828
1030
|
const int patches_w = image_size_width / patch_size;
|
|
829
1031
|
const int patches_h = image_size_height / patch_size;
|
|
830
|
-
const int num_positions = num_patches + (
|
|
831
|
-
const int num_position_ids = ctx->
|
|
1032
|
+
const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
|
|
1033
|
+
const int num_position_ids = ctx->proj_type == PROJECTOR_TYPE_QWEN2VL ? num_positions * 4 : num_positions;
|
|
832
1034
|
const int hidden_size = hparams.hidden_size;
|
|
833
1035
|
const int n_head = hparams.n_head;
|
|
834
1036
|
const int d_head = hidden_size / n_head;
|
|
835
1037
|
const float eps = hparams.eps;
|
|
836
1038
|
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
|
837
1039
|
|
|
838
|
-
const int batch_size = imgs
|
|
1040
|
+
const int batch_size = imgs.entries.size();
|
|
839
1041
|
|
|
840
|
-
if (ctx->has_llava_projector
|
|
1042
|
+
if (ctx->has_llava_projector
|
|
1043
|
+
|| ctx->proj_type == PROJECTOR_TYPE_MINICPMV
|
|
1044
|
+
|| ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
|
841
1045
|
GGML_ASSERT(batch_size == 1);
|
|
842
1046
|
}
|
|
843
1047
|
|
|
@@ -847,7 +1051,9 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
|
|
|
847
1051
|
/*.no_alloc =*/ true,
|
|
848
1052
|
};
|
|
849
1053
|
|
|
850
|
-
|
|
1054
|
+
ggml_context_ptr ctx0_ptr(ggml_init(params));
|
|
1055
|
+
auto ctx0 = ctx0_ptr.get();
|
|
1056
|
+
|
|
851
1057
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
|
852
1058
|
|
|
853
1059
|
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
|
|
@@ -856,8 +1062,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
|
|
|
856
1062
|
|
|
857
1063
|
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
|
858
1064
|
|
|
859
|
-
if (ctx->
|
|
860
|
-
GGML_ASSERT(image_size_width
|
|
1065
|
+
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
|
|
1066
|
+
GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
|
|
861
1067
|
GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
|
|
862
1068
|
|
|
863
1069
|
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
|
@@ -879,53 +1085,43 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
|
|
|
879
1085
|
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
|
|
880
1086
|
}
|
|
881
1087
|
|
|
882
|
-
if (
|
|
1088
|
+
if (model.patch_bias) {
|
|
883
1089
|
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
|
|
884
1090
|
inp = ggml_add(ctx0, inp, model.patch_bias);
|
|
885
1091
|
}
|
|
886
1092
|
struct ggml_tensor * embeddings = inp;
|
|
887
1093
|
struct ggml_tensor * pos_embed = nullptr;
|
|
888
1094
|
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
embeddings = ggml_acc(ctx0, embeddings, inp,
|
|
898
|
-
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
|
899
|
-
}
|
|
1095
|
+
// concat class_embeddings and patch_embeddings
|
|
1096
|
+
if (model.class_embedding) {
|
|
1097
|
+
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
|
1098
|
+
embeddings = ggml_scale(ctx0, embeddings, 0.0f); // set to all zeros
|
|
1099
|
+
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
|
1100
|
+
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
|
1101
|
+
embeddings = ggml_acc(ctx0, embeddings, inp,
|
|
1102
|
+
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
|
900
1103
|
}
|
|
901
1104
|
|
|
902
1105
|
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
|
|
903
1106
|
ggml_set_name(positions, "positions");
|
|
904
1107
|
ggml_set_input(positions);
|
|
905
1108
|
|
|
906
|
-
if (
|
|
1109
|
+
if (ctx->proj_type != PROJECTOR_TYPE_QWEN2VL) { // qwen2vl does NOT use learned position embeddings
|
|
907
1110
|
embeddings =
|
|
908
1111
|
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
|
909
1112
|
}
|
|
910
1113
|
|
|
911
|
-
if (ctx->
|
|
1114
|
+
if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
|
|
912
1115
|
int pos_w = image_size_width/patch_size;
|
|
913
1116
|
int pos_h = image_size_height/patch_size;
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
}
|
|
917
|
-
else if (ctx->minicpmv_version == 3) {
|
|
918
|
-
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
|
|
919
|
-
}
|
|
920
|
-
else if (ctx->minicpmv_version == 4) {
|
|
921
|
-
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
|
|
922
|
-
}
|
|
1117
|
+
int n_output_dim = clip_n_mmproj_embd(ctx);
|
|
1118
|
+
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, pos_w * pos_h, 1);
|
|
923
1119
|
ggml_set_name(pos_embed, "pos_embed");
|
|
924
1120
|
ggml_set_input(pos_embed);
|
|
925
1121
|
}
|
|
926
1122
|
|
|
927
1123
|
// pre-layernorm
|
|
928
|
-
if (
|
|
1124
|
+
if (model.pre_ln_w) {
|
|
929
1125
|
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
930
1126
|
ggml_set_name(embeddings, "pre_ln");
|
|
931
1127
|
|
|
@@ -962,12 +1158,11 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
|
|
|
962
1158
|
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
|
|
963
1159
|
|
|
964
1160
|
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
|
|
965
|
-
if (ctx->
|
|
1161
|
+
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
|
|
966
1162
|
Q = ggml_rope_multi(
|
|
967
1163
|
ctx0, Q, positions, nullptr,
|
|
968
1164
|
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
|
969
1165
|
}
|
|
970
|
-
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
|
|
971
1166
|
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
|
972
1167
|
Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
|
|
973
1168
|
|
|
@@ -975,7 +1170,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
|
|
|
975
1170
|
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
|
|
976
1171
|
|
|
977
1172
|
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
|
|
978
|
-
if (ctx->
|
|
1173
|
+
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
|
|
979
1174
|
K = ggml_rope_multi(
|
|
980
1175
|
ctx0, K, positions, nullptr,
|
|
981
1176
|
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
|
@@ -991,7 +1186,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
|
|
|
991
1186
|
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
|
|
992
1187
|
|
|
993
1188
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
|
994
|
-
KQ =
|
|
1189
|
+
KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
|
|
995
1190
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
|
996
1191
|
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
|
|
997
1192
|
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
|
@@ -1035,7 +1230,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
|
|
|
1035
1230
|
}
|
|
1036
1231
|
|
|
1037
1232
|
// post-layernorm
|
|
1038
|
-
if (
|
|
1233
|
+
if (model.post_ln_w) {
|
|
1039
1234
|
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
1040
1235
|
ggml_set_name(embeddings, "post_ln");
|
|
1041
1236
|
|
|
@@ -1075,8 +1270,10 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
|
|
|
1075
1270
|
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
|
1076
1271
|
|
|
1077
1272
|
embeddings = ggml_gelu(ctx0, embeddings);
|
|
1078
|
-
|
|
1079
|
-
|
|
1273
|
+
if (model.mm_2_w) {
|
|
1274
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
|
|
1275
|
+
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
|
1276
|
+
}
|
|
1080
1277
|
}
|
|
1081
1278
|
else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
|
1082
1279
|
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
|
@@ -1238,107 +1435,92 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
|
|
|
1238
1435
|
}
|
|
1239
1436
|
}
|
|
1240
1437
|
// minicpmv projector
|
|
1241
|
-
else if (ctx->
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1438
|
+
else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
|
|
1439
|
+
struct ggml_tensor * q = model.mm_model_query;
|
|
1440
|
+
{ // layernorm
|
|
1441
|
+
q = ggml_norm(ctx0, q, eps);
|
|
1442
|
+
q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
|
|
1443
|
+
}
|
|
1444
|
+
struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
|
|
1445
|
+
{ // layernorm
|
|
1446
|
+
v = ggml_norm(ctx0, v, eps);
|
|
1447
|
+
v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
|
|
1448
|
+
}
|
|
1449
|
+
struct ggml_tensor * k;
|
|
1450
|
+
{ // position
|
|
1451
|
+
// q = ggml_add(ctx0, q, model.mm_model_pos_embed);
|
|
1452
|
+
k = ggml_add(ctx0, v, pos_embed);
|
|
1453
|
+
}
|
|
1454
|
+
|
|
1455
|
+
{ // attention
|
|
1456
|
+
int hidden_size = clip_n_mmproj_embd(ctx);
|
|
1457
|
+
const int d_head = 128;
|
|
1458
|
+
int n_head = hidden_size/d_head;
|
|
1459
|
+
int num_query = 96;
|
|
1460
|
+
if (ctx->minicpmv_version == 2) {
|
|
1461
|
+
num_query = 96;
|
|
1248
1462
|
}
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
v = ggml_norm(ctx0, v, eps);
|
|
1252
|
-
v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
|
|
1463
|
+
else if (ctx->minicpmv_version == 3) {
|
|
1464
|
+
num_query = 64;
|
|
1253
1465
|
}
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
// q = ggml_add(ctx0, q, model.mm_model_pos_embed);
|
|
1257
|
-
k = ggml_add(ctx0, v, pos_embed);
|
|
1466
|
+
else if (ctx->minicpmv_version == 4) {
|
|
1467
|
+
num_query = 64;
|
|
1258
1468
|
}
|
|
1259
1469
|
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
}
|
|
1470
|
+
struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
|
|
1471
|
+
struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
|
|
1472
|
+
struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
|
|
1473
|
+
// permute
|
|
1474
|
+
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
|
|
1475
|
+
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
|
1476
|
+
Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
|
|
1477
|
+
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
|
|
1478
|
+
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
|
1479
|
+
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
|
|
1480
|
+
V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
|
|
1481
|
+
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
|
1482
|
+
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
|
|
1483
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
|
1484
|
+
KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
|
|
1485
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
|
1486
|
+
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
|
|
1487
|
+
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
|
1488
|
+
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
|
|
1280
1489
|
|
|
1281
|
-
|
|
1282
|
-
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
|
|
1283
|
-
struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
|
|
1284
|
-
struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
|
|
1285
|
-
// permute
|
|
1286
|
-
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
|
|
1287
|
-
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
|
1288
|
-
Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
|
|
1289
|
-
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
|
|
1290
|
-
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
|
1291
|
-
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
|
|
1292
|
-
V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
|
|
1293
|
-
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
|
1294
|
-
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
|
|
1295
|
-
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
|
1296
|
-
KQ = ggml_soft_max_inplace(ctx0, KQ);
|
|
1297
|
-
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
|
1298
|
-
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
|
|
1299
|
-
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
|
1300
|
-
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
|
|
1301
|
-
|
|
1302
|
-
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
|
|
1303
|
-
}
|
|
1304
|
-
{ // layernorm
|
|
1305
|
-
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
1306
|
-
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
|
|
1307
|
-
}
|
|
1308
|
-
embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
|
|
1490
|
+
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
|
|
1309
1491
|
}
|
|
1310
|
-
|
|
1311
|
-
|
|
1492
|
+
{ // layernorm
|
|
1493
|
+
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
1494
|
+
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
|
|
1312
1495
|
}
|
|
1496
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
|
|
1313
1497
|
}
|
|
1498
|
+
|
|
1314
1499
|
// glm projector
|
|
1315
|
-
else if (ctx->
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
|
|
1336
|
-
}
|
|
1337
|
-
} else {
|
|
1338
|
-
GGML_ABORT("fatel error");
|
|
1500
|
+
else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
|
1501
|
+
size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
|
|
1502
|
+
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
|
|
1503
|
+
embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
|
|
1504
|
+
embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
|
|
1505
|
+
embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
|
|
1506
|
+
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
|
|
1507
|
+
embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
|
|
1508
|
+
// GLU
|
|
1509
|
+
{
|
|
1510
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
|
|
1511
|
+
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
1512
|
+
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
|
|
1513
|
+
embeddings = ggml_gelu_inplace(ctx0, embeddings);
|
|
1514
|
+
struct ggml_tensor * x = embeddings;
|
|
1515
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
|
|
1516
|
+
x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
|
|
1517
|
+
embeddings = ggml_silu_inplace(ctx0, embeddings);
|
|
1518
|
+
embeddings = ggml_mul(ctx0, embeddings,x);
|
|
1519
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
|
|
1339
1520
|
}
|
|
1340
1521
|
}
|
|
1341
|
-
|
|
1522
|
+
|
|
1523
|
+
else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
|
|
1342
1524
|
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
|
|
1343
1525
|
|
|
1344
1526
|
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
|
@@ -1355,561 +1537,493 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
|
|
|
1355
1537
|
// build the graph
|
|
1356
1538
|
ggml_build_forward_expand(gf, embeddings);
|
|
1357
1539
|
|
|
1358
|
-
ggml_free(ctx0);
|
|
1359
|
-
|
|
1360
1540
|
return gf;
|
|
1361
1541
|
}
|
|
1362
1542
|
|
|
1363
|
-
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1543
|
+
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
|
|
1544
|
+
ggml_cgraph * res;
|
|
1545
|
+
switch (ctx->proj_type) {
|
|
1546
|
+
case PROJECTOR_TYPE_GEMMA3:
|
|
1547
|
+
case PROJECTOR_TYPE_IDEFICS3:
|
|
1548
|
+
{
|
|
1549
|
+
GGML_ASSERT(imgs.entries.size() == 1);
|
|
1550
|
+
res = clip_image_build_graph_siglip(ctx, *imgs.entries[0]);
|
|
1551
|
+
} break;
|
|
1552
|
+
case PROJECTOR_TYPE_PIXTRAL:
|
|
1553
|
+
{
|
|
1554
|
+
GGML_ASSERT(imgs.entries.size() == 1);
|
|
1555
|
+
res = clip_image_build_graph_pixtral(ctx, *imgs.entries[0]);
|
|
1556
|
+
} break;
|
|
1557
|
+
case PROJECTOR_TYPE_QWEN25VL:
|
|
1558
|
+
{
|
|
1559
|
+
res = clip_image_build_graph_qwen25vl(ctx, imgs);
|
|
1560
|
+
} break;
|
|
1561
|
+
default:
|
|
1562
|
+
{
|
|
1563
|
+
// TODO: we should have one build_* function per model
|
|
1564
|
+
res = clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf);
|
|
1565
|
+
} break;
|
|
1369
1566
|
}
|
|
1567
|
+
return res;
|
|
1370
1568
|
}
|
|
1371
1569
|
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
/* use_gpu */ true,
|
|
1376
|
-
/* verbosity */ verbosity,
|
|
1377
|
-
});
|
|
1378
|
-
}
|
|
1570
|
+
struct clip_model_loader {
|
|
1571
|
+
ggml_context_ptr ctx_meta;
|
|
1572
|
+
gguf_context_ptr ctx_gguf;
|
|
1379
1573
|
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
struct ggml_context * meta = NULL;
|
|
1574
|
+
clip_ctx & ctx_clip;
|
|
1575
|
+
std::string fname;
|
|
1383
1576
|
|
|
1384
|
-
|
|
1385
|
-
/*.no_alloc = */ true,
|
|
1386
|
-
/*.ctx = */ &meta,
|
|
1387
|
-
};
|
|
1577
|
+
size_t model_size = 0; // in bytes
|
|
1388
1578
|
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
}
|
|
1393
|
-
|
|
1394
|
-
if (verbosity >= 1) {
|
|
1395
|
-
const int n_tensors = gguf_get_n_tensors(ctx);
|
|
1396
|
-
const int n_kv = gguf_get_n_kv(ctx);
|
|
1397
|
-
const int ftype = get_u32(ctx, KEY_FTYPE);
|
|
1398
|
-
const std::string ftype_str = get_ftype(ftype);
|
|
1399
|
-
const int idx_desc = get_key_idx(ctx, KEY_DESCRIPTION);
|
|
1400
|
-
const std::string description = gguf_get_val_str(ctx, idx_desc);
|
|
1401
|
-
const int idx_name = gguf_find_key(ctx, KEY_NAME);
|
|
1402
|
-
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
|
|
1403
|
-
const std::string name = gguf_get_val_str(ctx, idx_name);
|
|
1404
|
-
LOG_INF("%s: model name: %s\n", __func__, name.c_str());
|
|
1405
|
-
}
|
|
1406
|
-
LOG_INF("%s: description: %s\n", __func__, description.c_str());
|
|
1407
|
-
LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
|
|
1408
|
-
LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
|
1409
|
-
LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
|
|
1410
|
-
LOG_INF("%s: n_kv: %d\n", __func__, n_kv);
|
|
1411
|
-
LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str());
|
|
1412
|
-
LOG_INF("\n");
|
|
1413
|
-
}
|
|
1414
|
-
const int n_tensors = gguf_get_n_tensors(ctx);
|
|
1415
|
-
|
|
1416
|
-
// kv
|
|
1417
|
-
const int n_kv = gguf_get_n_kv(ctx);
|
|
1418
|
-
LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
|
|
1419
|
-
__func__, n_kv, n_tensors, fname);
|
|
1420
|
-
{
|
|
1421
|
-
std::map<enum ggml_type, uint32_t> n_type;
|
|
1579
|
+
// TODO @ngxson : we should not pass clip_ctx here, it should be clip_vision_model
|
|
1580
|
+
clip_model_loader(const char * fname, clip_ctx & ctx_clip) : ctx_clip(ctx_clip), fname(fname) {
|
|
1581
|
+
struct ggml_context * meta = nullptr;
|
|
1422
1582
|
|
|
1423
|
-
|
|
1424
|
-
|
|
1583
|
+
struct gguf_init_params params = {
|
|
1584
|
+
/*.no_alloc = */ true,
|
|
1585
|
+
/*.ctx = */ &meta,
|
|
1586
|
+
};
|
|
1425
1587
|
|
|
1426
|
-
|
|
1588
|
+
ctx_gguf = gguf_context_ptr(gguf_init_from_file(fname, params));
|
|
1589
|
+
if (!ctx_gguf.get()) {
|
|
1590
|
+
throw std::runtime_error(string_format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
|
|
1427
1591
|
}
|
|
1428
1592
|
|
|
1429
|
-
|
|
1430
|
-
for (int i = 0; i < n_kv; i++) {
|
|
1431
|
-
const char * name = gguf_get_key(ctx, i);
|
|
1432
|
-
const enum gguf_type type = gguf_get_kv_type(ctx, i);
|
|
1433
|
-
const std::string type_name =
|
|
1434
|
-
type == GGUF_TYPE_ARRAY
|
|
1435
|
-
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i))
|
|
1436
|
-
: gguf_type_name(type);
|
|
1593
|
+
ctx_meta.reset(meta);
|
|
1437
1594
|
|
|
1438
|
-
|
|
1439
|
-
const size_t MAX_VALUE_LEN = 40;
|
|
1440
|
-
if (value.size() > MAX_VALUE_LEN) {
|
|
1441
|
-
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
|
1442
|
-
}
|
|
1443
|
-
replace_all(value, "\n", "\\n");
|
|
1444
|
-
|
|
1445
|
-
LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
|
1446
|
-
}
|
|
1447
|
-
|
|
1448
|
-
// print type counts
|
|
1449
|
-
for (auto & kv : n_type) {
|
|
1450
|
-
if (kv.second == 0) {
|
|
1451
|
-
continue;
|
|
1452
|
-
}
|
|
1595
|
+
const int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
|
|
1453
1596
|
|
|
1454
|
-
|
|
1597
|
+
// print gguf info
|
|
1598
|
+
{
|
|
1599
|
+
std::string name;
|
|
1600
|
+
get_string(KEY_NAME, name, false);
|
|
1601
|
+
std::string description;
|
|
1602
|
+
get_string(KEY_DESCRIPTION, description, false);
|
|
1603
|
+
LOG_INF("%s: model name: %s\n", __func__, name.c_str());
|
|
1604
|
+
LOG_INF("%s: description: %s\n", __func__, description.c_str());
|
|
1605
|
+
LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx_gguf.get()));
|
|
1606
|
+
LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx_gguf.get()));
|
|
1607
|
+
LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
|
|
1608
|
+
LOG_INF("%s: n_kv: %d\n", __func__, (int)gguf_get_n_kv(ctx_gguf.get()));
|
|
1609
|
+
LOG_INF("\n");
|
|
1455
1610
|
}
|
|
1456
|
-
}
|
|
1457
1611
|
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
|
|
1470
|
-
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
|
|
1612
|
+
// tensors
|
|
1613
|
+
{
|
|
1614
|
+
for (int i = 0; i < n_tensors; ++i) {
|
|
1615
|
+
const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
|
|
1616
|
+
const size_t offset = gguf_get_tensor_offset(ctx_gguf.get(), i);
|
|
1617
|
+
enum ggml_type type = gguf_get_tensor_type(ctx_gguf.get(), i);
|
|
1618
|
+
struct ggml_tensor * cur = ggml_get_tensor(meta, name);
|
|
1619
|
+
size_t tensor_size = ggml_nbytes(cur);
|
|
1620
|
+
model_size += tensor_size;
|
|
1621
|
+
LOG_DBG("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
|
|
1622
|
+
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
|
|
1471
1623
|
}
|
|
1472
1624
|
}
|
|
1473
1625
|
}
|
|
1474
1626
|
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
// update projector type
|
|
1478
|
-
{
|
|
1479
|
-
int idx = gguf_find_key(ctx, KEY_PROJ_TYPE);
|
|
1480
|
-
if (idx != -1) {
|
|
1481
|
-
const std::string proj_type = gguf_get_val_str(ctx, idx);
|
|
1482
|
-
new_clip->proj_type = clip_projector_type_from_string(proj_type);
|
|
1483
|
-
} else {
|
|
1484
|
-
new_clip->proj_type = PROJECTOR_TYPE_MLP;
|
|
1485
|
-
}
|
|
1627
|
+
void load_hparams() {
|
|
1628
|
+
auto & hparams = ctx_clip.vision_model.hparams;
|
|
1486
1629
|
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1630
|
+
// projector type
|
|
1631
|
+
std::string proj_type;
|
|
1632
|
+
{
|
|
1633
|
+
get_string(KEY_PROJ_TYPE, proj_type, false);
|
|
1634
|
+
if (!proj_type.empty()) {
|
|
1635
|
+
ctx_clip.proj_type = clip_projector_type_from_string(proj_type);
|
|
1636
|
+
}
|
|
1637
|
+
if (ctx_clip.proj_type == PROJECTOR_TYPE_UNKNOWN) {
|
|
1638
|
+
throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str()));
|
|
1490
1639
|
}
|
|
1491
1640
|
}
|
|
1492
|
-
}
|
|
1493
|
-
|
|
1494
|
-
// model size and capabilities
|
|
1495
|
-
{
|
|
1496
|
-
int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC);
|
|
1497
|
-
new_clip->has_text_encoder = gguf_get_val_bool(ctx, idx);
|
|
1498
1641
|
|
|
1499
|
-
|
|
1500
|
-
|
|
1642
|
+
// other hparams
|
|
1643
|
+
{
|
|
1644
|
+
get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false);
|
|
1645
|
+
|
|
1646
|
+
get_bool(KEY_USE_GELU, ctx_clip.use_gelu, false);
|
|
1647
|
+
get_bool(KEY_USE_SILU, ctx_clip.use_silu, false);
|
|
1648
|
+
|
|
1649
|
+
get_u32(KEY_N_EMBD, hparams.hidden_size);
|
|
1650
|
+
get_u32(KEY_N_HEAD, hparams.n_head);
|
|
1651
|
+
get_u32(KEY_N_FF, hparams.n_intermediate);
|
|
1652
|
+
get_u32(KEY_N_BLOCK, hparams.n_layer);
|
|
1653
|
+
get_u32(KEY_PROJ_DIM, hparams.projection_dim);
|
|
1654
|
+
get_f32(KEY_LAYER_NORM_EPS, hparams.eps);
|
|
1655
|
+
get_u32(KEY_IMAGE_SIZE, hparams.image_size);
|
|
1656
|
+
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
|
|
1657
|
+
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
|
|
1658
|
+
get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
|
|
1659
|
+
|
|
1660
|
+
ctx_clip.has_llava_projector = ctx_clip.proj_type == PROJECTOR_TYPE_MLP
|
|
1661
|
+
|| ctx_clip.proj_type == PROJECTOR_TYPE_MLP_NORM
|
|
1662
|
+
|| ctx_clip.proj_type == PROJECTOR_TYPE_LDP
|
|
1663
|
+
|| ctx_clip.proj_type == PROJECTOR_TYPE_LDPV2;
|
|
1501
1664
|
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1665
|
+
{
|
|
1666
|
+
std::string mm_patch_merge_type;
|
|
1667
|
+
get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false);
|
|
1668
|
+
if (mm_patch_merge_type == "spatial_unpad") {
|
|
1669
|
+
hparams.mm_patch_merge_type = PATCH_MERGE_SPATIAL_UNPAD;
|
|
1670
|
+
}
|
|
1671
|
+
}
|
|
1506
1672
|
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1673
|
+
{
|
|
1674
|
+
int idx_mean = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN);
|
|
1675
|
+
int idx_std = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_STD);
|
|
1676
|
+
GGML_ASSERT(idx_mean >= 0 && "image_mean not found");
|
|
1677
|
+
GGML_ASSERT(idx_std >= 0 && "image_std not found");
|
|
1678
|
+
const float * mean_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_mean);
|
|
1679
|
+
const float * std_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_std);
|
|
1680
|
+
for (int i = 0; i < 3; ++i) {
|
|
1681
|
+
ctx_clip.image_mean[i] = mean_data[i];
|
|
1682
|
+
ctx_clip.image_std[i] = std_data[i];
|
|
1683
|
+
}
|
|
1684
|
+
}
|
|
1511
1685
|
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1686
|
+
// Load the vision feature layer indices if they are explicitly provided;
|
|
1687
|
+
// if multiple vision feature layers are present, the values will be concatenated
|
|
1688
|
+
// to form the final visual features.
|
|
1689
|
+
// NOTE: gguf conversions should standardize the values of the vision feature layer to
|
|
1690
|
+
// be non-negative, since we use -1 to mark values as unset here.
|
|
1691
|
+
std::vector<int> vision_feature_layer;
|
|
1692
|
+
get_arr_int(KEY_FEATURE_LAYER, vision_feature_layer, false);
|
|
1693
|
+
// convert std::vector to std::unordered_set
|
|
1694
|
+
for (auto & layer : vision_feature_layer) {
|
|
1695
|
+
hparams.vision_feature_layer.insert(layer);
|
|
1696
|
+
}
|
|
1516
1697
|
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
|
|
1698
|
+
// Calculate the deepest feature layer based on hparams and projector type
|
|
1699
|
+
// NOTE: This is only used by build_graph_legacy()
|
|
1700
|
+
{
|
|
1701
|
+
// Get the index of the second to last layer; this is the default for models that have a llava projector
|
|
1702
|
+
int n_layer = hparams.n_layer - 1;
|
|
1703
|
+
int deepest_feature_layer = -1;
|
|
1704
|
+
|
|
1705
|
+
if (ctx_clip.proj_type == PROJECTOR_TYPE_MINICPMV
|
|
1706
|
+
|| ctx_clip.proj_type == PROJECTOR_TYPE_GLM_EDGE
|
|
1707
|
+
|| ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL
|
|
1708
|
+
|| ctx_clip.proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
|
1709
|
+
n_layer += 1;
|
|
1710
|
+
}
|
|
1521
1711
|
|
|
1522
|
-
|
|
1523
|
-
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1712
|
+
// If we set explicit vision feature layers, only go up to the deepest one
|
|
1713
|
+
// NOTE: only used by granite-vision models for now
|
|
1714
|
+
for (const auto & feature_layer : hparams.vision_feature_layer) {
|
|
1715
|
+
if (feature_layer > deepest_feature_layer) {
|
|
1716
|
+
deepest_feature_layer = feature_layer;
|
|
1717
|
+
}
|
|
1718
|
+
}
|
|
1719
|
+
ctx_clip.max_feature_layer = deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
|
|
1720
|
+
}
|
|
1527
1721
|
|
|
1528
|
-
|
|
1529
|
-
|
|
1722
|
+
// model-specific params
|
|
1723
|
+
switch (ctx_clip.proj_type) {
|
|
1724
|
+
case PROJECTOR_TYPE_MINICPMV:
|
|
1725
|
+
{
|
|
1726
|
+
if (ctx_clip.minicpmv_version == 0) {
|
|
1727
|
+
ctx_clip.minicpmv_version = 2; // default to 2 if not set
|
|
1728
|
+
}
|
|
1729
|
+
} break;
|
|
1730
|
+
case PROJECTOR_TYPE_IDEFICS3:
|
|
1731
|
+
{
|
|
1732
|
+
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
|
|
1733
|
+
} break;
|
|
1734
|
+
case PROJECTOR_TYPE_PIXTRAL:
|
|
1735
|
+
{
|
|
1736
|
+
hparams.rope_theta = 10000.0f;
|
|
1737
|
+
} break;
|
|
1738
|
+
case PROJECTOR_TYPE_QWEN25VL:
|
|
1739
|
+
{
|
|
1740
|
+
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
|
|
1741
|
+
} break;
|
|
1742
|
+
default:
|
|
1743
|
+
break;
|
|
1744
|
+
}
|
|
1530
1745
|
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
|
|
1534
|
-
|
|
1535
|
-
|
|
1746
|
+
LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
|
|
1747
|
+
LOG_INF("%s: has_llava_proj: %d\n", __func__, ctx_clip.has_llava_projector);
|
|
1748
|
+
LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version);
|
|
1749
|
+
LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
|
|
1750
|
+
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
|
|
1751
|
+
LOG_INF("%s: use_silu: %d\n", __func__, ctx_clip.use_silu);
|
|
1752
|
+
LOG_INF("%s: use_gelu: %d\n", __func__, ctx_clip.use_gelu);
|
|
1753
|
+
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
|
|
1754
|
+
LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
|
|
1536
1755
|
}
|
|
1756
|
+
}
|
|
1537
1757
|
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
} catch (std::runtime_error & /*e*/) {
|
|
1542
|
-
new_clip->use_silu = false;
|
|
1543
|
-
}
|
|
1758
|
+
void load_tensors() {
|
|
1759
|
+
std::map<std::string, size_t> tensor_offset;
|
|
1760
|
+
std::vector<ggml_tensor *> tensors_to_load;
|
|
1544
1761
|
|
|
1545
|
-
|
|
1546
|
-
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
|
|
1550
|
-
LOG_INF("%s: minicpmv_version: %d\n", __func__, new_clip->minicpmv_version);
|
|
1551
|
-
LOG_INF("%s: glm_projector: %d\n", __func__, new_clip->has_glm_projector);
|
|
1552
|
-
LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
|
1553
|
-
LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
|
1762
|
+
// get offsets
|
|
1763
|
+
for (int64_t i = 0; i < gguf_get_n_tensors(ctx_gguf.get()); ++i) {
|
|
1764
|
+
const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
|
|
1765
|
+
tensor_offset[name] = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), i);
|
|
1554
1766
|
}
|
|
1555
|
-
}
|
|
1556
|
-
|
|
1557
|
-
LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
|
|
1558
1767
|
|
|
1559
|
-
|
|
1560
|
-
{
|
|
1561
|
-
std::vector<uint8_t> read_buf;
|
|
1768
|
+
// create data context
|
|
1562
1769
|
struct ggml_init_params params = {
|
|
1563
|
-
/*.mem_size =*/ (
|
|
1770
|
+
/*.mem_size =*/ (gguf_get_n_tensors(ctx_gguf.get()) + 1) * ggml_tensor_overhead(),
|
|
1564
1771
|
/*.mem_buffer =*/ NULL,
|
|
1565
1772
|
/*.no_alloc =*/ true,
|
|
1566
1773
|
};
|
|
1774
|
+
ctx_clip.ctx_data.reset(ggml_init(params));
|
|
1775
|
+
if (!ctx_clip.ctx_data) {
|
|
1776
|
+
throw std::runtime_error(string_format("%s: failed to init ggml context\n", __func__));
|
|
1777
|
+
}
|
|
1567
1778
|
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
|
|
1573
|
-
return nullptr;
|
|
1574
|
-
}
|
|
1575
|
-
|
|
1576
|
-
auto fin = std::ifstream(fname, std::ios::binary);
|
|
1577
|
-
if (!fin) {
|
|
1578
|
-
LOG_ERR("cannot open model file for loading tensors\n");
|
|
1579
|
-
clip_free(new_clip);
|
|
1580
|
-
gguf_free(ctx);
|
|
1581
|
-
return nullptr;
|
|
1582
|
-
}
|
|
1583
|
-
|
|
1584
|
-
// add tensors to context
|
|
1585
|
-
for (int i = 0; i < n_tensors; ++i) {
|
|
1586
|
-
const char * name = gguf_get_tensor_name(ctx, i);
|
|
1587
|
-
struct ggml_tensor * t = ggml_get_tensor(meta, name);
|
|
1588
|
-
struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx_data, t);
|
|
1589
|
-
ggml_set_name(cur, name);
|
|
1590
|
-
}
|
|
1591
|
-
|
|
1592
|
-
// alloc memory and offload data
|
|
1593
|
-
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(new_clip->backend);
|
|
1594
|
-
new_clip->buf = ggml_backend_alloc_ctx_tensors_from_buft(new_clip->ctx_data, buft);
|
|
1595
|
-
ggml_backend_buffer_set_usage(new_clip->buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
|
1596
|
-
for (int i = 0; i < n_tensors; ++i) {
|
|
1597
|
-
const char * name = gguf_get_tensor_name(ctx, i);
|
|
1598
|
-
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
|
|
1599
|
-
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
|
1600
|
-
fin.seekg(offset, std::ios::beg);
|
|
1601
|
-
if (!fin) {
|
|
1602
|
-
LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
|
|
1603
|
-
clip_free(new_clip);
|
|
1604
|
-
gguf_free(ctx);
|
|
1605
|
-
return nullptr;
|
|
1606
|
-
}
|
|
1607
|
-
int num_bytes = ggml_nbytes(cur);
|
|
1608
|
-
if (ggml_backend_buft_is_host(buft)) {
|
|
1609
|
-
// for the CPU and Metal backend, we can read directly into the tensor
|
|
1610
|
-
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
|
|
1611
|
-
} else {
|
|
1612
|
-
// read into a temporary buffer first, then copy to device memory
|
|
1613
|
-
read_buf.resize(num_bytes);
|
|
1614
|
-
fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
|
|
1615
|
-
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
|
1779
|
+
// helper function
|
|
1780
|
+
auto get_tensor = [&](const std::string & name, bool required = true) {
|
|
1781
|
+
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta.get(), name.c_str());
|
|
1782
|
+
if (!cur && required) {
|
|
1783
|
+
throw std::runtime_error(string_format("%s: unable to find tensor %s\n", __func__, name.c_str()));
|
|
1616
1784
|
}
|
|
1617
|
-
|
|
1618
|
-
|
|
1619
|
-
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
// load vision model
|
|
1624
|
-
auto & vision_model = new_clip->vision_model;
|
|
1625
|
-
auto & hparams = vision_model.hparams;
|
|
1626
|
-
hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision"));
|
|
1627
|
-
hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision"));
|
|
1628
|
-
hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "vision"));
|
|
1629
|
-
hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision"));
|
|
1630
|
-
hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE);
|
|
1631
|
-
hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE);
|
|
1632
|
-
hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision"));
|
|
1633
|
-
hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
|
|
1634
|
-
|
|
1635
|
-
try {
|
|
1636
|
-
int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
|
|
1637
|
-
int n = gguf_get_arr_n(ctx, idx);
|
|
1638
|
-
const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx);
|
|
1639
|
-
for (int i = 0; i < n; ++i) {
|
|
1640
|
-
hparams.image_grid_pinpoints.push_back(pinpoints[i]);
|
|
1785
|
+
if (cur) {
|
|
1786
|
+
tensors_to_load.push_back(cur);
|
|
1787
|
+
// add tensors to context
|
|
1788
|
+
struct ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data.get(), cur);
|
|
1789
|
+
ggml_set_name(data_tensor, cur->name);
|
|
1790
|
+
cur = data_tensor;
|
|
1641
1791
|
}
|
|
1642
|
-
|
|
1792
|
+
return cur;
|
|
1793
|
+
};
|
|
1643
1794
|
|
|
1644
|
-
|
|
1645
|
-
// if multiple vision feature layers are present, the values will be concatenated
|
|
1646
|
-
// to form the final visual features.
|
|
1647
|
-
// NOTE: gguf conversions should standardize the values of the vision feature layer to
|
|
1648
|
-
// be non-negative, since we use -1 to mark values as unset here.
|
|
1649
|
-
try {
|
|
1650
|
-
int idx = get_key_idx(ctx, KEY_FEATURE_LAYER);
|
|
1651
|
-
int n = gguf_get_arr_n(ctx, idx);
|
|
1795
|
+
auto & vision_model = ctx_clip.vision_model;
|
|
1652
1796
|
|
|
1653
|
-
|
|
1797
|
+
vision_model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
|
|
1654
1798
|
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
|
|
1686
|
-
|
|
1687
|
-
|
|
1688
|
-
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
|
|
1693
|
-
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
|
|
1699
|
-
|
|
1700
|
-
|
|
1701
|
-
|
|
1799
|
+
vision_model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, "v", "weight"), false);
|
|
1800
|
+
vision_model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, "v", "bias"), false);
|
|
1801
|
+
|
|
1802
|
+
vision_model.post_ln_w = get_tensor(string_format(TN_LN_POST, "v", "weight"), false);
|
|
1803
|
+
vision_model.post_ln_b = get_tensor(string_format(TN_LN_POST, "v", "bias"), false);
|
|
1804
|
+
|
|
1805
|
+
vision_model.patch_bias = get_tensor(TN_PATCH_BIAS, false);
|
|
1806
|
+
vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
|
|
1807
|
+
vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
|
|
1808
|
+
|
|
1809
|
+
vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, "v"), false);
|
|
1810
|
+
|
|
1811
|
+
// layers
|
|
1812
|
+
vision_model.layers.resize(vision_model.hparams.n_layer);
|
|
1813
|
+
for (int il = 0; il < vision_model.hparams.n_layer; ++il) {
|
|
1814
|
+
auto & layer = vision_model.layers[il];
|
|
1815
|
+
layer.k_w = get_tensor(string_format(TN_ATTN_K, "v", il, "weight"));
|
|
1816
|
+
layer.q_w = get_tensor(string_format(TN_ATTN_Q, "v", il, "weight"));
|
|
1817
|
+
layer.v_w = get_tensor(string_format(TN_ATTN_V, "v", il, "weight"));
|
|
1818
|
+
layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight"));
|
|
1819
|
+
layer.ln_1_w = get_tensor(string_format(TN_LN_1, "v", il, "weight"), false);
|
|
1820
|
+
layer.ln_2_w = get_tensor(string_format(TN_LN_2, "v", il, "weight"), false);
|
|
1821
|
+
layer.k_b = get_tensor(string_format(TN_ATTN_K, "v", il, "bias"), false);
|
|
1822
|
+
layer.q_b = get_tensor(string_format(TN_ATTN_Q, "v", il, "bias"), false);
|
|
1823
|
+
layer.v_b = get_tensor(string_format(TN_ATTN_V, "v", il, "bias"), false);
|
|
1824
|
+
layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "bias"), false);
|
|
1825
|
+
layer.ln_1_b = get_tensor(string_format(TN_LN_1, "v", il, "bias"), false);
|
|
1826
|
+
layer.ln_2_b = get_tensor(string_format(TN_LN_2, "v", il, "bias"), false);
|
|
1827
|
+
|
|
1828
|
+
// new naming
|
|
1829
|
+
layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, "v", il, "weight"));
|
|
1830
|
+
layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, "v", il, "bias"), false);
|
|
1831
|
+
layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, "v", il, "weight"), false);
|
|
1832
|
+
layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, "v", il, "bias"), false);
|
|
1833
|
+
layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight"));
|
|
1834
|
+
layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"), false);
|
|
1835
|
+
|
|
1836
|
+
// legacy naming (the in and out is reversed! don't ask me why)
|
|
1837
|
+
layer.ff_i_w = layer.ff_down_w;
|
|
1838
|
+
layer.ff_o_w = layer.ff_up_w;
|
|
1839
|
+
layer.ff_g_w = layer.ff_gate_w;
|
|
1840
|
+
layer.ff_i_b = layer.ff_down_b;
|
|
1841
|
+
layer.ff_o_b = layer.ff_up_b;
|
|
1842
|
+
layer.ff_g_b = layer.ff_gate_b;
|
|
1843
|
+
}
|
|
1844
|
+
|
|
1845
|
+
switch (ctx_clip.proj_type) {
|
|
1846
|
+
case PROJECTOR_TYPE_MLP:
|
|
1847
|
+
case PROJECTOR_TYPE_MLP_NORM:
|
|
1848
|
+
{
|
|
1849
|
+
// LLaVA projection
|
|
1850
|
+
vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"), false);
|
|
1851
|
+
vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
|
|
1852
|
+
// Yi-type llava
|
|
1853
|
+
vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"), false);
|
|
1854
|
+
vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
|
|
1855
|
+
// missing in Yi-type llava
|
|
1856
|
+
vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"), false);
|
|
1857
|
+
vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
|
|
1858
|
+
// Yi-type llava
|
|
1859
|
+
vision_model.mm_3_w = get_tensor(string_format(TN_LLAVA_PROJ, 3, "weight"), false);
|
|
1860
|
+
vision_model.mm_3_b = get_tensor(string_format(TN_LLAVA_PROJ, 3, "bias"), false);
|
|
1861
|
+
vision_model.mm_4_w = get_tensor(string_format(TN_LLAVA_PROJ, 4, "weight"), false);
|
|
1862
|
+
vision_model.mm_4_b = get_tensor(string_format(TN_LLAVA_PROJ, 4, "bias"), false);
|
|
1863
|
+
if (vision_model.mm_3_w) {
|
|
1864
|
+
// TODO: this is a hack to support Yi-type llava
|
|
1865
|
+
ctx_clip.proj_type = PROJECTOR_TYPE_MLP_NORM;
|
|
1866
|
+
}
|
|
1867
|
+
vision_model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false);
|
|
1868
|
+
} break;
|
|
1869
|
+
case PROJECTOR_TYPE_LDP:
|
|
1870
|
+
{
|
|
1871
|
+
// MobileVLM projection
|
|
1872
|
+
vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
|
|
1873
|
+
vision_model.mm_model_mlp_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
|
|
1874
|
+
vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
|
|
1875
|
+
vision_model.mm_model_mlp_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
|
|
1876
|
+
vision_model.mm_model_block_1_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
|
|
1877
|
+
vision_model.mm_model_block_1_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
|
|
1878
|
+
vision_model.mm_model_block_1_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
|
|
1879
|
+
vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
|
|
1880
|
+
vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
|
|
1881
|
+
vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
|
|
1882
|
+
vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
|
|
1883
|
+
vision_model.mm_model_block_1_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
|
|
1884
|
+
vision_model.mm_model_block_1_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
|
|
1885
|
+
vision_model.mm_model_block_1_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
|
|
1886
|
+
vision_model.mm_model_block_2_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
|
|
1887
|
+
vision_model.mm_model_block_2_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
|
|
1888
|
+
vision_model.mm_model_block_2_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
|
|
1889
|
+
vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
|
|
1890
|
+
vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
|
|
1891
|
+
vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
|
|
1892
|
+
vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
|
|
1893
|
+
vision_model.mm_model_block_2_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
|
|
1894
|
+
vision_model.mm_model_block_2_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
|
|
1895
|
+
vision_model.mm_model_block_2_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
|
|
1896
|
+
} break;
|
|
1897
|
+
case PROJECTOR_TYPE_LDPV2:
|
|
1898
|
+
{
|
|
1899
|
+
// MobilVLM_V2 projection
|
|
1900
|
+
vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
|
|
1901
|
+
vision_model.mm_model_mlp_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
|
|
1902
|
+
vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
|
|
1903
|
+
vision_model.mm_model_mlp_2_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "bias"));
|
|
1904
|
+
vision_model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight"));
|
|
1905
|
+
vision_model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias"));
|
|
1906
|
+
} break;
|
|
1907
|
+
case PROJECTOR_TYPE_MINICPMV:
|
|
1908
|
+
{
|
|
1909
|
+
// vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
|
|
1910
|
+
vision_model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K);
|
|
1911
|
+
vision_model.mm_model_query = get_tensor(TN_MINICPMV_QUERY);
|
|
1912
|
+
vision_model.mm_model_proj = get_tensor(TN_MINICPMV_PROJ);
|
|
1913
|
+
vision_model.mm_model_kv_proj = get_tensor(TN_MINICPMV_KV_PROJ);
|
|
1914
|
+
vision_model.mm_model_attn_q_w = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "weight"));
|
|
1915
|
+
vision_model.mm_model_attn_k_w = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "weight"));
|
|
1916
|
+
vision_model.mm_model_attn_v_w = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "weight"));
|
|
1917
|
+
vision_model.mm_model_attn_q_b = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "bias"));
|
|
1918
|
+
vision_model.mm_model_attn_k_b = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "bias"));
|
|
1919
|
+
vision_model.mm_model_attn_v_b = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "bias"));
|
|
1920
|
+
vision_model.mm_model_attn_o_w = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "weight"));
|
|
1921
|
+
vision_model.mm_model_attn_o_b = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "bias"));
|
|
1922
|
+
vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_MINICPMV_LN, "q", "weight"));
|
|
1923
|
+
vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_MINICPMV_LN, "q", "bias"));
|
|
1924
|
+
vision_model.mm_model_ln_kv_w = get_tensor(string_format(TN_MINICPMV_LN, "kv", "weight"));
|
|
1925
|
+
vision_model.mm_model_ln_kv_b = get_tensor(string_format(TN_MINICPMV_LN, "kv", "bias"));
|
|
1926
|
+
vision_model.mm_model_ln_post_w = get_tensor(string_format(TN_MINICPMV_LN, "post", "weight"));
|
|
1927
|
+
vision_model.mm_model_ln_post_b = get_tensor(string_format(TN_MINICPMV_LN, "post", "bias"));
|
|
1928
|
+
} break;
|
|
1929
|
+
case PROJECTOR_TYPE_GLM_EDGE:
|
|
1930
|
+
{
|
|
1931
|
+
vision_model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight"));
|
|
1932
|
+
vision_model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias"));
|
|
1933
|
+
vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR,"weight"));
|
|
1934
|
+
vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1,"weight"));
|
|
1935
|
+
vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1,"bias"));
|
|
1936
|
+
vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
|
|
1937
|
+
vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight"));
|
|
1938
|
+
vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
|
|
1939
|
+
} break;
|
|
1940
|
+
case PROJECTOR_TYPE_QWEN2VL:
|
|
1941
|
+
case PROJECTOR_TYPE_QWEN25VL:
|
|
1942
|
+
{
|
|
1943
|
+
vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
|
|
1944
|
+
vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
|
|
1945
|
+
vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
|
1946
|
+
vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
|
1947
|
+
} break;
|
|
1948
|
+
case PROJECTOR_TYPE_GEMMA3:
|
|
1949
|
+
{
|
|
1950
|
+
vision_model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
|
1951
|
+
vision_model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
|
|
1952
|
+
} break;
|
|
1953
|
+
case PROJECTOR_TYPE_IDEFICS3:
|
|
1954
|
+
{
|
|
1955
|
+
vision_model.projection = get_tensor(TN_MM_PROJECTOR);
|
|
1956
|
+
} break;
|
|
1957
|
+
case PROJECTOR_TYPE_PIXTRAL:
|
|
1958
|
+
{
|
|
1959
|
+
vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
|
1960
|
+
vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
|
|
1961
|
+
vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
|
1962
|
+
vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
|
1963
|
+
// [IMG_BREAK] token embedding
|
|
1964
|
+
vision_model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
|
|
1965
|
+
} break;
|
|
1966
|
+
default:
|
|
1967
|
+
GGML_ASSERT(false && "unknown projector type");
|
|
1968
|
+
}
|
|
1969
|
+
|
|
1970
|
+
// load data
|
|
1971
|
+
{
|
|
1972
|
+
std::vector<uint8_t> read_buf;
|
|
1973
|
+
|
|
1974
|
+
auto fin = std::ifstream(fname, std::ios::binary);
|
|
1975
|
+
if (!fin) {
|
|
1976
|
+
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
|
1702
1977
|
}
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
|
|
1978
|
+
|
|
1979
|
+
// alloc memory and offload data
|
|
1980
|
+
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend);
|
|
1981
|
+
ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
|
|
1982
|
+
ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
|
1983
|
+
for (auto & t : tensors_to_load) {
|
|
1984
|
+
struct ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name);
|
|
1985
|
+
const size_t offset = tensor_offset[t->name];
|
|
1986
|
+
fin.seekg(offset, std::ios::beg);
|
|
1987
|
+
if (!fin) {
|
|
1988
|
+
throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name));
|
|
1989
|
+
}
|
|
1990
|
+
size_t num_bytes = ggml_nbytes(cur);
|
|
1991
|
+
if (ggml_backend_buft_is_host(buft)) {
|
|
1992
|
+
// for the CPU and Metal backend, we can read directly into the tensor
|
|
1993
|
+
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
|
|
1994
|
+
} else {
|
|
1995
|
+
// read into a temporary buffer first, then copy to device memory
|
|
1996
|
+
read_buf.resize(num_bytes);
|
|
1997
|
+
fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
|
|
1998
|
+
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
|
1999
|
+
}
|
|
1707
2000
|
}
|
|
1708
|
-
|
|
1709
|
-
|
|
1710
|
-
|
|
1711
|
-
}
|
|
1712
|
-
|
|
1713
|
-
try {
|
|
1714
|
-
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
|
|
1715
|
-
new_clip->has_class_embedding = true;
|
|
1716
|
-
} catch (const std::exception& /*e*/) {
|
|
1717
|
-
new_clip->has_class_embedding = false;
|
|
1718
|
-
}
|
|
1719
|
-
|
|
1720
|
-
try {
|
|
1721
|
-
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
|
|
1722
|
-
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
|
|
1723
|
-
new_clip->has_pre_norm = true;
|
|
1724
|
-
} catch (std::exception & /*e*/) {
|
|
1725
|
-
new_clip->has_pre_norm = false;
|
|
1726
|
-
}
|
|
1727
|
-
|
|
1728
|
-
try {
|
|
1729
|
-
vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
|
|
1730
|
-
vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
|
|
1731
|
-
new_clip->has_post_norm = true;
|
|
1732
|
-
} catch (std::exception & /*e*/) {
|
|
1733
|
-
new_clip->has_post_norm = false;
|
|
1734
|
-
}
|
|
1735
|
-
|
|
1736
|
-
try {
|
|
1737
|
-
vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
|
|
1738
|
-
new_clip->has_patch_bias = true;
|
|
1739
|
-
} catch (std::exception & /*e*/) {
|
|
1740
|
-
new_clip->has_patch_bias = false;
|
|
1741
|
-
}
|
|
1742
|
-
|
|
1743
|
-
try {
|
|
1744
|
-
vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
|
1745
|
-
} catch(const std::exception& /*e*/) {
|
|
1746
|
-
vision_model.patch_embeddings_0 = nullptr;
|
|
1747
|
-
}
|
|
1748
|
-
|
|
1749
|
-
try {
|
|
1750
|
-
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
|
1751
|
-
} catch(const std::exception& /*e*/) {
|
|
1752
|
-
vision_model.position_embeddings = nullptr;
|
|
1753
|
-
}
|
|
1754
|
-
|
|
1755
|
-
try {
|
|
1756
|
-
vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
|
|
1757
|
-
} catch(const std::exception& /*e*/) {
|
|
1758
|
-
new_clip->has_qwen2vl_merger = false;
|
|
1759
|
-
}
|
|
1760
|
-
|
|
1761
|
-
// LLaVA projection
|
|
1762
|
-
if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
|
1763
|
-
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
|
|
1764
|
-
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
|
|
1765
|
-
try {
|
|
1766
|
-
// Yi-type llava
|
|
1767
|
-
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
|
|
1768
|
-
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
|
|
1769
|
-
} catch (std::runtime_error & /*e*/) { }
|
|
1770
|
-
try {
|
|
1771
|
-
// missing in Yi-type llava
|
|
1772
|
-
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
|
|
1773
|
-
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
|
|
1774
|
-
} catch (std::runtime_error & /*e*/) { }
|
|
1775
|
-
try {
|
|
1776
|
-
// Yi-type llava
|
|
1777
|
-
vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
|
|
1778
|
-
vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
|
|
1779
|
-
} catch (std::runtime_error & /*e*/) { }
|
|
1780
|
-
try {
|
|
1781
|
-
// Yi-type llava
|
|
1782
|
-
vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
|
|
1783
|
-
vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
|
|
1784
|
-
} catch (std::runtime_error & /*e*/) { }
|
|
1785
|
-
try {
|
|
1786
|
-
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
|
|
1787
|
-
// LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
|
|
1788
|
-
} catch (std::runtime_error & /*e*/) { }
|
|
1789
|
-
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
|
|
1790
|
-
// MobileVLM projection
|
|
1791
|
-
vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
|
|
1792
|
-
vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
|
|
1793
|
-
vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
|
|
1794
|
-
vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
|
|
1795
|
-
vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
|
|
1796
|
-
vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
|
|
1797
|
-
vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
|
|
1798
|
-
vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
|
|
1799
|
-
vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
|
|
1800
|
-
vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
|
|
1801
|
-
vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
|
|
1802
|
-
vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
|
|
1803
|
-
vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
|
|
1804
|
-
vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
|
|
1805
|
-
vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
|
|
1806
|
-
vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
|
|
1807
|
-
vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
|
|
1808
|
-
vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
|
|
1809
|
-
vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
|
|
1810
|
-
vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
|
|
1811
|
-
vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
|
|
1812
|
-
vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
|
|
1813
|
-
vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
|
|
1814
|
-
vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
|
|
1815
|
-
}
|
|
1816
|
-
else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2)
|
|
1817
|
-
{
|
|
1818
|
-
// MobilVLM_V2 projection
|
|
1819
|
-
vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight"));
|
|
1820
|
-
vision_model.mm_model_mlp_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "bias"));
|
|
1821
|
-
vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "weight"));
|
|
1822
|
-
vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias"));
|
|
1823
|
-
vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
|
|
1824
|
-
vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
|
|
1825
|
-
}
|
|
1826
|
-
else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
|
1827
|
-
// vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
|
|
1828
|
-
vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K);
|
|
1829
|
-
vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY);
|
|
1830
|
-
vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ);
|
|
1831
|
-
vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ);
|
|
1832
|
-
vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight"));
|
|
1833
|
-
vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight"));
|
|
1834
|
-
vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight"));
|
|
1835
|
-
vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias"));
|
|
1836
|
-
vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias"));
|
|
1837
|
-
vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias"));
|
|
1838
|
-
vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight"));
|
|
1839
|
-
vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias"));
|
|
1840
|
-
vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight"));
|
|
1841
|
-
vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias"));
|
|
1842
|
-
vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight"));
|
|
1843
|
-
vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias"));
|
|
1844
|
-
vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
|
|
1845
|
-
vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
|
|
1846
|
-
}
|
|
1847
|
-
else if (new_clip->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
|
1848
|
-
vision_model.mm_model_adapter_conv_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "weight"));
|
|
1849
|
-
vision_model.mm_model_adapter_conv_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "bias"));
|
|
1850
|
-
vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_LINEAR,"weight"));
|
|
1851
|
-
vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"weight"));
|
|
1852
|
-
vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"bias"));
|
|
1853
|
-
vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
|
|
1854
|
-
vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_GATE,"weight"));
|
|
1855
|
-
vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
|
|
1856
|
-
vision_model.boi_w = get_tensor(new_clip->ctx_data, TN_GLM_BOI_W);
|
|
1857
|
-
vision_model.eoi_w = get_tensor(new_clip->ctx_data, TN_GLM_EOI_W);
|
|
1858
|
-
}
|
|
1859
|
-
else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) {
|
|
1860
|
-
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
|
|
1861
|
-
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
|
|
1862
|
-
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
|
|
1863
|
-
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
|
|
1864
|
-
}
|
|
1865
|
-
else if (new_clip->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
|
1866
|
-
vision_model.mm_input_proj_w = get_tensor(new_clip->ctx_data, TN_MM_INP_PROJ);
|
|
1867
|
-
vision_model.mm_soft_emb_norm_w = get_tensor(new_clip->ctx_data, TN_MM_SOFT_EMB_N);
|
|
1868
|
-
}
|
|
1869
|
-
else {
|
|
1870
|
-
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
|
1871
|
-
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
|
2001
|
+
fin.close();
|
|
2002
|
+
|
|
2003
|
+
LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str());
|
|
1872
2004
|
}
|
|
2005
|
+
}
|
|
1873
2006
|
|
|
1874
|
-
|
|
2007
|
+
void alloc_compute_meta() {
|
|
2008
|
+
ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
|
|
1875
2009
|
|
|
1876
|
-
|
|
1877
|
-
auto & layer = vision_model.layers[il];
|
|
1878
|
-
layer.k_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "weight"));
|
|
1879
|
-
layer.q_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "weight"));
|
|
1880
|
-
layer.v_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "weight"));
|
|
1881
|
-
layer.o_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "weight"));
|
|
1882
|
-
layer.ln_1_w = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "weight"));
|
|
1883
|
-
layer.ln_2_w = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "weight"));
|
|
1884
|
-
layer.ff_i_w = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "weight"));
|
|
1885
|
-
layer.ff_o_w = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "weight"));
|
|
1886
|
-
layer.k_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "bias"));
|
|
1887
|
-
layer.q_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "bias"));
|
|
1888
|
-
layer.v_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "bias"));
|
|
1889
|
-
layer.o_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "bias"));
|
|
1890
|
-
layer.ln_1_b = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "bias"));
|
|
1891
|
-
layer.ln_2_b = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "bias"));
|
|
1892
|
-
layer.ff_i_b = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "bias"));
|
|
1893
|
-
layer.ff_o_b = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "bias"));
|
|
1894
|
-
}
|
|
1895
|
-
}
|
|
1896
|
-
|
|
1897
|
-
ggml_free(meta);
|
|
1898
|
-
|
|
1899
|
-
new_clip->ctx_gguf = ctx;
|
|
1900
|
-
|
|
1901
|
-
// measure mem requirement and allocate
|
|
1902
|
-
{
|
|
1903
|
-
new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
|
|
2010
|
+
// create a fake batch
|
|
1904
2011
|
clip_image_f32_batch batch;
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
|
|
1908
|
-
|
|
1909
|
-
|
|
1910
|
-
|
|
1911
|
-
|
|
1912
|
-
|
|
2012
|
+
clip_image_f32_ptr img(clip_image_f32_init());
|
|
2013
|
+
clip_image_size image_size;
|
|
2014
|
+
image_size.width = ctx_clip.vision_model.hparams.image_size;
|
|
2015
|
+
image_size.height = ctx_clip.vision_model.hparams.image_size;
|
|
2016
|
+
img->nx = image_size.width;
|
|
2017
|
+
img->ny = image_size.height;
|
|
2018
|
+
img->buf.resize(image_size.width * image_size.height * 3);
|
|
2019
|
+
batch.entries.push_back(std::move(img));
|
|
2020
|
+
|
|
2021
|
+
ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch, image_size, false);
|
|
2022
|
+
ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
|
|
2023
|
+
for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) {
|
|
2024
|
+
ggml_backend_t backend = ctx_clip.backend_ptrs[i];
|
|
2025
|
+
ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i];
|
|
2026
|
+
size_t size = ggml_backend_sched_get_buffer_size(ctx_clip.sched.get(), backend);
|
|
1913
2027
|
if (size > 1) {
|
|
1914
2028
|
LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
|
1915
2029
|
ggml_backend_buft_name(buft),
|
|
@@ -1918,15 +2032,98 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
|
|
|
1918
2032
|
}
|
|
1919
2033
|
}
|
|
1920
2034
|
|
|
1921
|
-
|
|
2035
|
+
void get_bool(const std::string & key, bool & output, bool required = true) {
|
|
2036
|
+
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
|
|
2037
|
+
if (i < 0) {
|
|
2038
|
+
if (required) throw std::runtime_error("Key not found: " + key);
|
|
2039
|
+
return;
|
|
2040
|
+
}
|
|
2041
|
+
output = gguf_get_val_bool(ctx_gguf.get(), i);
|
|
2042
|
+
}
|
|
2043
|
+
|
|
2044
|
+
void get_i32(const std::string & key, int & output, bool required = true) {
|
|
2045
|
+
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
|
|
2046
|
+
if (i < 0) {
|
|
2047
|
+
if (required) throw std::runtime_error("Key not found: " + key);
|
|
2048
|
+
return;
|
|
2049
|
+
}
|
|
2050
|
+
output = gguf_get_val_i32(ctx_gguf.get(), i);
|
|
2051
|
+
}
|
|
2052
|
+
|
|
2053
|
+
void get_u32(const std::string & key, int & output, bool required = true) {
|
|
2054
|
+
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
|
|
2055
|
+
if (i < 0) {
|
|
2056
|
+
if (required) throw std::runtime_error("Key not found: " + key);
|
|
2057
|
+
return;
|
|
2058
|
+
}
|
|
2059
|
+
output = gguf_get_val_u32(ctx_gguf.get(), i);
|
|
2060
|
+
}
|
|
2061
|
+
|
|
2062
|
+
void get_f32(const std::string & key, float & output, bool required = true) {
|
|
2063
|
+
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
|
|
2064
|
+
if (i < 0) {
|
|
2065
|
+
if (required) throw std::runtime_error("Key not found: " + key);
|
|
2066
|
+
return;
|
|
2067
|
+
}
|
|
2068
|
+
output = gguf_get_val_f32(ctx_gguf.get(), i);
|
|
2069
|
+
}
|
|
2070
|
+
|
|
2071
|
+
void get_string(const std::string & key, std::string & output, bool required = true) {
|
|
2072
|
+
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
|
|
2073
|
+
if (i < 0) {
|
|
2074
|
+
if (required) throw std::runtime_error("Key not found: " + key);
|
|
2075
|
+
return;
|
|
2076
|
+
}
|
|
2077
|
+
output = std::string(gguf_get_val_str(ctx_gguf.get(), i));
|
|
2078
|
+
}
|
|
2079
|
+
|
|
2080
|
+
void get_arr_int(const std::string & key, std::vector<int> & output, bool required = true) {
|
|
2081
|
+
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
|
|
2082
|
+
if (i < 0) {
|
|
2083
|
+
if (required) throw std::runtime_error("Key not found: " + key);
|
|
2084
|
+
return;
|
|
2085
|
+
}
|
|
2086
|
+
int n = gguf_get_arr_n(ctx_gguf.get(), i);
|
|
2087
|
+
output.resize(n);
|
|
2088
|
+
const int32_t * values = (const int32_t *)gguf_get_arr_data(ctx_gguf.get(), i);
|
|
2089
|
+
for (int i = 0; i < n; ++i) {
|
|
2090
|
+
output[i] = values[i];
|
|
2091
|
+
}
|
|
2092
|
+
}
|
|
2093
|
+
};
|
|
2094
|
+
|
|
2095
|
+
// read and create ggml_context containing the tensors and their data
|
|
2096
|
+
struct clip_ctx * clip_model_load(const char * fname, const int verbosity) {
|
|
2097
|
+
return clip_init(fname, clip_context_params{
|
|
2098
|
+
/* use_gpu */ true,
|
|
2099
|
+
/* verbosity */ static_cast<ggml_log_level>(verbosity),
|
|
2100
|
+
});
|
|
2101
|
+
}
|
|
2102
|
+
|
|
2103
|
+
struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
|
|
2104
|
+
g_logger_state.verbosity_thold = ctx_params.verbosity;
|
|
2105
|
+
clip_ctx * ctx_clip = new clip_ctx(ctx_params);
|
|
2106
|
+
|
|
2107
|
+
try {
|
|
2108
|
+
clip_model_loader loader(fname, *ctx_clip);
|
|
2109
|
+
loader.load_hparams();
|
|
2110
|
+
loader.load_tensors();
|
|
2111
|
+
loader.alloc_compute_meta();
|
|
2112
|
+
} catch (const std::exception & e) {
|
|
2113
|
+
LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what());
|
|
2114
|
+
delete ctx_clip;
|
|
2115
|
+
return nullptr;
|
|
2116
|
+
}
|
|
2117
|
+
|
|
2118
|
+
return ctx_clip;
|
|
1922
2119
|
}
|
|
1923
2120
|
|
|
1924
2121
|
void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
|
|
1925
|
-
ctx_clip->load_image_size = load_image_size;
|
|
2122
|
+
ctx_clip->load_image_size = *load_image_size; // copy
|
|
1926
2123
|
}
|
|
1927
2124
|
|
|
1928
2125
|
struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
|
|
1929
|
-
return ctx_clip->load_image_size;
|
|
2126
|
+
return &ctx_clip->load_image_size;
|
|
1930
2127
|
}
|
|
1931
2128
|
|
|
1932
2129
|
struct clip_image_size * clip_image_size_init() {
|
|
@@ -1944,19 +2141,53 @@ struct clip_image_f32 * clip_image_f32_init() {
|
|
|
1944
2141
|
return new clip_image_f32();
|
|
1945
2142
|
}
|
|
1946
2143
|
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
1950
|
-
|
|
1951
|
-
|
|
1952
|
-
|
|
2144
|
+
struct clip_image_f32_batch * clip_image_f32_batch_init() {
|
|
2145
|
+
return new clip_image_f32_batch();
|
|
2146
|
+
}
|
|
2147
|
+
|
|
2148
|
+
unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) {
|
|
2149
|
+
if (nx) *nx = img->nx;
|
|
2150
|
+
if (ny) *ny = img->ny;
|
|
2151
|
+
return img->buf.data();
|
|
2152
|
+
}
|
|
2153
|
+
|
|
2154
|
+
void clip_image_size_free(struct clip_image_size * load_image_size) {
|
|
2155
|
+
if (load_image_size == nullptr) {
|
|
2156
|
+
return;
|
|
2157
|
+
}
|
|
2158
|
+
delete load_image_size;
|
|
2159
|
+
}
|
|
2160
|
+
void clip_image_u8_free(struct clip_image_u8 * img) { if (img) delete img; }
|
|
2161
|
+
void clip_image_f32_free(struct clip_image_f32 * img) { if (img) delete img; }
|
|
2162
|
+
void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { if (batch) delete batch; }
|
|
2163
|
+
void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { if (batch) delete batch; }
|
|
2164
|
+
|
|
2165
|
+
size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) {
|
|
2166
|
+
return batch->entries.size();
|
|
2167
|
+
}
|
|
2168
|
+
|
|
2169
|
+
size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx) {
|
|
2170
|
+
if (idx < 0 || idx >= (int)batch->entries.size()) {
|
|
2171
|
+
LOG_ERR("%s: invalid index %d\n", __func__, idx);
|
|
2172
|
+
return 0;
|
|
2173
|
+
}
|
|
2174
|
+
return batch->entries[idx]->nx;
|
|
2175
|
+
}
|
|
2176
|
+
|
|
2177
|
+
size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) {
|
|
2178
|
+
if (idx < 0 || idx >= (int)batch->entries.size()) {
|
|
2179
|
+
LOG_ERR("%s: invalid index %d\n", __func__, idx);
|
|
2180
|
+
return 0;
|
|
1953
2181
|
}
|
|
2182
|
+
return batch->entries[idx]->ny;
|
|
1954
2183
|
}
|
|
1955
|
-
|
|
1956
|
-
|
|
1957
|
-
|
|
1958
|
-
|
|
2184
|
+
|
|
2185
|
+
clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) {
|
|
2186
|
+
if (idx < 0 || idx >= (int)batch->entries.size()) {
|
|
2187
|
+
LOG_ERR("%s: invalid index %d\n", __func__, idx);
|
|
2188
|
+
return nullptr;
|
|
1959
2189
|
}
|
|
2190
|
+
return batch->entries[idx].get();
|
|
1960
2191
|
}
|
|
1961
2192
|
|
|
1962
2193
|
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
|
|
@@ -1990,605 +2221,597 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
|
|
|
1990
2221
|
return true;
|
|
1991
2222
|
}
|
|
1992
2223
|
|
|
1993
|
-
//
|
|
1994
|
-
|
|
1995
|
-
|
|
1996
|
-
|
|
1997
|
-
|
|
1998
|
-
|
|
1999
|
-
|
|
2000
|
-
|
|
2001
|
-
|
|
2002
|
-
|
|
2003
|
-
|
|
2004
|
-
|
|
2005
|
-
|
|
2006
|
-
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
|
-
|
|
2012
|
-
|
|
2013
|
-
|
|
2014
|
-
|
|
2015
|
-
|
|
2016
|
-
|
|
2017
|
-
|
|
2018
|
-
|
|
2019
|
-
|
|
2020
|
-
|
|
2021
|
-
float
|
|
2022
|
-
|
|
2023
|
-
|
|
2024
|
-
|
|
2025
|
-
|
|
2026
|
-
|
|
2224
|
+
// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
|
|
2225
|
+
static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
|
|
2226
|
+
dst.nx = src.nx;
|
|
2227
|
+
dst.ny = src.ny;
|
|
2228
|
+
dst.buf.resize(src.buf.size());
|
|
2229
|
+
|
|
2230
|
+
// TODO @ngxson : seems like this could be done more efficiently on cgraph
|
|
2231
|
+
for (size_t i = 0; i < src.buf.size(); ++i) {
|
|
2232
|
+
int c = i % 3; // rgb
|
|
2233
|
+
dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c];
|
|
2234
|
+
}
|
|
2235
|
+
}
|
|
2236
|
+
|
|
2237
|
+
// set of tools to manupulate images
|
|
2238
|
+
// in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv
|
|
2239
|
+
struct image_manipulation {
|
|
2240
|
+
// Bilinear resize function
|
|
2241
|
+
static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
|
|
2242
|
+
dst.nx = target_width;
|
|
2243
|
+
dst.ny = target_height;
|
|
2244
|
+
dst.buf.resize(3 * target_width * target_height);
|
|
2245
|
+
|
|
2246
|
+
float x_ratio = static_cast<float>(src.nx - 1) / target_width;
|
|
2247
|
+
float y_ratio = static_cast<float>(src.ny - 1) / target_height;
|
|
2248
|
+
|
|
2249
|
+
for (int y = 0; y < target_height; y++) {
|
|
2250
|
+
for (int x = 0; x < target_width; x++) {
|
|
2251
|
+
float px = x_ratio * x;
|
|
2252
|
+
float py = y_ratio * y;
|
|
2253
|
+
int x_floor = static_cast<int>(px);
|
|
2254
|
+
int y_floor = static_cast<int>(py);
|
|
2255
|
+
float x_lerp = px - x_floor;
|
|
2256
|
+
float y_lerp = py - y_floor;
|
|
2257
|
+
|
|
2258
|
+
for (int c = 0; c < 3; c++) {
|
|
2259
|
+
float top = lerp(
|
|
2260
|
+
static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
|
|
2261
|
+
static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
|
|
2262
|
+
x_lerp
|
|
2263
|
+
);
|
|
2264
|
+
float bottom = lerp(
|
|
2265
|
+
static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
|
|
2266
|
+
static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
|
|
2267
|
+
x_lerp
|
|
2268
|
+
);
|
|
2269
|
+
dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
|
|
2270
|
+
}
|
|
2027
2271
|
}
|
|
2028
2272
|
}
|
|
2029
2273
|
}
|
|
2030
|
-
}
|
|
2031
2274
|
|
|
2032
|
-
//
|
|
2033
|
-
|
|
2034
|
-
dst
|
|
2035
|
-
|
|
2036
|
-
|
|
2275
|
+
// Bicubic resize function
|
|
2276
|
+
// part of image will be cropped if the aspect ratio is different
|
|
2277
|
+
static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
|
|
2278
|
+
const int nx = img.nx;
|
|
2279
|
+
const int ny = img.ny;
|
|
2280
|
+
|
|
2281
|
+
dst.nx = target_width;
|
|
2282
|
+
dst.ny = target_height;
|
|
2283
|
+
dst.buf.resize(3 * target_width * target_height);
|
|
2284
|
+
|
|
2285
|
+
float Cc;
|
|
2286
|
+
float C[5];
|
|
2287
|
+
float d0, d2, d3, a0, a1, a2, a3;
|
|
2288
|
+
int i, j, k, jj;
|
|
2289
|
+
int x, y;
|
|
2290
|
+
float dx, dy;
|
|
2291
|
+
float tx, ty;
|
|
2292
|
+
|
|
2293
|
+
tx = (float)nx / (float)target_width;
|
|
2294
|
+
ty = (float)ny / (float)target_height;
|
|
2295
|
+
|
|
2296
|
+
// Bicubic interpolation; adapted from ViT.cpp, inspired from :
|
|
2297
|
+
// -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
|
|
2298
|
+
// -> https://en.wikipedia.org/wiki/Bicubic_interpolation
|
|
2299
|
+
|
|
2300
|
+
for (i = 0; i < target_height; i++) {
|
|
2301
|
+
for (j = 0; j < target_width; j++) {
|
|
2302
|
+
x = (int)(tx * j);
|
|
2303
|
+
y = (int)(ty * i);
|
|
2304
|
+
|
|
2305
|
+
dx = tx * j - x;
|
|
2306
|
+
dy = ty * i - y;
|
|
2307
|
+
|
|
2308
|
+
for (k = 0; k < 3; k++) {
|
|
2309
|
+
for (jj = 0; jj <= 3; jj++) {
|
|
2310
|
+
d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
|
2311
|
+
d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
|
2312
|
+
d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
|
2313
|
+
a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
|
2314
|
+
|
|
2315
|
+
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
|
|
2316
|
+
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
|
|
2317
|
+
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
|
|
2318
|
+
|
|
2319
|
+
C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
|
|
2320
|
+
|
|
2321
|
+
d0 = C[0] - C[1];
|
|
2322
|
+
d2 = C[2] - C[1];
|
|
2323
|
+
d3 = C[3] - C[1];
|
|
2324
|
+
a0 = C[1];
|
|
2325
|
+
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
|
|
2326
|
+
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
|
|
2327
|
+
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
|
|
2328
|
+
Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
|
|
2329
|
+
|
|
2330
|
+
const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
|
|
2331
|
+
dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
|
|
2332
|
+
}
|
|
2333
|
+
}
|
|
2334
|
+
}
|
|
2335
|
+
}
|
|
2037
2336
|
|
|
2038
|
-
|
|
2039
|
-
int c = i % 3; // rgb
|
|
2040
|
-
dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - mean[c]) / std[c];
|
|
2337
|
+
return true;
|
|
2041
2338
|
}
|
|
2042
|
-
}
|
|
2043
2339
|
|
|
2044
|
-
|
|
2045
|
-
|
|
2046
|
-
|
|
2340
|
+
// llava-1.6 type of resize_and_pad
|
|
2341
|
+
// if the ratio is not 1:1, padding with pad_color will be applied
|
|
2342
|
+
// pad_color is single channel, default is 0 (black)
|
|
2343
|
+
static void resize_and_pad_image(const clip_image_u8 & image, clip_image_u8 & dst, const clip_image_size & target_resolution, std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
|
|
2344
|
+
int target_width = target_resolution.width;
|
|
2345
|
+
int target_height = target_resolution.height;
|
|
2346
|
+
|
|
2347
|
+
float scale_w = static_cast<float>(target_width) / image.nx;
|
|
2348
|
+
float scale_h = static_cast<float>(target_height) / image.ny;
|
|
2349
|
+
|
|
2350
|
+
int new_width, new_height;
|
|
2351
|
+
|
|
2352
|
+
if (scale_w < scale_h) {
|
|
2353
|
+
new_width = target_width;
|
|
2354
|
+
new_height = std::min(static_cast<int>(std::ceil(image.ny * scale_w)), target_height);
|
|
2355
|
+
} else {
|
|
2356
|
+
new_height = target_height;
|
|
2357
|
+
new_width = std::min(static_cast<int>(std::ceil(image.nx * scale_h)), target_width);
|
|
2358
|
+
}
|
|
2359
|
+
|
|
2360
|
+
clip_image_u8 resized_image;
|
|
2361
|
+
bicubic_resize(image, resized_image, new_width, new_height);
|
|
2362
|
+
|
|
2363
|
+
clip_image_u8 padded_image;
|
|
2364
|
+
padded_image.nx = target_width;
|
|
2365
|
+
padded_image.ny = target_height;
|
|
2366
|
+
padded_image.buf.resize(3 * target_width * target_height);
|
|
2047
2367
|
|
|
2048
|
-
|
|
2049
|
-
|
|
2050
|
-
|
|
2051
|
-
|
|
2052
|
-
|
|
2053
|
-
|
|
2054
|
-
|
|
2055
|
-
|
|
2056
|
-
|
|
2057
|
-
|
|
2058
|
-
|
|
2059
|
-
|
|
2060
|
-
|
|
2061
|
-
|
|
2062
|
-
|
|
2063
|
-
|
|
2064
|
-
tx = (float)nx / (float)target_width;
|
|
2065
|
-
ty = (float)ny / (float)target_height;
|
|
2066
|
-
|
|
2067
|
-
// Bicubic interpolation; adapted from ViT.cpp, inspired from :
|
|
2068
|
-
// -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
|
|
2069
|
-
// -> https://en.wikipedia.org/wiki/Bicubic_interpolation
|
|
2070
|
-
|
|
2071
|
-
for (i = 0; i < target_height; i++) {
|
|
2072
|
-
for (j = 0; j < target_width; j++) {
|
|
2073
|
-
x = (int)(tx * j);
|
|
2074
|
-
y = (int)(ty * i);
|
|
2075
|
-
|
|
2076
|
-
dx = tx * j - x;
|
|
2077
|
-
dy = ty * i - y;
|
|
2078
|
-
|
|
2079
|
-
for (k = 0; k < 3; k++) {
|
|
2080
|
-
for (jj = 0; jj <= 3; jj++) {
|
|
2081
|
-
d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
|
2082
|
-
d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
|
2083
|
-
d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
|
2084
|
-
a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
|
2085
|
-
|
|
2086
|
-
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
|
|
2087
|
-
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
|
|
2088
|
-
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
|
|
2089
|
-
|
|
2090
|
-
C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
|
|
2091
|
-
|
|
2092
|
-
d0 = C[0] - C[1];
|
|
2093
|
-
d2 = C[2] - C[1];
|
|
2094
|
-
d3 = C[3] - C[1];
|
|
2095
|
-
a0 = C[1];
|
|
2096
|
-
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
|
|
2097
|
-
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
|
|
2098
|
-
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
|
|
2099
|
-
Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
|
|
2100
|
-
|
|
2101
|
-
const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
|
|
2102
|
-
dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
|
|
2368
|
+
// Fill the padded image with the fill color
|
|
2369
|
+
for (size_t i = 0; i < padded_image.buf.size(); i += 3) {
|
|
2370
|
+
padded_image.buf[i] = pad_color[0];
|
|
2371
|
+
padded_image.buf[i + 1] = pad_color[1];
|
|
2372
|
+
padded_image.buf[i + 2] = pad_color[2];
|
|
2373
|
+
}
|
|
2374
|
+
|
|
2375
|
+
// Calculate padding offsets
|
|
2376
|
+
int pad_x = (target_width - new_width) / 2;
|
|
2377
|
+
int pad_y = (target_height - new_height) / 2;
|
|
2378
|
+
|
|
2379
|
+
// Copy the resized image into the center of the padded buffer
|
|
2380
|
+
for (int y = 0; y < new_height; ++y) {
|
|
2381
|
+
for (int x = 0; x < new_width; ++x) {
|
|
2382
|
+
for (int c = 0; c < 3; ++c) {
|
|
2383
|
+
padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c];
|
|
2103
2384
|
}
|
|
2104
2385
|
}
|
|
2105
2386
|
}
|
|
2387
|
+
dst = std::move(padded_image);
|
|
2106
2388
|
}
|
|
2107
2389
|
|
|
2108
|
-
|
|
2109
|
-
|
|
2390
|
+
static void crop_image(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
|
|
2391
|
+
dst.nx = w;
|
|
2392
|
+
dst.ny = h;
|
|
2393
|
+
dst.buf.resize(3 * w * h);
|
|
2110
2394
|
|
|
2111
|
-
|
|
2112
|
-
|
|
2113
|
-
|
|
2114
|
-
|
|
2395
|
+
for (int i = 0; i < h; ++i) {
|
|
2396
|
+
for (int j = 0; j < w; ++j) {
|
|
2397
|
+
int src_idx = 3 * ((y + i)*image.nx + (x + j));
|
|
2398
|
+
int dst_idx = 3 * (i*w + j);
|
|
2399
|
+
dst.buf[dst_idx] = image.buf[src_idx];
|
|
2400
|
+
dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
|
|
2401
|
+
dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
|
|
2402
|
+
}
|
|
2403
|
+
}
|
|
2404
|
+
}
|
|
2115
2405
|
|
|
2116
|
-
|
|
2117
|
-
|
|
2406
|
+
// calculate the size of the **resized** image, while preserving the aspect ratio
|
|
2407
|
+
// the calculated size will be aligned to the nearest multiple of align_size
|
|
2408
|
+
// if H or W size is larger than max_dimension, it will be resized to max_dimension
|
|
2409
|
+
static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int max_dimension) {
|
|
2410
|
+
if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || max_dimension <= 0) {
|
|
2411
|
+
return {0, 0};
|
|
2412
|
+
}
|
|
2118
2413
|
|
|
2119
|
-
|
|
2414
|
+
float scale = std::min(1.0f, std::min(static_cast<float>(max_dimension) / inp_size.width,
|
|
2415
|
+
static_cast<float>(max_dimension) / inp_size.height));
|
|
2120
2416
|
|
|
2121
|
-
|
|
2122
|
-
|
|
2123
|
-
new_height = std::min(static_cast<int>(std::ceil(image.ny * scale_w)), target_height);
|
|
2124
|
-
} else {
|
|
2125
|
-
new_height = target_height;
|
|
2126
|
-
new_width = std::min(static_cast<int>(std::ceil(image.nx * scale_h)), target_width);
|
|
2127
|
-
}
|
|
2417
|
+
float target_width_f = static_cast<float>(inp_size.width) * scale;
|
|
2418
|
+
float target_height_f = static_cast<float>(inp_size.height) * scale;
|
|
2128
2419
|
|
|
2129
|
-
|
|
2130
|
-
|
|
2131
|
-
bicubic_resize(image, resized_image, new_width, new_height);
|
|
2420
|
+
int aligned_width = GGML_PAD((int)target_width_f, align_size);
|
|
2421
|
+
int aligned_height = GGML_PAD((int)target_height_f, align_size);
|
|
2132
2422
|
|
|
2133
|
-
|
|
2134
|
-
|
|
2135
|
-
padded_image.ny = target_height;
|
|
2136
|
-
padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black
|
|
2423
|
+
return {aligned_width, aligned_height};
|
|
2424
|
+
}
|
|
2137
2425
|
|
|
2138
|
-
|
|
2139
|
-
|
|
2140
|
-
|
|
2426
|
+
private:
|
|
2427
|
+
static inline int clip(int x, int lower, int upper) {
|
|
2428
|
+
return std::max(lower, std::min(x, upper));
|
|
2429
|
+
}
|
|
2141
2430
|
|
|
2142
|
-
//
|
|
2143
|
-
|
|
2144
|
-
|
|
2145
|
-
for (int c = 0; c < 3; ++c) {
|
|
2146
|
-
padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c];
|
|
2147
|
-
}
|
|
2148
|
-
}
|
|
2431
|
+
// Linear interpolation between two points
|
|
2432
|
+
static inline float lerp(float s, float e, float t) {
|
|
2433
|
+
return s + (e - s) * t;
|
|
2149
2434
|
}
|
|
2150
|
-
|
|
2151
|
-
}
|
|
2435
|
+
};
|
|
2152
2436
|
|
|
2153
2437
|
/**
|
|
2154
|
-
*
|
|
2438
|
+
* implementation of LLaVA-UHD:
|
|
2439
|
+
* - https://arxiv.org/pdf/2403.11703
|
|
2440
|
+
* - https://github.com/thunlp/LLaVA-UHD
|
|
2441
|
+
* - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
|
|
2442
|
+
*
|
|
2443
|
+
* overview:
|
|
2444
|
+
* - an image always have a single overview (downscaled image)
|
|
2445
|
+
* - an image can have 0 or multiple slices, depending on the image size
|
|
2446
|
+
* - each slice can then be considered as a separate image
|
|
2155
2447
|
*
|
|
2156
|
-
*
|
|
2157
|
-
*
|
|
2158
|
-
*
|
|
2448
|
+
* for example:
|
|
2449
|
+
*
|
|
2450
|
+
* [overview] --> [slice 1] --> [slice 2]
|
|
2451
|
+
* | |
|
|
2452
|
+
* +--> [slice 3] --> [slice 4]
|
|
2159
2453
|
*/
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
|
|
2163
|
-
|
|
2164
|
-
|
|
2165
|
-
|
|
2166
|
-
|
|
2167
|
-
|
|
2168
|
-
|
|
2169
|
-
|
|
2170
|
-
|
|
2171
|
-
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
int wasted_resolution = (width * height) - effective_resolution;
|
|
2175
|
-
// LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
|
2176
|
-
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
|
2177
|
-
max_effective_resolution = effective_resolution;
|
|
2178
|
-
min_wasted_resolution = wasted_resolution;
|
|
2179
|
-
best_fit = resolution;
|
|
2180
|
-
}
|
|
2181
|
-
}
|
|
2182
|
-
|
|
2183
|
-
return best_fit;
|
|
2184
|
-
}
|
|
2454
|
+
struct llava_uhd {
|
|
2455
|
+
struct slice_coordinates {
|
|
2456
|
+
int x;
|
|
2457
|
+
int y;
|
|
2458
|
+
clip_image_size size;
|
|
2459
|
+
};
|
|
2460
|
+
|
|
2461
|
+
struct slice_instructions {
|
|
2462
|
+
clip_image_size overview_size; // size of downscaled image
|
|
2463
|
+
clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size)
|
|
2464
|
+
clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices
|
|
2465
|
+
std::vector<slice_coordinates> slices;
|
|
2466
|
+
bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
|
|
2467
|
+
};
|
|
2185
2468
|
|
|
2186
|
-
static
|
|
2187
|
-
|
|
2188
|
-
|
|
2189
|
-
|
|
2190
|
-
|
|
2191
|
-
|
|
2192
|
-
|
|
2193
|
-
|
|
2194
|
-
|
|
2195
|
-
|
|
2196
|
-
|
|
2197
|
-
|
|
2198
|
-
|
|
2199
|
-
|
|
2469
|
+
static int get_max_slices(struct clip_ctx * ctx) {
|
|
2470
|
+
if (clip_is_minicpmv(ctx)) {
|
|
2471
|
+
return 9;
|
|
2472
|
+
}
|
|
2473
|
+
return 0;
|
|
2474
|
+
}
|
|
2475
|
+
|
|
2476
|
+
static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
|
|
2477
|
+
slice_instructions res;
|
|
2478
|
+
const int patch_size = clip_get_patch_size(ctx);
|
|
2479
|
+
const int slice_size = clip_get_image_size(ctx);
|
|
2480
|
+
const int max_slice_nums = get_max_slices(ctx);
|
|
2481
|
+
const int original_width = original_size.width;
|
|
2482
|
+
const int original_height = original_size.height;
|
|
2483
|
+
const float log_ratio = log((float)original_width / original_height);
|
|
2484
|
+
const float ratio = (float)original_width * original_height / (slice_size * slice_size);
|
|
2485
|
+
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
|
2486
|
+
const bool has_slices = (multiple > 1);
|
|
2487
|
+
const bool has_pinpoints = !ctx->vision_model.hparams.image_grid_pinpoints.empty();
|
|
2488
|
+
|
|
2489
|
+
if (has_pinpoints) {
|
|
2490
|
+
// has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
|
|
2491
|
+
auto refine_size = llava_uhd::select_best_resolution(
|
|
2492
|
+
ctx->vision_model.hparams.image_grid_pinpoints,
|
|
2493
|
+
original_size);
|
|
2494
|
+
res.overview_size = clip_image_size{slice_size, slice_size};
|
|
2495
|
+
res.refined_size = refine_size;
|
|
2496
|
+
res.grid_size = clip_image_size{0, 0};
|
|
2497
|
+
res.padding_refined = true;
|
|
2498
|
+
|
|
2499
|
+
for (int y = 0; y < refine_size.height; y += slice_size) {
|
|
2500
|
+
for (int x = 0; x < refine_size.width; x += slice_size) {
|
|
2501
|
+
slice_coordinates slice;
|
|
2502
|
+
slice.x = x;
|
|
2503
|
+
slice.y = y;
|
|
2504
|
+
slice.size.width = std::min(slice_size, refine_size.width - x);
|
|
2505
|
+
slice.size.height = std::min(slice_size, refine_size.height - y);
|
|
2506
|
+
res.slices.push_back(slice);
|
|
2507
|
+
if (x == 0) {
|
|
2508
|
+
res.grid_size.width++;
|
|
2200
2509
|
}
|
|
2201
2510
|
}
|
|
2511
|
+
res.grid_size.height++;
|
|
2202
2512
|
}
|
|
2203
|
-
patches.push_back(patch);
|
|
2204
|
-
}
|
|
2205
|
-
}
|
|
2206
|
-
return patches;
|
|
2207
|
-
}
|
|
2208
2513
|
|
|
2209
|
-
|
|
2210
|
-
|
|
2211
|
-
}
|
|
2514
|
+
return res;
|
|
2515
|
+
}
|
|
2212
2516
|
|
|
2213
|
-
|
|
2214
|
-
int width = original_size.first;
|
|
2215
|
-
int height = original_size.second;
|
|
2216
|
-
if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
|
|
2217
|
-
float r = static_cast<float>(width) / height;
|
|
2218
|
-
height = static_cast<int>(scale_resolution / std::sqrt(r));
|
|
2219
|
-
width = static_cast<int>(height * r);
|
|
2220
|
-
}
|
|
2221
|
-
int best_width = ensure_divide(width, patch_size);
|
|
2222
|
-
int best_height = ensure_divide(height, patch_size);
|
|
2223
|
-
return std::make_pair(best_width, best_height);
|
|
2224
|
-
}
|
|
2517
|
+
// no pinpoints, dynamically calculate the grid size (e.g. minicpmv)
|
|
2225
2518
|
|
|
2226
|
-
|
|
2227
|
-
|
|
2228
|
-
std::tie(width, height) = original_size;
|
|
2229
|
-
int grid_x, grid_y;
|
|
2230
|
-
std::tie(grid_x, grid_y) = grid;
|
|
2519
|
+
auto best_size = get_best_resize(original_size, slice_size, patch_size, has_slices);
|
|
2520
|
+
res.overview_size = best_size;
|
|
2231
2521
|
|
|
2232
|
-
|
|
2233
|
-
|
|
2522
|
+
if (!has_slices) {
|
|
2523
|
+
// skip slicing logic
|
|
2524
|
+
res.refined_size = clip_image_size{0, 0};
|
|
2525
|
+
res.grid_size = clip_image_size{0, 0};
|
|
2234
2526
|
|
|
2235
|
-
|
|
2236
|
-
|
|
2527
|
+
} else {
|
|
2528
|
+
auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio);
|
|
2529
|
+
auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
|
|
2530
|
+
res.grid_size = best_grid;
|
|
2531
|
+
res.refined_size = refine_size;
|
|
2532
|
+
|
|
2533
|
+
int width = refine_size.width;
|
|
2534
|
+
int height = refine_size.height;
|
|
2535
|
+
int grid_x = int(width / best_grid.width);
|
|
2536
|
+
int grid_y = int(height / best_grid.height);
|
|
2537
|
+
for (int patches_y = 0, ic = 0;
|
|
2538
|
+
patches_y < refine_size.height && ic < best_grid.height;
|
|
2539
|
+
patches_y += grid_y, ic += 1) {
|
|
2540
|
+
for (int patches_x = 0, jc = 0;
|
|
2541
|
+
patches_x < refine_size.width && jc < best_grid.width;
|
|
2542
|
+
patches_x += grid_x, jc += 1) {
|
|
2543
|
+
slice_coordinates slice;
|
|
2544
|
+
slice.x = patches_x;
|
|
2545
|
+
slice.y = patches_y;
|
|
2546
|
+
slice.size.width = grid_x;
|
|
2547
|
+
slice.size.height = grid_y;
|
|
2548
|
+
res.slices.push_back(slice);
|
|
2549
|
+
// LOG_INF("slice %d: %d %d %d %d\n", ic, patches_i, patches_j, grid_x, grid_y);
|
|
2550
|
+
}
|
|
2551
|
+
}
|
|
2552
|
+
}
|
|
2237
2553
|
|
|
2238
|
-
|
|
2239
|
-
|
|
2240
|
-
int best_grid_width, best_grid_height;
|
|
2241
|
-
std::tie(best_grid_width, best_grid_height) = best_grid_size;
|
|
2554
|
+
return res;
|
|
2555
|
+
}
|
|
2242
2556
|
|
|
2243
|
-
|
|
2244
|
-
|
|
2245
|
-
return refine_size;
|
|
2246
|
-
}
|
|
2557
|
+
static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
|
|
2558
|
+
std::vector<clip_image_u8_ptr> output;
|
|
2247
2559
|
|
|
2248
|
-
|
|
2249
|
-
|
|
2250
|
-
|
|
2251
|
-
|
|
2252
|
-
|
|
2560
|
+
// resize to overview size
|
|
2561
|
+
clip_image_u8_ptr resized_img(clip_image_u8_init());
|
|
2562
|
+
image_manipulation::bicubic_resize(*img, *resized_img, inst.overview_size.width, inst.overview_size.height);
|
|
2563
|
+
output.push_back(std::move(resized_img));
|
|
2564
|
+
if (inst.slices.empty()) {
|
|
2565
|
+
// no slices, just return the resized image
|
|
2566
|
+
return output;
|
|
2253
2567
|
}
|
|
2254
|
-
candidate_split_grids_nums.push_back(i);
|
|
2255
|
-
}
|
|
2256
2568
|
|
|
2257
|
-
|
|
2258
|
-
|
|
2259
|
-
|
|
2260
|
-
|
|
2261
|
-
|
|
2262
|
-
|
|
2569
|
+
// resize to refined size
|
|
2570
|
+
clip_image_u8_ptr refined_img(clip_image_u8_init());
|
|
2571
|
+
if (inst.padding_refined) {
|
|
2572
|
+
image_manipulation::resize_and_pad_image(*img, *refined_img, inst.refined_size);
|
|
2573
|
+
} else {
|
|
2574
|
+
image_manipulation::bilinear_resize(*img, *refined_img, inst.refined_size.width, inst.refined_size.height);
|
|
2575
|
+
}
|
|
2576
|
+
|
|
2577
|
+
// create slices
|
|
2578
|
+
for (const auto & slice : inst.slices) {
|
|
2579
|
+
int x = slice.x;
|
|
2580
|
+
int y = slice.y;
|
|
2581
|
+
int w = slice.size.width;
|
|
2582
|
+
int h = slice.size.height;
|
|
2583
|
+
|
|
2584
|
+
clip_image_u8_ptr img_slice(clip_image_u8_init());
|
|
2585
|
+
image_manipulation::crop_image(*refined_img, *img_slice, x, y, w, h);
|
|
2586
|
+
output.push_back(std::move(img_slice));
|
|
2587
|
+
}
|
|
2588
|
+
|
|
2589
|
+
return output;
|
|
2590
|
+
}
|
|
2591
|
+
|
|
2592
|
+
private:
|
|
2593
|
+
static clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
|
|
2594
|
+
int width = original_size.width;
|
|
2595
|
+
int height = original_size.height;
|
|
2596
|
+
if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
|
|
2597
|
+
float r = static_cast<float>(width) / height;
|
|
2598
|
+
height = static_cast<int>(scale_resolution / std::sqrt(r));
|
|
2599
|
+
width = static_cast<int>(height * r);
|
|
2600
|
+
}
|
|
2601
|
+
clip_image_size res;
|
|
2602
|
+
res.width = ensure_divide(width, patch_size);
|
|
2603
|
+
res.height = ensure_divide(height, patch_size);
|
|
2604
|
+
return res;
|
|
2605
|
+
}
|
|
2606
|
+
|
|
2607
|
+
/**
|
|
2608
|
+
* Selects the best resolution from a list of possible resolutions based on the original size.
|
|
2609
|
+
*
|
|
2610
|
+
* @param original_size The original size of the image
|
|
2611
|
+
* @param possible_resolutions A list of possible resolutions
|
|
2612
|
+
* @return The best fit resolution
|
|
2613
|
+
*/
|
|
2614
|
+
static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions) {
|
|
2615
|
+
int original_width = original_size.width;
|
|
2616
|
+
int original_height = original_size.height;
|
|
2617
|
+
clip_image_size best_fit;
|
|
2618
|
+
int max_effective_resolution = 0;
|
|
2619
|
+
int min_wasted_resolution = std::numeric_limits<int>::max();
|
|
2620
|
+
|
|
2621
|
+
for (const auto & resolution : possible_resolutions) {
|
|
2622
|
+
int width = resolution.width;
|
|
2623
|
+
int height = resolution.height;
|
|
2624
|
+
float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
|
|
2625
|
+
int downscaled_width = static_cast<int>(original_width * scale);
|
|
2626
|
+
int downscaled_height = static_cast<int>(original_height * scale);
|
|
2627
|
+
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
|
2628
|
+
int wasted_resolution = (width * height) - effective_resolution;
|
|
2629
|
+
// LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
|
2630
|
+
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
|
2631
|
+
max_effective_resolution = effective_resolution;
|
|
2632
|
+
min_wasted_resolution = wasted_resolution;
|
|
2633
|
+
best_fit = resolution;
|
|
2263
2634
|
}
|
|
2264
|
-
++m;
|
|
2265
2635
|
}
|
|
2636
|
+
|
|
2637
|
+
return best_fit;
|
|
2266
2638
|
}
|
|
2267
2639
|
|
|
2268
|
-
|
|
2269
|
-
|
|
2270
|
-
|
|
2271
|
-
|
|
2272
|
-
|
|
2273
|
-
best_grid = grid;
|
|
2274
|
-
min_error = error;
|
|
2640
|
+
// used by llava 1.6 with custom list of pinpoints
|
|
2641
|
+
static clip_image_size select_best_resolution(const std::vector<int32_t> & pinpoints, const clip_image_size & original_size) {
|
|
2642
|
+
std::vector<clip_image_size> possible_resolutions;
|
|
2643
|
+
for (size_t i = 0; i < pinpoints.size(); i += 2) {
|
|
2644
|
+
possible_resolutions.push_back(clip_image_size{pinpoints[i], pinpoints[i+1]});
|
|
2275
2645
|
}
|
|
2646
|
+
return select_best_resolution(original_size, possible_resolutions);
|
|
2276
2647
|
}
|
|
2277
|
-
return best_grid;
|
|
2278
|
-
}
|
|
2279
2648
|
|
|
2280
|
-
|
|
2281
|
-
|
|
2282
|
-
|
|
2283
|
-
|
|
2284
|
-
static
|
|
2285
|
-
|
|
2286
|
-
|
|
2287
|
-
|
|
2288
|
-
|
|
2289
|
-
|
|
2290
|
-
|
|
2291
|
-
|
|
2292
|
-
|
|
2293
|
-
|
|
2294
|
-
|
|
2295
|
-
|
|
2296
|
-
|
|
2297
|
-
auto
|
|
2298
|
-
|
|
2299
|
-
|
|
2300
|
-
|
|
2301
|
-
|
|
2302
|
-
|
|
2303
|
-
|
|
2304
|
-
|
|
2305
|
-
|
|
2306
|
-
|
|
2307
|
-
|
|
2308
|
-
|
|
2309
|
-
|
|
2310
|
-
|
|
2311
|
-
|
|
2312
|
-
|
|
2313
|
-
|
|
2314
|
-
|
|
2315
|
-
|
|
2316
|
-
|
|
2317
|
-
|
|
2318
|
-
|
|
2319
|
-
|
|
2320
|
-
|
|
2321
|
-
|
|
2322
|
-
int height = refine_image->ny;
|
|
2323
|
-
int grid_x = int(width / best_grid.first);
|
|
2324
|
-
int grid_y = int(height / best_grid.second);
|
|
2325
|
-
for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
|
|
2326
|
-
images.push_back(std::vector<clip_image_u8 *>());
|
|
2327
|
-
for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
|
|
2328
|
-
clip_image_u8 * patch = clip_image_u8_init();
|
|
2329
|
-
patch->nx = grid_x;
|
|
2330
|
-
patch->ny = grid_y;
|
|
2331
|
-
patch->buf.resize(3 * patch->nx * patch->ny);
|
|
2332
|
-
for (int y = patches_i; y < patches_i + grid_y; ++y) {
|
|
2333
|
-
for (int x = patches_j; x < patches_j + grid_x; ++x) {
|
|
2334
|
-
const int i = 3 * (y * refine_image->nx + x);
|
|
2335
|
-
const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j));
|
|
2336
|
-
patch->buf[j] = refine_image->buf[i];
|
|
2337
|
-
patch->buf[j+1] = refine_image->buf[i+1];
|
|
2338
|
-
patch->buf[j+2] = refine_image->buf[i+2];
|
|
2339
|
-
}
|
|
2649
|
+
static int ensure_divide(int length, int patch_size) {
|
|
2650
|
+
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
|
|
2651
|
+
}
|
|
2652
|
+
|
|
2653
|
+
static clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
|
|
2654
|
+
int width = original_size.width;
|
|
2655
|
+
int height = original_size.height;
|
|
2656
|
+
int grid_x = grid.width;
|
|
2657
|
+
int grid_y = grid.height;
|
|
2658
|
+
|
|
2659
|
+
int refine_width = ensure_divide(width, grid_x);
|
|
2660
|
+
int refine_height = ensure_divide(height, grid_y);
|
|
2661
|
+
|
|
2662
|
+
clip_image_size grid_size;
|
|
2663
|
+
grid_size.width = refine_width / grid_x;
|
|
2664
|
+
grid_size.height = refine_height / grid_y;
|
|
2665
|
+
|
|
2666
|
+
auto best_grid_size = get_best_resize(grid_size, scale_resolution, patch_size, allow_upscale);
|
|
2667
|
+
int best_grid_width = best_grid_size.width;
|
|
2668
|
+
int best_grid_height = best_grid_size.height;
|
|
2669
|
+
|
|
2670
|
+
clip_image_size refine_size;
|
|
2671
|
+
refine_size.width = best_grid_width * grid_x;
|
|
2672
|
+
refine_size.height = best_grid_height * grid_y;
|
|
2673
|
+
return refine_size;
|
|
2674
|
+
}
|
|
2675
|
+
|
|
2676
|
+
static clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
|
|
2677
|
+
std::vector<int> candidate_split_grids_nums;
|
|
2678
|
+
for (int i : {multiple - 1, multiple, multiple + 1}) {
|
|
2679
|
+
if (i == 1 || i > max_slice_nums) {
|
|
2680
|
+
continue;
|
|
2681
|
+
}
|
|
2682
|
+
candidate_split_grids_nums.push_back(i);
|
|
2683
|
+
}
|
|
2684
|
+
|
|
2685
|
+
std::vector<clip_image_size> candidate_grids;
|
|
2686
|
+
for (int split_grids_nums : candidate_split_grids_nums) {
|
|
2687
|
+
int m = 1;
|
|
2688
|
+
while (m <= split_grids_nums) {
|
|
2689
|
+
if (split_grids_nums % m == 0) {
|
|
2690
|
+
candidate_grids.push_back(clip_image_size{m, split_grids_nums / m});
|
|
2340
2691
|
}
|
|
2341
|
-
|
|
2692
|
+
++m;
|
|
2342
2693
|
}
|
|
2343
2694
|
}
|
|
2344
|
-
|
|
2695
|
+
|
|
2696
|
+
clip_image_size best_grid{1, 1};
|
|
2697
|
+
float min_error = std::numeric_limits<float>::infinity();
|
|
2698
|
+
for (const auto& grid : candidate_grids) {
|
|
2699
|
+
float error = std::abs(log_ratio - std::log(1.0 * grid.width / grid.height));
|
|
2700
|
+
if (error < min_error) {
|
|
2701
|
+
best_grid = grid;
|
|
2702
|
+
min_error = error;
|
|
2703
|
+
}
|
|
2704
|
+
}
|
|
2705
|
+
return best_grid;
|
|
2345
2706
|
}
|
|
2346
|
-
|
|
2347
|
-
}
|
|
2707
|
+
};
|
|
2348
2708
|
|
|
2709
|
+
// TODO @ngxson : decprecate the load_image_size singleton pattern
|
|
2349
2710
|
int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
|
|
2350
|
-
const
|
|
2351
|
-
|
|
2352
|
-
const int original_width = ctx_clip->load_image_size->width;
|
|
2353
|
-
const int original_height = ctx_clip->load_image_size->height;
|
|
2354
|
-
const float log_ratio = log(1.0*original_width/original_height);
|
|
2355
|
-
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
|
|
2356
|
-
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
|
2357
|
-
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
|
|
2358
|
-
return best_grid.first;
|
|
2711
|
+
const auto inst = llava_uhd::get_slice_instructions(ctx_clip, ctx_clip->load_image_size);
|
|
2712
|
+
return inst.grid_size.width;
|
|
2359
2713
|
}
|
|
2360
2714
|
|
|
2361
2715
|
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
|
|
2362
2716
|
// res_imgs memory is being allocated here, previous allocations will be freed if found
|
|
2363
|
-
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
|
|
2717
|
+
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
|
|
2718
|
+
clip_image_size original_size{img->nx, img->ny};
|
|
2719
|
+
bool pad_to_square = true;
|
|
2720
|
+
auto & params = ctx->vision_model.hparams;
|
|
2721
|
+
// The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
|
|
2722
|
+
if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) {
|
|
2723
|
+
pad_to_square = false;
|
|
2724
|
+
}
|
|
2725
|
+
|
|
2726
|
+
if (clip_is_minicpmv(ctx)) {
|
|
2727
|
+
auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
|
|
2728
|
+
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
|
|
2364
2729
|
|
|
2365
|
-
if(clip_is_minicpmv(ctx)){
|
|
2366
|
-
int max_slice_nums = 9;
|
|
2367
|
-
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, max_slice_nums);
|
|
2368
|
-
res_imgs->size = 0;
|
|
2369
|
-
for (size_t i = 0; i < imgs.size(); ++i){
|
|
2370
|
-
res_imgs->size += imgs[i].size();
|
|
2371
|
-
}
|
|
2372
|
-
res_imgs->data = new clip_image_f32[res_imgs->size];
|
|
2373
|
-
int idx = 0;
|
|
2374
|
-
for (size_t i = 0; i < imgs.size(); ++i) {
|
|
2375
|
-
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
|
2376
|
-
LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
|
|
2377
|
-
clip_image_f32 * res = clip_image_f32_init();
|
|
2378
|
-
normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
|
|
2379
|
-
res_imgs->data[idx++] = *res;
|
|
2380
|
-
clip_image_f32_free(res);
|
|
2381
|
-
}
|
|
2382
|
-
}
|
|
2383
2730
|
for (size_t i = 0; i < imgs.size(); ++i) {
|
|
2384
|
-
|
|
2385
|
-
|
|
2386
|
-
|
|
2387
|
-
|
|
2388
|
-
}
|
|
2731
|
+
// clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
|
|
2732
|
+
clip_image_f32_ptr res(clip_image_f32_init());
|
|
2733
|
+
normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
|
|
2734
|
+
res_imgs->entries.push_back(std::move(res));
|
|
2389
2735
|
}
|
|
2390
2736
|
return true;
|
|
2391
2737
|
}
|
|
2392
|
-
else if (ctx->
|
|
2393
|
-
clip_image_u8
|
|
2394
|
-
auto patch_size =
|
|
2738
|
+
else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
|
2739
|
+
clip_image_u8 resized;
|
|
2740
|
+
auto patch_size = clip_get_patch_size(ctx) * 2;
|
|
2395
2741
|
int nx = ceil((float)img->nx / patch_size) * patch_size;
|
|
2396
2742
|
int ny = ceil((float)img->ny / patch_size) * patch_size;
|
|
2397
|
-
bicubic_resize(*img,
|
|
2743
|
+
image_manipulation::bicubic_resize(*img, resized, nx, ny);
|
|
2398
2744
|
|
|
2399
|
-
|
|
2400
|
-
//
|
|
2401
|
-
normalize_image_u8_to_f32(resized,
|
|
2745
|
+
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
|
2746
|
+
// clip_image_f32_ptr res(clip_image_f32_init());
|
|
2747
|
+
normalize_image_u8_to_f32(resized, *img_f32, ctx->image_mean, ctx->image_std);
|
|
2402
2748
|
// res_imgs->data[0] = *res;
|
|
2403
|
-
res_imgs->
|
|
2404
|
-
|
|
2405
|
-
// clip_image_f32_free(res);
|
|
2406
|
-
clip_image_u8_free(resized);
|
|
2749
|
+
res_imgs->entries.push_back(std::move(img_f32));
|
|
2407
2750
|
return true;
|
|
2408
2751
|
}
|
|
2409
|
-
|
|
2410
|
-
|
|
2411
|
-
|
|
2412
|
-
res_imgs->data = new clip_image_f32[res_imgs->size];
|
|
2752
|
+
else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE
|
|
2753
|
+
|| ctx->proj_type == PROJECTOR_TYPE_GEMMA3
|
|
2754
|
+
|| ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
|
|
2413
2755
|
clip_image_u8 resized_image;
|
|
2414
|
-
|
|
2415
|
-
|
|
2416
|
-
|
|
2756
|
+
int sz = params.image_size;
|
|
2757
|
+
image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz});
|
|
2758
|
+
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
|
2417
2759
|
//clip_image_save_to_bmp(resized_image, "resized.bmp");
|
|
2418
|
-
normalize_image_u8_to_f32(
|
|
2419
|
-
res_imgs->
|
|
2420
|
-
clip_image_f32_free(res);
|
|
2760
|
+
normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
|
|
2761
|
+
res_imgs->entries.push_back(std::move(img_f32));
|
|
2421
2762
|
return true;
|
|
2422
2763
|
}
|
|
2423
|
-
|
|
2424
|
-
|
|
2425
|
-
|
|
2426
|
-
|
|
2427
|
-
|
|
2428
|
-
|
|
2429
|
-
|
|
2430
|
-
|
|
2431
|
-
if (strcmp(params.mm_patch_merge_type, "spatial_unpad") == 0) {
|
|
2432
|
-
pad_to_square = false;
|
|
2433
|
-
}
|
|
2434
|
-
// free the previous res_imgs if any set
|
|
2435
|
-
if (res_imgs->size > 0) {
|
|
2436
|
-
clip_image_f32_batch_free(res_imgs);
|
|
2764
|
+
else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
|
|
2765
|
+
clip_image_u8 resized_image;
|
|
2766
|
+
auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
|
|
2767
|
+
image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height);
|
|
2768
|
+
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
|
2769
|
+
normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
|
|
2770
|
+
res_imgs->entries.push_back(std::move(img_f32));
|
|
2771
|
+
return true;
|
|
2437
2772
|
}
|
|
2438
|
-
res_imgs->data = nullptr;
|
|
2439
|
-
res_imgs->size = 0;
|
|
2440
2773
|
|
|
2441
2774
|
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
|
|
2442
2775
|
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
|
|
2443
2776
|
|
|
2444
|
-
|
|
2445
|
-
|
|
2446
|
-
|
|
2777
|
+
clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily
|
|
2778
|
+
|
|
2779
|
+
if (pad_to_square) {
|
|
2780
|
+
// for llava-1.5, we resize image to a square, and pad the shorter side with a background color
|
|
2781
|
+
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
|
|
2782
|
+
const int longer_side = std::max(img->nx, img->ny);
|
|
2447
2783
|
temp->nx = longer_side;
|
|
2448
2784
|
temp->ny = longer_side;
|
|
2449
2785
|
temp->buf.resize(3 * longer_side * longer_side);
|
|
2450
|
-
const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA (this is the mean rgb color * 255)
|
|
2451
2786
|
|
|
2452
|
-
//
|
|
2453
|
-
|
|
2454
|
-
temp->buf[i] = bc[i % 3];
|
|
2455
|
-
}
|
|
2787
|
+
// background color in RGB from LLaVA (this is the mean rgb color * 255)
|
|
2788
|
+
const std::array<uint8_t, 3> pad_color = {122, 116, 104};
|
|
2456
2789
|
|
|
2457
|
-
//
|
|
2458
|
-
|
|
2459
|
-
for (int x = 0; x < img->nx; x++) {
|
|
2460
|
-
const int i = 3 * (y * img->nx + x);
|
|
2461
|
-
const int j = 3 * (y * temp->nx + x);
|
|
2462
|
-
temp->buf[j] = img->buf[i];
|
|
2463
|
-
temp->buf[j+1] = img->buf[i+1];
|
|
2464
|
-
temp->buf[j+2] = img->buf[i+2];
|
|
2465
|
-
}
|
|
2466
|
-
}
|
|
2467
|
-
} else {
|
|
2468
|
-
if (!params.image_grid_pinpoints.empty()) {
|
|
2469
|
-
// "spatial_unpad" with "anyres" processing for llava-1.6
|
|
2470
|
-
std::vector<std::pair<int, int>> possible_resolutions;
|
|
2471
|
-
for (size_t i = 0; i < params.image_grid_pinpoints.size(); i+=2) {
|
|
2472
|
-
possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
|
|
2473
|
-
}
|
|
2474
|
-
std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
|
|
2475
|
-
// clip_image_save_to_bmp(*img, "input.bmp");
|
|
2476
|
-
resize_and_pad_image(*img, *temp, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6
|
|
2477
|
-
// clip_image_save_to_bmp(*temp, "resized.bmp");
|
|
2478
|
-
// visually verify normalized image:
|
|
2479
|
-
// normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std);
|
|
2480
|
-
// {
|
|
2481
|
-
// clip_image_u8 * temp2 = clip_image_u8_init();
|
|
2482
|
-
// clip_image_convert_f32_to_u8(*res, *temp2);
|
|
2483
|
-
// clip_image_save_to_bmp(*temp2, "resized_normalized_f32.bmp");
|
|
2484
|
-
// clip_image_u8_free(temp2);
|
|
2485
|
-
// }
|
|
2486
|
-
|
|
2487
|
-
std::vector<clip_image_u8 *> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
|
|
2488
|
-
|
|
2489
|
-
clip_image_u8 *image_original_resize = clip_image_u8_init();
|
|
2490
|
-
// bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
|
|
2491
|
-
bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
|
|
2492
|
-
patches.insert(patches.begin(), image_original_resize);
|
|
2493
|
-
// clip_image_f32_batch_init(patches.size());
|
|
2494
|
-
res_imgs->size = patches.size();
|
|
2495
|
-
res_imgs->data = new clip_image_f32[res_imgs->size];
|
|
2496
|
-
int num=0;
|
|
2497
|
-
for (auto& patch : patches) {
|
|
2498
|
-
normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std);
|
|
2499
|
-
num++;
|
|
2500
|
-
}
|
|
2790
|
+
// resize the image to the target_size
|
|
2791
|
+
image_manipulation::resize_and_pad_image(*img, *temp, clip_image_size{params.image_size, params.image_size}, pad_color);
|
|
2501
2792
|
|
|
2502
|
-
|
|
2503
|
-
|
|
2504
|
-
|
|
2505
|
-
|
|
2793
|
+
clip_image_f32_ptr res(clip_image_f32_init());
|
|
2794
|
+
normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std);
|
|
2795
|
+
res_imgs->entries.push_back(std::move(res));
|
|
2796
|
+
return true;
|
|
2506
2797
|
|
|
2507
|
-
|
|
2798
|
+
} else if (!params.image_grid_pinpoints.empty()) {
|
|
2799
|
+
// "spatial_unpad" with "anyres" processing for llava-1.6
|
|
2800
|
+
auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
|
|
2801
|
+
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
|
|
2508
2802
|
|
|
2509
|
-
|
|
2510
|
-
|
|
2511
|
-
|
|
2512
|
-
|
|
2513
|
-
|
|
2514
|
-
memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
|
|
2803
|
+
for (size_t i = 0; i < imgs.size(); ++i) {
|
|
2804
|
+
// clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
|
|
2805
|
+
clip_image_f32_ptr res(clip_image_f32_init());
|
|
2806
|
+
normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
|
|
2807
|
+
res_imgs->entries.push_back(std::move(res));
|
|
2515
2808
|
}
|
|
2516
|
-
}
|
|
2517
|
-
|
|
2518
|
-
const int nx = temp->nx;
|
|
2519
|
-
const int ny = temp->ny;
|
|
2520
|
-
// clip_image_save_to_bmp(*temp, "resized_vanilla.bmp");
|
|
2521
|
-
|
|
2522
|
-
const int nx2 = ctx->vision_model.hparams.image_size;
|
|
2523
|
-
const int ny2 = ctx->vision_model.hparams.image_size;
|
|
2524
|
-
clip_image_f32 * res = clip_image_f32_init();
|
|
2525
|
-
res->nx = nx2;
|
|
2526
|
-
res->ny = ny2;
|
|
2527
|
-
res->buf.resize(3 * nx2 * ny2);
|
|
2528
|
-
|
|
2529
|
-
const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size;
|
|
2530
|
-
|
|
2531
|
-
const int nx3 = int(nx / scale + 0.5f);
|
|
2532
|
-
const int ny3 = int(ny / scale + 0.5f);
|
|
2533
|
-
|
|
2534
|
-
const auto & m3 = ctx->image_mean; // {0.48145466f, 0.4578275f, 0.40821073f};
|
|
2535
|
-
const auto & s3 = ctx->image_std; // {0.26862954f, 0.26130258f, 0.27577711f};
|
|
2536
|
-
|
|
2537
|
-
for (int y = 0; y < ny3; y++) {
|
|
2538
|
-
for (int x = 0; x < nx3; x++) {
|
|
2539
|
-
for (int c = 0; c < 3; c++) {
|
|
2540
|
-
// linear interpolation
|
|
2541
|
-
const float sx = (x + 0.5f) * scale - 0.5f;
|
|
2542
|
-
const float sy = (y + 0.5f) * scale - 0.5f;
|
|
2543
|
-
|
|
2544
|
-
const int x0 = std::max(0, (int)std::floor(sx));
|
|
2545
|
-
const int y0 = std::max(0, (int)std::floor(sy));
|
|
2546
2809
|
|
|
2547
|
-
|
|
2548
|
-
const int y1 = std::min(y0 + 1, ny - 1);
|
|
2549
|
-
|
|
2550
|
-
const float dx = sx - x0;
|
|
2551
|
-
const float dy = sy - y0;
|
|
2552
|
-
|
|
2553
|
-
const int j00 = 3 * (y0 * nx + x0) + c;
|
|
2554
|
-
const int j01 = 3 * (y0 * nx + x1) + c;
|
|
2555
|
-
const int j10 = 3 * (y1 * nx + x0) + c;
|
|
2556
|
-
const int j11 = 3 * (y1 * nx + x1) + c;
|
|
2557
|
-
|
|
2558
|
-
const float v00 = temp->buf[j00];
|
|
2559
|
-
const float v01 = temp->buf[j01];
|
|
2560
|
-
const float v10 = temp->buf[j10];
|
|
2561
|
-
const float v11 = temp->buf[j11];
|
|
2562
|
-
|
|
2563
|
-
const float v0 = v00 * (1.0f - dx) + v01 * dx;
|
|
2564
|
-
const float v1 = v10 * (1.0f - dx) + v11 * dx;
|
|
2565
|
-
|
|
2566
|
-
const float v = v0 * (1.0f - dy) + v1 * dy;
|
|
2567
|
-
|
|
2568
|
-
const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f);
|
|
2569
|
-
|
|
2570
|
-
const int i = 3 * (y * nx3 + x) + c;
|
|
2810
|
+
return true;
|
|
2571
2811
|
|
|
2572
|
-
res->buf[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
|
|
2573
|
-
}
|
|
2574
|
-
}
|
|
2575
2812
|
}
|
|
2576
|
-
clip_image_u8_free(temp);
|
|
2577
|
-
|
|
2578
|
-
// {
|
|
2579
|
-
// clip_image_u8 * temp2 = clip_image_u8_init();
|
|
2580
|
-
// clip_image_convert_f32_to_u8(*res, *temp2);
|
|
2581
|
-
// clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp");
|
|
2582
|
-
// clip_image_u8_free(temp2);
|
|
2583
|
-
// }
|
|
2584
|
-
// res_imgs.push_back(res);
|
|
2585
|
-
|
|
2586
|
-
res_imgs->size = 1;
|
|
2587
|
-
res_imgs->data = new clip_image_f32[res_imgs->size];
|
|
2588
|
-
res_imgs->data[0] = *res;
|
|
2589
|
-
clip_image_f32_free(res);
|
|
2590
2813
|
|
|
2591
|
-
|
|
2814
|
+
GGML_ASSERT(false && "Unknown image preprocessing type");
|
|
2592
2815
|
}
|
|
2593
2816
|
|
|
2594
2817
|
ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
|
|
@@ -2596,35 +2819,40 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
|
|
|
2596
2819
|
}
|
|
2597
2820
|
|
|
2598
2821
|
void clip_free(clip_ctx * ctx) {
|
|
2822
|
+
if (ctx == nullptr) {
|
|
2823
|
+
return;
|
|
2824
|
+
}
|
|
2599
2825
|
delete ctx;
|
|
2600
2826
|
}
|
|
2601
2827
|
|
|
2828
|
+
// deprecated
|
|
2602
2829
|
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
|
|
2603
|
-
|
|
2604
|
-
|
|
2830
|
+
const int32_t nx = ctx->vision_model.hparams.image_size;
|
|
2831
|
+
const int32_t ny = ctx->vision_model.hparams.image_size;
|
|
2832
|
+
return clip_embd_nbytes_by_img(ctx, nx, ny);
|
|
2605
2833
|
}
|
|
2606
2834
|
|
|
2607
|
-
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int
|
|
2835
|
+
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
|
|
2608
2836
|
clip_image_f32 img;
|
|
2609
2837
|
img.nx = img_w;
|
|
2610
2838
|
img.ny = img_h;
|
|
2611
|
-
return
|
|
2839
|
+
return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
|
2612
2840
|
}
|
|
2613
2841
|
|
|
2614
|
-
int32_t
|
|
2842
|
+
int32_t clip_get_image_size(const struct clip_ctx * ctx) {
|
|
2615
2843
|
return ctx->vision_model.hparams.image_size;
|
|
2616
2844
|
}
|
|
2617
2845
|
|
|
2618
|
-
int32_t
|
|
2846
|
+
int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
|
|
2619
2847
|
return ctx->vision_model.hparams.patch_size;
|
|
2620
2848
|
}
|
|
2621
2849
|
|
|
2622
|
-
int32_t
|
|
2850
|
+
int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
|
|
2623
2851
|
return ctx->vision_model.hparams.hidden_size;
|
|
2624
2852
|
}
|
|
2625
2853
|
|
|
2626
2854
|
const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
|
|
2627
|
-
return ctx->vision_model.hparams.mm_patch_merge_type;
|
|
2855
|
+
return ctx->vision_model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat";
|
|
2628
2856
|
}
|
|
2629
2857
|
|
|
2630
2858
|
const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
|
|
@@ -2638,21 +2866,44 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
|
|
|
2638
2866
|
return ctx->vision_model.hparams.image_grid_pinpoints.size();
|
|
2639
2867
|
}
|
|
2640
2868
|
|
|
2869
|
+
// deprecated
|
|
2641
2870
|
int clip_n_patches(const struct clip_ctx * ctx) {
|
|
2642
2871
|
clip_image_f32 img;
|
|
2643
2872
|
img.nx = ctx->vision_model.hparams.image_size;
|
|
2644
2873
|
img.ny = ctx->vision_model.hparams.image_size;
|
|
2645
|
-
return
|
|
2874
|
+
return clip_n_output_tokens(ctx, &img);
|
|
2646
2875
|
}
|
|
2647
2876
|
|
|
2877
|
+
// deprecated
|
|
2648
2878
|
int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
|
2879
|
+
return clip_n_output_tokens(ctx, img);
|
|
2880
|
+
}
|
|
2881
|
+
|
|
2882
|
+
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
|
2883
|
+
const auto & params = ctx->vision_model.hparams;
|
|
2884
|
+
const int n_total = clip_n_output_tokens(ctx, img);
|
|
2885
|
+
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
|
2886
|
+
return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0);
|
|
2887
|
+
}
|
|
2888
|
+
return n_total;
|
|
2889
|
+
}
|
|
2890
|
+
|
|
2891
|
+
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
|
2892
|
+
const auto & params = ctx->vision_model.hparams;
|
|
2893
|
+
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
|
2894
|
+
return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0);
|
|
2895
|
+
}
|
|
2896
|
+
return 1;
|
|
2897
|
+
}
|
|
2898
|
+
|
|
2899
|
+
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
|
2649
2900
|
const auto & params = ctx->vision_model.hparams;
|
|
2650
2901
|
|
|
2651
2902
|
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
|
2652
2903
|
|
|
2653
2904
|
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
|
2654
2905
|
n_patches /= 4;
|
|
2655
|
-
} else if (ctx->proj_type ==
|
|
2906
|
+
} else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
|
|
2656
2907
|
if (ctx->minicpmv_version == 2) {
|
|
2657
2908
|
n_patches = 96;
|
|
2658
2909
|
}
|
|
@@ -2662,11 +2913,22 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
|
|
|
2662
2913
|
else if (ctx->minicpmv_version == 4) {
|
|
2663
2914
|
n_patches = 64;
|
|
2664
2915
|
}
|
|
2665
|
-
|
|
2916
|
+
else {
|
|
2917
|
+
GGML_ABORT("Unknown minicpmv version");
|
|
2918
|
+
}
|
|
2919
|
+
} else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
|
2666
2920
|
int patch_size = params.patch_size * 2;
|
|
2667
2921
|
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
|
|
2668
2922
|
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
|
|
2669
2923
|
n_patches = x_patch * y_patch;
|
|
2924
|
+
} else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
|
2925
|
+
n_patches = 256;
|
|
2926
|
+
} else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
|
|
2927
|
+
n_patches /= ctx->vision_model.hparams.proj_scale_factor;
|
|
2928
|
+
} else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
|
|
2929
|
+
int n_patches_x = img->nx / params.patch_size;
|
|
2930
|
+
int n_patches_y = img->ny / params.patch_size;
|
|
2931
|
+
n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
|
|
2670
2932
|
}
|
|
2671
2933
|
|
|
2672
2934
|
return n_patches;
|
|
@@ -2759,35 +3021,22 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
|
|
|
2759
3021
|
}
|
|
2760
3022
|
|
|
2761
3023
|
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
|
2762
|
-
|
|
2763
|
-
|
|
2764
|
-
|
|
2765
|
-
|
|
3024
|
+
clip_image_f32_batch imgs;
|
|
3025
|
+
clip_image_f32_ptr img_copy(clip_image_f32_init());
|
|
3026
|
+
*img_copy = *img;
|
|
3027
|
+
imgs.entries.push_back(std::move(img_copy));
|
|
2766
3028
|
|
|
2767
|
-
clip_image_f32_batch imgs{};
|
|
2768
|
-
imgs.size = 1;
|
|
2769
|
-
imgs.data = img;
|
|
2770
3029
|
return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
|
|
2771
3030
|
}
|
|
2772
3031
|
|
|
2773
|
-
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch *
|
|
2774
|
-
|
|
2775
|
-
|
|
2776
|
-
return false;
|
|
2777
|
-
}
|
|
3032
|
+
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
|
|
3033
|
+
const clip_image_f32_batch & imgs = *imgs_c_ptr;
|
|
3034
|
+
int batch_size = imgs.entries.size();
|
|
2778
3035
|
|
|
2779
|
-
|
|
2780
|
-
|
|
2781
|
-
|
|
2782
|
-
}
|
|
2783
|
-
if (ctx->has_minicpmv_projector) {
|
|
2784
|
-
GGML_ASSERT(batch_size == 1);
|
|
2785
|
-
}
|
|
2786
|
-
if (ctx->has_glm_projector) {
|
|
3036
|
+
if (ctx->has_llava_projector
|
|
3037
|
+
|| ctx->proj_type == PROJECTOR_TYPE_MINICPMV
|
|
3038
|
+
|| ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
|
2787
3039
|
GGML_ASSERT(batch_size == 1);
|
|
2788
|
-
ggml_tensor * boi = ctx->vision_model.boi_w;
|
|
2789
|
-
ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi));
|
|
2790
|
-
vec = (float*)(vec+ggml_nelements(boi)); //offset for boi
|
|
2791
3040
|
}
|
|
2792
3041
|
|
|
2793
3042
|
// build the inference graph
|
|
@@ -2796,169 +3045,283 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
2796
3045
|
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
|
|
2797
3046
|
|
|
2798
3047
|
// set inputs
|
|
2799
|
-
const auto & model
|
|
3048
|
+
const auto & model = ctx->vision_model;
|
|
2800
3049
|
const auto & hparams = model.hparams;
|
|
2801
3050
|
|
|
2802
|
-
const int
|
|
2803
|
-
int
|
|
2804
|
-
|
|
2805
|
-
if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
|
|
2806
|
-
image_size_width = imgs->data[0].nx;
|
|
2807
|
-
image_size_height = imgs->data[0].ny;
|
|
2808
|
-
}
|
|
3051
|
+
const int image_size_width = imgs.entries[0]->nx;
|
|
3052
|
+
const int image_size_height = imgs.entries[0]->ny;
|
|
3053
|
+
|
|
2809
3054
|
const int patch_size = hparams.patch_size;
|
|
2810
3055
|
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
|
2811
|
-
const int num_positions = num_patches + (
|
|
2812
|
-
|
|
2813
|
-
|
|
2814
|
-
}
|
|
2815
|
-
const int pos_w = ctx->load_image_size->width/patch_size;
|
|
2816
|
-
const int pos_h = ctx->load_image_size->height/patch_size;
|
|
3056
|
+
const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
|
|
3057
|
+
const int pos_w = ctx->load_image_size.width / patch_size;
|
|
3058
|
+
const int pos_h = ctx->load_image_size.height / patch_size;
|
|
2817
3059
|
|
|
2818
|
-
|
|
2819
|
-
|
|
2820
|
-
|
|
2821
|
-
|
|
2822
|
-
|
|
2823
|
-
|
|
2824
|
-
|
|
2825
|
-
|
|
2826
|
-
|
|
2827
|
-
|
|
3060
|
+
const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
|
|
3061
|
+
|
|
3062
|
+
auto get_inp_tensor = [&gf](const char * name) {
|
|
3063
|
+
struct ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
|
|
3064
|
+
if (inp == nullptr) {
|
|
3065
|
+
GGML_ABORT("Failed to get tensor %s", name);
|
|
3066
|
+
}
|
|
3067
|
+
if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) {
|
|
3068
|
+
GGML_ABORT("Tensor %s is not an input tensor", name);
|
|
3069
|
+
}
|
|
3070
|
+
return inp;
|
|
3071
|
+
};
|
|
2828
3072
|
|
|
3073
|
+
auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
|
|
3074
|
+
ggml_tensor * cur = get_inp_tensor(name);
|
|
3075
|
+
GGML_ASSERT(cur->type == GGML_TYPE_F32);
|
|
3076
|
+
GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
|
|
3077
|
+
ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
|
|
3078
|
+
};
|
|
3079
|
+
|
|
3080
|
+
auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t> & values) {
|
|
3081
|
+
ggml_tensor * cur = get_inp_tensor(name);
|
|
3082
|
+
GGML_ASSERT(cur->type == GGML_TYPE_I32);
|
|
3083
|
+
GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
|
|
3084
|
+
ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
|
|
3085
|
+
};
|
|
3086
|
+
|
|
3087
|
+
// set input pixel values
|
|
3088
|
+
{
|
|
3089
|
+
size_t nelem = 0;
|
|
3090
|
+
for (const auto & img : imgs.entries) {
|
|
3091
|
+
nelem += img->nx * img->ny * 3;
|
|
3092
|
+
}
|
|
3093
|
+
std::vector<float> inp_raw(nelem);
|
|
3094
|
+
|
|
3095
|
+
// layout of data (note: the channel dim is unrolled to better visualize the layout):
|
|
3096
|
+
//
|
|
3097
|
+
// ┌──W──┐
|
|
3098
|
+
// │ H │ channel = R
|
|
3099
|
+
// ├─────┤ │
|
|
3100
|
+
// │ H │ channel = G
|
|
3101
|
+
// ├─────┤ │
|
|
3102
|
+
// │ H │ channel = B
|
|
3103
|
+
// └─────┘ │
|
|
3104
|
+
// ──────┘ x B
|
|
3105
|
+
|
|
3106
|
+
for (size_t i = 0; i < imgs.entries.size(); i++) {
|
|
3107
|
+
const int nx = imgs.entries[i]->nx;
|
|
3108
|
+
const int ny = imgs.entries[i]->ny;
|
|
2829
3109
|
const int n = nx * ny;
|
|
2830
3110
|
|
|
2831
3111
|
for (int b = 0; b < batch_size; b++) {
|
|
2832
|
-
|
|
2833
|
-
|
|
2834
|
-
|
|
2835
|
-
|
|
2836
|
-
|
|
3112
|
+
float * batch_entry = inp_raw.data() + b * (3*n);
|
|
3113
|
+
for (int y = 0; y < ny; y++) {
|
|
3114
|
+
for (int x = 0; x < nx; x++) {
|
|
3115
|
+
size_t base_src = 3*(y * nx + x); // idx of the first channel
|
|
3116
|
+
size_t base_dst = y * nx + x; // idx of the first channel
|
|
3117
|
+
batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ];
|
|
3118
|
+
batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
|
|
3119
|
+
batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
|
|
2837
3120
|
}
|
|
2838
3121
|
}
|
|
2839
3122
|
}
|
|
2840
3123
|
}
|
|
2841
|
-
|
|
2842
|
-
free(data);
|
|
3124
|
+
set_input_f32("inp_raw", inp_raw);
|
|
2843
3125
|
}
|
|
2844
|
-
if (ctx->has_minicpmv_projector) {
|
|
2845
|
-
{
|
|
2846
|
-
// inspired from siglip:
|
|
2847
|
-
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
|
|
2848
|
-
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
|
|
2849
|
-
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
|
2850
|
-
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
|
2851
|
-
int bucket_coords_h[1024];
|
|
2852
|
-
int bucket_coords_w[1024];
|
|
2853
|
-
for (int i = 0; i < pos_h; i++){
|
|
2854
|
-
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
|
|
2855
|
-
}
|
|
2856
|
-
for (int i = 0; i < pos_w; i++){
|
|
2857
|
-
bucket_coords_w[i] = std::floor(70.0*i/pos_w);
|
|
2858
|
-
}
|
|
2859
|
-
for (int i = 0, id = 0; i < pos_h; i++){
|
|
2860
|
-
for (int j = 0; j < pos_w; j++){
|
|
2861
|
-
positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
|
|
2862
|
-
}
|
|
2863
|
-
}
|
|
2864
|
-
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
|
2865
|
-
free(positions_data);
|
|
2866
|
-
}
|
|
2867
|
-
|
|
2868
|
-
{
|
|
2869
|
-
// inspired from resampler of Qwen-VL:
|
|
2870
|
-
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
|
|
2871
|
-
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
|
|
2872
|
-
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
|
|
2873
|
-
int embed_dim = 4096;
|
|
2874
|
-
if (ctx->minicpmv_version == 2) {
|
|
2875
|
-
embed_dim = 4096;
|
|
2876
|
-
}
|
|
2877
|
-
else if (ctx->minicpmv_version == 3) {
|
|
2878
|
-
embed_dim = 3584;
|
|
2879
|
-
}
|
|
2880
|
-
else if (ctx->minicpmv_version == 4) {
|
|
2881
|
-
embed_dim = 3584;
|
|
2882
|
-
}
|
|
2883
|
-
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
|
2884
3126
|
|
|
2885
|
-
|
|
2886
|
-
|
|
2887
|
-
|
|
2888
|
-
|
|
3127
|
+
// set input per projector
|
|
3128
|
+
switch (ctx->proj_type) {
|
|
3129
|
+
case PROJECTOR_TYPE_MINICPMV:
|
|
3130
|
+
{
|
|
3131
|
+
// inspired from siglip:
|
|
3132
|
+
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
|
|
3133
|
+
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
|
|
3134
|
+
std::vector<int32_t> positions(pos_h * pos_w);
|
|
3135
|
+
int bucket_coords_h[1024];
|
|
3136
|
+
int bucket_coords_w[1024];
|
|
3137
|
+
for (int i = 0; i < pos_h; i++){
|
|
3138
|
+
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
|
|
2889
3139
|
}
|
|
2890
|
-
|
|
3140
|
+
for (int i = 0; i < pos_w; i++){
|
|
3141
|
+
bucket_coords_w[i] = std::floor(70.0*i/pos_w);
|
|
3142
|
+
}
|
|
3143
|
+
for (int i = 0, id = 0; i < pos_h; i++){
|
|
3144
|
+
for (int j = 0; j < pos_w; j++){
|
|
3145
|
+
positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
|
|
3146
|
+
}
|
|
3147
|
+
}
|
|
3148
|
+
set_input_i32("positions", positions);
|
|
2891
3149
|
|
|
2892
|
-
|
|
2893
|
-
|
|
2894
|
-
|
|
2895
|
-
|
|
2896
|
-
else{
|
|
2897
|
-
{
|
|
2898
|
-
if (ctx->has_class_embedding) {
|
|
2899
|
-
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
|
|
3150
|
+
// inspired from resampler of Qwen-VL:
|
|
3151
|
+
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
|
|
3152
|
+
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
|
|
3153
|
+
int embed_dim = clip_n_mmproj_embd(ctx);
|
|
2900
3154
|
|
|
2901
|
-
|
|
2902
|
-
|
|
2903
|
-
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
|
|
2904
|
-
free(zero_mem);
|
|
2905
|
-
}
|
|
2906
|
-
}
|
|
3155
|
+
// TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
|
|
3156
|
+
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
|
2907
3157
|
|
|
2908
|
-
|
|
2909
|
-
|
|
3158
|
+
std::vector<float> pos_embed(embed_dim * pos_w * pos_h);
|
|
3159
|
+
for(int i = 0; i < pos_w * pos_h; ++i){
|
|
3160
|
+
for(int j = 0; j < embed_dim; ++j){
|
|
3161
|
+
pos_embed[i * embed_dim + j] = pos_embed_t[i][j];
|
|
3162
|
+
}
|
|
3163
|
+
}
|
|
2910
3164
|
|
|
2911
|
-
|
|
2912
|
-
|
|
2913
|
-
|
|
3165
|
+
set_input_f32("pos_embed", pos_embed);
|
|
3166
|
+
} break;
|
|
3167
|
+
case PROJECTOR_TYPE_QWEN2VL:
|
|
3168
|
+
{
|
|
3169
|
+
const int merge_ratio = 2;
|
|
3170
|
+
const int pw = image_size_width / patch_size;
|
|
3171
|
+
const int ph = image_size_height / patch_size;
|
|
3172
|
+
std::vector<int> positions(num_positions * 4);
|
|
3173
|
+
int ptr = 0;
|
|
3174
|
+
for (int y = 0; y < ph; y += merge_ratio) {
|
|
3175
|
+
for (int x = 0; x < pw; x += merge_ratio) {
|
|
3176
|
+
for (int dy = 0; dy < 2; dy++) {
|
|
3177
|
+
for (int dx = 0; dx < 2; dx++) {
|
|
3178
|
+
positions[ ptr] = y + dy;
|
|
3179
|
+
positions[ num_patches + ptr] = x + dx;
|
|
3180
|
+
positions[2 * num_patches + ptr] = y + dy;
|
|
3181
|
+
positions[3 * num_patches + ptr] = x + dx;
|
|
3182
|
+
ptr++;
|
|
3183
|
+
}
|
|
3184
|
+
}
|
|
3185
|
+
}
|
|
3186
|
+
}
|
|
2914
3187
|
|
|
2915
|
-
|
|
2916
|
-
|
|
3188
|
+
set_input_i32("positions", positions);
|
|
3189
|
+
} break;
|
|
3190
|
+
case PROJECTOR_TYPE_QWEN25VL:
|
|
2917
3191
|
{
|
|
2918
|
-
|
|
2919
|
-
|
|
2920
|
-
|
|
2921
|
-
|
|
2922
|
-
|
|
2923
|
-
|
|
2924
|
-
|
|
2925
|
-
|
|
2926
|
-
|
|
3192
|
+
// pw * ph = number of tokens output by ViT after apply patch merger
|
|
3193
|
+
// ipw * ipw = number of vision token been processed inside ViT
|
|
3194
|
+
const int merge_ratio = 2;
|
|
3195
|
+
const int pw = image_size_width / patch_size / merge_ratio;
|
|
3196
|
+
const int ph = image_size_height / patch_size / merge_ratio;
|
|
3197
|
+
const int ipw = image_size_width / patch_size;
|
|
3198
|
+
const int iph = image_size_height / patch_size;
|
|
3199
|
+
|
|
3200
|
+
std::vector<int> idx (ph * pw);
|
|
3201
|
+
std::vector<int> inv_idx(ph * pw);
|
|
3202
|
+
|
|
3203
|
+
if (use_window_attn) {
|
|
3204
|
+
const int attn_window_size = 112;
|
|
3205
|
+
const int grid_window = attn_window_size / patch_size / merge_ratio;
|
|
3206
|
+
int dst = 0;
|
|
3207
|
+
// [num_vision_tokens, num_vision_tokens] attention mask tensor
|
|
3208
|
+
std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
|
|
3209
|
+
int mask_row = 0;
|
|
3210
|
+
|
|
3211
|
+
for (int y = 0; y < ph; y += grid_window) {
|
|
3212
|
+
for (int x = 0; x < pw; x += grid_window) {
|
|
3213
|
+
const int win_h = std::min(grid_window, ph - y);
|
|
3214
|
+
const int win_w = std::min(grid_window, pw - x);
|
|
3215
|
+
const int dst_0 = dst;
|
|
3216
|
+
// group all tokens belong to the same window togather (to a continue range)
|
|
3217
|
+
for (int dy = 0; dy < win_h; dy++) {
|
|
3218
|
+
for (int dx = 0; dx < win_w; dx++) {
|
|
3219
|
+
const int src = (y + dy) * pw + (x + dx);
|
|
3220
|
+
GGML_ASSERT(src < (int)idx.size());
|
|
3221
|
+
GGML_ASSERT(dst < (int)inv_idx.size());
|
|
3222
|
+
idx [src] = dst;
|
|
3223
|
+
inv_idx[dst] = src;
|
|
3224
|
+
dst++;
|
|
3225
|
+
}
|
|
3226
|
+
}
|
|
3227
|
+
|
|
3228
|
+
for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
|
|
3229
|
+
int row_offset = mask_row * (ipw * iph);
|
|
3230
|
+
std::fill(
|
|
3231
|
+
mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
|
|
3232
|
+
mask.begin() + row_offset + (dst * merge_ratio * merge_ratio),
|
|
3233
|
+
0.0);
|
|
3234
|
+
mask_row++;
|
|
3235
|
+
}
|
|
2927
3236
|
}
|
|
2928
3237
|
}
|
|
3238
|
+
|
|
3239
|
+
set_input_i32("window_idx", idx);
|
|
3240
|
+
set_input_i32("inv_window_idx", inv_idx);
|
|
3241
|
+
set_input_f32("window_mask", mask);
|
|
3242
|
+
} else {
|
|
3243
|
+
for (int i = 0; i < ph * pw; i++) {
|
|
3244
|
+
idx[i] = i;
|
|
3245
|
+
}
|
|
2929
3246
|
}
|
|
2930
|
-
}
|
|
2931
3247
|
|
|
2932
|
-
|
|
2933
|
-
|
|
2934
|
-
|
|
2935
|
-
|
|
2936
|
-
|
|
2937
|
-
|
|
2938
|
-
|
|
2939
|
-
|
|
3248
|
+
const int mpow = merge_ratio * merge_ratio;
|
|
3249
|
+
std::vector<int> positions(num_positions * 4);
|
|
3250
|
+
|
|
3251
|
+
int ptr = 0;
|
|
3252
|
+
for (int y = 0; y < iph; y += merge_ratio) {
|
|
3253
|
+
for (int x = 0; x < ipw; x += merge_ratio) {
|
|
3254
|
+
for (int dy = 0; dy < 2; dy++) {
|
|
3255
|
+
for (int dx = 0; dx < 2; dx++) {
|
|
3256
|
+
auto remap = idx[ptr / mpow];
|
|
3257
|
+
remap = (remap * mpow) + (ptr % mpow);
|
|
3258
|
+
|
|
3259
|
+
positions[ remap] = y + dy;
|
|
3260
|
+
positions[ num_patches + remap] = x + dx;
|
|
3261
|
+
positions[2 * num_patches + remap] = y + dy;
|
|
3262
|
+
positions[3 * num_patches + remap] = x + dx;
|
|
3263
|
+
ptr++;
|
|
3264
|
+
}
|
|
3265
|
+
}
|
|
3266
|
+
}
|
|
3267
|
+
}
|
|
2940
3268
|
|
|
2941
|
-
|
|
3269
|
+
set_input_i32("positions", positions);
|
|
3270
|
+
} break;
|
|
3271
|
+
case PROJECTOR_TYPE_PIXTRAL:
|
|
3272
|
+
{
|
|
3273
|
+
// set the 2D positions
|
|
3274
|
+
int n_patches_per_col = image_size_width / patch_size;
|
|
3275
|
+
std::vector<int> pos_data(num_positions);
|
|
3276
|
+
// dimension H
|
|
3277
|
+
for (int i = 0; i < num_positions; i++) {
|
|
3278
|
+
pos_data[i] = i / n_patches_per_col;
|
|
3279
|
+
}
|
|
3280
|
+
set_input_i32("pos_h", pos_data);
|
|
3281
|
+
// dimension W
|
|
3282
|
+
for (int i = 0; i < num_positions; i++) {
|
|
3283
|
+
pos_data[i] = i % n_patches_per_col;
|
|
3284
|
+
}
|
|
3285
|
+
set_input_i32("pos_w", pos_data);
|
|
3286
|
+
} break;
|
|
3287
|
+
case PROJECTOR_TYPE_GLM_EDGE:
|
|
3288
|
+
{
|
|
3289
|
+
// llava and other models
|
|
3290
|
+
std::vector<int32_t> positions(num_positions);
|
|
2942
3291
|
for (int i = 0; i < num_positions; i++) {
|
|
2943
|
-
|
|
3292
|
+
positions[i] = i;
|
|
2944
3293
|
}
|
|
2945
|
-
|
|
2946
|
-
|
|
3294
|
+
set_input_i32("positions", positions);
|
|
3295
|
+
} break;
|
|
3296
|
+
case PROJECTOR_TYPE_MLP:
|
|
3297
|
+
case PROJECTOR_TYPE_MLP_NORM:
|
|
3298
|
+
case PROJECTOR_TYPE_LDP:
|
|
3299
|
+
case PROJECTOR_TYPE_LDPV2:
|
|
3300
|
+
{
|
|
3301
|
+
// llava and other models
|
|
3302
|
+
std::vector<int32_t> positions(num_positions);
|
|
3303
|
+
for (int i = 0; i < num_positions; i++) {
|
|
3304
|
+
positions[i] = i;
|
|
3305
|
+
}
|
|
3306
|
+
set_input_i32("positions", positions);
|
|
2947
3307
|
|
|
2948
|
-
if (!ctx->has_glm_projector) {
|
|
2949
|
-
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
|
2950
3308
|
// The patches vector is used to get rows to index into the embeds with;
|
|
2951
3309
|
// we should skip dim 0 only if we have CLS to avoid going out of bounds
|
|
2952
3310
|
// when retrieving the rows.
|
|
2953
|
-
int patch_offset =
|
|
2954
|
-
|
|
3311
|
+
int patch_offset = model.class_embedding ? 1 : 0;
|
|
3312
|
+
std::vector<int32_t> patches(num_patches);
|
|
2955
3313
|
for (int i = 0; i < num_patches; i++) {
|
|
2956
|
-
|
|
3314
|
+
patches[i] = i + patch_offset;
|
|
2957
3315
|
}
|
|
2958
|
-
|
|
2959
|
-
|
|
2960
|
-
|
|
2961
|
-
|
|
3316
|
+
set_input_i32("patches", patches);
|
|
3317
|
+
} break;
|
|
3318
|
+
case PROJECTOR_TYPE_GEMMA3:
|
|
3319
|
+
case PROJECTOR_TYPE_IDEFICS3:
|
|
3320
|
+
{
|
|
3321
|
+
// do nothing
|
|
3322
|
+
} break;
|
|
3323
|
+
default:
|
|
3324
|
+
GGML_ABORT("Unknown projector type");
|
|
2962
3325
|
}
|
|
2963
3326
|
|
|
2964
3327
|
ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
|
|
@@ -2975,13 +3338,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
2975
3338
|
// copy the embeddings to the location passed by the user
|
|
2976
3339
|
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
|
2977
3340
|
|
|
2978
|
-
if (ctx->has_glm_projector) {
|
|
2979
|
-
//eoi
|
|
2980
|
-
ggml_tensor * eoi = ctx->vision_model.eoi_w;
|
|
2981
|
-
int offset = ggml_nelements(embeddings);
|
|
2982
|
-
ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi));
|
|
2983
|
-
}
|
|
2984
|
-
|
|
2985
3341
|
return true;
|
|
2986
3342
|
}
|
|
2987
3343
|
|
|
@@ -2989,10 +3345,13 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|
|
2989
3345
|
assert(itype < GGML_TYPE_COUNT);
|
|
2990
3346
|
ggml_type type = static_cast<ggml_type>(itype);
|
|
2991
3347
|
|
|
2992
|
-
auto * ctx_clip =
|
|
3348
|
+
auto * ctx_clip = clip_init(fname_inp, clip_context_params{
|
|
3349
|
+
/* use_gpu */ false,
|
|
3350
|
+
/* verbosity */ GGML_LOG_LEVEL_ERROR,
|
|
3351
|
+
});
|
|
2993
3352
|
|
|
2994
|
-
const auto & ctx_src = ctx_clip->ctx_gguf;
|
|
2995
|
-
const auto & ctx_data = ctx_clip->ctx_data;
|
|
3353
|
+
const auto & ctx_src = ctx_clip->ctx_gguf.get();
|
|
3354
|
+
const auto & ctx_data = ctx_clip->ctx_data.get();
|
|
2996
3355
|
|
|
2997
3356
|
auto * ctx_out = gguf_init_empty();
|
|
2998
3357
|
gguf_set_kv(ctx_out, ctx_src);
|
|
@@ -3066,7 +3425,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|
|
3066
3425
|
f32_data = (float *)conv_buf.data();
|
|
3067
3426
|
break;
|
|
3068
3427
|
default:
|
|
3069
|
-
LOG_ERR("Please use an input file in f32 or f16\n");
|
|
3428
|
+
LOG_ERR("%s: Please use an input file in f32 or f16\n", __func__);
|
|
3070
3429
|
gguf_free(ctx_out);
|
|
3071
3430
|
return false;
|
|
3072
3431
|
}
|
|
@@ -3118,78 +3477,60 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|
|
3118
3477
|
}
|
|
3119
3478
|
|
|
3120
3479
|
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|
3121
|
-
|
|
3122
|
-
|
|
3123
|
-
|
|
3124
|
-
|
|
3125
|
-
|
|
3126
|
-
|
|
3127
|
-
|
|
3128
|
-
|
|
3129
|
-
|
|
3130
|
-
|
|
3131
|
-
|
|
3132
|
-
|
|
3133
|
-
|
|
3134
|
-
|
|
3135
|
-
|
|
3136
|
-
|
|
3137
|
-
|
|
3138
|
-
|
|
3139
|
-
|
|
3140
|
-
|
|
3141
|
-
return
|
|
3142
|
-
|
|
3143
|
-
|
|
3144
|
-
|
|
3145
|
-
|
|
3146
|
-
|
|
3147
|
-
|
|
3148
|
-
|
|
3149
|
-
|
|
3150
|
-
|
|
3151
|
-
return ctx->vision_model.mm_input_proj_w->ne[0];
|
|
3480
|
+
switch (ctx->proj_type) {
|
|
3481
|
+
case PROJECTOR_TYPE_LDP:
|
|
3482
|
+
return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
|
|
3483
|
+
case PROJECTOR_TYPE_LDPV2:
|
|
3484
|
+
return ctx->vision_model.mm_model_peg_0_b->ne[0];
|
|
3485
|
+
case PROJECTOR_TYPE_MLP:
|
|
3486
|
+
case PROJECTOR_TYPE_PIXTRAL:
|
|
3487
|
+
return ctx->vision_model.mm_2_b->ne[0];
|
|
3488
|
+
case PROJECTOR_TYPE_MLP_NORM:
|
|
3489
|
+
return ctx->vision_model.mm_3_b->ne[0];
|
|
3490
|
+
case PROJECTOR_TYPE_MINICPMV:
|
|
3491
|
+
if (ctx->minicpmv_version == 2) {
|
|
3492
|
+
return 4096;
|
|
3493
|
+
} else if (ctx->minicpmv_version == 3) {
|
|
3494
|
+
return 3584;
|
|
3495
|
+
} else if (ctx->minicpmv_version == 4) {
|
|
3496
|
+
return 3584;
|
|
3497
|
+
}
|
|
3498
|
+
GGML_ABORT("Unknown minicpmv version");
|
|
3499
|
+
case PROJECTOR_TYPE_GLM_EDGE:
|
|
3500
|
+
return ctx->vision_model.mm_model_mlp_3_w->ne[1];
|
|
3501
|
+
case PROJECTOR_TYPE_QWEN2VL:
|
|
3502
|
+
case PROJECTOR_TYPE_QWEN25VL:
|
|
3503
|
+
return ctx->vision_model.mm_1_b->ne[0];
|
|
3504
|
+
case PROJECTOR_TYPE_GEMMA3:
|
|
3505
|
+
return ctx->vision_model.mm_input_proj_w->ne[0];
|
|
3506
|
+
case PROJECTOR_TYPE_IDEFICS3:
|
|
3507
|
+
return ctx->vision_model.projection->ne[1];
|
|
3508
|
+
default:
|
|
3509
|
+
GGML_ABORT("Unknown projector type");
|
|
3152
3510
|
}
|
|
3153
|
-
|
|
3154
|
-
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
|
3155
|
-
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
|
3156
3511
|
}
|
|
3157
3512
|
|
|
3158
3513
|
int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
|
3159
|
-
if (ctx->
|
|
3514
|
+
if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
|
|
3160
3515
|
return ctx->minicpmv_version;
|
|
3161
3516
|
}
|
|
3162
3517
|
return 0;
|
|
3163
3518
|
}
|
|
3164
3519
|
|
|
3165
3520
|
bool clip_is_glm(const struct clip_ctx * ctx) {
|
|
3166
|
-
return ctx->
|
|
3521
|
+
return ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE;
|
|
3167
3522
|
}
|
|
3523
|
+
|
|
3168
3524
|
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
|
|
3169
|
-
return ctx->
|
|
3525
|
+
return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL;
|
|
3170
3526
|
}
|
|
3171
3527
|
|
|
3172
|
-
|
|
3173
|
-
|
|
3174
|
-
|
|
3175
|
-
// default for models that have a llava projector
|
|
3176
|
-
const auto & hparams = ctx->vision_model.hparams;
|
|
3177
|
-
int n_layer = hparams.n_layer - 1;
|
|
3178
|
-
int deepest_feature_layer = -1;
|
|
3179
|
-
|
|
3180
|
-
// Handle other projectors; incrementing here indicates that we
|
|
3181
|
-
// should use the last encoder layer for the vision features.
|
|
3182
|
-
if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
|
|
3183
|
-
n_layer += 1;
|
|
3184
|
-
}
|
|
3528
|
+
bool clip_is_llava(const struct clip_ctx * ctx) {
|
|
3529
|
+
return ctx->has_llava_projector;
|
|
3530
|
+
}
|
|
3185
3531
|
|
|
3186
|
-
|
|
3187
|
-
|
|
3188
|
-
if (feature_layer > deepest_feature_layer) {
|
|
3189
|
-
deepest_feature_layer = feature_layer;
|
|
3190
|
-
}
|
|
3191
|
-
}
|
|
3192
|
-
return deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
|
|
3532
|
+
bool clip_is_gemma3(const struct clip_ctx * ctx) {
|
|
3533
|
+
return ctx->proj_type == PROJECTOR_TYPE_GEMMA3;
|
|
3193
3534
|
}
|
|
3194
3535
|
|
|
3195
3536
|
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
|
|
@@ -3204,3 +3545,11 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img,
|
|
|
3204
3545
|
clip_image_encode(ctx, n_threads, &clip_img, vec);
|
|
3205
3546
|
return true;
|
|
3206
3547
|
}
|
|
3548
|
+
|
|
3549
|
+
//
|
|
3550
|
+
// API used internally with mtmd
|
|
3551
|
+
//
|
|
3552
|
+
|
|
3553
|
+
projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
|
|
3554
|
+
return ctx->proj_type;
|
|
3555
|
+
}
|