@fugood/llama.node 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -10
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +6 -4
- package/src/LlamaCompletionWorker.cpp +6 -6
- package/src/LlamaContext.cpp +7 -9
- package/src/common.hpp +2 -1
- package/src/llama.cpp/.github/workflows/build.yml +98 -24
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +43 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +20 -8
- package/src/llama.cpp/common/CMakeLists.txt +12 -10
- package/src/llama.cpp/common/arg.cpp +2006 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +496 -1632
- package/src/llama.cpp/common/common.h +161 -63
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +3 -0
- package/src/llama.cpp/common/sampling.cpp +348 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/common/train.cpp +2 -0
- package/src/llama.cpp/docs/build.md +36 -1
- package/src/llama.cpp/examples/CMakeLists.txt +0 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +39 -55
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
- package/src/llama.cpp/examples/infill/infill.cpp +117 -132
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +685 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
- package/src/llama.cpp/examples/llava/llava.cpp +110 -24
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
- package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
- package/src/llama.cpp/examples/main/main.cpp +210 -262
- package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
- package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
- package/src/llama.cpp/examples/server/server.cpp +1027 -1073
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +107 -105
- package/src/llama.cpp/examples/simple/simple.cpp +35 -41
- package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
- package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
- package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
- package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
- package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
- package/src/llama.cpp/ggml/include/ggml.h +293 -186
- package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
- package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
- package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
- package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
- package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
- package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
- package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
- package/src/llama.cpp/include/llama.h +241 -264
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
- package/src/llama.cpp/src/llama-sampling.h +20 -47
- package/src/llama.cpp/src/llama-vocab.cpp +343 -120
- package/src/llama.cpp/src/llama-vocab.h +33 -17
- package/src/llama.cpp/src/llama.cpp +4247 -1525
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +3 -0
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
- package/src/llama.cpp/tests/test-barrier.cpp +93 -0
- package/src/llama.cpp/tests/test-grad0.cpp +187 -70
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
- package/src/llama.cpp/tests/test-rope.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +157 -98
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
// I'll gradually clean and extend it
|
|
4
4
|
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
|
5
5
|
#include "clip.h"
|
|
6
|
-
#include "log.h"
|
|
7
6
|
#include "ggml.h"
|
|
8
7
|
#include "ggml-alloc.h"
|
|
9
8
|
#include "ggml-backend.h"
|
|
@@ -20,6 +19,10 @@
|
|
|
20
19
|
#include "ggml-cann.h"
|
|
21
20
|
#endif
|
|
22
21
|
|
|
22
|
+
#ifdef GGML_USE_VULKAN
|
|
23
|
+
#include "ggml-vulkan.h"
|
|
24
|
+
#endif
|
|
25
|
+
|
|
23
26
|
#define STB_IMAGE_IMPLEMENTATION
|
|
24
27
|
#include "stb_image.h"
|
|
25
28
|
|
|
@@ -36,6 +39,11 @@
|
|
|
36
39
|
#include <cinttypes>
|
|
37
40
|
#include <limits>
|
|
38
41
|
|
|
42
|
+
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
|
43
|
+
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
44
|
+
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
45
|
+
#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
46
|
+
|
|
39
47
|
//#define CLIP_DEBUG_FUNCTIONS
|
|
40
48
|
|
|
41
49
|
// RGB uint8 image
|
|
@@ -74,26 +82,28 @@ static std::string format(const char * fmt, ...) {
|
|
|
74
82
|
// key constants
|
|
75
83
|
//
|
|
76
84
|
|
|
77
|
-
#define KEY_FTYPE
|
|
78
|
-
#define KEY_NAME
|
|
79
|
-
#define KEY_DESCRIPTION
|
|
80
|
-
#define KEY_HAS_TEXT_ENC
|
|
81
|
-
#define KEY_HAS_VIS_ENC
|
|
82
|
-
#define KEY_HAS_LLAVA_PROJ
|
|
83
|
-
#define
|
|
84
|
-
#define
|
|
85
|
-
#define
|
|
86
|
-
#define
|
|
87
|
-
#define
|
|
88
|
-
#define
|
|
89
|
-
#define
|
|
90
|
-
#define
|
|
91
|
-
#define
|
|
92
|
-
#define
|
|
93
|
-
#define
|
|
94
|
-
#define
|
|
95
|
-
#define
|
|
96
|
-
#define
|
|
85
|
+
#define KEY_FTYPE "general.file_type"
|
|
86
|
+
#define KEY_NAME "general.name"
|
|
87
|
+
#define KEY_DESCRIPTION "general.description"
|
|
88
|
+
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
|
|
89
|
+
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
|
|
90
|
+
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
|
91
|
+
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
|
|
92
|
+
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
|
93
|
+
#define KEY_USE_GELU "clip.use_gelu"
|
|
94
|
+
#define KEY_N_EMBD "clip.%s.embedding_length"
|
|
95
|
+
#define KEY_N_FF "clip.%s.feed_forward_length"
|
|
96
|
+
#define KEY_N_BLOCK "clip.%s.block_count"
|
|
97
|
+
#define KEY_N_HEAD "clip.%s.attention.head_count"
|
|
98
|
+
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
|
|
99
|
+
#define KEY_PROJ_DIM "clip.%s.projection_dim"
|
|
100
|
+
#define KEY_TOKENS "tokenizer.ggml.tokens"
|
|
101
|
+
#define KEY_N_POSITIONS "clip.text.context_length"
|
|
102
|
+
#define KEY_IMAGE_SIZE "clip.vision.image_size"
|
|
103
|
+
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
|
104
|
+
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
|
105
|
+
#define KEY_IMAGE_STD "clip.vision.image_std"
|
|
106
|
+
#define KEY_PROJ_TYPE "clip.projector_type"
|
|
97
107
|
|
|
98
108
|
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
|
99
109
|
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
|
@@ -127,12 +137,20 @@ static std::string format(const char * fmt, ...) {
|
|
|
127
137
|
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
|
128
138
|
#define TN_IMAGE_NEWLINE "model.image_newline"
|
|
129
139
|
|
|
140
|
+
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
|
141
|
+
#define TN_MINICPMV_QUERY "resampler.query"
|
|
142
|
+
#define TN_MINICPMV_PROJ "resampler.proj.weight"
|
|
143
|
+
#define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
|
|
144
|
+
#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
|
|
145
|
+
#define TN_MINICPMV_LN "resampler.ln_%s.%s"
|
|
146
|
+
|
|
130
147
|
|
|
131
148
|
enum projector_type {
|
|
132
149
|
PROJECTOR_TYPE_MLP,
|
|
133
150
|
PROJECTOR_TYPE_MLP_NORM,
|
|
134
151
|
PROJECTOR_TYPE_LDP,
|
|
135
152
|
PROJECTOR_TYPE_LDPV2,
|
|
153
|
+
PROJECTOR_TYPE_RESAMPLER,
|
|
136
154
|
PROJECTOR_TYPE_UNKNOWN,
|
|
137
155
|
};
|
|
138
156
|
|
|
@@ -140,6 +158,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|
|
140
158
|
{ PROJECTOR_TYPE_MLP, "mlp" },
|
|
141
159
|
{ PROJECTOR_TYPE_LDP, "ldp" },
|
|
142
160
|
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
|
161
|
+
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
|
143
162
|
};
|
|
144
163
|
|
|
145
164
|
|
|
@@ -150,7 +169,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|
|
150
169
|
static int get_key_idx(const gguf_context * ctx, const char * key) {
|
|
151
170
|
int i = gguf_find_key(ctx, key);
|
|
152
171
|
if (i == -1) {
|
|
153
|
-
|
|
172
|
+
LOG_ERR("key %s not found in file\n", key);
|
|
154
173
|
throw std::runtime_error(format("Missing required key: %s", key));
|
|
155
174
|
}
|
|
156
175
|
|
|
@@ -200,17 +219,20 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
|
|
|
200
219
|
}
|
|
201
220
|
|
|
202
221
|
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
auto new_pos = s.find(search, pos);
|
|
206
|
-
if (new_pos == std::string::npos) {
|
|
207
|
-
result += s.substr(pos, s.size() - pos);
|
|
208
|
-
break;
|
|
209
|
-
}
|
|
210
|
-
result += s.substr(pos, new_pos - pos) + replace;
|
|
211
|
-
pos = new_pos;
|
|
222
|
+
if (search.empty()) {
|
|
223
|
+
return;
|
|
212
224
|
}
|
|
213
|
-
|
|
225
|
+
std::string builder;
|
|
226
|
+
builder.reserve(s.length());
|
|
227
|
+
size_t pos = 0;
|
|
228
|
+
size_t last_pos = 0;
|
|
229
|
+
while ((pos = s.find(search, last_pos)) != std::string::npos) {
|
|
230
|
+
builder.append(s, last_pos, pos - last_pos);
|
|
231
|
+
builder.append(replace);
|
|
232
|
+
last_pos = pos + search.length();
|
|
233
|
+
}
|
|
234
|
+
builder.append(s, last_pos, std::string::npos);
|
|
235
|
+
s = std::move(builder);
|
|
214
236
|
}
|
|
215
237
|
|
|
216
238
|
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
|
@@ -252,7 +274,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
|
|
252
274
|
|
|
253
275
|
static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
|
|
254
276
|
size_t tensor_size = ggml_nbytes(tensor);
|
|
255
|
-
|
|
277
|
+
LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
|
|
256
278
|
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
|
|
257
279
|
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
|
|
258
280
|
}
|
|
@@ -270,7 +292,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
|
|
|
270
292
|
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
|
|
271
293
|
std::ofstream file(filename, std::ios::binary);
|
|
272
294
|
if (!file.is_open()) {
|
|
273
|
-
|
|
295
|
+
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
|
|
274
296
|
return;
|
|
275
297
|
}
|
|
276
298
|
|
|
@@ -289,7 +311,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
|
|
|
289
311
|
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
|
|
290
312
|
std::ofstream file(filename, std::ios::binary);
|
|
291
313
|
if (!file.is_open()) {
|
|
292
|
-
|
|
314
|
+
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
|
|
293
315
|
return;
|
|
294
316
|
}
|
|
295
317
|
|
|
@@ -492,12 +514,34 @@ struct clip_vision_model {
|
|
|
492
514
|
struct ggml_tensor * mm_model_mlp_2_b;
|
|
493
515
|
struct ggml_tensor * mm_model_peg_0_w;
|
|
494
516
|
struct ggml_tensor * mm_model_peg_0_b;
|
|
517
|
+
|
|
518
|
+
// MINICPMV projection
|
|
519
|
+
struct ggml_tensor * mm_model_pos_embed_k;
|
|
520
|
+
struct ggml_tensor * mm_model_query;
|
|
521
|
+
struct ggml_tensor * mm_model_proj;
|
|
522
|
+
struct ggml_tensor * mm_model_kv_proj;
|
|
523
|
+
struct ggml_tensor * mm_model_attn_q_w;
|
|
524
|
+
struct ggml_tensor * mm_model_attn_q_b;
|
|
525
|
+
struct ggml_tensor * mm_model_attn_k_w;
|
|
526
|
+
struct ggml_tensor * mm_model_attn_k_b;
|
|
527
|
+
struct ggml_tensor * mm_model_attn_v_w;
|
|
528
|
+
struct ggml_tensor * mm_model_attn_v_b;
|
|
529
|
+
struct ggml_tensor * mm_model_attn_o_w;
|
|
530
|
+
struct ggml_tensor * mm_model_attn_o_b;
|
|
531
|
+
struct ggml_tensor * mm_model_ln_q_w;
|
|
532
|
+
struct ggml_tensor * mm_model_ln_q_b;
|
|
533
|
+
struct ggml_tensor * mm_model_ln_kv_w;
|
|
534
|
+
struct ggml_tensor * mm_model_ln_kv_b;
|
|
535
|
+
struct ggml_tensor * mm_model_ln_post_w;
|
|
536
|
+
struct ggml_tensor * mm_model_ln_post_b;
|
|
495
537
|
};
|
|
496
538
|
|
|
497
539
|
struct clip_ctx {
|
|
498
540
|
bool has_text_encoder = false;
|
|
499
541
|
bool has_vision_encoder = false;
|
|
500
542
|
bool has_llava_projector = false;
|
|
543
|
+
bool has_minicpmv_projector = false;
|
|
544
|
+
int minicpmv_version = 2;
|
|
501
545
|
|
|
502
546
|
struct clip_vision_model vision_model;
|
|
503
547
|
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
|
@@ -522,31 +566,46 @@ struct clip_ctx {
|
|
|
522
566
|
|
|
523
567
|
ggml_backend_t backend = NULL;
|
|
524
568
|
ggml_gallocr_t compute_alloc = NULL;
|
|
569
|
+
|
|
570
|
+
struct clip_image_size * load_image_size;
|
|
525
571
|
};
|
|
526
572
|
|
|
527
|
-
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
|
|
573
|
+
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
|
528
574
|
if (!ctx->has_vision_encoder) {
|
|
529
|
-
|
|
575
|
+
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
|
530
576
|
return nullptr;
|
|
531
577
|
}
|
|
532
578
|
|
|
533
579
|
const auto & model = ctx->vision_model;
|
|
534
580
|
const auto & hparams = model.hparams;
|
|
535
581
|
|
|
536
|
-
const int image_size
|
|
582
|
+
const int image_size = hparams.image_size;
|
|
583
|
+
int image_size_width = image_size;
|
|
584
|
+
int image_size_height = image_size;
|
|
585
|
+
if (ctx->has_minicpmv_projector) {
|
|
586
|
+
if (load_image_size == nullptr) {
|
|
587
|
+
load_image_size = clip_image_size_init();
|
|
588
|
+
}
|
|
589
|
+
LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
|
590
|
+
image_size_width = load_image_size->width;
|
|
591
|
+
image_size_height = load_image_size->height;
|
|
592
|
+
if (is_inf) {
|
|
593
|
+
image_size_width = imgs->data->nx;
|
|
594
|
+
image_size_height = imgs->data->ny;
|
|
595
|
+
}
|
|
596
|
+
}
|
|
537
597
|
const int patch_size = hparams.patch_size;
|
|
538
|
-
const int num_patches = ((
|
|
539
|
-
const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
|
|
598
|
+
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
|
540
599
|
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
|
541
600
|
const int hidden_size = hparams.hidden_size;
|
|
542
601
|
const int n_head = hparams.n_head;
|
|
543
602
|
const int d_head = hidden_size / n_head;
|
|
544
|
-
|
|
603
|
+
int n_layer = hparams.n_layer;
|
|
545
604
|
const float eps = hparams.eps;
|
|
546
605
|
|
|
547
606
|
const int batch_size = imgs->size;
|
|
548
607
|
|
|
549
|
-
if (ctx->has_llava_projector) {
|
|
608
|
+
if (ctx->has_llava_projector || ctx->has_minicpmv_projector) {
|
|
550
609
|
GGML_ASSERT(batch_size == 1);
|
|
551
610
|
}
|
|
552
611
|
|
|
@@ -559,7 +618,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
559
618
|
struct ggml_context * ctx0 = ggml_init(params);
|
|
560
619
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
|
561
620
|
|
|
562
|
-
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32,
|
|
621
|
+
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
|
|
563
622
|
ggml_set_name(inp_raw, "inp_raw");
|
|
564
623
|
ggml_set_input(inp_raw);
|
|
565
624
|
|
|
@@ -572,19 +631,21 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
572
631
|
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
|
|
573
632
|
inp = ggml_add(ctx0, inp, model.patch_bias);
|
|
574
633
|
}
|
|
575
|
-
|
|
576
|
-
// concat class_embeddings and patch_embeddings
|
|
577
634
|
struct ggml_tensor * embeddings = inp;
|
|
578
|
-
|
|
579
|
-
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
|
580
|
-
ggml_set_name(embeddings, "embeddings");
|
|
581
|
-
ggml_set_input(embeddings);
|
|
582
|
-
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
|
583
|
-
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
|
584
|
-
embeddings = ggml_acc(ctx0, embeddings, inp,
|
|
585
|
-
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
|
586
|
-
}
|
|
635
|
+
struct ggml_tensor * pos_embed = nullptr;
|
|
587
636
|
|
|
637
|
+
if (ctx->has_llava_projector) {
|
|
638
|
+
// concat class_embeddings and patch_embeddings
|
|
639
|
+
if (ctx->has_class_embedding) {
|
|
640
|
+
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
|
641
|
+
ggml_set_name(embeddings, "embeddings");
|
|
642
|
+
ggml_set_input(embeddings);
|
|
643
|
+
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
|
644
|
+
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
|
645
|
+
embeddings = ggml_acc(ctx0, embeddings, inp,
|
|
646
|
+
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
|
647
|
+
}
|
|
648
|
+
}
|
|
588
649
|
|
|
589
650
|
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
|
590
651
|
ggml_set_name(positions, "positions");
|
|
@@ -593,6 +654,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
593
654
|
embeddings =
|
|
594
655
|
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
|
595
656
|
|
|
657
|
+
if (ctx->has_minicpmv_projector) {
|
|
658
|
+
int pos_w = image_size_width/patch_size;
|
|
659
|
+
int pos_h = image_size_height/patch_size;
|
|
660
|
+
if (ctx->minicpmv_version == 2) {
|
|
661
|
+
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
|
|
662
|
+
}
|
|
663
|
+
else if (ctx->minicpmv_version == 3) {
|
|
664
|
+
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
|
|
665
|
+
}
|
|
666
|
+
ggml_set_name(pos_embed, "pos_embed");
|
|
667
|
+
ggml_set_input(pos_embed);
|
|
668
|
+
}
|
|
669
|
+
|
|
596
670
|
// pre-layernorm
|
|
597
671
|
if (ctx->has_pre_norm) {
|
|
598
672
|
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
@@ -602,6 +676,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
602
676
|
}
|
|
603
677
|
|
|
604
678
|
// loop over layers
|
|
679
|
+
if (ctx->has_minicpmv_projector) {
|
|
680
|
+
n_layer += 1;
|
|
681
|
+
}
|
|
605
682
|
for (int il = 0; il < n_layer - 1; il++) {
|
|
606
683
|
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
|
|
607
684
|
|
|
@@ -691,7 +768,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
691
768
|
}
|
|
692
769
|
|
|
693
770
|
// llava projector
|
|
694
|
-
{
|
|
771
|
+
if (ctx->has_llava_projector) {
|
|
695
772
|
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
|
|
696
773
|
|
|
697
774
|
struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
|
|
@@ -712,8 +789,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
712
789
|
embeddings = ggml_gelu(ctx0, embeddings);
|
|
713
790
|
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
|
|
714
791
|
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
|
715
|
-
|
|
716
|
-
|
|
792
|
+
}
|
|
793
|
+
else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
|
717
794
|
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
|
718
795
|
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
|
719
796
|
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
|
|
@@ -872,6 +949,75 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
872
949
|
GGML_ABORT("fatal error");
|
|
873
950
|
}
|
|
874
951
|
}
|
|
952
|
+
// minicpmv projector
|
|
953
|
+
else if (ctx->has_minicpmv_projector)
|
|
954
|
+
{
|
|
955
|
+
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
|
956
|
+
struct ggml_tensor * q = model.mm_model_query;
|
|
957
|
+
{ // layernorm
|
|
958
|
+
q = ggml_norm(ctx0, q, eps);
|
|
959
|
+
q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
|
|
960
|
+
}
|
|
961
|
+
struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
|
|
962
|
+
{ // layernorm
|
|
963
|
+
v = ggml_norm(ctx0, v, eps);
|
|
964
|
+
v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
|
|
965
|
+
}
|
|
966
|
+
struct ggml_tensor * k;
|
|
967
|
+
{ // position
|
|
968
|
+
// q = ggml_add(ctx0, q, model.mm_model_pos_embed);
|
|
969
|
+
k = ggml_add(ctx0, v, pos_embed);
|
|
970
|
+
}
|
|
971
|
+
|
|
972
|
+
{ // attention
|
|
973
|
+
int hidden_size = 4096;
|
|
974
|
+
const int d_head = 128;
|
|
975
|
+
int n_head = hidden_size/d_head;
|
|
976
|
+
int num_query = 96;
|
|
977
|
+
if (ctx->minicpmv_version == 2) {
|
|
978
|
+
hidden_size = 4096;
|
|
979
|
+
n_head = hidden_size/d_head;
|
|
980
|
+
num_query = 96;
|
|
981
|
+
}
|
|
982
|
+
else if (ctx->minicpmv_version == 3) {
|
|
983
|
+
hidden_size = 3584;
|
|
984
|
+
n_head = hidden_size/d_head;
|
|
985
|
+
num_query = 64;
|
|
986
|
+
}
|
|
987
|
+
|
|
988
|
+
struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
|
|
989
|
+
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
|
|
990
|
+
struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
|
|
991
|
+
struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
|
|
992
|
+
// permute
|
|
993
|
+
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
|
|
994
|
+
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
|
995
|
+
Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
|
|
996
|
+
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
|
|
997
|
+
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
|
998
|
+
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
|
|
999
|
+
V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
|
|
1000
|
+
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
|
1001
|
+
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
|
|
1002
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
|
1003
|
+
KQ = ggml_soft_max_inplace(ctx0, KQ);
|
|
1004
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
|
1005
|
+
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
|
|
1006
|
+
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
|
1007
|
+
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
|
|
1008
|
+
|
|
1009
|
+
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
|
|
1010
|
+
}
|
|
1011
|
+
{ // layernorm
|
|
1012
|
+
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
1013
|
+
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
|
|
1014
|
+
}
|
|
1015
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
|
|
1016
|
+
}
|
|
1017
|
+
else {
|
|
1018
|
+
GGML_ASSERT(false);
|
|
1019
|
+
}
|
|
1020
|
+
}
|
|
875
1021
|
|
|
876
1022
|
// build the graph
|
|
877
1023
|
ggml_build_forward_expand(gf, embeddings);
|
|
@@ -905,21 +1051,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
905
1051
|
const int idx_name = gguf_find_key(ctx, KEY_NAME);
|
|
906
1052
|
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
|
|
907
1053
|
const std::string name = gguf_get_val_str(ctx, idx_name);
|
|
908
|
-
|
|
1054
|
+
LOG_INF("%s: model name: %s\n", __func__, name.c_str());
|
|
909
1055
|
}
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
1056
|
+
LOG_INF("%s: description: %s\n", __func__, description.c_str());
|
|
1057
|
+
LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
|
|
1058
|
+
LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
|
1059
|
+
LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
|
|
1060
|
+
LOG_INF("%s: n_kv: %d\n", __func__, n_kv);
|
|
1061
|
+
LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str());
|
|
1062
|
+
LOG_INF("\n");
|
|
917
1063
|
}
|
|
918
1064
|
const int n_tensors = gguf_get_n_tensors(ctx);
|
|
919
1065
|
|
|
920
1066
|
// kv
|
|
921
1067
|
const int n_kv = gguf_get_n_kv(ctx);
|
|
922
|
-
|
|
1068
|
+
LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
|
|
923
1069
|
__func__, n_kv, n_tensors, fname);
|
|
924
1070
|
{
|
|
925
1071
|
std::map<enum ggml_type, uint32_t> n_type;
|
|
@@ -930,7 +1076,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
930
1076
|
n_type[type]++;
|
|
931
1077
|
}
|
|
932
1078
|
|
|
933
|
-
|
|
1079
|
+
LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
|
934
1080
|
for (int i = 0; i < n_kv; i++) {
|
|
935
1081
|
const char * name = gguf_get_key(ctx, i);
|
|
936
1082
|
const enum gguf_type type = gguf_get_kv_type(ctx, i);
|
|
@@ -946,7 +1092,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
946
1092
|
}
|
|
947
1093
|
replace_all(value, "\n", "\\n");
|
|
948
1094
|
|
|
949
|
-
|
|
1095
|
+
LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
|
950
1096
|
}
|
|
951
1097
|
|
|
952
1098
|
// print type counts
|
|
@@ -955,7 +1101,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
955
1101
|
continue;
|
|
956
1102
|
}
|
|
957
1103
|
|
|
958
|
-
|
|
1104
|
+
LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
|
|
959
1105
|
}
|
|
960
1106
|
}
|
|
961
1107
|
|
|
@@ -970,13 +1116,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
970
1116
|
size_t tensor_size = ggml_nbytes(cur);
|
|
971
1117
|
model_size += tensor_size;
|
|
972
1118
|
if (verbosity >= 3) {
|
|
973
|
-
|
|
1119
|
+
LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
|
|
974
1120
|
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
|
|
975
1121
|
}
|
|
976
1122
|
}
|
|
977
1123
|
}
|
|
978
1124
|
|
|
979
|
-
clip_ctx * new_clip = new clip_ctx;
|
|
1125
|
+
clip_ctx * new_clip = new clip_ctx{};
|
|
980
1126
|
|
|
981
1127
|
// update projector type
|
|
982
1128
|
{
|
|
@@ -997,23 +1143,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
997
1143
|
|
|
998
1144
|
#ifdef GGML_USE_CUDA
|
|
999
1145
|
new_clip->backend = ggml_backend_cuda_init(0);
|
|
1000
|
-
|
|
1146
|
+
LOG_INF("%s: CLIP using CUDA backend\n", __func__);
|
|
1001
1147
|
#endif
|
|
1002
1148
|
|
|
1003
1149
|
#ifdef GGML_USE_METAL
|
|
1004
1150
|
new_clip->backend = ggml_backend_metal_init();
|
|
1005
|
-
|
|
1151
|
+
LOG_INF("%s: CLIP using Metal backend\n", __func__);
|
|
1006
1152
|
#endif
|
|
1007
1153
|
|
|
1008
1154
|
#ifdef GGML_USE_CANN
|
|
1009
1155
|
new_clip->backend = ggml_backend_cann_init(0);
|
|
1010
|
-
|
|
1156
|
+
LOG_INF("%s: CLIP using CANN backend\n", __func__);
|
|
1011
1157
|
#endif
|
|
1012
1158
|
|
|
1159
|
+
#ifdef GGML_USE_VULKAN
|
|
1160
|
+
new_clip->backend = ggml_backend_vk_init(0);
|
|
1161
|
+
LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
|
|
1162
|
+
#endif
|
|
1013
1163
|
|
|
1014
1164
|
if (!new_clip->backend) {
|
|
1015
1165
|
new_clip->backend = ggml_backend_cpu_init();
|
|
1016
|
-
|
|
1166
|
+
LOG_INF("%s: CLIP using CPU backend\n", __func__);
|
|
1017
1167
|
}
|
|
1018
1168
|
|
|
1019
1169
|
// model size and capabilities
|
|
@@ -1029,7 +1179,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1029
1179
|
new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx);
|
|
1030
1180
|
}
|
|
1031
1181
|
|
|
1032
|
-
|
|
1182
|
+
idx = gguf_find_key(ctx, KEY_HAS_MINICPMV_PROJ);
|
|
1183
|
+
if (idx != -1) {
|
|
1184
|
+
new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx);
|
|
1185
|
+
}
|
|
1186
|
+
|
|
1187
|
+
idx = gguf_find_key(ctx, KEY_MINICPMV_VERSION);
|
|
1188
|
+
if (idx != -1) {
|
|
1189
|
+
new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1192
|
+
// GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
|
|
1193
|
+
|
|
1033
1194
|
GGML_ASSERT(new_clip->has_vision_encoder);
|
|
1034
1195
|
GGML_ASSERT(!new_clip->has_text_encoder);
|
|
1035
1196
|
|
|
@@ -1037,15 +1198,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1037
1198
|
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
|
|
1038
1199
|
|
|
1039
1200
|
if (verbosity >= 1) {
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1201
|
+
LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
|
1202
|
+
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
|
1203
|
+
LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
|
1204
|
+
LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
|
|
1205
|
+
LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
|
1206
|
+
LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
|
1045
1207
|
}
|
|
1046
1208
|
}
|
|
1047
1209
|
|
|
1048
|
-
|
|
1210
|
+
LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
|
|
1049
1211
|
|
|
1050
1212
|
// load tensors
|
|
1051
1213
|
{
|
|
@@ -1058,7 +1220,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1058
1220
|
|
|
1059
1221
|
new_clip->ctx_data = ggml_init(params);
|
|
1060
1222
|
if (!new_clip->ctx_data) {
|
|
1061
|
-
|
|
1223
|
+
LOG_ERR("%s: ggml_init() failed\n", __func__);
|
|
1062
1224
|
clip_free(new_clip);
|
|
1063
1225
|
gguf_free(ctx);
|
|
1064
1226
|
return nullptr;
|
|
@@ -1066,7 +1228,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1066
1228
|
|
|
1067
1229
|
auto fin = std::ifstream(fname, std::ios::binary);
|
|
1068
1230
|
if (!fin) {
|
|
1069
|
-
|
|
1231
|
+
LOG_ERR("cannot open model file for loading tensors\n");
|
|
1070
1232
|
clip_free(new_clip);
|
|
1071
1233
|
gguf_free(ctx);
|
|
1072
1234
|
return nullptr;
|
|
@@ -1088,7 +1250,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1088
1250
|
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
|
1089
1251
|
fin.seekg(offset, std::ios::beg);
|
|
1090
1252
|
if (!fin) {
|
|
1091
|
-
|
|
1253
|
+
LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
|
|
1092
1254
|
clip_free(new_clip);
|
|
1093
1255
|
gguf_free(ctx);
|
|
1094
1256
|
return nullptr;
|
|
@@ -1159,23 +1321,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1159
1321
|
}
|
|
1160
1322
|
|
|
1161
1323
|
if (verbosity >= 2) {
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1324
|
+
LOG_INF("\n%s: vision model hparams\n", __func__);
|
|
1325
|
+
LOG_INF("image_size %d\n", hparams.image_size);
|
|
1326
|
+
LOG_INF("patch_size %d\n", hparams.patch_size);
|
|
1327
|
+
LOG_INF("v_hidden_size %d\n", hparams.hidden_size);
|
|
1328
|
+
LOG_INF("v_n_intermediate %d\n", hparams.n_intermediate);
|
|
1329
|
+
LOG_INF("v_projection_dim %d\n", hparams.projection_dim);
|
|
1330
|
+
LOG_INF("v_n_head %d\n", hparams.n_head);
|
|
1331
|
+
LOG_INF("v_n_layer %d\n", hparams.n_layer);
|
|
1332
|
+
LOG_INF("v_eps %f\n", hparams.eps);
|
|
1333
|
+
LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
|
|
1334
|
+
LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
|
|
1335
|
+
LOG_INF("v_image_grid_pinpoints: ");
|
|
1174
1336
|
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
|
|
1175
|
-
|
|
1337
|
+
LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
|
|
1176
1338
|
}
|
|
1177
|
-
|
|
1178
|
-
|
|
1339
|
+
LOG_INF("\n");
|
|
1340
|
+
LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
|
|
1179
1341
|
|
|
1180
1342
|
}
|
|
1181
1343
|
|
|
@@ -1213,7 +1375,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1213
1375
|
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
|
1214
1376
|
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
|
1215
1377
|
} catch(const std::exception& /*e*/) {
|
|
1216
|
-
|
|
1378
|
+
LOG_ERR("%s: failed to load vision model tensors\n", __func__);
|
|
1217
1379
|
}
|
|
1218
1380
|
|
|
1219
1381
|
// LLaVA projection
|
|
@@ -1242,7 +1404,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1242
1404
|
} catch (std::runtime_error & /*e*/) { }
|
|
1243
1405
|
try {
|
|
1244
1406
|
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
|
|
1245
|
-
//
|
|
1407
|
+
// LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
|
|
1246
1408
|
} catch (std::runtime_error & /*e*/) { }
|
|
1247
1409
|
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
|
|
1248
1410
|
// MobileVLM projection
|
|
@@ -1281,6 +1443,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1281
1443
|
vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
|
|
1282
1444
|
vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
|
|
1283
1445
|
}
|
|
1446
|
+
else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
|
1447
|
+
// vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
|
|
1448
|
+
vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K);
|
|
1449
|
+
vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY);
|
|
1450
|
+
vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ);
|
|
1451
|
+
vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ);
|
|
1452
|
+
vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight"));
|
|
1453
|
+
vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight"));
|
|
1454
|
+
vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight"));
|
|
1455
|
+
vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias"));
|
|
1456
|
+
vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias"));
|
|
1457
|
+
vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias"));
|
|
1458
|
+
vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight"));
|
|
1459
|
+
vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias"));
|
|
1460
|
+
vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight"));
|
|
1461
|
+
vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias"));
|
|
1462
|
+
vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight"));
|
|
1463
|
+
vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias"));
|
|
1464
|
+
vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
|
|
1465
|
+
vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
|
|
1466
|
+
}
|
|
1284
1467
|
else {
|
|
1285
1468
|
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
|
1286
1469
|
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
|
@@ -1319,15 +1502,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1319
1502
|
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
|
|
1320
1503
|
clip_image_f32_batch batch;
|
|
1321
1504
|
batch.size = 1;
|
|
1322
|
-
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
|
|
1505
|
+
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
|
1323
1506
|
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
|
1324
1507
|
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
|
1325
|
-
|
|
1508
|
+
LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
|
1326
1509
|
}
|
|
1327
1510
|
|
|
1328
1511
|
return new_clip;
|
|
1329
1512
|
}
|
|
1330
1513
|
|
|
1514
|
+
void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
|
|
1515
|
+
ctx_clip->load_image_size = load_image_size;
|
|
1516
|
+
}
|
|
1517
|
+
|
|
1518
|
+
struct clip_image_size * clip_image_size_init() {
|
|
1519
|
+
struct clip_image_size * load_image_size = new struct clip_image_size();
|
|
1520
|
+
load_image_size->width = 448;
|
|
1521
|
+
load_image_size->height = 448;
|
|
1522
|
+
return load_image_size;
|
|
1523
|
+
}
|
|
1524
|
+
|
|
1331
1525
|
struct clip_image_u8 * clip_image_u8_init() {
|
|
1332
1526
|
return new clip_image_u8();
|
|
1333
1527
|
}
|
|
@@ -1362,7 +1556,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
|
|
1362
1556
|
int nx, ny, nc;
|
|
1363
1557
|
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
|
|
1364
1558
|
if (!data) {
|
|
1365
|
-
|
|
1559
|
+
LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
|
|
1366
1560
|
return false;
|
|
1367
1561
|
}
|
|
1368
1562
|
build_clip_img_from_data(data, nx, ny, img);
|
|
@@ -1374,7 +1568,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
|
|
|
1374
1568
|
int nx, ny, nc;
|
|
1375
1569
|
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
|
1376
1570
|
if (!data) {
|
|
1377
|
-
|
|
1571
|
+
LOG_ERR("%s: failed to decode image bytes\n", __func__);
|
|
1378
1572
|
return false;
|
|
1379
1573
|
}
|
|
1380
1574
|
build_clip_img_from_data(data, nx, ny, img);
|
|
@@ -1433,7 +1627,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
|
|
|
1433
1627
|
}
|
|
1434
1628
|
}
|
|
1435
1629
|
|
|
1436
|
-
inline
|
|
1630
|
+
inline int clip(int x, int lower, int upper) {
|
|
1437
1631
|
return std::max(lower, std::min(x, upper));
|
|
1438
1632
|
}
|
|
1439
1633
|
|
|
@@ -1564,7 +1758,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
|
|
|
1564
1758
|
int downscaled_height = static_cast<int>(original_height * scale);
|
|
1565
1759
|
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
|
1566
1760
|
int wasted_resolution = (width * height) - effective_resolution;
|
|
1567
|
-
//
|
|
1761
|
+
// LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
|
1568
1762
|
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
|
1569
1763
|
max_effective_resolution = effective_resolution;
|
|
1570
1764
|
min_wasted_resolution = wasted_resolution;
|
|
@@ -1598,12 +1792,185 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
|
|
|
1598
1792
|
return patches;
|
|
1599
1793
|
}
|
|
1600
1794
|
|
|
1795
|
+
static int ensure_divide(int length, int patch_size) {
|
|
1796
|
+
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
|
|
1797
|
+
}
|
|
1798
|
+
|
|
1799
|
+
static std::pair<int, int> uhd_find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
|
|
1800
|
+
int width = original_size.first;
|
|
1801
|
+
int height = original_size.second;
|
|
1802
|
+
if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
|
|
1803
|
+
float r = static_cast<float>(width) / height;
|
|
1804
|
+
height = static_cast<int>(scale_resolution / std::sqrt(r));
|
|
1805
|
+
width = static_cast<int>(height * r);
|
|
1806
|
+
}
|
|
1807
|
+
int best_width = ensure_divide(width, patch_size);
|
|
1808
|
+
int best_height = ensure_divide(height, patch_size);
|
|
1809
|
+
return std::make_pair(best_width, best_height);
|
|
1810
|
+
}
|
|
1811
|
+
|
|
1812
|
+
static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
|
|
1813
|
+
int width, height;
|
|
1814
|
+
std::tie(width, height) = original_size;
|
|
1815
|
+
int grid_x, grid_y;
|
|
1816
|
+
std::tie(grid_x, grid_y) = grid;
|
|
1817
|
+
|
|
1818
|
+
int refine_width = ensure_divide(width, grid_x);
|
|
1819
|
+
int refine_height = ensure_divide(height, grid_y);
|
|
1820
|
+
|
|
1821
|
+
int grid_width = refine_width / grid_x;
|
|
1822
|
+
int grid_height = refine_height / grid_y;
|
|
1823
|
+
|
|
1824
|
+
// auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line)
|
|
1825
|
+
auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair
|
|
1826
|
+
int best_grid_width, best_grid_height;
|
|
1827
|
+
std::tie(best_grid_width, best_grid_height) = best_grid_size;
|
|
1828
|
+
|
|
1829
|
+
// std::pair<int, int> refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line)
|
|
1830
|
+
std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line)
|
|
1831
|
+
return refine_size;
|
|
1832
|
+
}
|
|
1833
|
+
|
|
1834
|
+
static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
|
|
1835
|
+
std::vector<int> candidate_split_grids_nums;
|
|
1836
|
+
for (int i : {multiple - 1, multiple, multiple + 1}) {
|
|
1837
|
+
if (i == 1 || i > max_slice_nums) {
|
|
1838
|
+
continue;
|
|
1839
|
+
}
|
|
1840
|
+
candidate_split_grids_nums.push_back(i);
|
|
1841
|
+
}
|
|
1842
|
+
|
|
1843
|
+
std::vector<std::pair<int, int>> candidate_grids;
|
|
1844
|
+
for (int split_grids_nums : candidate_split_grids_nums) {
|
|
1845
|
+
int m = 1;
|
|
1846
|
+
while (m <= split_grids_nums) {
|
|
1847
|
+
if (split_grids_nums % m == 0) {
|
|
1848
|
+
candidate_grids.emplace_back(m, split_grids_nums / m);
|
|
1849
|
+
}
|
|
1850
|
+
++m;
|
|
1851
|
+
}
|
|
1852
|
+
}
|
|
1853
|
+
|
|
1854
|
+
std::pair<int, int> best_grid{1, 1};
|
|
1855
|
+
float min_error = std::numeric_limits<float>::infinity();
|
|
1856
|
+
for (const auto& grid : candidate_grids) {
|
|
1857
|
+
float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second));
|
|
1858
|
+
if (error < min_error) {
|
|
1859
|
+
best_grid = grid;
|
|
1860
|
+
min_error = error;
|
|
1861
|
+
}
|
|
1862
|
+
}
|
|
1863
|
+
return best_grid;
|
|
1864
|
+
}
|
|
1865
|
+
|
|
1866
|
+
// inspired from LLaVA-UHD:
|
|
1867
|
+
// -> https://arxiv.org/pdf/2403.11703
|
|
1868
|
+
// -> https://github.com/thunlp/LLaVA-UHD
|
|
1869
|
+
// -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
|
|
1870
|
+
static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
|
|
1871
|
+
const std::pair<int, int> original_size={img->nx,img->ny};
|
|
1872
|
+
const int original_width = img->nx;
|
|
1873
|
+
const int original_height = img->ny;
|
|
1874
|
+
const float log_ratio = log(1.0*original_width/original_height);
|
|
1875
|
+
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
|
|
1876
|
+
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
|
1877
|
+
|
|
1878
|
+
std::vector<std::vector<clip_image_u8 *>> images;
|
|
1879
|
+
LOG_INF("%s: multiple %d\n", __func__, multiple);
|
|
1880
|
+
images.push_back(std::vector<clip_image_u8 *>());
|
|
1881
|
+
|
|
1882
|
+
if (multiple <= 1) {
|
|
1883
|
+
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
|
|
1884
|
+
clip_image_u8 * source_image = clip_image_u8_init();
|
|
1885
|
+
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
|
|
1886
|
+
// source_image = image.resize(best_size, Image.Resampling.BICUBIC)
|
|
1887
|
+
images[images.size()-1].push_back(source_image);
|
|
1888
|
+
}
|
|
1889
|
+
else if (multiple > 1) {
|
|
1890
|
+
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
|
|
1891
|
+
clip_image_u8 * source_image = clip_image_u8_init();
|
|
1892
|
+
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
|
|
1893
|
+
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
|
|
1894
|
+
LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
|
|
1895
|
+
images[images.size()-1].push_back(source_image);
|
|
1896
|
+
|
|
1897
|
+
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
|
|
1898
|
+
LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
|
|
1899
|
+
|
|
1900
|
+
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
|
|
1901
|
+
clip_image_u8 * refine_image = clip_image_u8_init();
|
|
1902
|
+
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
|
|
1903
|
+
|
|
1904
|
+
LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
|
|
1905
|
+
|
|
1906
|
+
// split_to_patches
|
|
1907
|
+
int width = refine_image->nx;
|
|
1908
|
+
int height = refine_image->ny;
|
|
1909
|
+
int grid_x = int(width / best_grid.first);
|
|
1910
|
+
int grid_y = int(height / best_grid.second);
|
|
1911
|
+
for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
|
|
1912
|
+
images.push_back(std::vector<clip_image_u8 *>());
|
|
1913
|
+
for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
|
|
1914
|
+
clip_image_u8 * patch = clip_image_u8_init();
|
|
1915
|
+
patch->nx = grid_x;
|
|
1916
|
+
patch->ny = grid_y;
|
|
1917
|
+
patch->buf.resize(3 * patch->nx * patch->ny);
|
|
1918
|
+
for (int y = patches_i; y < patches_i + grid_y; ++y) {
|
|
1919
|
+
for (int x = patches_j; x < patches_j + grid_x; ++x) {
|
|
1920
|
+
const int i = 3 * (y * refine_image->nx + x);
|
|
1921
|
+
const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j));
|
|
1922
|
+
patch->buf[j] = refine_image->buf[i];
|
|
1923
|
+
patch->buf[j+1] = refine_image->buf[i+1];
|
|
1924
|
+
patch->buf[j+2] = refine_image->buf[i+2];
|
|
1925
|
+
}
|
|
1926
|
+
}
|
|
1927
|
+
images[images.size()-1].push_back(patch);
|
|
1928
|
+
}
|
|
1929
|
+
}
|
|
1930
|
+
}
|
|
1931
|
+
return images;
|
|
1932
|
+
}
|
|
1933
|
+
|
|
1934
|
+
int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
|
|
1935
|
+
const int max_slice_nums=9;
|
|
1936
|
+
const int scale_resolution=448;
|
|
1937
|
+
const int original_width = ctx_clip->load_image_size->width;
|
|
1938
|
+
const int original_height = ctx_clip->load_image_size->height;
|
|
1939
|
+
const float log_ratio = log(1.0*original_width/original_height);
|
|
1940
|
+
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
|
|
1941
|
+
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
|
1942
|
+
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
|
|
1943
|
+
return best_grid.first;
|
|
1944
|
+
}
|
|
1945
|
+
|
|
1601
1946
|
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
|
|
1602
1947
|
// res_imgs memory is being allocated here, previous allocations will be freed if found
|
|
1603
1948
|
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
|
|
1949
|
+
|
|
1950
|
+
if(clip_is_minicpmv(ctx)){
|
|
1951
|
+
int max_slice_nums = 9;
|
|
1952
|
+
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, max_slice_nums);
|
|
1953
|
+
res_imgs->size = 0;
|
|
1954
|
+
for (size_t i = 0; i < imgs.size(); ++i){
|
|
1955
|
+
res_imgs->size += imgs[i].size();
|
|
1956
|
+
}
|
|
1957
|
+
res_imgs->data = new clip_image_f32[res_imgs->size];
|
|
1958
|
+
int idx = 0;
|
|
1959
|
+
for (size_t i = 0; i < imgs.size(); ++i) {
|
|
1960
|
+
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
|
1961
|
+
LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
|
|
1962
|
+
clip_image_f32 * res = clip_image_f32_init();
|
|
1963
|
+
normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
|
|
1964
|
+
res_imgs->data[idx++] = *res;
|
|
1965
|
+
clip_image_f32_free(res);
|
|
1966
|
+
}
|
|
1967
|
+
}
|
|
1968
|
+
return true;
|
|
1969
|
+
}
|
|
1970
|
+
|
|
1604
1971
|
bool pad_to_square = true;
|
|
1605
1972
|
if (!ctx->has_vision_encoder) {
|
|
1606
|
-
|
|
1973
|
+
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
|
1607
1974
|
return false;
|
|
1608
1975
|
}
|
|
1609
1976
|
auto & params = ctx->vision_model.hparams;
|
|
@@ -1680,7 +2047,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|
|
1680
2047
|
}
|
|
1681
2048
|
|
|
1682
2049
|
for (size_t i = 0; i < patches.size(); i++) {
|
|
1683
|
-
//
|
|
2050
|
+
// LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
|
|
1684
2051
|
clip_image_u8_free(patches[i]);
|
|
1685
2052
|
}
|
|
1686
2053
|
|
|
@@ -1816,14 +2183,107 @@ int clip_n_patches(const struct clip_ctx * ctx) {
|
|
|
1816
2183
|
|
|
1817
2184
|
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
|
|
1818
2185
|
n_patches /= 4;
|
|
2186
|
+
} else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
|
2187
|
+
if (ctx->minicpmv_version == 2) {
|
|
2188
|
+
n_patches = 96;
|
|
2189
|
+
}
|
|
2190
|
+
else if (ctx->minicpmv_version == 3) {
|
|
2191
|
+
n_patches = 64;
|
|
2192
|
+
}
|
|
1819
2193
|
}
|
|
1820
2194
|
|
|
1821
2195
|
return n_patches;
|
|
1822
2196
|
}
|
|
1823
2197
|
|
|
2198
|
+
static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
|
|
2199
|
+
assert(embed_dim % 2 == 0);
|
|
2200
|
+
int H = pos.size();
|
|
2201
|
+
int W = pos[0].size();
|
|
2202
|
+
|
|
2203
|
+
std::vector<float> omega(embed_dim / 2);
|
|
2204
|
+
for (int i = 0; i < embed_dim / 2; ++i) {
|
|
2205
|
+
omega[i] = 1.0 / pow(10000.0, static_cast<float>(i) / (embed_dim / 2));
|
|
2206
|
+
}
|
|
2207
|
+
|
|
2208
|
+
std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
|
|
2209
|
+
for (int h = 0; h < H; ++h) {
|
|
2210
|
+
for (int w = 0; w < W; ++w) {
|
|
2211
|
+
for (int d = 0; d < embed_dim / 2; ++d) {
|
|
2212
|
+
float out_value = pos[h][w] * omega[d];
|
|
2213
|
+
emb[h][w][d] = sin(out_value);
|
|
2214
|
+
emb[h][w][d + embed_dim / 2] = cos(out_value);
|
|
2215
|
+
}
|
|
2216
|
+
}
|
|
2217
|
+
}
|
|
2218
|
+
|
|
2219
|
+
return emb;
|
|
2220
|
+
}
|
|
2221
|
+
|
|
2222
|
+
static std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector<std::vector<std::vector<float>>> & grid) {
|
|
2223
|
+
assert(embed_dim % 2 == 0);
|
|
2224
|
+
std::vector<std::vector<std::vector<float>>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2)
|
|
2225
|
+
std::vector<std::vector<std::vector<float>>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2)
|
|
2226
|
+
|
|
2227
|
+
int H = emb_h.size();
|
|
2228
|
+
int W = emb_h[0].size();
|
|
2229
|
+
std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
|
|
2230
|
+
|
|
2231
|
+
for (int h = 0; h < H; ++h) {
|
|
2232
|
+
for (int w = 0; w < W; ++w) {
|
|
2233
|
+
for (int d = 0; d < embed_dim / 2; ++d) {
|
|
2234
|
+
emb[h][w][d] = emb_h[h][w][d];
|
|
2235
|
+
emb[h][w][d + embed_dim / 2] = emb_w[h][w][d];
|
|
2236
|
+
}
|
|
2237
|
+
}
|
|
2238
|
+
}
|
|
2239
|
+
return emb;
|
|
2240
|
+
}
|
|
2241
|
+
|
|
2242
|
+
static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, const std::pair<int, int> image_size) {
|
|
2243
|
+
int grid_h_size = image_size.first;
|
|
2244
|
+
int grid_w_size = image_size.second;
|
|
2245
|
+
|
|
2246
|
+
std::vector<float> grid_h(grid_h_size);
|
|
2247
|
+
std::vector<float> grid_w(grid_w_size);
|
|
2248
|
+
|
|
2249
|
+
for (int i = 0; i < grid_h_size; ++i) {
|
|
2250
|
+
grid_h[i] = static_cast<float>(i);
|
|
2251
|
+
}
|
|
2252
|
+
for (int i = 0; i < grid_w_size; ++i) {
|
|
2253
|
+
grid_w[i] = static_cast<float>(i);
|
|
2254
|
+
}
|
|
2255
|
+
|
|
2256
|
+
std::vector<std::vector<float>> grid(grid_h_size, std::vector<float>(grid_w_size));
|
|
2257
|
+
for (int h = 0; h < grid_h_size; ++h) {
|
|
2258
|
+
for (int w = 0; w < grid_w_size; ++w) {
|
|
2259
|
+
grid[h][w] = grid_w[w];
|
|
2260
|
+
}
|
|
2261
|
+
}
|
|
2262
|
+
std::vector<std::vector<std::vector<float>>> grid_2d = {grid, grid};
|
|
2263
|
+
for (int h = 0; h < grid_h_size; ++h) {
|
|
2264
|
+
for (int w = 0; w < grid_w_size; ++w) {
|
|
2265
|
+
grid_2d[0][h][w] = grid_h[h];
|
|
2266
|
+
grid_2d[1][h][w] = grid_w[w];
|
|
2267
|
+
}
|
|
2268
|
+
}
|
|
2269
|
+
|
|
2270
|
+
std::vector<std::vector<std::vector<float>>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d);
|
|
2271
|
+
|
|
2272
|
+
int H = image_size.first;
|
|
2273
|
+
int W = image_size.second;
|
|
2274
|
+
std::vector<std::vector<float>> pos_embed_2d(H * W, std::vector<float>(embed_dim));
|
|
2275
|
+
for (int h = 0; h < H; ++h) {
|
|
2276
|
+
for (int w = 0; w < W; ++w) {
|
|
2277
|
+
pos_embed_2d[w * H + h] = pos_embed_3d[h][w];
|
|
2278
|
+
}
|
|
2279
|
+
}
|
|
2280
|
+
|
|
2281
|
+
return pos_embed_2d;
|
|
2282
|
+
}
|
|
2283
|
+
|
|
1824
2284
|
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
|
1825
2285
|
if (!ctx->has_vision_encoder) {
|
|
1826
|
-
|
|
2286
|
+
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
|
1827
2287
|
return false;
|
|
1828
2288
|
}
|
|
1829
2289
|
|
|
@@ -1835,7 +2295,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
|
|
|
1835
2295
|
|
|
1836
2296
|
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
|
|
1837
2297
|
if (!ctx->has_vision_encoder) {
|
|
1838
|
-
|
|
2298
|
+
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
|
1839
2299
|
return false;
|
|
1840
2300
|
}
|
|
1841
2301
|
|
|
@@ -1843,19 +2303,33 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
1843
2303
|
if (ctx->has_llava_projector) {
|
|
1844
2304
|
GGML_ASSERT(batch_size == 1); // TODO: support multiple images
|
|
1845
2305
|
}
|
|
2306
|
+
if (ctx->has_minicpmv_projector) {
|
|
2307
|
+
GGML_ASSERT(batch_size == 1);
|
|
2308
|
+
}
|
|
1846
2309
|
|
|
1847
2310
|
// build the inference graph
|
|
1848
|
-
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
|
|
2311
|
+
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
|
|
1849
2312
|
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
|
|
1850
2313
|
|
|
1851
2314
|
// set inputs
|
|
1852
2315
|
const auto & model = ctx->vision_model;
|
|
1853
2316
|
const auto & hparams = model.hparams;
|
|
1854
2317
|
|
|
1855
|
-
const int image_size
|
|
2318
|
+
const int image_size = hparams.image_size;
|
|
2319
|
+
int image_size_width = image_size;
|
|
2320
|
+
int image_size_height = image_size;
|
|
2321
|
+
if (ctx->has_minicpmv_projector) {
|
|
2322
|
+
image_size_width = imgs->data[0].nx;
|
|
2323
|
+
image_size_height = imgs->data[0].ny;
|
|
2324
|
+
}
|
|
1856
2325
|
const int patch_size = hparams.patch_size;
|
|
1857
|
-
const int num_patches = ((
|
|
2326
|
+
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
|
1858
2327
|
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
|
2328
|
+
if(ctx->load_image_size==nullptr){
|
|
2329
|
+
ctx->load_image_size= clip_image_size_init();
|
|
2330
|
+
}
|
|
2331
|
+
const int pos_w = ctx->load_image_size->width/patch_size;
|
|
2332
|
+
const int pos_h = ctx->load_image_size->height/patch_size;
|
|
1859
2333
|
|
|
1860
2334
|
{
|
|
1861
2335
|
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
|
|
@@ -1864,7 +2338,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
1864
2338
|
for (size_t i = 0; i < imgs->size; i++) {
|
|
1865
2339
|
const int nx = imgs->data[i].nx;
|
|
1866
2340
|
const int ny = imgs->data[i].ny;
|
|
1867
|
-
|
|
2341
|
+
if (!ctx->has_minicpmv_projector) {
|
|
2342
|
+
GGML_ASSERT(nx == image_size && ny == image_size);
|
|
2343
|
+
}
|
|
1868
2344
|
|
|
1869
2345
|
const int n = nx * ny;
|
|
1870
2346
|
|
|
@@ -1881,53 +2357,97 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
1881
2357
|
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
|
|
1882
2358
|
free(data);
|
|
1883
2359
|
}
|
|
2360
|
+
if (ctx->has_minicpmv_projector) {
|
|
2361
|
+
{
|
|
2362
|
+
// inspired from siglip:
|
|
2363
|
+
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
|
|
2364
|
+
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
|
|
2365
|
+
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
|
2366
|
+
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
|
2367
|
+
int bucket_coords_h[70];
|
|
2368
|
+
int bucket_coords_w[70];
|
|
2369
|
+
for (int i = 0; i < pos_h; i++){
|
|
2370
|
+
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
|
|
2371
|
+
}
|
|
2372
|
+
for (int i = 0; i < pos_w; i++){
|
|
2373
|
+
bucket_coords_w[i] = std::floor(70.0*i/pos_w);
|
|
2374
|
+
}
|
|
2375
|
+
for (int i = 0, id = 0; i < pos_h; i++){
|
|
2376
|
+
for (int j = 0; j < pos_w; j++){
|
|
2377
|
+
positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
|
|
2378
|
+
}
|
|
2379
|
+
}
|
|
2380
|
+
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
|
2381
|
+
free(positions_data);
|
|
2382
|
+
}
|
|
1884
2383
|
|
|
1885
|
-
|
|
1886
|
-
|
|
1887
|
-
|
|
2384
|
+
{
|
|
2385
|
+
// inspired from resampler of Qwen-VL:
|
|
2386
|
+
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
|
|
2387
|
+
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
|
|
2388
|
+
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
|
|
2389
|
+
int embed_dim = 4096;
|
|
2390
|
+
if (ctx->minicpmv_version == 2) {
|
|
2391
|
+
embed_dim = 4096;
|
|
2392
|
+
}
|
|
2393
|
+
else if (ctx->minicpmv_version == 3) {
|
|
2394
|
+
embed_dim = 3584;
|
|
2395
|
+
}
|
|
2396
|
+
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
|
1888
2397
|
|
|
1889
|
-
|
|
1890
|
-
|
|
1891
|
-
|
|
1892
|
-
|
|
2398
|
+
float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
|
|
2399
|
+
for(int i=0;i<pos_w * pos_h;++i){
|
|
2400
|
+
for(int j=0;j<embed_dim;++j){
|
|
2401
|
+
pos_embed_data[i*embed_dim+j]=pos_embed_t[i][j];
|
|
2402
|
+
}
|
|
2403
|
+
}
|
|
2404
|
+
|
|
2405
|
+
ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
|
|
2406
|
+
free(pos_embed_data);
|
|
1893
2407
|
}
|
|
1894
2408
|
}
|
|
2409
|
+
else{
|
|
2410
|
+
{
|
|
2411
|
+
if (ctx->has_class_embedding) {
|
|
2412
|
+
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
|
|
1895
2413
|
|
|
1896
|
-
|
|
1897
|
-
|
|
2414
|
+
void* zero_mem = malloc(ggml_nbytes(embeddings));
|
|
2415
|
+
memset(zero_mem, 0, ggml_nbytes(embeddings));
|
|
2416
|
+
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
|
|
2417
|
+
free(zero_mem);
|
|
2418
|
+
}
|
|
2419
|
+
}
|
|
2420
|
+
|
|
2421
|
+
{
|
|
2422
|
+
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
|
1898
2423
|
|
|
1899
|
-
|
|
1900
|
-
|
|
1901
|
-
|
|
2424
|
+
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
|
2425
|
+
for (int i = 0; i < num_positions; i++) {
|
|
2426
|
+
positions_data[i] = i;
|
|
2427
|
+
}
|
|
2428
|
+
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
|
2429
|
+
free(positions_data);
|
|
1902
2430
|
}
|
|
1903
|
-
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
|
1904
|
-
free(positions_data);
|
|
1905
|
-
}
|
|
1906
2431
|
|
|
1907
|
-
|
|
1908
|
-
|
|
1909
|
-
|
|
1910
|
-
|
|
1911
|
-
|
|
2432
|
+
{
|
|
2433
|
+
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
|
2434
|
+
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
|
2435
|
+
for (int i = 0; i < num_patches; i++) {
|
|
2436
|
+
patches_data[i] = i + 1;
|
|
2437
|
+
}
|
|
2438
|
+
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
|
2439
|
+
free(patches_data);
|
|
1912
2440
|
}
|
|
1913
|
-
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
|
1914
|
-
free(patches_data);
|
|
1915
2441
|
}
|
|
1916
2442
|
|
|
1917
2443
|
if (ggml_backend_is_cpu(ctx->backend)) {
|
|
1918
2444
|
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
|
1919
2445
|
}
|
|
1920
2446
|
|
|
1921
|
-
#ifdef GGML_USE_METAL
|
|
1922
|
-
if (ggml_backend_is_metal(ctx->backend)) {
|
|
1923
|
-
ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
|
|
1924
|
-
}
|
|
1925
|
-
#endif
|
|
1926
|
-
|
|
1927
2447
|
ggml_backend_graph_compute(ctx->backend, gf);
|
|
1928
2448
|
|
|
1929
2449
|
// the last node is the embedding tensor
|
|
1930
|
-
struct ggml_tensor * embeddings = gf
|
|
2450
|
+
struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
|
1931
2451
|
|
|
1932
2452
|
// copy the embeddings to the location passed by the user
|
|
1933
2453
|
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
|
@@ -1999,7 +2519,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|
|
1999
2519
|
new_type = type;
|
|
2000
2520
|
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
|
|
2001
2521
|
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
|
|
2002
|
-
//
|
|
2522
|
+
// LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
|
|
2003
2523
|
}
|
|
2004
2524
|
const size_t n_elms = ggml_nelements(cur);
|
|
2005
2525
|
float * f32_data;
|
|
@@ -2018,7 +2538,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|
|
2018
2538
|
f32_data = (float *)conv_buf.data();
|
|
2019
2539
|
break;
|
|
2020
2540
|
default:
|
|
2021
|
-
|
|
2541
|
+
LOG_ERR("Please use an input file in f32 or f16\n");
|
|
2022
2542
|
gguf_free(ctx_out);
|
|
2023
2543
|
return false;
|
|
2024
2544
|
}
|
|
@@ -2045,7 +2565,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|
|
2045
2565
|
fout.put(0);
|
|
2046
2566
|
}
|
|
2047
2567
|
|
|
2048
|
-
|
|
2568
|
+
LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
|
|
2049
2569
|
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
|
2050
2570
|
}
|
|
2051
2571
|
|
|
@@ -2061,8 +2581,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|
|
2061
2581
|
gguf_free(ctx_out);
|
|
2062
2582
|
|
|
2063
2583
|
{
|
|
2064
|
-
|
|
2065
|
-
|
|
2584
|
+
LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
|
|
2585
|
+
LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
|
|
2066
2586
|
}
|
|
2067
2587
|
|
|
2068
2588
|
return true;
|
|
@@ -2081,7 +2601,22 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|
|
2081
2601
|
if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
|
2082
2602
|
return ctx->vision_model.mm_3_b->ne[0];
|
|
2083
2603
|
}
|
|
2604
|
+
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
|
2605
|
+
if (ctx->minicpmv_version == 2) {
|
|
2606
|
+
return 4096;
|
|
2607
|
+
}
|
|
2608
|
+
else if (ctx->minicpmv_version == 3) {
|
|
2609
|
+
return 3584;
|
|
2610
|
+
}
|
|
2611
|
+
}
|
|
2084
2612
|
|
|
2085
2613
|
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
|
2086
2614
|
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
|
2087
2615
|
}
|
|
2616
|
+
|
|
2617
|
+
int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
|
2618
|
+
if (ctx->has_minicpmv_projector) {
|
|
2619
|
+
return ctx->minicpmv_version;
|
|
2620
|
+
}
|
|
2621
|
+
return 0;
|
|
2622
|
+
}
|