@fugood/llama.node 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -8
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +10 -10
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +14 -17
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +5 -4
- package/src/llama.cpp/.github/workflows/build.yml +137 -29
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +46 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +26 -11
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +10 -10
- package/src/llama.cpp/common/arg.cpp +2041 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +523 -1861
- package/src/llama.cpp/common/common.h +234 -106
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +39 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +356 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/docs/build.md +72 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +49 -65
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
- package/src/llama.cpp/examples/infill/infill.cpp +131 -192
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +686 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
- package/src/llama.cpp/examples/llava/llava.cpp +146 -26
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
- package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
- package/src/llama.cpp/examples/main/main.cpp +216 -313
- package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
- package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
- package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
- package/src/llama.cpp/examples/server/server.cpp +1347 -1531
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +396 -107
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +132 -106
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +272 -505
- package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
- package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
- package/src/llama.cpp/include/llama.h +296 -285
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
- package/src/llama.cpp/src/llama-sampling.h +39 -47
- package/src/llama.cpp/src/llama-vocab.cpp +390 -127
- package/src/llama.cpp/src/llama-vocab.h +60 -20
- package/src/llama.cpp/src/llama.cpp +6215 -3263
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +4 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
- package/src/llama.cpp/tests/test-barrier.cpp +94 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +2 -1
- package/src/llama.cpp/tests/test-sampling.cpp +226 -142
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/common/train.cpp +0 -1513
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
// I'll gradually clean and extend it
|
|
4
4
|
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
|
5
5
|
#include "clip.h"
|
|
6
|
-
#include "log.h"
|
|
7
6
|
#include "ggml.h"
|
|
7
|
+
#include "ggml-cpu.h"
|
|
8
8
|
#include "ggml-alloc.h"
|
|
9
9
|
#include "ggml-backend.h"
|
|
10
10
|
|
|
@@ -20,6 +20,10 @@
|
|
|
20
20
|
#include "ggml-cann.h"
|
|
21
21
|
#endif
|
|
22
22
|
|
|
23
|
+
#ifdef GGML_USE_VULKAN
|
|
24
|
+
#include "ggml-vulkan.h"
|
|
25
|
+
#endif
|
|
26
|
+
|
|
23
27
|
#define STB_IMAGE_IMPLEMENTATION
|
|
24
28
|
#include "stb_image.h"
|
|
25
29
|
|
|
@@ -36,6 +40,11 @@
|
|
|
36
40
|
#include <cinttypes>
|
|
37
41
|
#include <limits>
|
|
38
42
|
|
|
43
|
+
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
|
44
|
+
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
45
|
+
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
46
|
+
#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
47
|
+
|
|
39
48
|
//#define CLIP_DEBUG_FUNCTIONS
|
|
40
49
|
|
|
41
50
|
// RGB uint8 image
|
|
@@ -74,26 +83,28 @@ static std::string format(const char * fmt, ...) {
|
|
|
74
83
|
// key constants
|
|
75
84
|
//
|
|
76
85
|
|
|
77
|
-
#define KEY_FTYPE
|
|
78
|
-
#define KEY_NAME
|
|
79
|
-
#define KEY_DESCRIPTION
|
|
80
|
-
#define KEY_HAS_TEXT_ENC
|
|
81
|
-
#define KEY_HAS_VIS_ENC
|
|
82
|
-
#define KEY_HAS_LLAVA_PROJ
|
|
83
|
-
#define
|
|
84
|
-
#define
|
|
85
|
-
#define
|
|
86
|
-
#define
|
|
87
|
-
#define
|
|
88
|
-
#define
|
|
89
|
-
#define
|
|
90
|
-
#define
|
|
91
|
-
#define
|
|
92
|
-
#define
|
|
93
|
-
#define
|
|
94
|
-
#define
|
|
95
|
-
#define
|
|
96
|
-
#define
|
|
86
|
+
#define KEY_FTYPE "general.file_type"
|
|
87
|
+
#define KEY_NAME "general.name"
|
|
88
|
+
#define KEY_DESCRIPTION "general.description"
|
|
89
|
+
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
|
|
90
|
+
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
|
|
91
|
+
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
|
92
|
+
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
|
|
93
|
+
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
|
94
|
+
#define KEY_USE_GELU "clip.use_gelu"
|
|
95
|
+
#define KEY_N_EMBD "clip.%s.embedding_length"
|
|
96
|
+
#define KEY_N_FF "clip.%s.feed_forward_length"
|
|
97
|
+
#define KEY_N_BLOCK "clip.%s.block_count"
|
|
98
|
+
#define KEY_N_HEAD "clip.%s.attention.head_count"
|
|
99
|
+
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
|
|
100
|
+
#define KEY_PROJ_DIM "clip.%s.projection_dim"
|
|
101
|
+
#define KEY_TOKENS "tokenizer.ggml.tokens"
|
|
102
|
+
#define KEY_N_POSITIONS "clip.text.context_length"
|
|
103
|
+
#define KEY_IMAGE_SIZE "clip.vision.image_size"
|
|
104
|
+
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
|
105
|
+
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
|
106
|
+
#define KEY_IMAGE_STD "clip.vision.image_std"
|
|
107
|
+
#define KEY_PROJ_TYPE "clip.projector_type"
|
|
97
108
|
|
|
98
109
|
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
|
99
110
|
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
|
@@ -127,12 +138,20 @@ static std::string format(const char * fmt, ...) {
|
|
|
127
138
|
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
|
128
139
|
#define TN_IMAGE_NEWLINE "model.image_newline"
|
|
129
140
|
|
|
141
|
+
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
|
142
|
+
#define TN_MINICPMV_QUERY "resampler.query"
|
|
143
|
+
#define TN_MINICPMV_PROJ "resampler.proj.weight"
|
|
144
|
+
#define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
|
|
145
|
+
#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
|
|
146
|
+
#define TN_MINICPMV_LN "resampler.ln_%s.%s"
|
|
147
|
+
|
|
130
148
|
|
|
131
149
|
enum projector_type {
|
|
132
150
|
PROJECTOR_TYPE_MLP,
|
|
133
151
|
PROJECTOR_TYPE_MLP_NORM,
|
|
134
152
|
PROJECTOR_TYPE_LDP,
|
|
135
153
|
PROJECTOR_TYPE_LDPV2,
|
|
154
|
+
PROJECTOR_TYPE_RESAMPLER,
|
|
136
155
|
PROJECTOR_TYPE_UNKNOWN,
|
|
137
156
|
};
|
|
138
157
|
|
|
@@ -140,6 +159,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|
|
140
159
|
{ PROJECTOR_TYPE_MLP, "mlp" },
|
|
141
160
|
{ PROJECTOR_TYPE_LDP, "ldp" },
|
|
142
161
|
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
|
162
|
+
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
|
143
163
|
};
|
|
144
164
|
|
|
145
165
|
|
|
@@ -150,7 +170,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|
|
150
170
|
static int get_key_idx(const gguf_context * ctx, const char * key) {
|
|
151
171
|
int i = gguf_find_key(ctx, key);
|
|
152
172
|
if (i == -1) {
|
|
153
|
-
|
|
173
|
+
LOG_ERR("key %s not found in file\n", key);
|
|
154
174
|
throw std::runtime_error(format("Missing required key: %s", key));
|
|
155
175
|
}
|
|
156
176
|
|
|
@@ -200,17 +220,20 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
|
|
|
200
220
|
}
|
|
201
221
|
|
|
202
222
|
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
auto new_pos = s.find(search, pos);
|
|
206
|
-
if (new_pos == std::string::npos) {
|
|
207
|
-
result += s.substr(pos, s.size() - pos);
|
|
208
|
-
break;
|
|
209
|
-
}
|
|
210
|
-
result += s.substr(pos, new_pos - pos) + replace;
|
|
211
|
-
pos = new_pos;
|
|
223
|
+
if (search.empty()) {
|
|
224
|
+
return;
|
|
212
225
|
}
|
|
213
|
-
|
|
226
|
+
std::string builder;
|
|
227
|
+
builder.reserve(s.length());
|
|
228
|
+
size_t pos = 0;
|
|
229
|
+
size_t last_pos = 0;
|
|
230
|
+
while ((pos = s.find(search, last_pos)) != std::string::npos) {
|
|
231
|
+
builder.append(s, last_pos, pos - last_pos);
|
|
232
|
+
builder.append(replace);
|
|
233
|
+
last_pos = pos + search.length();
|
|
234
|
+
}
|
|
235
|
+
builder.append(s, last_pos, std::string::npos);
|
|
236
|
+
s = std::move(builder);
|
|
214
237
|
}
|
|
215
238
|
|
|
216
239
|
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
|
@@ -252,7 +275,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
|
|
252
275
|
|
|
253
276
|
static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
|
|
254
277
|
size_t tensor_size = ggml_nbytes(tensor);
|
|
255
|
-
|
|
278
|
+
LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
|
|
256
279
|
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
|
|
257
280
|
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
|
|
258
281
|
}
|
|
@@ -270,7 +293,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
|
|
|
270
293
|
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
|
|
271
294
|
std::ofstream file(filename, std::ios::binary);
|
|
272
295
|
if (!file.is_open()) {
|
|
273
|
-
|
|
296
|
+
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
|
|
274
297
|
return;
|
|
275
298
|
}
|
|
276
299
|
|
|
@@ -289,7 +312,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
|
|
|
289
312
|
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
|
|
290
313
|
std::ofstream file(filename, std::ios::binary);
|
|
291
314
|
if (!file.is_open()) {
|
|
292
|
-
|
|
315
|
+
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
|
|
293
316
|
return;
|
|
294
317
|
}
|
|
295
318
|
|
|
@@ -492,12 +515,34 @@ struct clip_vision_model {
|
|
|
492
515
|
struct ggml_tensor * mm_model_mlp_2_b;
|
|
493
516
|
struct ggml_tensor * mm_model_peg_0_w;
|
|
494
517
|
struct ggml_tensor * mm_model_peg_0_b;
|
|
518
|
+
|
|
519
|
+
// MINICPMV projection
|
|
520
|
+
struct ggml_tensor * mm_model_pos_embed_k;
|
|
521
|
+
struct ggml_tensor * mm_model_query;
|
|
522
|
+
struct ggml_tensor * mm_model_proj;
|
|
523
|
+
struct ggml_tensor * mm_model_kv_proj;
|
|
524
|
+
struct ggml_tensor * mm_model_attn_q_w;
|
|
525
|
+
struct ggml_tensor * mm_model_attn_q_b;
|
|
526
|
+
struct ggml_tensor * mm_model_attn_k_w;
|
|
527
|
+
struct ggml_tensor * mm_model_attn_k_b;
|
|
528
|
+
struct ggml_tensor * mm_model_attn_v_w;
|
|
529
|
+
struct ggml_tensor * mm_model_attn_v_b;
|
|
530
|
+
struct ggml_tensor * mm_model_attn_o_w;
|
|
531
|
+
struct ggml_tensor * mm_model_attn_o_b;
|
|
532
|
+
struct ggml_tensor * mm_model_ln_q_w;
|
|
533
|
+
struct ggml_tensor * mm_model_ln_q_b;
|
|
534
|
+
struct ggml_tensor * mm_model_ln_kv_w;
|
|
535
|
+
struct ggml_tensor * mm_model_ln_kv_b;
|
|
536
|
+
struct ggml_tensor * mm_model_ln_post_w;
|
|
537
|
+
struct ggml_tensor * mm_model_ln_post_b;
|
|
495
538
|
};
|
|
496
539
|
|
|
497
540
|
struct clip_ctx {
|
|
498
541
|
bool has_text_encoder = false;
|
|
499
542
|
bool has_vision_encoder = false;
|
|
500
543
|
bool has_llava_projector = false;
|
|
544
|
+
bool has_minicpmv_projector = false;
|
|
545
|
+
int minicpmv_version = 2;
|
|
501
546
|
|
|
502
547
|
struct clip_vision_model vision_model;
|
|
503
548
|
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
|
@@ -522,31 +567,46 @@ struct clip_ctx {
|
|
|
522
567
|
|
|
523
568
|
ggml_backend_t backend = NULL;
|
|
524
569
|
ggml_gallocr_t compute_alloc = NULL;
|
|
570
|
+
|
|
571
|
+
struct clip_image_size * load_image_size;
|
|
525
572
|
};
|
|
526
573
|
|
|
527
|
-
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
|
|
574
|
+
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
|
528
575
|
if (!ctx->has_vision_encoder) {
|
|
529
|
-
|
|
576
|
+
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
|
530
577
|
return nullptr;
|
|
531
578
|
}
|
|
532
579
|
|
|
533
580
|
const auto & model = ctx->vision_model;
|
|
534
581
|
const auto & hparams = model.hparams;
|
|
535
582
|
|
|
536
|
-
const int image_size
|
|
583
|
+
const int image_size = hparams.image_size;
|
|
584
|
+
int image_size_width = image_size;
|
|
585
|
+
int image_size_height = image_size;
|
|
586
|
+
if (ctx->has_minicpmv_projector) {
|
|
587
|
+
if (load_image_size == nullptr) {
|
|
588
|
+
load_image_size = clip_image_size_init();
|
|
589
|
+
}
|
|
590
|
+
LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
|
591
|
+
image_size_width = load_image_size->width;
|
|
592
|
+
image_size_height = load_image_size->height;
|
|
593
|
+
if (is_inf) {
|
|
594
|
+
image_size_width = imgs->data->nx;
|
|
595
|
+
image_size_height = imgs->data->ny;
|
|
596
|
+
}
|
|
597
|
+
}
|
|
537
598
|
const int patch_size = hparams.patch_size;
|
|
538
|
-
const int num_patches = ((
|
|
539
|
-
const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
|
|
599
|
+
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
|
540
600
|
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
|
541
601
|
const int hidden_size = hparams.hidden_size;
|
|
542
602
|
const int n_head = hparams.n_head;
|
|
543
603
|
const int d_head = hidden_size / n_head;
|
|
544
|
-
|
|
604
|
+
int n_layer = hparams.n_layer;
|
|
545
605
|
const float eps = hparams.eps;
|
|
546
606
|
|
|
547
607
|
const int batch_size = imgs->size;
|
|
548
608
|
|
|
549
|
-
if (ctx->has_llava_projector) {
|
|
609
|
+
if (ctx->has_llava_projector || ctx->has_minicpmv_projector) {
|
|
550
610
|
GGML_ASSERT(batch_size == 1);
|
|
551
611
|
}
|
|
552
612
|
|
|
@@ -559,7 +619,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
559
619
|
struct ggml_context * ctx0 = ggml_init(params);
|
|
560
620
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
|
561
621
|
|
|
562
|
-
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32,
|
|
622
|
+
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
|
|
563
623
|
ggml_set_name(inp_raw, "inp_raw");
|
|
564
624
|
ggml_set_input(inp_raw);
|
|
565
625
|
|
|
@@ -572,19 +632,21 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
572
632
|
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
|
|
573
633
|
inp = ggml_add(ctx0, inp, model.patch_bias);
|
|
574
634
|
}
|
|
575
|
-
|
|
576
|
-
// concat class_embeddings and patch_embeddings
|
|
577
635
|
struct ggml_tensor * embeddings = inp;
|
|
578
|
-
|
|
579
|
-
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
|
580
|
-
ggml_set_name(embeddings, "embeddings");
|
|
581
|
-
ggml_set_input(embeddings);
|
|
582
|
-
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
|
583
|
-
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
|
584
|
-
embeddings = ggml_acc(ctx0, embeddings, inp,
|
|
585
|
-
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
|
586
|
-
}
|
|
636
|
+
struct ggml_tensor * pos_embed = nullptr;
|
|
587
637
|
|
|
638
|
+
if (ctx->has_llava_projector) {
|
|
639
|
+
// concat class_embeddings and patch_embeddings
|
|
640
|
+
if (ctx->has_class_embedding) {
|
|
641
|
+
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
|
642
|
+
ggml_set_name(embeddings, "embeddings");
|
|
643
|
+
ggml_set_input(embeddings);
|
|
644
|
+
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
|
645
|
+
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
|
646
|
+
embeddings = ggml_acc(ctx0, embeddings, inp,
|
|
647
|
+
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
|
648
|
+
}
|
|
649
|
+
}
|
|
588
650
|
|
|
589
651
|
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
|
590
652
|
ggml_set_name(positions, "positions");
|
|
@@ -593,6 +655,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
593
655
|
embeddings =
|
|
594
656
|
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
|
595
657
|
|
|
658
|
+
if (ctx->has_minicpmv_projector) {
|
|
659
|
+
int pos_w = image_size_width/patch_size;
|
|
660
|
+
int pos_h = image_size_height/patch_size;
|
|
661
|
+
if (ctx->minicpmv_version == 2) {
|
|
662
|
+
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
|
|
663
|
+
}
|
|
664
|
+
else if (ctx->minicpmv_version == 3) {
|
|
665
|
+
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
|
|
666
|
+
}
|
|
667
|
+
ggml_set_name(pos_embed, "pos_embed");
|
|
668
|
+
ggml_set_input(pos_embed);
|
|
669
|
+
}
|
|
670
|
+
|
|
596
671
|
// pre-layernorm
|
|
597
672
|
if (ctx->has_pre_norm) {
|
|
598
673
|
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
@@ -602,6 +677,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
602
677
|
}
|
|
603
678
|
|
|
604
679
|
// loop over layers
|
|
680
|
+
if (ctx->has_minicpmv_projector) {
|
|
681
|
+
n_layer += 1;
|
|
682
|
+
}
|
|
605
683
|
for (int il = 0; il < n_layer - 1; il++) {
|
|
606
684
|
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
|
|
607
685
|
|
|
@@ -691,7 +769,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
691
769
|
}
|
|
692
770
|
|
|
693
771
|
// llava projector
|
|
694
|
-
{
|
|
772
|
+
if (ctx->has_llava_projector) {
|
|
695
773
|
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
|
|
696
774
|
|
|
697
775
|
struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
|
|
@@ -712,8 +790,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
712
790
|
embeddings = ggml_gelu(ctx0, embeddings);
|
|
713
791
|
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
|
|
714
792
|
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
|
715
|
-
|
|
716
|
-
|
|
793
|
+
}
|
|
794
|
+
else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
|
717
795
|
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
|
718
796
|
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
|
719
797
|
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
|
|
@@ -872,6 +950,75 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
872
950
|
GGML_ABORT("fatal error");
|
|
873
951
|
}
|
|
874
952
|
}
|
|
953
|
+
// minicpmv projector
|
|
954
|
+
else if (ctx->has_minicpmv_projector)
|
|
955
|
+
{
|
|
956
|
+
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
|
957
|
+
struct ggml_tensor * q = model.mm_model_query;
|
|
958
|
+
{ // layernorm
|
|
959
|
+
q = ggml_norm(ctx0, q, eps);
|
|
960
|
+
q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
|
|
961
|
+
}
|
|
962
|
+
struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
|
|
963
|
+
{ // layernorm
|
|
964
|
+
v = ggml_norm(ctx0, v, eps);
|
|
965
|
+
v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
|
|
966
|
+
}
|
|
967
|
+
struct ggml_tensor * k;
|
|
968
|
+
{ // position
|
|
969
|
+
// q = ggml_add(ctx0, q, model.mm_model_pos_embed);
|
|
970
|
+
k = ggml_add(ctx0, v, pos_embed);
|
|
971
|
+
}
|
|
972
|
+
|
|
973
|
+
{ // attention
|
|
974
|
+
int hidden_size = 4096;
|
|
975
|
+
const int d_head = 128;
|
|
976
|
+
int n_head = hidden_size/d_head;
|
|
977
|
+
int num_query = 96;
|
|
978
|
+
if (ctx->minicpmv_version == 2) {
|
|
979
|
+
hidden_size = 4096;
|
|
980
|
+
n_head = hidden_size/d_head;
|
|
981
|
+
num_query = 96;
|
|
982
|
+
}
|
|
983
|
+
else if (ctx->minicpmv_version == 3) {
|
|
984
|
+
hidden_size = 3584;
|
|
985
|
+
n_head = hidden_size/d_head;
|
|
986
|
+
num_query = 64;
|
|
987
|
+
}
|
|
988
|
+
|
|
989
|
+
struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
|
|
990
|
+
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
|
|
991
|
+
struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
|
|
992
|
+
struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
|
|
993
|
+
// permute
|
|
994
|
+
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
|
|
995
|
+
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
|
996
|
+
Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
|
|
997
|
+
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
|
|
998
|
+
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
|
999
|
+
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
|
|
1000
|
+
V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
|
|
1001
|
+
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
|
1002
|
+
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
|
|
1003
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
|
1004
|
+
KQ = ggml_soft_max_inplace(ctx0, KQ);
|
|
1005
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
|
1006
|
+
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
|
|
1007
|
+
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
|
1008
|
+
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
|
|
1009
|
+
|
|
1010
|
+
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
|
|
1011
|
+
}
|
|
1012
|
+
{ // layernorm
|
|
1013
|
+
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
1014
|
+
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
|
|
1015
|
+
}
|
|
1016
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
|
|
1017
|
+
}
|
|
1018
|
+
else {
|
|
1019
|
+
GGML_ASSERT(false);
|
|
1020
|
+
}
|
|
1021
|
+
}
|
|
875
1022
|
|
|
876
1023
|
// build the graph
|
|
877
1024
|
ggml_build_forward_expand(gf, embeddings);
|
|
@@ -905,21 +1052,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
905
1052
|
const int idx_name = gguf_find_key(ctx, KEY_NAME);
|
|
906
1053
|
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
|
|
907
1054
|
const std::string name = gguf_get_val_str(ctx, idx_name);
|
|
908
|
-
|
|
1055
|
+
LOG_INF("%s: model name: %s\n", __func__, name.c_str());
|
|
909
1056
|
}
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
1057
|
+
LOG_INF("%s: description: %s\n", __func__, description.c_str());
|
|
1058
|
+
LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
|
|
1059
|
+
LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
|
1060
|
+
LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
|
|
1061
|
+
LOG_INF("%s: n_kv: %d\n", __func__, n_kv);
|
|
1062
|
+
LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str());
|
|
1063
|
+
LOG_INF("\n");
|
|
917
1064
|
}
|
|
918
1065
|
const int n_tensors = gguf_get_n_tensors(ctx);
|
|
919
1066
|
|
|
920
1067
|
// kv
|
|
921
1068
|
const int n_kv = gguf_get_n_kv(ctx);
|
|
922
|
-
|
|
1069
|
+
LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
|
|
923
1070
|
__func__, n_kv, n_tensors, fname);
|
|
924
1071
|
{
|
|
925
1072
|
std::map<enum ggml_type, uint32_t> n_type;
|
|
@@ -930,7 +1077,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
930
1077
|
n_type[type]++;
|
|
931
1078
|
}
|
|
932
1079
|
|
|
933
|
-
|
|
1080
|
+
LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
|
934
1081
|
for (int i = 0; i < n_kv; i++) {
|
|
935
1082
|
const char * name = gguf_get_key(ctx, i);
|
|
936
1083
|
const enum gguf_type type = gguf_get_kv_type(ctx, i);
|
|
@@ -946,7 +1093,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
946
1093
|
}
|
|
947
1094
|
replace_all(value, "\n", "\\n");
|
|
948
1095
|
|
|
949
|
-
|
|
1096
|
+
LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
|
950
1097
|
}
|
|
951
1098
|
|
|
952
1099
|
// print type counts
|
|
@@ -955,7 +1102,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
955
1102
|
continue;
|
|
956
1103
|
}
|
|
957
1104
|
|
|
958
|
-
|
|
1105
|
+
LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
|
|
959
1106
|
}
|
|
960
1107
|
}
|
|
961
1108
|
|
|
@@ -970,13 +1117,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
970
1117
|
size_t tensor_size = ggml_nbytes(cur);
|
|
971
1118
|
model_size += tensor_size;
|
|
972
1119
|
if (verbosity >= 3) {
|
|
973
|
-
|
|
1120
|
+
LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
|
|
974
1121
|
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
|
|
975
1122
|
}
|
|
976
1123
|
}
|
|
977
1124
|
}
|
|
978
1125
|
|
|
979
|
-
clip_ctx * new_clip = new clip_ctx;
|
|
1126
|
+
clip_ctx * new_clip = new clip_ctx{};
|
|
980
1127
|
|
|
981
1128
|
// update projector type
|
|
982
1129
|
{
|
|
@@ -997,23 +1144,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
997
1144
|
|
|
998
1145
|
#ifdef GGML_USE_CUDA
|
|
999
1146
|
new_clip->backend = ggml_backend_cuda_init(0);
|
|
1000
|
-
|
|
1147
|
+
LOG_INF("%s: CLIP using CUDA backend\n", __func__);
|
|
1001
1148
|
#endif
|
|
1002
1149
|
|
|
1003
1150
|
#ifdef GGML_USE_METAL
|
|
1004
1151
|
new_clip->backend = ggml_backend_metal_init();
|
|
1005
|
-
|
|
1152
|
+
LOG_INF("%s: CLIP using Metal backend\n", __func__);
|
|
1006
1153
|
#endif
|
|
1007
1154
|
|
|
1008
1155
|
#ifdef GGML_USE_CANN
|
|
1009
1156
|
new_clip->backend = ggml_backend_cann_init(0);
|
|
1010
|
-
|
|
1157
|
+
LOG_INF("%s: CLIP using CANN backend\n", __func__);
|
|
1011
1158
|
#endif
|
|
1012
1159
|
|
|
1160
|
+
#ifdef GGML_USE_VULKAN
|
|
1161
|
+
new_clip->backend = ggml_backend_vk_init(0);
|
|
1162
|
+
LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
|
|
1163
|
+
#endif
|
|
1013
1164
|
|
|
1014
1165
|
if (!new_clip->backend) {
|
|
1015
1166
|
new_clip->backend = ggml_backend_cpu_init();
|
|
1016
|
-
|
|
1167
|
+
LOG_INF("%s: CLIP using CPU backend\n", __func__);
|
|
1017
1168
|
}
|
|
1018
1169
|
|
|
1019
1170
|
// model size and capabilities
|
|
@@ -1029,7 +1180,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1029
1180
|
new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx);
|
|
1030
1181
|
}
|
|
1031
1182
|
|
|
1032
|
-
|
|
1183
|
+
idx = gguf_find_key(ctx, KEY_HAS_MINICPMV_PROJ);
|
|
1184
|
+
if (idx != -1) {
|
|
1185
|
+
new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx);
|
|
1186
|
+
}
|
|
1187
|
+
|
|
1188
|
+
idx = gguf_find_key(ctx, KEY_MINICPMV_VERSION);
|
|
1189
|
+
if (idx != -1) {
|
|
1190
|
+
new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
|
|
1191
|
+
}
|
|
1192
|
+
|
|
1193
|
+
// GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
|
|
1194
|
+
|
|
1033
1195
|
GGML_ASSERT(new_clip->has_vision_encoder);
|
|
1034
1196
|
GGML_ASSERT(!new_clip->has_text_encoder);
|
|
1035
1197
|
|
|
@@ -1037,15 +1199,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1037
1199
|
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
|
|
1038
1200
|
|
|
1039
1201
|
if (verbosity >= 1) {
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1202
|
+
LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
|
1203
|
+
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
|
1204
|
+
LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
|
1205
|
+
LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
|
|
1206
|
+
LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
|
1207
|
+
LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
|
1045
1208
|
}
|
|
1046
1209
|
}
|
|
1047
1210
|
|
|
1048
|
-
|
|
1211
|
+
LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
|
|
1049
1212
|
|
|
1050
1213
|
// load tensors
|
|
1051
1214
|
{
|
|
@@ -1058,7 +1221,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1058
1221
|
|
|
1059
1222
|
new_clip->ctx_data = ggml_init(params);
|
|
1060
1223
|
if (!new_clip->ctx_data) {
|
|
1061
|
-
|
|
1224
|
+
LOG_ERR("%s: ggml_init() failed\n", __func__);
|
|
1062
1225
|
clip_free(new_clip);
|
|
1063
1226
|
gguf_free(ctx);
|
|
1064
1227
|
return nullptr;
|
|
@@ -1066,7 +1229,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1066
1229
|
|
|
1067
1230
|
auto fin = std::ifstream(fname, std::ios::binary);
|
|
1068
1231
|
if (!fin) {
|
|
1069
|
-
|
|
1232
|
+
LOG_ERR("cannot open model file for loading tensors\n");
|
|
1070
1233
|
clip_free(new_clip);
|
|
1071
1234
|
gguf_free(ctx);
|
|
1072
1235
|
return nullptr;
|
|
@@ -1088,7 +1251,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1088
1251
|
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
|
1089
1252
|
fin.seekg(offset, std::ios::beg);
|
|
1090
1253
|
if (!fin) {
|
|
1091
|
-
|
|
1254
|
+
LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
|
|
1092
1255
|
clip_free(new_clip);
|
|
1093
1256
|
gguf_free(ctx);
|
|
1094
1257
|
return nullptr;
|
|
@@ -1159,23 +1322,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1159
1322
|
}
|
|
1160
1323
|
|
|
1161
1324
|
if (verbosity >= 2) {
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1325
|
+
LOG_INF("\n%s: vision model hparams\n", __func__);
|
|
1326
|
+
LOG_INF("image_size %d\n", hparams.image_size);
|
|
1327
|
+
LOG_INF("patch_size %d\n", hparams.patch_size);
|
|
1328
|
+
LOG_INF("v_hidden_size %d\n", hparams.hidden_size);
|
|
1329
|
+
LOG_INF("v_n_intermediate %d\n", hparams.n_intermediate);
|
|
1330
|
+
LOG_INF("v_projection_dim %d\n", hparams.projection_dim);
|
|
1331
|
+
LOG_INF("v_n_head %d\n", hparams.n_head);
|
|
1332
|
+
LOG_INF("v_n_layer %d\n", hparams.n_layer);
|
|
1333
|
+
LOG_INF("v_eps %f\n", hparams.eps);
|
|
1334
|
+
LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
|
|
1335
|
+
LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
|
|
1336
|
+
LOG_INF("v_image_grid_pinpoints: ");
|
|
1174
1337
|
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
|
|
1175
|
-
|
|
1338
|
+
LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
|
|
1176
1339
|
}
|
|
1177
|
-
|
|
1178
|
-
|
|
1340
|
+
LOG_INF("\n");
|
|
1341
|
+
LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
|
|
1179
1342
|
|
|
1180
1343
|
}
|
|
1181
1344
|
|
|
@@ -1213,7 +1376,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1213
1376
|
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
|
1214
1377
|
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
|
1215
1378
|
} catch(const std::exception& /*e*/) {
|
|
1216
|
-
|
|
1379
|
+
LOG_ERR("%s: failed to load vision model tensors\n", __func__);
|
|
1217
1380
|
}
|
|
1218
1381
|
|
|
1219
1382
|
// LLaVA projection
|
|
@@ -1242,7 +1405,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1242
1405
|
} catch (std::runtime_error & /*e*/) { }
|
|
1243
1406
|
try {
|
|
1244
1407
|
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
|
|
1245
|
-
//
|
|
1408
|
+
// LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
|
|
1246
1409
|
} catch (std::runtime_error & /*e*/) { }
|
|
1247
1410
|
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
|
|
1248
1411
|
// MobileVLM projection
|
|
@@ -1281,6 +1444,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1281
1444
|
vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
|
|
1282
1445
|
vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
|
|
1283
1446
|
}
|
|
1447
|
+
else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
|
1448
|
+
// vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
|
|
1449
|
+
vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K);
|
|
1450
|
+
vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY);
|
|
1451
|
+
vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ);
|
|
1452
|
+
vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ);
|
|
1453
|
+
vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight"));
|
|
1454
|
+
vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight"));
|
|
1455
|
+
vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight"));
|
|
1456
|
+
vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias"));
|
|
1457
|
+
vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias"));
|
|
1458
|
+
vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias"));
|
|
1459
|
+
vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight"));
|
|
1460
|
+
vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias"));
|
|
1461
|
+
vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight"));
|
|
1462
|
+
vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias"));
|
|
1463
|
+
vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight"));
|
|
1464
|
+
vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias"));
|
|
1465
|
+
vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
|
|
1466
|
+
vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
|
|
1467
|
+
}
|
|
1284
1468
|
else {
|
|
1285
1469
|
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
|
1286
1470
|
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
|
@@ -1319,15 +1503,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1319
1503
|
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
|
|
1320
1504
|
clip_image_f32_batch batch;
|
|
1321
1505
|
batch.size = 1;
|
|
1322
|
-
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
|
|
1506
|
+
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
|
1323
1507
|
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
|
1324
1508
|
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
|
1325
|
-
|
|
1509
|
+
LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
|
1326
1510
|
}
|
|
1327
1511
|
|
|
1328
1512
|
return new_clip;
|
|
1329
1513
|
}
|
|
1330
1514
|
|
|
1515
|
+
void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
|
|
1516
|
+
ctx_clip->load_image_size = load_image_size;
|
|
1517
|
+
}
|
|
1518
|
+
|
|
1519
|
+
struct clip_image_size * clip_image_size_init() {
|
|
1520
|
+
struct clip_image_size * load_image_size = new struct clip_image_size();
|
|
1521
|
+
load_image_size->width = 448;
|
|
1522
|
+
load_image_size->height = 448;
|
|
1523
|
+
return load_image_size;
|
|
1524
|
+
}
|
|
1525
|
+
|
|
1331
1526
|
struct clip_image_u8 * clip_image_u8_init() {
|
|
1332
1527
|
return new clip_image_u8();
|
|
1333
1528
|
}
|
|
@@ -1362,7 +1557,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
|
|
1362
1557
|
int nx, ny, nc;
|
|
1363
1558
|
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
|
|
1364
1559
|
if (!data) {
|
|
1365
|
-
|
|
1560
|
+
LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
|
|
1366
1561
|
return false;
|
|
1367
1562
|
}
|
|
1368
1563
|
build_clip_img_from_data(data, nx, ny, img);
|
|
@@ -1374,7 +1569,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
|
|
|
1374
1569
|
int nx, ny, nc;
|
|
1375
1570
|
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
|
1376
1571
|
if (!data) {
|
|
1377
|
-
|
|
1572
|
+
LOG_ERR("%s: failed to decode image bytes\n", __func__);
|
|
1378
1573
|
return false;
|
|
1379
1574
|
}
|
|
1380
1575
|
build_clip_img_from_data(data, nx, ny, img);
|
|
@@ -1433,7 +1628,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
|
|
|
1433
1628
|
}
|
|
1434
1629
|
}
|
|
1435
1630
|
|
|
1436
|
-
inline
|
|
1631
|
+
inline int clip(int x, int lower, int upper) {
|
|
1437
1632
|
return std::max(lower, std::min(x, upper));
|
|
1438
1633
|
}
|
|
1439
1634
|
|
|
@@ -1564,7 +1759,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
|
|
|
1564
1759
|
int downscaled_height = static_cast<int>(original_height * scale);
|
|
1565
1760
|
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
|
1566
1761
|
int wasted_resolution = (width * height) - effective_resolution;
|
|
1567
|
-
//
|
|
1762
|
+
// LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
|
1568
1763
|
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
|
1569
1764
|
max_effective_resolution = effective_resolution;
|
|
1570
1765
|
min_wasted_resolution = wasted_resolution;
|
|
@@ -1598,12 +1793,185 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
|
|
|
1598
1793
|
return patches;
|
|
1599
1794
|
}
|
|
1600
1795
|
|
|
1796
|
+
static int ensure_divide(int length, int patch_size) {
|
|
1797
|
+
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
|
|
1798
|
+
}
|
|
1799
|
+
|
|
1800
|
+
static std::pair<int, int> uhd_find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
|
|
1801
|
+
int width = original_size.first;
|
|
1802
|
+
int height = original_size.second;
|
|
1803
|
+
if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
|
|
1804
|
+
float r = static_cast<float>(width) / height;
|
|
1805
|
+
height = static_cast<int>(scale_resolution / std::sqrt(r));
|
|
1806
|
+
width = static_cast<int>(height * r);
|
|
1807
|
+
}
|
|
1808
|
+
int best_width = ensure_divide(width, patch_size);
|
|
1809
|
+
int best_height = ensure_divide(height, patch_size);
|
|
1810
|
+
return std::make_pair(best_width, best_height);
|
|
1811
|
+
}
|
|
1812
|
+
|
|
1813
|
+
static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
|
|
1814
|
+
int width, height;
|
|
1815
|
+
std::tie(width, height) = original_size;
|
|
1816
|
+
int grid_x, grid_y;
|
|
1817
|
+
std::tie(grid_x, grid_y) = grid;
|
|
1818
|
+
|
|
1819
|
+
int refine_width = ensure_divide(width, grid_x);
|
|
1820
|
+
int refine_height = ensure_divide(height, grid_y);
|
|
1821
|
+
|
|
1822
|
+
int grid_width = refine_width / grid_x;
|
|
1823
|
+
int grid_height = refine_height / grid_y;
|
|
1824
|
+
|
|
1825
|
+
// auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line)
|
|
1826
|
+
auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair
|
|
1827
|
+
int best_grid_width, best_grid_height;
|
|
1828
|
+
std::tie(best_grid_width, best_grid_height) = best_grid_size;
|
|
1829
|
+
|
|
1830
|
+
// std::pair<int, int> refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line)
|
|
1831
|
+
std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line)
|
|
1832
|
+
return refine_size;
|
|
1833
|
+
}
|
|
1834
|
+
|
|
1835
|
+
static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
|
|
1836
|
+
std::vector<int> candidate_split_grids_nums;
|
|
1837
|
+
for (int i : {multiple - 1, multiple, multiple + 1}) {
|
|
1838
|
+
if (i == 1 || i > max_slice_nums) {
|
|
1839
|
+
continue;
|
|
1840
|
+
}
|
|
1841
|
+
candidate_split_grids_nums.push_back(i);
|
|
1842
|
+
}
|
|
1843
|
+
|
|
1844
|
+
std::vector<std::pair<int, int>> candidate_grids;
|
|
1845
|
+
for (int split_grids_nums : candidate_split_grids_nums) {
|
|
1846
|
+
int m = 1;
|
|
1847
|
+
while (m <= split_grids_nums) {
|
|
1848
|
+
if (split_grids_nums % m == 0) {
|
|
1849
|
+
candidate_grids.emplace_back(m, split_grids_nums / m);
|
|
1850
|
+
}
|
|
1851
|
+
++m;
|
|
1852
|
+
}
|
|
1853
|
+
}
|
|
1854
|
+
|
|
1855
|
+
std::pair<int, int> best_grid{1, 1};
|
|
1856
|
+
float min_error = std::numeric_limits<float>::infinity();
|
|
1857
|
+
for (const auto& grid : candidate_grids) {
|
|
1858
|
+
float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second));
|
|
1859
|
+
if (error < min_error) {
|
|
1860
|
+
best_grid = grid;
|
|
1861
|
+
min_error = error;
|
|
1862
|
+
}
|
|
1863
|
+
}
|
|
1864
|
+
return best_grid;
|
|
1865
|
+
}
|
|
1866
|
+
|
|
1867
|
+
// inspired from LLaVA-UHD:
|
|
1868
|
+
// -> https://arxiv.org/pdf/2403.11703
|
|
1869
|
+
// -> https://github.com/thunlp/LLaVA-UHD
|
|
1870
|
+
// -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
|
|
1871
|
+
static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
|
|
1872
|
+
const std::pair<int, int> original_size={img->nx,img->ny};
|
|
1873
|
+
const int original_width = img->nx;
|
|
1874
|
+
const int original_height = img->ny;
|
|
1875
|
+
const float log_ratio = log(1.0*original_width/original_height);
|
|
1876
|
+
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
|
|
1877
|
+
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
|
1878
|
+
|
|
1879
|
+
std::vector<std::vector<clip_image_u8 *>> images;
|
|
1880
|
+
LOG_INF("%s: multiple %d\n", __func__, multiple);
|
|
1881
|
+
images.push_back(std::vector<clip_image_u8 *>());
|
|
1882
|
+
|
|
1883
|
+
if (multiple <= 1) {
|
|
1884
|
+
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
|
|
1885
|
+
clip_image_u8 * source_image = clip_image_u8_init();
|
|
1886
|
+
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
|
|
1887
|
+
// source_image = image.resize(best_size, Image.Resampling.BICUBIC)
|
|
1888
|
+
images[images.size()-1].push_back(source_image);
|
|
1889
|
+
}
|
|
1890
|
+
else if (multiple > 1) {
|
|
1891
|
+
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
|
|
1892
|
+
clip_image_u8 * source_image = clip_image_u8_init();
|
|
1893
|
+
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
|
|
1894
|
+
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
|
|
1895
|
+
LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
|
|
1896
|
+
images[images.size()-1].push_back(source_image);
|
|
1897
|
+
|
|
1898
|
+
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
|
|
1899
|
+
LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
|
|
1900
|
+
|
|
1901
|
+
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
|
|
1902
|
+
clip_image_u8 * refine_image = clip_image_u8_init();
|
|
1903
|
+
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
|
|
1904
|
+
|
|
1905
|
+
LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
|
|
1906
|
+
|
|
1907
|
+
// split_to_patches
|
|
1908
|
+
int width = refine_image->nx;
|
|
1909
|
+
int height = refine_image->ny;
|
|
1910
|
+
int grid_x = int(width / best_grid.first);
|
|
1911
|
+
int grid_y = int(height / best_grid.second);
|
|
1912
|
+
for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
|
|
1913
|
+
images.push_back(std::vector<clip_image_u8 *>());
|
|
1914
|
+
for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
|
|
1915
|
+
clip_image_u8 * patch = clip_image_u8_init();
|
|
1916
|
+
patch->nx = grid_x;
|
|
1917
|
+
patch->ny = grid_y;
|
|
1918
|
+
patch->buf.resize(3 * patch->nx * patch->ny);
|
|
1919
|
+
for (int y = patches_i; y < patches_i + grid_y; ++y) {
|
|
1920
|
+
for (int x = patches_j; x < patches_j + grid_x; ++x) {
|
|
1921
|
+
const int i = 3 * (y * refine_image->nx + x);
|
|
1922
|
+
const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j));
|
|
1923
|
+
patch->buf[j] = refine_image->buf[i];
|
|
1924
|
+
patch->buf[j+1] = refine_image->buf[i+1];
|
|
1925
|
+
patch->buf[j+2] = refine_image->buf[i+2];
|
|
1926
|
+
}
|
|
1927
|
+
}
|
|
1928
|
+
images[images.size()-1].push_back(patch);
|
|
1929
|
+
}
|
|
1930
|
+
}
|
|
1931
|
+
}
|
|
1932
|
+
return images;
|
|
1933
|
+
}
|
|
1934
|
+
|
|
1935
|
+
int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
|
|
1936
|
+
const int max_slice_nums=9;
|
|
1937
|
+
const int scale_resolution=448;
|
|
1938
|
+
const int original_width = ctx_clip->load_image_size->width;
|
|
1939
|
+
const int original_height = ctx_clip->load_image_size->height;
|
|
1940
|
+
const float log_ratio = log(1.0*original_width/original_height);
|
|
1941
|
+
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
|
|
1942
|
+
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
|
1943
|
+
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
|
|
1944
|
+
return best_grid.first;
|
|
1945
|
+
}
|
|
1946
|
+
|
|
1601
1947
|
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
|
|
1602
1948
|
// res_imgs memory is being allocated here, previous allocations will be freed if found
|
|
1603
1949
|
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
|
|
1950
|
+
|
|
1951
|
+
if(clip_is_minicpmv(ctx)){
|
|
1952
|
+
int max_slice_nums = 9;
|
|
1953
|
+
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, max_slice_nums);
|
|
1954
|
+
res_imgs->size = 0;
|
|
1955
|
+
for (size_t i = 0; i < imgs.size(); ++i){
|
|
1956
|
+
res_imgs->size += imgs[i].size();
|
|
1957
|
+
}
|
|
1958
|
+
res_imgs->data = new clip_image_f32[res_imgs->size];
|
|
1959
|
+
int idx = 0;
|
|
1960
|
+
for (size_t i = 0; i < imgs.size(); ++i) {
|
|
1961
|
+
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
|
1962
|
+
LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
|
|
1963
|
+
clip_image_f32 * res = clip_image_f32_init();
|
|
1964
|
+
normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
|
|
1965
|
+
res_imgs->data[idx++] = *res;
|
|
1966
|
+
clip_image_f32_free(res);
|
|
1967
|
+
}
|
|
1968
|
+
}
|
|
1969
|
+
return true;
|
|
1970
|
+
}
|
|
1971
|
+
|
|
1604
1972
|
bool pad_to_square = true;
|
|
1605
1973
|
if (!ctx->has_vision_encoder) {
|
|
1606
|
-
|
|
1974
|
+
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
|
1607
1975
|
return false;
|
|
1608
1976
|
}
|
|
1609
1977
|
auto & params = ctx->vision_model.hparams;
|
|
@@ -1680,7 +2048,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|
|
1680
2048
|
}
|
|
1681
2049
|
|
|
1682
2050
|
for (size_t i = 0; i < patches.size(); i++) {
|
|
1683
|
-
//
|
|
2051
|
+
// LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
|
|
1684
2052
|
clip_image_u8_free(patches[i]);
|
|
1685
2053
|
}
|
|
1686
2054
|
|
|
@@ -1816,14 +2184,107 @@ int clip_n_patches(const struct clip_ctx * ctx) {
|
|
|
1816
2184
|
|
|
1817
2185
|
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
|
|
1818
2186
|
n_patches /= 4;
|
|
2187
|
+
} else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
|
2188
|
+
if (ctx->minicpmv_version == 2) {
|
|
2189
|
+
n_patches = 96;
|
|
2190
|
+
}
|
|
2191
|
+
else if (ctx->minicpmv_version == 3) {
|
|
2192
|
+
n_patches = 64;
|
|
2193
|
+
}
|
|
1819
2194
|
}
|
|
1820
2195
|
|
|
1821
2196
|
return n_patches;
|
|
1822
2197
|
}
|
|
1823
2198
|
|
|
2199
|
+
static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
|
|
2200
|
+
assert(embed_dim % 2 == 0);
|
|
2201
|
+
int H = pos.size();
|
|
2202
|
+
int W = pos[0].size();
|
|
2203
|
+
|
|
2204
|
+
std::vector<float> omega(embed_dim / 2);
|
|
2205
|
+
for (int i = 0; i < embed_dim / 2; ++i) {
|
|
2206
|
+
omega[i] = 1.0 / pow(10000.0, static_cast<float>(i) / (embed_dim / 2));
|
|
2207
|
+
}
|
|
2208
|
+
|
|
2209
|
+
std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
|
|
2210
|
+
for (int h = 0; h < H; ++h) {
|
|
2211
|
+
for (int w = 0; w < W; ++w) {
|
|
2212
|
+
for (int d = 0; d < embed_dim / 2; ++d) {
|
|
2213
|
+
float out_value = pos[h][w] * omega[d];
|
|
2214
|
+
emb[h][w][d] = sin(out_value);
|
|
2215
|
+
emb[h][w][d + embed_dim / 2] = cos(out_value);
|
|
2216
|
+
}
|
|
2217
|
+
}
|
|
2218
|
+
}
|
|
2219
|
+
|
|
2220
|
+
return emb;
|
|
2221
|
+
}
|
|
2222
|
+
|
|
2223
|
+
static std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector<std::vector<std::vector<float>>> & grid) {
|
|
2224
|
+
assert(embed_dim % 2 == 0);
|
|
2225
|
+
std::vector<std::vector<std::vector<float>>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2)
|
|
2226
|
+
std::vector<std::vector<std::vector<float>>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2)
|
|
2227
|
+
|
|
2228
|
+
int H = emb_h.size();
|
|
2229
|
+
int W = emb_h[0].size();
|
|
2230
|
+
std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
|
|
2231
|
+
|
|
2232
|
+
for (int h = 0; h < H; ++h) {
|
|
2233
|
+
for (int w = 0; w < W; ++w) {
|
|
2234
|
+
for (int d = 0; d < embed_dim / 2; ++d) {
|
|
2235
|
+
emb[h][w][d] = emb_h[h][w][d];
|
|
2236
|
+
emb[h][w][d + embed_dim / 2] = emb_w[h][w][d];
|
|
2237
|
+
}
|
|
2238
|
+
}
|
|
2239
|
+
}
|
|
2240
|
+
return emb;
|
|
2241
|
+
}
|
|
2242
|
+
|
|
2243
|
+
static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, const std::pair<int, int> image_size) {
|
|
2244
|
+
int grid_h_size = image_size.first;
|
|
2245
|
+
int grid_w_size = image_size.second;
|
|
2246
|
+
|
|
2247
|
+
std::vector<float> grid_h(grid_h_size);
|
|
2248
|
+
std::vector<float> grid_w(grid_w_size);
|
|
2249
|
+
|
|
2250
|
+
for (int i = 0; i < grid_h_size; ++i) {
|
|
2251
|
+
grid_h[i] = static_cast<float>(i);
|
|
2252
|
+
}
|
|
2253
|
+
for (int i = 0; i < grid_w_size; ++i) {
|
|
2254
|
+
grid_w[i] = static_cast<float>(i);
|
|
2255
|
+
}
|
|
2256
|
+
|
|
2257
|
+
std::vector<std::vector<float>> grid(grid_h_size, std::vector<float>(grid_w_size));
|
|
2258
|
+
for (int h = 0; h < grid_h_size; ++h) {
|
|
2259
|
+
for (int w = 0; w < grid_w_size; ++w) {
|
|
2260
|
+
grid[h][w] = grid_w[w];
|
|
2261
|
+
}
|
|
2262
|
+
}
|
|
2263
|
+
std::vector<std::vector<std::vector<float>>> grid_2d = {grid, grid};
|
|
2264
|
+
for (int h = 0; h < grid_h_size; ++h) {
|
|
2265
|
+
for (int w = 0; w < grid_w_size; ++w) {
|
|
2266
|
+
grid_2d[0][h][w] = grid_h[h];
|
|
2267
|
+
grid_2d[1][h][w] = grid_w[w];
|
|
2268
|
+
}
|
|
2269
|
+
}
|
|
2270
|
+
|
|
2271
|
+
std::vector<std::vector<std::vector<float>>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d);
|
|
2272
|
+
|
|
2273
|
+
int H = image_size.first;
|
|
2274
|
+
int W = image_size.second;
|
|
2275
|
+
std::vector<std::vector<float>> pos_embed_2d(H * W, std::vector<float>(embed_dim));
|
|
2276
|
+
for (int h = 0; h < H; ++h) {
|
|
2277
|
+
for (int w = 0; w < W; ++w) {
|
|
2278
|
+
pos_embed_2d[w * H + h] = pos_embed_3d[h][w];
|
|
2279
|
+
}
|
|
2280
|
+
}
|
|
2281
|
+
|
|
2282
|
+
return pos_embed_2d;
|
|
2283
|
+
}
|
|
2284
|
+
|
|
1824
2285
|
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
|
1825
2286
|
if (!ctx->has_vision_encoder) {
|
|
1826
|
-
|
|
2287
|
+
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
|
1827
2288
|
return false;
|
|
1828
2289
|
}
|
|
1829
2290
|
|
|
@@ -1835,7 +2296,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
|
|
|
1835
2296
|
|
|
1836
2297
|
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
|
|
1837
2298
|
if (!ctx->has_vision_encoder) {
|
|
1838
|
-
|
|
2299
|
+
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
|
1839
2300
|
return false;
|
|
1840
2301
|
}
|
|
1841
2302
|
|
|
@@ -1843,19 +2304,33 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
1843
2304
|
if (ctx->has_llava_projector) {
|
|
1844
2305
|
GGML_ASSERT(batch_size == 1); // TODO: support multiple images
|
|
1845
2306
|
}
|
|
2307
|
+
if (ctx->has_minicpmv_projector) {
|
|
2308
|
+
GGML_ASSERT(batch_size == 1);
|
|
2309
|
+
}
|
|
1846
2310
|
|
|
1847
2311
|
// build the inference graph
|
|
1848
|
-
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
|
|
2312
|
+
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
|
|
1849
2313
|
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
|
|
1850
2314
|
|
|
1851
2315
|
// set inputs
|
|
1852
2316
|
const auto & model = ctx->vision_model;
|
|
1853
2317
|
const auto & hparams = model.hparams;
|
|
1854
2318
|
|
|
1855
|
-
const int image_size
|
|
2319
|
+
const int image_size = hparams.image_size;
|
|
2320
|
+
int image_size_width = image_size;
|
|
2321
|
+
int image_size_height = image_size;
|
|
2322
|
+
if (ctx->has_minicpmv_projector) {
|
|
2323
|
+
image_size_width = imgs->data[0].nx;
|
|
2324
|
+
image_size_height = imgs->data[0].ny;
|
|
2325
|
+
}
|
|
1856
2326
|
const int patch_size = hparams.patch_size;
|
|
1857
|
-
const int num_patches = ((
|
|
2327
|
+
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
|
1858
2328
|
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
|
2329
|
+
if(ctx->load_image_size==nullptr){
|
|
2330
|
+
ctx->load_image_size= clip_image_size_init();
|
|
2331
|
+
}
|
|
2332
|
+
const int pos_w = ctx->load_image_size->width/patch_size;
|
|
2333
|
+
const int pos_h = ctx->load_image_size->height/patch_size;
|
|
1859
2334
|
|
|
1860
2335
|
{
|
|
1861
2336
|
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
|
|
@@ -1864,7 +2339,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
1864
2339
|
for (size_t i = 0; i < imgs->size; i++) {
|
|
1865
2340
|
const int nx = imgs->data[i].nx;
|
|
1866
2341
|
const int ny = imgs->data[i].ny;
|
|
1867
|
-
|
|
2342
|
+
if (!ctx->has_minicpmv_projector) {
|
|
2343
|
+
GGML_ASSERT(nx == image_size && ny == image_size);
|
|
2344
|
+
}
|
|
1868
2345
|
|
|
1869
2346
|
const int n = nx * ny;
|
|
1870
2347
|
|
|
@@ -1881,53 +2358,97 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
1881
2358
|
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
|
|
1882
2359
|
free(data);
|
|
1883
2360
|
}
|
|
2361
|
+
if (ctx->has_minicpmv_projector) {
|
|
2362
|
+
{
|
|
2363
|
+
// inspired from siglip:
|
|
2364
|
+
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
|
|
2365
|
+
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
|
|
2366
|
+
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
|
2367
|
+
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
|
2368
|
+
int bucket_coords_h[70];
|
|
2369
|
+
int bucket_coords_w[70];
|
|
2370
|
+
for (int i = 0; i < pos_h; i++){
|
|
2371
|
+
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
|
|
2372
|
+
}
|
|
2373
|
+
for (int i = 0; i < pos_w; i++){
|
|
2374
|
+
bucket_coords_w[i] = std::floor(70.0*i/pos_w);
|
|
2375
|
+
}
|
|
2376
|
+
for (int i = 0, id = 0; i < pos_h; i++){
|
|
2377
|
+
for (int j = 0; j < pos_w; j++){
|
|
2378
|
+
positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
|
|
2379
|
+
}
|
|
2380
|
+
}
|
|
2381
|
+
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
|
2382
|
+
free(positions_data);
|
|
2383
|
+
}
|
|
1884
2384
|
|
|
1885
|
-
|
|
1886
|
-
|
|
1887
|
-
|
|
2385
|
+
{
|
|
2386
|
+
// inspired from resampler of Qwen-VL:
|
|
2387
|
+
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
|
|
2388
|
+
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
|
|
2389
|
+
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
|
|
2390
|
+
int embed_dim = 4096;
|
|
2391
|
+
if (ctx->minicpmv_version == 2) {
|
|
2392
|
+
embed_dim = 4096;
|
|
2393
|
+
}
|
|
2394
|
+
else if (ctx->minicpmv_version == 3) {
|
|
2395
|
+
embed_dim = 3584;
|
|
2396
|
+
}
|
|
2397
|
+
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
|
1888
2398
|
|
|
1889
|
-
|
|
1890
|
-
|
|
1891
|
-
|
|
1892
|
-
|
|
2399
|
+
float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
|
|
2400
|
+
for(int i=0;i<pos_w * pos_h;++i){
|
|
2401
|
+
for(int j=0;j<embed_dim;++j){
|
|
2402
|
+
pos_embed_data[i*embed_dim+j]=pos_embed_t[i][j];
|
|
2403
|
+
}
|
|
2404
|
+
}
|
|
2405
|
+
|
|
2406
|
+
ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
|
|
2407
|
+
free(pos_embed_data);
|
|
1893
2408
|
}
|
|
1894
2409
|
}
|
|
2410
|
+
else{
|
|
2411
|
+
{
|
|
2412
|
+
if (ctx->has_class_embedding) {
|
|
2413
|
+
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
|
|
1895
2414
|
|
|
1896
|
-
|
|
1897
|
-
|
|
2415
|
+
void* zero_mem = malloc(ggml_nbytes(embeddings));
|
|
2416
|
+
memset(zero_mem, 0, ggml_nbytes(embeddings));
|
|
2417
|
+
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
|
|
2418
|
+
free(zero_mem);
|
|
2419
|
+
}
|
|
2420
|
+
}
|
|
2421
|
+
|
|
2422
|
+
{
|
|
2423
|
+
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
|
1898
2424
|
|
|
1899
|
-
|
|
1900
|
-
|
|
1901
|
-
|
|
2425
|
+
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
|
2426
|
+
for (int i = 0; i < num_positions; i++) {
|
|
2427
|
+
positions_data[i] = i;
|
|
2428
|
+
}
|
|
2429
|
+
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
|
2430
|
+
free(positions_data);
|
|
1902
2431
|
}
|
|
1903
|
-
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
|
1904
|
-
free(positions_data);
|
|
1905
|
-
}
|
|
1906
2432
|
|
|
1907
|
-
|
|
1908
|
-
|
|
1909
|
-
|
|
1910
|
-
|
|
1911
|
-
|
|
2433
|
+
{
|
|
2434
|
+
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
|
2435
|
+
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
|
2436
|
+
for (int i = 0; i < num_patches; i++) {
|
|
2437
|
+
patches_data[i] = i + 1;
|
|
2438
|
+
}
|
|
2439
|
+
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
|
2440
|
+
free(patches_data);
|
|
1912
2441
|
}
|
|
1913
|
-
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
|
1914
|
-
free(patches_data);
|
|
1915
2442
|
}
|
|
1916
2443
|
|
|
1917
2444
|
if (ggml_backend_is_cpu(ctx->backend)) {
|
|
1918
2445
|
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
|
1919
2446
|
}
|
|
1920
2447
|
|
|
1921
|
-
#ifdef GGML_USE_METAL
|
|
1922
|
-
if (ggml_backend_is_metal(ctx->backend)) {
|
|
1923
|
-
ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
|
|
1924
|
-
}
|
|
1925
|
-
#endif
|
|
1926
|
-
|
|
1927
2448
|
ggml_backend_graph_compute(ctx->backend, gf);
|
|
1928
2449
|
|
|
1929
2450
|
// the last node is the embedding tensor
|
|
1930
|
-
struct ggml_tensor * embeddings = gf
|
|
2451
|
+
struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
|
1931
2452
|
|
|
1932
2453
|
// copy the embeddings to the location passed by the user
|
|
1933
2454
|
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
|
@@ -1999,7 +2520,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|
|
1999
2520
|
new_type = type;
|
|
2000
2521
|
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
|
|
2001
2522
|
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
|
|
2002
|
-
//
|
|
2523
|
+
// LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
|
|
2003
2524
|
}
|
|
2004
2525
|
const size_t n_elms = ggml_nelements(cur);
|
|
2005
2526
|
float * f32_data;
|
|
@@ -2018,7 +2539,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|
|
2018
2539
|
f32_data = (float *)conv_buf.data();
|
|
2019
2540
|
break;
|
|
2020
2541
|
default:
|
|
2021
|
-
|
|
2542
|
+
LOG_ERR("Please use an input file in f32 or f16\n");
|
|
2022
2543
|
gguf_free(ctx_out);
|
|
2023
2544
|
return false;
|
|
2024
2545
|
}
|
|
@@ -2045,7 +2566,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|
|
2045
2566
|
fout.put(0);
|
|
2046
2567
|
}
|
|
2047
2568
|
|
|
2048
|
-
|
|
2569
|
+
LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
|
|
2049
2570
|
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
|
2050
2571
|
}
|
|
2051
2572
|
|
|
@@ -2061,8 +2582,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|
|
2061
2582
|
gguf_free(ctx_out);
|
|
2062
2583
|
|
|
2063
2584
|
{
|
|
2064
|
-
|
|
2065
|
-
|
|
2585
|
+
LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
|
|
2586
|
+
LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
|
|
2066
2587
|
}
|
|
2067
2588
|
|
|
2068
2589
|
return true;
|
|
@@ -2081,7 +2602,22 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|
|
2081
2602
|
if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
|
2082
2603
|
return ctx->vision_model.mm_3_b->ne[0];
|
|
2083
2604
|
}
|
|
2605
|
+
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
|
2606
|
+
if (ctx->minicpmv_version == 2) {
|
|
2607
|
+
return 4096;
|
|
2608
|
+
}
|
|
2609
|
+
else if (ctx->minicpmv_version == 3) {
|
|
2610
|
+
return 3584;
|
|
2611
|
+
}
|
|
2612
|
+
}
|
|
2084
2613
|
|
|
2085
2614
|
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
|
2086
2615
|
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
|
2087
2616
|
}
|
|
2617
|
+
|
|
2618
|
+
int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
|
2619
|
+
if (ctx->has_minicpmv_projector) {
|
|
2620
|
+
return ctx->minicpmv_version;
|
|
2621
|
+
}
|
|
2622
|
+
return 0;
|
|
2623
|
+
}
|