@fugood/llama.node 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/EmbeddingWorker.cpp +15 -5
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +1 -1
- package/src/LlamaContext.cpp +81 -18
- package/src/LlamaContext.h +2 -0
- package/src/llama.cpp/.github/workflows/build.yml +197 -159
- package/src/llama.cpp/.github/workflows/docker.yml +5 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +11 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -2
- package/src/llama.cpp/common/arg.cpp +426 -245
- package/src/llama.cpp/common/common.cpp +143 -80
- package/src/llama.cpp/common/common.h +81 -24
- package/src/llama.cpp/common/sampling.cpp +53 -19
- package/src/llama.cpp/common/sampling.h +22 -1
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +101 -148
- package/src/llama.cpp/examples/CMakeLists.txt +32 -13
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +5 -4
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +262 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/llava.cpp +46 -19
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +9 -5
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
- package/src/llama.cpp/examples/server/server.cpp +1758 -886
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +94 -304
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +4 -0
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
- package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml.h +106 -24
- package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
- package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
- package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
- package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
- package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
- package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
- package/src/llama.cpp/ggml/src/ggml.c +367 -207
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +26 -19
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/src/CMakeLists.txt +2 -7
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +35 -90
- package/src/llama.cpp/src/llama-vocab.cpp +6 -1
- package/src/llama.cpp/src/llama.cpp +1748 -640
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -37
- package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
- package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
- package/src/llama.cpp/tests/test-rope.cpp +61 -20
- package/src/llama.cpp/tests/test-sampling.cpp +2 -2
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
|
@@ -11,7 +11,7 @@ target_include_directories(llava PUBLIC .)
|
|
|
11
11
|
target_include_directories(llava PUBLIC ../..)
|
|
12
12
|
target_include_directories(llava PUBLIC ../../common)
|
|
13
13
|
|
|
14
|
-
target_compile_features(llava PRIVATE
|
|
14
|
+
target_compile_features(llava PRIVATE cxx_std_17)
|
|
15
15
|
|
|
16
16
|
add_library(llava_static STATIC $<TARGET_OBJECTS:llava>)
|
|
17
17
|
if (BUILD_SHARED_LIBS)
|
|
@@ -35,11 +35,18 @@ add_executable(${TARGET} llava-cli.cpp)
|
|
|
35
35
|
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
|
|
36
36
|
install(TARGETS ${TARGET} RUNTIME)
|
|
37
37
|
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
|
38
|
-
target_compile_features(${TARGET} PRIVATE
|
|
38
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
39
39
|
|
|
40
40
|
set(TARGET llama-minicpmv-cli)
|
|
41
41
|
add_executable(${TARGET} minicpmv-cli.cpp)
|
|
42
42
|
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
|
|
43
43
|
install(TARGETS ${TARGET} RUNTIME)
|
|
44
44
|
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
|
45
|
-
target_compile_features(${TARGET} PRIVATE
|
|
45
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
46
|
+
|
|
47
|
+
set(TARGET llama-qwen2vl-cli)
|
|
48
|
+
add_executable(${TARGET} qwen2vl-cli.cpp)
|
|
49
|
+
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
|
|
50
|
+
install(TARGETS ${TARGET} RUNTIME)
|
|
51
|
+
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
|
52
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
@@ -8,21 +8,25 @@
|
|
|
8
8
|
#include "ggml-alloc.h"
|
|
9
9
|
#include "ggml-backend.h"
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
11
|
+
//#ifdef GGML_USE_CUDA
|
|
12
|
+
//#include "ggml-cuda.h"
|
|
13
|
+
//#endif
|
|
14
|
+
//
|
|
15
|
+
//#ifdef GGML_USE_SYCL
|
|
16
|
+
//#include "ggml-sycl.h"
|
|
17
|
+
//#endif
|
|
18
|
+
//
|
|
19
|
+
//#ifdef GGML_USE_METAL
|
|
20
|
+
//#include "ggml-metal.h"
|
|
21
|
+
//#endif
|
|
22
|
+
//
|
|
23
|
+
//#ifdef GGML_USE_CANN
|
|
24
|
+
//#include "ggml-cann.h"
|
|
25
|
+
//#endif
|
|
26
|
+
//
|
|
27
|
+
//#ifdef GGML_USE_VULKAN
|
|
28
|
+
//#include "ggml-vulkan.h"
|
|
29
|
+
//#endif
|
|
26
30
|
|
|
27
31
|
#define STB_IMAGE_IMPLEMENTATION
|
|
28
32
|
#include "stb_image.h"
|
|
@@ -40,10 +44,17 @@
|
|
|
40
44
|
#include <cinttypes>
|
|
41
45
|
#include <limits>
|
|
42
46
|
|
|
43
|
-
#
|
|
44
|
-
#define
|
|
45
|
-
#define
|
|
46
|
-
#define
|
|
47
|
+
#if defined(LLAVA_LOG_OFF)
|
|
48
|
+
# define LOG_INF(...)
|
|
49
|
+
# define LOG_WRN(...)
|
|
50
|
+
# define LOG_ERR(...)
|
|
51
|
+
# define LOG_DBG(...)
|
|
52
|
+
#else // defined(LLAVA_LOG_OFF)
|
|
53
|
+
# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
|
54
|
+
# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
55
|
+
# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
56
|
+
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
|
57
|
+
#endif // defined(LLAVA_LOG_OFF)
|
|
47
58
|
|
|
48
59
|
//#define CLIP_DEBUG_FUNCTIONS
|
|
49
60
|
|
|
@@ -91,7 +102,9 @@ static std::string format(const char * fmt, ...) {
|
|
|
91
102
|
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
|
92
103
|
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
|
|
93
104
|
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
|
105
|
+
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
|
|
94
106
|
#define KEY_USE_GELU "clip.use_gelu"
|
|
107
|
+
#define KEY_USE_SILU "clip.use_silu"
|
|
95
108
|
#define KEY_N_EMBD "clip.%s.embedding_length"
|
|
96
109
|
#define KEY_N_FF "clip.%s.feed_forward_length"
|
|
97
110
|
#define KEY_N_BLOCK "clip.%s.block_count"
|
|
@@ -118,7 +131,8 @@ static std::string format(const char * fmt, ...) {
|
|
|
118
131
|
#define TN_TOKEN_EMBD "%s.token_embd.weight"
|
|
119
132
|
#define TN_POS_EMBD "%s.position_embd.weight"
|
|
120
133
|
#define TN_CLASS_EMBD "v.class_embd"
|
|
121
|
-
#define TN_PATCH_EMBD "v.patch_embd.weight"
|
|
134
|
+
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
|
|
135
|
+
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
|
|
122
136
|
#define TN_PATCH_BIAS "v.patch_embd.bias"
|
|
123
137
|
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
|
|
124
138
|
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
|
@@ -152,6 +166,7 @@ enum projector_type {
|
|
|
152
166
|
PROJECTOR_TYPE_LDP,
|
|
153
167
|
PROJECTOR_TYPE_LDPV2,
|
|
154
168
|
PROJECTOR_TYPE_RESAMPLER,
|
|
169
|
+
PROJECTOR_TYPE_MERGER,
|
|
155
170
|
PROJECTOR_TYPE_UNKNOWN,
|
|
156
171
|
};
|
|
157
172
|
|
|
@@ -160,6 +175,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|
|
160
175
|
{ PROJECTOR_TYPE_LDP, "ldp" },
|
|
161
176
|
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
|
162
177
|
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
|
178
|
+
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
|
|
163
179
|
};
|
|
164
180
|
|
|
165
181
|
|
|
@@ -452,7 +468,8 @@ struct clip_vision_model {
|
|
|
452
468
|
|
|
453
469
|
// embeddings
|
|
454
470
|
struct ggml_tensor * class_embedding;
|
|
455
|
-
struct ggml_tensor *
|
|
471
|
+
struct ggml_tensor * patch_embeddings_0;
|
|
472
|
+
struct ggml_tensor * patch_embeddings_1; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
|
|
456
473
|
struct ggml_tensor * patch_bias;
|
|
457
474
|
struct ggml_tensor * position_embeddings;
|
|
458
475
|
|
|
@@ -542,6 +559,7 @@ struct clip_ctx {
|
|
|
542
559
|
bool has_vision_encoder = false;
|
|
543
560
|
bool has_llava_projector = false;
|
|
544
561
|
bool has_minicpmv_projector = false;
|
|
562
|
+
bool has_qwen2vl_merger = false;
|
|
545
563
|
int minicpmv_version = 2;
|
|
546
564
|
|
|
547
565
|
struct clip_vision_model vision_model;
|
|
@@ -550,6 +568,7 @@ struct clip_ctx {
|
|
|
550
568
|
float image_mean[3];
|
|
551
569
|
float image_std[3];
|
|
552
570
|
bool use_gelu = false;
|
|
571
|
+
bool use_silu = false;
|
|
553
572
|
int32_t ftype = 1;
|
|
554
573
|
|
|
555
574
|
bool has_class_embedding = true;
|
|
@@ -595,14 +614,26 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
595
614
|
image_size_height = imgs->data->ny;
|
|
596
615
|
}
|
|
597
616
|
}
|
|
617
|
+
else if (ctx->has_qwen2vl_merger) {
|
|
618
|
+
// use the image's native resolution when image is avaible
|
|
619
|
+
if (is_inf) {
|
|
620
|
+
// if (imgs->data->nx && imgs->data->ny) {
|
|
621
|
+
image_size_width = imgs->data->nx;
|
|
622
|
+
image_size_height = imgs->data->ny;
|
|
623
|
+
}
|
|
624
|
+
}
|
|
598
625
|
const int patch_size = hparams.patch_size;
|
|
599
626
|
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
|
627
|
+
const int patches_w = image_size_width / patch_size;
|
|
628
|
+
const int patches_h = image_size_height / patch_size;
|
|
600
629
|
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
|
630
|
+
const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions;
|
|
601
631
|
const int hidden_size = hparams.hidden_size;
|
|
602
632
|
const int n_head = hparams.n_head;
|
|
603
633
|
const int d_head = hidden_size / n_head;
|
|
604
634
|
int n_layer = hparams.n_layer;
|
|
605
635
|
const float eps = hparams.eps;
|
|
636
|
+
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
|
606
637
|
|
|
607
638
|
const int batch_size = imgs->size;
|
|
608
639
|
|
|
@@ -623,10 +654,30 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
623
654
|
ggml_set_name(inp_raw, "inp_raw");
|
|
624
655
|
ggml_set_input(inp_raw);
|
|
625
656
|
|
|
626
|
-
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.
|
|
657
|
+
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
|
627
658
|
|
|
628
|
-
|
|
629
|
-
|
|
659
|
+
if (ctx->has_qwen2vl_merger) {
|
|
660
|
+
GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
|
|
661
|
+
GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
|
|
662
|
+
|
|
663
|
+
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
|
664
|
+
inp = ggml_add(ctx0, inp, inp_1);
|
|
665
|
+
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
|
|
666
|
+
inp = ggml_reshape_4d(
|
|
667
|
+
ctx0, inp,
|
|
668
|
+
hidden_size * 2, patches_w / 2, patches_h, batch_size);
|
|
669
|
+
inp = ggml_reshape_4d(
|
|
670
|
+
ctx0, inp,
|
|
671
|
+
hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
|
|
672
|
+
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
|
|
673
|
+
inp = ggml_reshape_3d(
|
|
674
|
+
ctx0, inp,
|
|
675
|
+
hidden_size, patches_w * patches_h, batch_size);
|
|
676
|
+
}
|
|
677
|
+
else {
|
|
678
|
+
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
|
|
679
|
+
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
|
|
680
|
+
}
|
|
630
681
|
|
|
631
682
|
if (ctx->has_patch_bias) {
|
|
632
683
|
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
|
|
@@ -648,12 +699,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
648
699
|
}
|
|
649
700
|
}
|
|
650
701
|
|
|
651
|
-
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
|
702
|
+
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
|
|
652
703
|
ggml_set_name(positions, "positions");
|
|
653
704
|
ggml_set_input(positions);
|
|
654
705
|
|
|
655
|
-
|
|
656
|
-
|
|
706
|
+
if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding
|
|
707
|
+
embeddings =
|
|
708
|
+
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
|
709
|
+
}
|
|
657
710
|
|
|
658
711
|
if (ctx->has_minicpmv_projector) {
|
|
659
712
|
int pos_w = image_size_width/patch_size;
|
|
@@ -677,7 +730,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
677
730
|
}
|
|
678
731
|
|
|
679
732
|
// loop over layers
|
|
680
|
-
if (ctx->has_minicpmv_projector) {
|
|
733
|
+
if (ctx->has_minicpmv_projector || ctx->has_qwen2vl_merger) {
|
|
734
|
+
// TODO: figure out why we doing thing in this way ???
|
|
681
735
|
n_layer += 1;
|
|
682
736
|
}
|
|
683
737
|
for (int il = 0; il < n_layer - 1; il++) {
|
|
@@ -699,8 +753,13 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
699
753
|
struct ggml_tensor * Q =
|
|
700
754
|
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
|
|
701
755
|
|
|
702
|
-
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
|
|
703
756
|
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
|
|
757
|
+
if (ctx->has_qwen2vl_merger) {
|
|
758
|
+
Q = ggml_rope_multi(
|
|
759
|
+
ctx0, Q, positions, nullptr,
|
|
760
|
+
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
|
761
|
+
}
|
|
762
|
+
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
|
|
704
763
|
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
|
705
764
|
Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
|
|
706
765
|
|
|
@@ -708,6 +767,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
708
767
|
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
|
|
709
768
|
|
|
710
769
|
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
|
|
770
|
+
if (ctx->has_qwen2vl_merger) {
|
|
771
|
+
K = ggml_rope_multi(
|
|
772
|
+
ctx0, K, positions, nullptr,
|
|
773
|
+
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
|
774
|
+
}
|
|
711
775
|
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
|
712
776
|
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
|
|
713
777
|
|
|
@@ -747,6 +811,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
747
811
|
|
|
748
812
|
if (ctx->use_gelu) {
|
|
749
813
|
cur = ggml_gelu_inplace(ctx0, cur);
|
|
814
|
+
} else if (ctx->use_silu) {
|
|
815
|
+
cur = ggml_silu_inplace(ctx0, cur);
|
|
750
816
|
} else {
|
|
751
817
|
cur = ggml_gelu_quick_inplace(ctx0, cur);
|
|
752
818
|
}
|
|
@@ -758,6 +824,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
758
824
|
cur = ggml_add(ctx0, embeddings, cur);
|
|
759
825
|
|
|
760
826
|
embeddings = cur;
|
|
827
|
+
|
|
761
828
|
}
|
|
762
829
|
|
|
763
830
|
// post-layernorm
|
|
@@ -829,7 +896,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
829
896
|
mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
|
|
830
897
|
mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
|
|
831
898
|
// stride = 1, padding = 1, bias is nullptr
|
|
832
|
-
block_1 =
|
|
899
|
+
block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
|
|
833
900
|
|
|
834
901
|
// layer norm
|
|
835
902
|
// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
|
@@ -877,7 +944,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
877
944
|
// block_2
|
|
878
945
|
{
|
|
879
946
|
// stride = 2
|
|
880
|
-
block_1 =
|
|
947
|
+
block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
|
|
881
948
|
|
|
882
949
|
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
|
|
883
950
|
// layer norm
|
|
@@ -938,7 +1005,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
938
1005
|
// mlp_2 ne [24, 24, 2048, 1]
|
|
939
1006
|
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
|
|
940
1007
|
// weight ne = [3, 3, 2048, 1]
|
|
941
|
-
struct ggml_tensor * peg_0 =
|
|
1008
|
+
struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
|
|
942
1009
|
peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
|
|
943
1010
|
peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
|
|
944
1011
|
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
|
|
@@ -1019,6 +1086,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
1019
1086
|
GGML_ASSERT(false);
|
|
1020
1087
|
}
|
|
1021
1088
|
}
|
|
1089
|
+
else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
|
1090
|
+
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
|
|
1091
|
+
|
|
1092
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
|
1093
|
+
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
|
1094
|
+
|
|
1095
|
+
// GELU activation
|
|
1096
|
+
embeddings = ggml_gelu(ctx0, embeddings);
|
|
1097
|
+
|
|
1098
|
+
// Second linear layer
|
|
1099
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
|
|
1100
|
+
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
|
|
1101
|
+
}
|
|
1022
1102
|
|
|
1023
1103
|
// build the graph
|
|
1024
1104
|
ggml_build_forward_expand(gf, embeddings);
|
|
@@ -1142,25 +1222,30 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1142
1222
|
}
|
|
1143
1223
|
}
|
|
1144
1224
|
|
|
1145
|
-
|
|
1146
|
-
new_clip->backend = ggml_backend_cuda_init(0);
|
|
1147
|
-
LOG_INF("%s: CLIP using CUDA backend\n", __func__);
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
new_clip->backend = ggml_backend_metal_init();
|
|
1152
|
-
LOG_INF("%s: CLIP using Metal backend\n", __func__);
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
new_clip->backend = ggml_backend_cann_init(0);
|
|
1157
|
-
LOG_INF("%s: CLIP using CANN backend\n", __func__);
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
new_clip->backend = ggml_backend_vk_init(0);
|
|
1162
|
-
LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
|
|
1163
|
-
|
|
1225
|
+
//#ifdef GGML_USE_CUDA
|
|
1226
|
+
// new_clip->backend = ggml_backend_cuda_init(0);
|
|
1227
|
+
// LOG_INF("%s: CLIP using CUDA backend\n", __func__);
|
|
1228
|
+
//#endif
|
|
1229
|
+
//
|
|
1230
|
+
//#ifdef GGML_USE_METAL
|
|
1231
|
+
// new_clip->backend = ggml_backend_metal_init();
|
|
1232
|
+
// LOG_INF("%s: CLIP using Metal backend\n", __func__);
|
|
1233
|
+
//#endif
|
|
1234
|
+
//
|
|
1235
|
+
//#ifdef GGML_USE_CANN
|
|
1236
|
+
// new_clip->backend = ggml_backend_cann_init(0);
|
|
1237
|
+
// LOG_INF("%s: CLIP using CANN backend\n", __func__);
|
|
1238
|
+
//#endif
|
|
1239
|
+
//
|
|
1240
|
+
//#ifdef GGML_USE_VULKAN
|
|
1241
|
+
// new_clip->backend = ggml_backend_vk_init(0);
|
|
1242
|
+
// LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
|
|
1243
|
+
//#endif
|
|
1244
|
+
//
|
|
1245
|
+
//#ifdef GGML_USE_SYCL
|
|
1246
|
+
// new_clip->backend = ggml_backend_sycl_init(0);
|
|
1247
|
+
// LOG_INF("%s: CLIP using SYCL backend\n", __func__);
|
|
1248
|
+
//#endif
|
|
1164
1249
|
|
|
1165
1250
|
if (!new_clip->backend) {
|
|
1166
1251
|
new_clip->backend = ggml_backend_cpu_init();
|
|
@@ -1190,6 +1275,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1190
1275
|
new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
|
|
1191
1276
|
}
|
|
1192
1277
|
|
|
1278
|
+
idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER);
|
|
1279
|
+
if (idx != -1) {
|
|
1280
|
+
new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx);
|
|
1281
|
+
}
|
|
1193
1282
|
// GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
|
|
1194
1283
|
|
|
1195
1284
|
GGML_ASSERT(new_clip->has_vision_encoder);
|
|
@@ -1198,6 +1287,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1198
1287
|
idx = get_key_idx(ctx, KEY_USE_GELU);
|
|
1199
1288
|
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
|
|
1200
1289
|
|
|
1290
|
+
try {
|
|
1291
|
+
idx = get_key_idx(ctx, KEY_USE_SILU);
|
|
1292
|
+
new_clip->use_silu = gguf_get_val_bool(ctx, idx);
|
|
1293
|
+
} catch (std::runtime_error & /*e*/) {
|
|
1294
|
+
new_clip->use_silu = false;
|
|
1295
|
+
}
|
|
1296
|
+
|
|
1201
1297
|
if (verbosity >= 1) {
|
|
1202
1298
|
LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
|
1203
1299
|
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
|
@@ -1373,11 +1469,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1373
1469
|
}
|
|
1374
1470
|
|
|
1375
1471
|
try {
|
|
1376
|
-
vision_model.
|
|
1472
|
+
vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
|
1377
1473
|
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
|
1378
1474
|
} catch(const std::exception& /*e*/) {
|
|
1379
1475
|
LOG_ERR("%s: failed to load vision model tensors\n", __func__);
|
|
1380
1476
|
}
|
|
1477
|
+
try {
|
|
1478
|
+
vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
|
|
1479
|
+
} catch(const std::exception& /*e*/) {
|
|
1480
|
+
new_clip->has_qwen2vl_merger = false;
|
|
1481
|
+
}
|
|
1381
1482
|
|
|
1382
1483
|
// LLaVA projection
|
|
1383
1484
|
if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
|
@@ -1465,6 +1566,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1465
1566
|
vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
|
|
1466
1567
|
vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
|
|
1467
1568
|
}
|
|
1569
|
+
else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) {
|
|
1570
|
+
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
|
|
1571
|
+
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
|
|
1572
|
+
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
|
|
1573
|
+
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
|
|
1574
|
+
}
|
|
1468
1575
|
else {
|
|
1469
1576
|
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
|
1470
1577
|
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
|
@@ -1503,6 +1610,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1503
1610
|
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
|
|
1504
1611
|
clip_image_f32_batch batch;
|
|
1505
1612
|
batch.size = 1;
|
|
1613
|
+
batch.data = nullptr;
|
|
1506
1614
|
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
|
1507
1615
|
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
|
1508
1616
|
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
|
@@ -1516,6 +1624,10 @@ void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size
|
|
|
1516
1624
|
ctx_clip->load_image_size = load_image_size;
|
|
1517
1625
|
}
|
|
1518
1626
|
|
|
1627
|
+
struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
|
|
1628
|
+
return ctx_clip->load_image_size;
|
|
1629
|
+
}
|
|
1630
|
+
|
|
1519
1631
|
struct clip_image_size * clip_image_size_init() {
|
|
1520
1632
|
struct clip_image_size * load_image_size = new struct clip_image_size();
|
|
1521
1633
|
load_image_size->width = 448;
|
|
@@ -1968,6 +2080,23 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|
|
1968
2080
|
}
|
|
1969
2081
|
return true;
|
|
1970
2082
|
}
|
|
2083
|
+
else if (ctx->has_qwen2vl_merger) {
|
|
2084
|
+
clip_image_u8 * resized = clip_image_u8_init();
|
|
2085
|
+
auto patch_size = clip_patch_size(ctx) * 2;
|
|
2086
|
+
int nx = ceil((float)img->nx / patch_size) * patch_size;
|
|
2087
|
+
int ny = ceil((float)img->ny / patch_size) * patch_size;
|
|
2088
|
+
bicubic_resize(*img, *resized, nx, ny);
|
|
2089
|
+
|
|
2090
|
+
res_imgs->data = new clip_image_f32[1];
|
|
2091
|
+
// clip_image_f32 * res = clip_image_f32_init();
|
|
2092
|
+
normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std);
|
|
2093
|
+
// res_imgs->data[0] = *res;
|
|
2094
|
+
res_imgs->size = 1;
|
|
2095
|
+
|
|
2096
|
+
// clip_image_f32_free(res);
|
|
2097
|
+
clip_image_u8_free(resized);
|
|
2098
|
+
return true;
|
|
2099
|
+
}
|
|
1971
2100
|
|
|
1972
2101
|
bool pad_to_square = true;
|
|
1973
2102
|
if (!ctx->has_vision_encoder) {
|
|
@@ -2157,6 +2286,13 @@ size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
|
|
|
2157
2286
|
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
|
2158
2287
|
}
|
|
2159
2288
|
|
|
2289
|
+
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
|
|
2290
|
+
clip_image_f32 img;
|
|
2291
|
+
img.nx = img_w;
|
|
2292
|
+
img.ny = img_h;
|
|
2293
|
+
return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
|
2294
|
+
}
|
|
2295
|
+
|
|
2160
2296
|
int32_t clip_image_size(const struct clip_ctx * ctx) {
|
|
2161
2297
|
return ctx->vision_model.hparams.image_size;
|
|
2162
2298
|
}
|
|
@@ -2178,6 +2314,13 @@ const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
|
|
|
2178
2314
|
}
|
|
2179
2315
|
|
|
2180
2316
|
int clip_n_patches(const struct clip_ctx * ctx) {
|
|
2317
|
+
clip_image_f32 img;
|
|
2318
|
+
img.nx = ctx->vision_model.hparams.image_size;
|
|
2319
|
+
img.ny = ctx->vision_model.hparams.image_size;
|
|
2320
|
+
return clip_n_patches_by_img(ctx, &img);
|
|
2321
|
+
}
|
|
2322
|
+
|
|
2323
|
+
int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
|
2181
2324
|
const auto & params = ctx->vision_model.hparams;
|
|
2182
2325
|
|
|
2183
2326
|
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
|
@@ -2191,6 +2334,11 @@ int clip_n_patches(const struct clip_ctx * ctx) {
|
|
|
2191
2334
|
else if (ctx->minicpmv_version == 3) {
|
|
2192
2335
|
n_patches = 64;
|
|
2193
2336
|
}
|
|
2337
|
+
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
|
2338
|
+
int patch_size = params.patch_size * 2;
|
|
2339
|
+
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
|
|
2340
|
+
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
|
|
2341
|
+
n_patches = x_patch * y_patch;
|
|
2194
2342
|
}
|
|
2195
2343
|
|
|
2196
2344
|
return n_patches;
|
|
@@ -2319,7 +2467,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
2319
2467
|
const int image_size = hparams.image_size;
|
|
2320
2468
|
int image_size_width = image_size;
|
|
2321
2469
|
int image_size_height = image_size;
|
|
2322
|
-
if (ctx->has_minicpmv_projector) {
|
|
2470
|
+
if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
|
|
2323
2471
|
image_size_width = imgs->data[0].nx;
|
|
2324
2472
|
image_size_height = imgs->data[0].ny;
|
|
2325
2473
|
}
|
|
@@ -2339,7 +2487,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
2339
2487
|
for (size_t i = 0; i < imgs->size; i++) {
|
|
2340
2488
|
const int nx = imgs->data[i].nx;
|
|
2341
2489
|
const int ny = imgs->data[i].ny;
|
|
2342
|
-
if (!ctx->has_minicpmv_projector) {
|
|
2490
|
+
if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) {
|
|
2343
2491
|
GGML_ASSERT(nx == image_size && ny == image_size);
|
|
2344
2492
|
}
|
|
2345
2493
|
|
|
@@ -2397,9 +2545,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
2397
2545
|
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
|
2398
2546
|
|
|
2399
2547
|
float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
|
|
2400
|
-
for(int i=0;i<pos_w * pos_h
|
|
2401
|
-
for(int j=0;j<embed_dim
|
|
2402
|
-
pos_embed_data[i*embed_dim+j]=pos_embed_t[i][j];
|
|
2548
|
+
for(int i=0;i < pos_w * pos_h; ++i){
|
|
2549
|
+
for(int j=0; j < embed_dim; ++j){
|
|
2550
|
+
pos_embed_data[i * embed_dim + j] = pos_embed_t[i][j];
|
|
2403
2551
|
}
|
|
2404
2552
|
}
|
|
2405
2553
|
|
|
@@ -2419,7 +2567,34 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
2419
2567
|
}
|
|
2420
2568
|
}
|
|
2421
2569
|
|
|
2422
|
-
{
|
|
2570
|
+
if (ctx->has_qwen2vl_merger) {
|
|
2571
|
+
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
|
2572
|
+
|
|
2573
|
+
const int pw = image_size_width / patch_size;
|
|
2574
|
+
const int ph = image_size_height / patch_size;
|
|
2575
|
+
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
|
2576
|
+
|
|
2577
|
+
int ptr = 0;
|
|
2578
|
+
for (int y = 0; y < ph; y+=2)
|
|
2579
|
+
{
|
|
2580
|
+
for (int x = 0; x < pw; x+=2)
|
|
2581
|
+
{
|
|
2582
|
+
for (int dy = 0; dy < 2; dy++) {
|
|
2583
|
+
for (int dx = 0; dx < 2; dx++) {
|
|
2584
|
+
positions_data[ptr] = y + dy;
|
|
2585
|
+
positions_data[num_patches + ptr] = x + dx;
|
|
2586
|
+
positions_data[num_patches * 2 + ptr] = y + dy;
|
|
2587
|
+
positions_data[num_patches * 3 + ptr] = x + dx;
|
|
2588
|
+
ptr++;
|
|
2589
|
+
}
|
|
2590
|
+
}
|
|
2591
|
+
}
|
|
2592
|
+
}
|
|
2593
|
+
|
|
2594
|
+
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
|
2595
|
+
free(positions_data);
|
|
2596
|
+
}
|
|
2597
|
+
else {
|
|
2423
2598
|
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
|
2424
2599
|
|
|
2425
2600
|
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
|
@@ -2428,16 +2603,16 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
2428
2603
|
}
|
|
2429
2604
|
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
|
2430
2605
|
free(positions_data);
|
|
2431
|
-
}
|
|
2432
2606
|
|
|
2433
|
-
|
|
2434
|
-
|
|
2435
|
-
|
|
2436
|
-
|
|
2437
|
-
|
|
2607
|
+
{
|
|
2608
|
+
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
|
2609
|
+
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
|
2610
|
+
for (int i = 0; i < num_patches; i++) {
|
|
2611
|
+
patches_data[i] = i + 1;
|
|
2612
|
+
}
|
|
2613
|
+
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
|
2614
|
+
free(patches_data);
|
|
2438
2615
|
}
|
|
2439
|
-
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
|
2440
|
-
free(patches_data);
|
|
2441
2616
|
}
|
|
2442
2617
|
}
|
|
2443
2618
|
|
|
@@ -2610,6 +2785,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|
|
2610
2785
|
return 3584;
|
|
2611
2786
|
}
|
|
2612
2787
|
}
|
|
2788
|
+
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
|
2789
|
+
return ctx->vision_model.mm_1_b->ne[0];
|
|
2790
|
+
}
|
|
2613
2791
|
|
|
2614
2792
|
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
|
2615
2793
|
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
|
@@ -2621,3 +2799,21 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
|
|
2621
2799
|
}
|
|
2622
2800
|
return 0;
|
|
2623
2801
|
}
|
|
2802
|
+
|
|
2803
|
+
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
|
|
2804
|
+
return ctx->has_qwen2vl_merger;
|
|
2805
|
+
}
|
|
2806
|
+
|
|
2807
|
+
|
|
2808
|
+
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
|
|
2809
|
+
clip_image_f32 clip_img;
|
|
2810
|
+
clip_img.buf.resize(h * w * 3);
|
|
2811
|
+
for (int i = 0; i < h*w*3; i++)
|
|
2812
|
+
{
|
|
2813
|
+
clip_img.buf[i] = img[i];
|
|
2814
|
+
}
|
|
2815
|
+
clip_img.nx = w;
|
|
2816
|
+
clip_img.ny = h;
|
|
2817
|
+
clip_image_encode(ctx, n_threads, &clip_img, vec);
|
|
2818
|
+
return true;
|
|
2819
|
+
}
|
|
@@ -45,6 +45,7 @@ CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity
|
|
|
45
45
|
CLIP_API void clip_free(struct clip_ctx * ctx);
|
|
46
46
|
|
|
47
47
|
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
|
48
|
+
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
|
|
48
49
|
|
|
49
50
|
CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
|
|
50
51
|
CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
|
|
@@ -55,11 +56,13 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
|
|
55
56
|
|
|
56
57
|
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
|
57
58
|
|
|
58
|
-
CLIP_API int clip_n_patches
|
|
59
|
-
CLIP_API int
|
|
59
|
+
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
|
60
|
+
CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
|
61
|
+
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
|
|
60
62
|
|
|
61
63
|
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
|
|
62
64
|
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
|
65
|
+
CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
|
|
63
66
|
|
|
64
67
|
CLIP_API struct clip_image_size * clip_image_size_init();
|
|
65
68
|
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
|
@@ -86,6 +89,9 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
|
|
|
86
89
|
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
|
87
90
|
|
|
88
91
|
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
|
|
92
|
+
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
|
93
|
+
|
|
94
|
+
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
|
89
95
|
|
|
90
96
|
#ifdef __cplusplus
|
|
91
97
|
}
|