@fugood/llama.node 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. package/CMakeLists.txt +5 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/EmbeddingWorker.cpp +15 -5
  19. package/src/EmbeddingWorker.h +2 -1
  20. package/src/LlamaCompletionWorker.cpp +1 -1
  21. package/src/LlamaContext.cpp +81 -18
  22. package/src/LlamaContext.h +2 -0
  23. package/src/llama.cpp/.github/workflows/build.yml +197 -159
  24. package/src/llama.cpp/.github/workflows/docker.yml +5 -8
  25. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  27. package/src/llama.cpp/CMakeLists.txt +11 -6
  28. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  29. package/src/llama.cpp/cmake/common.cmake +33 -0
  30. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  31. package/src/llama.cpp/common/CMakeLists.txt +6 -2
  32. package/src/llama.cpp/common/arg.cpp +426 -245
  33. package/src/llama.cpp/common/common.cpp +143 -80
  34. package/src/llama.cpp/common/common.h +81 -24
  35. package/src/llama.cpp/common/sampling.cpp +53 -19
  36. package/src/llama.cpp/common/sampling.h +22 -1
  37. package/src/llama.cpp/common/speculative.cpp +274 -0
  38. package/src/llama.cpp/common/speculative.h +28 -0
  39. package/src/llama.cpp/docs/build.md +101 -148
  40. package/src/llama.cpp/examples/CMakeLists.txt +32 -13
  41. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +5 -4
  43. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  46. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  47. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  48. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  49. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  50. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  52. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  55. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  57. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  59. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
  61. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/infill/infill.cpp +1 -1
  63. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  64. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
  65. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  66. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  67. package/src/llama.cpp/examples/llava/clip.cpp +262 -66
  68. package/src/llama.cpp/examples/llava/clip.h +8 -2
  69. package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
  70. package/src/llama.cpp/examples/llava/llava.cpp +46 -19
  71. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
  72. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  73. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  75. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  76. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
  77. package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
  78. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/main/main.cpp +9 -5
  80. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  81. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  83. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  84. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  87. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  88. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  89. package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
  90. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  91. package/src/llama.cpp/examples/run/run.cpp +911 -0
  92. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
  94. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
  95. package/src/llama.cpp/examples/server/server.cpp +1758 -886
  96. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  97. package/src/llama.cpp/examples/server/utils.hpp +94 -304
  98. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  99. package/src/llama.cpp/examples/simple/simple.cpp +4 -0
  100. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
  101. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
  102. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
  104. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  105. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  106. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
  108. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  109. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  110. package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
  111. package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
  112. package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
  113. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  114. package/src/llama.cpp/ggml/include/ggml.h +106 -24
  115. package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
  116. package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
  117. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
  118. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
  119. package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
  120. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
  121. package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
  122. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
  123. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  124. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  125. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
  126. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  127. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  128. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  129. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  130. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  131. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  132. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  133. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  134. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  135. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
  136. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  137. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  138. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
  139. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
  140. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
  141. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  142. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
  143. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
  151. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
  152. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
  153. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  155. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
  156. package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
  157. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
  158. package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
  159. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
  160. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
  161. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
  162. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  163. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  164. package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
  165. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
  167. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
  169. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
  172. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  173. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  174. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
  175. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
  176. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  177. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
  178. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  179. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  180. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
  181. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
  182. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
  183. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  184. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  185. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
  187. package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
  188. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
  189. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
  190. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
  191. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
  192. package/src/llama.cpp/ggml/src/ggml.c +367 -207
  193. package/src/llama.cpp/include/llama-cpp.h +25 -0
  194. package/src/llama.cpp/include/llama.h +26 -19
  195. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  196. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  197. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  198. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  199. package/src/llama.cpp/src/CMakeLists.txt +2 -7
  200. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  201. package/src/llama.cpp/src/llama-grammar.h +2 -5
  202. package/src/llama.cpp/src/llama-sampling.cpp +35 -90
  203. package/src/llama.cpp/src/llama-vocab.cpp +6 -1
  204. package/src/llama.cpp/src/llama.cpp +1748 -640
  205. package/src/llama.cpp/src/unicode.cpp +62 -51
  206. package/src/llama.cpp/src/unicode.h +9 -10
  207. package/src/llama.cpp/tests/CMakeLists.txt +48 -37
  208. package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
  209. package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
  210. package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
  211. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  212. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  213. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  214. package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
  215. package/src/llama.cpp/tests/test-rope.cpp +61 -20
  216. package/src/llama.cpp/tests/test-sampling.cpp +2 -2
  217. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  218. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  219. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  220. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  221. package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
  222. package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
  223. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
  224. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  225. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
@@ -19,6 +19,7 @@ android {
19
19
  externalNativeBuild {
20
20
  cmake {
21
21
  arguments += "-DLLAMA_BUILD_COMMON=ON"
22
+ arguments += "-DGGML_LLAMAFILE=OFF"
22
23
  arguments += "-DCMAKE_BUILD_TYPE=Release"
23
24
  cppFlags += listOf()
24
25
  arguments += listOf()
@@ -11,7 +11,7 @@ target_include_directories(llava PUBLIC .)
11
11
  target_include_directories(llava PUBLIC ../..)
12
12
  target_include_directories(llava PUBLIC ../../common)
13
13
 
14
- target_compile_features(llava PRIVATE cxx_std_11)
14
+ target_compile_features(llava PRIVATE cxx_std_17)
15
15
 
16
16
  add_library(llava_static STATIC $<TARGET_OBJECTS:llava>)
17
17
  if (BUILD_SHARED_LIBS)
@@ -35,11 +35,18 @@ add_executable(${TARGET} llava-cli.cpp)
35
35
  set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
36
36
  install(TARGETS ${TARGET} RUNTIME)
37
37
  target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
38
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
38
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
39
39
 
40
40
  set(TARGET llama-minicpmv-cli)
41
41
  add_executable(${TARGET} minicpmv-cli.cpp)
42
42
  set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
43
43
  install(TARGETS ${TARGET} RUNTIME)
44
44
  target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
45
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
45
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
46
+
47
+ set(TARGET llama-qwen2vl-cli)
48
+ add_executable(${TARGET} qwen2vl-cli.cpp)
49
+ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
50
+ install(TARGETS ${TARGET} RUNTIME)
51
+ target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
52
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -8,21 +8,25 @@
8
8
  #include "ggml-alloc.h"
9
9
  #include "ggml-backend.h"
10
10
 
11
- #ifdef GGML_USE_CUDA
12
- #include "ggml-cuda.h"
13
- #endif
14
-
15
- #ifdef GGML_USE_METAL
16
- #include "ggml-metal.h"
17
- #endif
18
-
19
- #ifdef GGML_USE_CANN
20
- #include "ggml-cann.h"
21
- #endif
22
-
23
- #ifdef GGML_USE_VULKAN
24
- #include "ggml-vulkan.h"
25
- #endif
11
+ //#ifdef GGML_USE_CUDA
12
+ //#include "ggml-cuda.h"
13
+ //#endif
14
+ //
15
+ //#ifdef GGML_USE_SYCL
16
+ //#include "ggml-sycl.h"
17
+ //#endif
18
+ //
19
+ //#ifdef GGML_USE_METAL
20
+ //#include "ggml-metal.h"
21
+ //#endif
22
+ //
23
+ //#ifdef GGML_USE_CANN
24
+ //#include "ggml-cann.h"
25
+ //#endif
26
+ //
27
+ //#ifdef GGML_USE_VULKAN
28
+ //#include "ggml-vulkan.h"
29
+ //#endif
26
30
 
27
31
  #define STB_IMAGE_IMPLEMENTATION
28
32
  #include "stb_image.h"
@@ -40,10 +44,17 @@
40
44
  #include <cinttypes>
41
45
  #include <limits>
42
46
 
43
- #define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
44
- #define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
45
- #define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
46
- #define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
47
+ #if defined(LLAVA_LOG_OFF)
48
+ # define LOG_INF(...)
49
+ # define LOG_WRN(...)
50
+ # define LOG_ERR(...)
51
+ # define LOG_DBG(...)
52
+ #else // defined(LLAVA_LOG_OFF)
53
+ # define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
54
+ # define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
55
+ # define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
56
+ # define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
57
+ #endif // defined(LLAVA_LOG_OFF)
47
58
 
48
59
  //#define CLIP_DEBUG_FUNCTIONS
49
60
 
@@ -91,7 +102,9 @@ static std::string format(const char * fmt, ...) {
91
102
  #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
92
103
  #define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
93
104
  #define KEY_MINICPMV_VERSION "clip.minicpmv_version"
105
+ #define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
94
106
  #define KEY_USE_GELU "clip.use_gelu"
107
+ #define KEY_USE_SILU "clip.use_silu"
95
108
  #define KEY_N_EMBD "clip.%s.embedding_length"
96
109
  #define KEY_N_FF "clip.%s.feed_forward_length"
97
110
  #define KEY_N_BLOCK "clip.%s.block_count"
@@ -118,7 +131,8 @@ static std::string format(const char * fmt, ...) {
118
131
  #define TN_TOKEN_EMBD "%s.token_embd.weight"
119
132
  #define TN_POS_EMBD "%s.position_embd.weight"
120
133
  #define TN_CLASS_EMBD "v.class_embd"
121
- #define TN_PATCH_EMBD "v.patch_embd.weight"
134
+ #define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
135
+ #define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
122
136
  #define TN_PATCH_BIAS "v.patch_embd.bias"
123
137
  #define TN_ATTN_K "%s.blk.%d.attn_k.%s"
124
138
  #define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
@@ -152,6 +166,7 @@ enum projector_type {
152
166
  PROJECTOR_TYPE_LDP,
153
167
  PROJECTOR_TYPE_LDPV2,
154
168
  PROJECTOR_TYPE_RESAMPLER,
169
+ PROJECTOR_TYPE_MERGER,
155
170
  PROJECTOR_TYPE_UNKNOWN,
156
171
  };
157
172
 
@@ -160,6 +175,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
160
175
  { PROJECTOR_TYPE_LDP, "ldp" },
161
176
  { PROJECTOR_TYPE_LDPV2, "ldpv2"},
162
177
  { PROJECTOR_TYPE_RESAMPLER, "resampler"},
178
+ { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
163
179
  };
164
180
 
165
181
 
@@ -452,7 +468,8 @@ struct clip_vision_model {
452
468
 
453
469
  // embeddings
454
470
  struct ggml_tensor * class_embedding;
455
- struct ggml_tensor * patch_embeddings;
471
+ struct ggml_tensor * patch_embeddings_0;
472
+ struct ggml_tensor * patch_embeddings_1; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
456
473
  struct ggml_tensor * patch_bias;
457
474
  struct ggml_tensor * position_embeddings;
458
475
 
@@ -542,6 +559,7 @@ struct clip_ctx {
542
559
  bool has_vision_encoder = false;
543
560
  bool has_llava_projector = false;
544
561
  bool has_minicpmv_projector = false;
562
+ bool has_qwen2vl_merger = false;
545
563
  int minicpmv_version = 2;
546
564
 
547
565
  struct clip_vision_model vision_model;
@@ -550,6 +568,7 @@ struct clip_ctx {
550
568
  float image_mean[3];
551
569
  float image_std[3];
552
570
  bool use_gelu = false;
571
+ bool use_silu = false;
553
572
  int32_t ftype = 1;
554
573
 
555
574
  bool has_class_embedding = true;
@@ -595,14 +614,26 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
595
614
  image_size_height = imgs->data->ny;
596
615
  }
597
616
  }
617
+ else if (ctx->has_qwen2vl_merger) {
618
+ // use the image's native resolution when image is avaible
619
+ if (is_inf) {
620
+ // if (imgs->data->nx && imgs->data->ny) {
621
+ image_size_width = imgs->data->nx;
622
+ image_size_height = imgs->data->ny;
623
+ }
624
+ }
598
625
  const int patch_size = hparams.patch_size;
599
626
  const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
627
+ const int patches_w = image_size_width / patch_size;
628
+ const int patches_h = image_size_height / patch_size;
600
629
  const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
630
+ const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions;
601
631
  const int hidden_size = hparams.hidden_size;
602
632
  const int n_head = hparams.n_head;
603
633
  const int d_head = hidden_size / n_head;
604
634
  int n_layer = hparams.n_layer;
605
635
  const float eps = hparams.eps;
636
+ int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
606
637
 
607
638
  const int batch_size = imgs->size;
608
639
 
@@ -623,10 +654,30 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
623
654
  ggml_set_name(inp_raw, "inp_raw");
624
655
  ggml_set_input(inp_raw);
625
656
 
626
- struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
657
+ struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
627
658
 
628
- inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
629
- inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
659
+ if (ctx->has_qwen2vl_merger) {
660
+ GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
661
+ GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
662
+
663
+ auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
664
+ inp = ggml_add(ctx0, inp, inp_1);
665
+ inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
666
+ inp = ggml_reshape_4d(
667
+ ctx0, inp,
668
+ hidden_size * 2, patches_w / 2, patches_h, batch_size);
669
+ inp = ggml_reshape_4d(
670
+ ctx0, inp,
671
+ hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
672
+ inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
673
+ inp = ggml_reshape_3d(
674
+ ctx0, inp,
675
+ hidden_size, patches_w * patches_h, batch_size);
676
+ }
677
+ else {
678
+ inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
679
+ inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
680
+ }
630
681
 
631
682
  if (ctx->has_patch_bias) {
632
683
  // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
@@ -648,12 +699,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
648
699
  }
649
700
  }
650
701
 
651
- struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
702
+ struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
652
703
  ggml_set_name(positions, "positions");
653
704
  ggml_set_input(positions);
654
705
 
655
- embeddings =
656
- ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
706
+ if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding
707
+ embeddings =
708
+ ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
709
+ }
657
710
 
658
711
  if (ctx->has_minicpmv_projector) {
659
712
  int pos_w = image_size_width/patch_size;
@@ -677,7 +730,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
677
730
  }
678
731
 
679
732
  // loop over layers
680
- if (ctx->has_minicpmv_projector) {
733
+ if (ctx->has_minicpmv_projector || ctx->has_qwen2vl_merger) {
734
+ // TODO: figure out why we doing thing in this way ???
681
735
  n_layer += 1;
682
736
  }
683
737
  for (int il = 0; il < n_layer - 1; il++) {
@@ -699,8 +753,13 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
699
753
  struct ggml_tensor * Q =
700
754
  ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
701
755
 
702
- Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
703
756
  Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
757
+ if (ctx->has_qwen2vl_merger) {
758
+ Q = ggml_rope_multi(
759
+ ctx0, Q, positions, nullptr,
760
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
761
+ }
762
+ Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
704
763
  Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
705
764
  Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
706
765
 
@@ -708,6 +767,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
708
767
  ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
709
768
 
710
769
  K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
770
+ if (ctx->has_qwen2vl_merger) {
771
+ K = ggml_rope_multi(
772
+ ctx0, K, positions, nullptr,
773
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
774
+ }
711
775
  K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
712
776
  K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
713
777
 
@@ -747,6 +811,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
747
811
 
748
812
  if (ctx->use_gelu) {
749
813
  cur = ggml_gelu_inplace(ctx0, cur);
814
+ } else if (ctx->use_silu) {
815
+ cur = ggml_silu_inplace(ctx0, cur);
750
816
  } else {
751
817
  cur = ggml_gelu_quick_inplace(ctx0, cur);
752
818
  }
@@ -758,6 +824,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
758
824
  cur = ggml_add(ctx0, embeddings, cur);
759
825
 
760
826
  embeddings = cur;
827
+
761
828
  }
762
829
 
763
830
  // post-layernorm
@@ -829,7 +896,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
829
896
  mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
830
897
  mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
831
898
  // stride = 1, padding = 1, bias is nullptr
832
- block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
899
+ block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
833
900
 
834
901
  // layer norm
835
902
  // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
@@ -877,7 +944,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
877
944
  // block_2
878
945
  {
879
946
  // stride = 2
880
- block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
947
+ block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
881
948
 
882
949
  // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
883
950
  // layer norm
@@ -938,7 +1005,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
938
1005
  // mlp_2 ne [24, 24, 2048, 1]
939
1006
  mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
940
1007
  // weight ne = [3, 3, 2048, 1]
941
- struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
1008
+ struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
942
1009
  peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
943
1010
  peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
944
1011
  mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
@@ -1019,6 +1086,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
1019
1086
  GGML_ASSERT(false);
1020
1087
  }
1021
1088
  }
1089
+ else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
1090
+ embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
1091
+
1092
+ embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
1093
+ embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
1094
+
1095
+ // GELU activation
1096
+ embeddings = ggml_gelu(ctx0, embeddings);
1097
+
1098
+ // Second linear layer
1099
+ embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
1100
+ embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
1101
+ }
1022
1102
 
1023
1103
  // build the graph
1024
1104
  ggml_build_forward_expand(gf, embeddings);
@@ -1142,25 +1222,30 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1142
1222
  }
1143
1223
  }
1144
1224
 
1145
- #ifdef GGML_USE_CUDA
1146
- new_clip->backend = ggml_backend_cuda_init(0);
1147
- LOG_INF("%s: CLIP using CUDA backend\n", __func__);
1148
- #endif
1149
-
1150
- #ifdef GGML_USE_METAL
1151
- new_clip->backend = ggml_backend_metal_init();
1152
- LOG_INF("%s: CLIP using Metal backend\n", __func__);
1153
- #endif
1154
-
1155
- #ifdef GGML_USE_CANN
1156
- new_clip->backend = ggml_backend_cann_init(0);
1157
- LOG_INF("%s: CLIP using CANN backend\n", __func__);
1158
- #endif
1159
-
1160
- #ifdef GGML_USE_VULKAN
1161
- new_clip->backend = ggml_backend_vk_init(0);
1162
- LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
1163
- #endif
1225
+ //#ifdef GGML_USE_CUDA
1226
+ // new_clip->backend = ggml_backend_cuda_init(0);
1227
+ // LOG_INF("%s: CLIP using CUDA backend\n", __func__);
1228
+ //#endif
1229
+ //
1230
+ //#ifdef GGML_USE_METAL
1231
+ // new_clip->backend = ggml_backend_metal_init();
1232
+ // LOG_INF("%s: CLIP using Metal backend\n", __func__);
1233
+ //#endif
1234
+ //
1235
+ //#ifdef GGML_USE_CANN
1236
+ // new_clip->backend = ggml_backend_cann_init(0);
1237
+ // LOG_INF("%s: CLIP using CANN backend\n", __func__);
1238
+ //#endif
1239
+ //
1240
+ //#ifdef GGML_USE_VULKAN
1241
+ // new_clip->backend = ggml_backend_vk_init(0);
1242
+ // LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
1243
+ //#endif
1244
+ //
1245
+ //#ifdef GGML_USE_SYCL
1246
+ // new_clip->backend = ggml_backend_sycl_init(0);
1247
+ // LOG_INF("%s: CLIP using SYCL backend\n", __func__);
1248
+ //#endif
1164
1249
 
1165
1250
  if (!new_clip->backend) {
1166
1251
  new_clip->backend = ggml_backend_cpu_init();
@@ -1190,6 +1275,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1190
1275
  new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
1191
1276
  }
1192
1277
 
1278
+ idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER);
1279
+ if (idx != -1) {
1280
+ new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx);
1281
+ }
1193
1282
  // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
1194
1283
 
1195
1284
  GGML_ASSERT(new_clip->has_vision_encoder);
@@ -1198,6 +1287,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1198
1287
  idx = get_key_idx(ctx, KEY_USE_GELU);
1199
1288
  new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
1200
1289
 
1290
+ try {
1291
+ idx = get_key_idx(ctx, KEY_USE_SILU);
1292
+ new_clip->use_silu = gguf_get_val_bool(ctx, idx);
1293
+ } catch (std::runtime_error & /*e*/) {
1294
+ new_clip->use_silu = false;
1295
+ }
1296
+
1201
1297
  if (verbosity >= 1) {
1202
1298
  LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
1203
1299
  LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
@@ -1373,11 +1469,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1373
1469
  }
1374
1470
 
1375
1471
  try {
1376
- vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
1472
+ vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
1377
1473
  vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
1378
1474
  } catch(const std::exception& /*e*/) {
1379
1475
  LOG_ERR("%s: failed to load vision model tensors\n", __func__);
1380
1476
  }
1477
+ try {
1478
+ vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
1479
+ } catch(const std::exception& /*e*/) {
1480
+ new_clip->has_qwen2vl_merger = false;
1481
+ }
1381
1482
 
1382
1483
  // LLaVA projection
1383
1484
  if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
@@ -1465,6 +1566,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1465
1566
  vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
1466
1567
  vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
1467
1568
  }
1569
+ else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) {
1570
+ vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
1571
+ vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
1572
+ vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
1573
+ vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
1574
+ }
1468
1575
  else {
1469
1576
  std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
1470
1577
  throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@@ -1503,6 +1610,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1503
1610
  new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
1504
1611
  clip_image_f32_batch batch;
1505
1612
  batch.size = 1;
1613
+ batch.data = nullptr;
1506
1614
  ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
1507
1615
  ggml_gallocr_reserve(new_clip->compute_alloc, gf);
1508
1616
  size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
@@ -1516,6 +1624,10 @@ void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size
1516
1624
  ctx_clip->load_image_size = load_image_size;
1517
1625
  }
1518
1626
 
1627
+ struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
1628
+ return ctx_clip->load_image_size;
1629
+ }
1630
+
1519
1631
  struct clip_image_size * clip_image_size_init() {
1520
1632
  struct clip_image_size * load_image_size = new struct clip_image_size();
1521
1633
  load_image_size->width = 448;
@@ -1968,6 +2080,23 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
1968
2080
  }
1969
2081
  return true;
1970
2082
  }
2083
+ else if (ctx->has_qwen2vl_merger) {
2084
+ clip_image_u8 * resized = clip_image_u8_init();
2085
+ auto patch_size = clip_patch_size(ctx) * 2;
2086
+ int nx = ceil((float)img->nx / patch_size) * patch_size;
2087
+ int ny = ceil((float)img->ny / patch_size) * patch_size;
2088
+ bicubic_resize(*img, *resized, nx, ny);
2089
+
2090
+ res_imgs->data = new clip_image_f32[1];
2091
+ // clip_image_f32 * res = clip_image_f32_init();
2092
+ normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std);
2093
+ // res_imgs->data[0] = *res;
2094
+ res_imgs->size = 1;
2095
+
2096
+ // clip_image_f32_free(res);
2097
+ clip_image_u8_free(resized);
2098
+ return true;
2099
+ }
1971
2100
 
1972
2101
  bool pad_to_square = true;
1973
2102
  if (!ctx->has_vision_encoder) {
@@ -2157,6 +2286,13 @@ size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
2157
2286
  return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
2158
2287
  }
2159
2288
 
2289
+ size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
2290
+ clip_image_f32 img;
2291
+ img.nx = img_w;
2292
+ img.ny = img_h;
2293
+ return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
2294
+ }
2295
+
2160
2296
  int32_t clip_image_size(const struct clip_ctx * ctx) {
2161
2297
  return ctx->vision_model.hparams.image_size;
2162
2298
  }
@@ -2178,6 +2314,13 @@ const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
2178
2314
  }
2179
2315
 
2180
2316
  int clip_n_patches(const struct clip_ctx * ctx) {
2317
+ clip_image_f32 img;
2318
+ img.nx = ctx->vision_model.hparams.image_size;
2319
+ img.ny = ctx->vision_model.hparams.image_size;
2320
+ return clip_n_patches_by_img(ctx, &img);
2321
+ }
2322
+
2323
+ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
2181
2324
  const auto & params = ctx->vision_model.hparams;
2182
2325
 
2183
2326
  int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
@@ -2191,6 +2334,11 @@ int clip_n_patches(const struct clip_ctx * ctx) {
2191
2334
  else if (ctx->minicpmv_version == 3) {
2192
2335
  n_patches = 64;
2193
2336
  }
2337
+ } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
2338
+ int patch_size = params.patch_size * 2;
2339
+ int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
2340
+ int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
2341
+ n_patches = x_patch * y_patch;
2194
2342
  }
2195
2343
 
2196
2344
  return n_patches;
@@ -2319,7 +2467,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2319
2467
  const int image_size = hparams.image_size;
2320
2468
  int image_size_width = image_size;
2321
2469
  int image_size_height = image_size;
2322
- if (ctx->has_minicpmv_projector) {
2470
+ if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
2323
2471
  image_size_width = imgs->data[0].nx;
2324
2472
  image_size_height = imgs->data[0].ny;
2325
2473
  }
@@ -2339,7 +2487,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2339
2487
  for (size_t i = 0; i < imgs->size; i++) {
2340
2488
  const int nx = imgs->data[i].nx;
2341
2489
  const int ny = imgs->data[i].ny;
2342
- if (!ctx->has_minicpmv_projector) {
2490
+ if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) {
2343
2491
  GGML_ASSERT(nx == image_size && ny == image_size);
2344
2492
  }
2345
2493
 
@@ -2397,9 +2545,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2397
2545
  auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
2398
2546
 
2399
2547
  float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
2400
- for(int i=0;i<pos_w * pos_h;++i){
2401
- for(int j=0;j<embed_dim;++j){
2402
- pos_embed_data[i*embed_dim+j]=pos_embed_t[i][j];
2548
+ for(int i=0;i < pos_w * pos_h; ++i){
2549
+ for(int j=0; j < embed_dim; ++j){
2550
+ pos_embed_data[i * embed_dim + j] = pos_embed_t[i][j];
2403
2551
  }
2404
2552
  }
2405
2553
 
@@ -2419,7 +2567,34 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2419
2567
  }
2420
2568
  }
2421
2569
 
2422
- {
2570
+ if (ctx->has_qwen2vl_merger) {
2571
+ struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
2572
+
2573
+ const int pw = image_size_width / patch_size;
2574
+ const int ph = image_size_height / patch_size;
2575
+ int* positions_data = (int*)malloc(ggml_nbytes(positions));
2576
+
2577
+ int ptr = 0;
2578
+ for (int y = 0; y < ph; y+=2)
2579
+ {
2580
+ for (int x = 0; x < pw; x+=2)
2581
+ {
2582
+ for (int dy = 0; dy < 2; dy++) {
2583
+ for (int dx = 0; dx < 2; dx++) {
2584
+ positions_data[ptr] = y + dy;
2585
+ positions_data[num_patches + ptr] = x + dx;
2586
+ positions_data[num_patches * 2 + ptr] = y + dy;
2587
+ positions_data[num_patches * 3 + ptr] = x + dx;
2588
+ ptr++;
2589
+ }
2590
+ }
2591
+ }
2592
+ }
2593
+
2594
+ ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
2595
+ free(positions_data);
2596
+ }
2597
+ else {
2423
2598
  struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
2424
2599
 
2425
2600
  int* positions_data = (int*)malloc(ggml_nbytes(positions));
@@ -2428,16 +2603,16 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2428
2603
  }
2429
2604
  ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
2430
2605
  free(positions_data);
2431
- }
2432
2606
 
2433
- {
2434
- struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
2435
- int* patches_data = (int*)malloc(ggml_nbytes(patches));
2436
- for (int i = 0; i < num_patches; i++) {
2437
- patches_data[i] = i + 1;
2607
+ {
2608
+ struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
2609
+ int* patches_data = (int*)malloc(ggml_nbytes(patches));
2610
+ for (int i = 0; i < num_patches; i++) {
2611
+ patches_data[i] = i + 1;
2612
+ }
2613
+ ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
2614
+ free(patches_data);
2438
2615
  }
2439
- ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
2440
- free(patches_data);
2441
2616
  }
2442
2617
  }
2443
2618
 
@@ -2610,6 +2785,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
2610
2785
  return 3584;
2611
2786
  }
2612
2787
  }
2788
+ if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
2789
+ return ctx->vision_model.mm_1_b->ne[0];
2790
+ }
2613
2791
 
2614
2792
  std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
2615
2793
  throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@@ -2621,3 +2799,21 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
2621
2799
  }
2622
2800
  return 0;
2623
2801
  }
2802
+
2803
+ bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
2804
+ return ctx->has_qwen2vl_merger;
2805
+ }
2806
+
2807
+
2808
+ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
2809
+ clip_image_f32 clip_img;
2810
+ clip_img.buf.resize(h * w * 3);
2811
+ for (int i = 0; i < h*w*3; i++)
2812
+ {
2813
+ clip_img.buf[i] = img[i];
2814
+ }
2815
+ clip_img.nx = w;
2816
+ clip_img.ny = h;
2817
+ clip_image_encode(ctx, n_threads, &clip_img, vec);
2818
+ return true;
2819
+ }
@@ -45,6 +45,7 @@ CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity
45
45
  CLIP_API void clip_free(struct clip_ctx * ctx);
46
46
 
47
47
  CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
48
+ CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
48
49
 
49
50
  CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
50
51
  CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
@@ -55,11 +56,13 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
55
56
 
56
57
  CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
57
58
 
58
- CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
59
- CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
59
+ CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
60
+ CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
61
+ CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
60
62
 
61
63
  CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
62
64
  CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
65
+ CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
63
66
 
64
67
  CLIP_API struct clip_image_size * clip_image_size_init();
65
68
  CLIP_API struct clip_image_u8 * clip_image_u8_init ();
@@ -86,6 +89,9 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
86
89
  CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
87
90
 
88
91
  CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
92
+ CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
93
+
94
+ CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
89
95
 
90
96
  #ifdef __cplusplus
91
97
  }