@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/CMakeLists.txt +7 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/DetokenizeWorker.cpp +1 -1
  19. package/src/EmbeddingWorker.cpp +17 -7
  20. package/src/EmbeddingWorker.h +2 -1
  21. package/src/LlamaCompletionWorker.cpp +8 -8
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +89 -27
  24. package/src/LlamaContext.h +2 -0
  25. package/src/TokenizeWorker.cpp +1 -1
  26. package/src/common.hpp +4 -4
  27. package/src/llama.cpp/.github/workflows/build.yml +240 -168
  28. package/src/llama.cpp/.github/workflows/docker.yml +8 -8
  29. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  31. package/src/llama.cpp/CMakeLists.txt +14 -6
  32. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/cmake/common.cmake +33 -0
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  36. package/src/llama.cpp/common/CMakeLists.txt +6 -4
  37. package/src/llama.cpp/common/arg.cpp +986 -770
  38. package/src/llama.cpp/common/arg.h +22 -22
  39. package/src/llama.cpp/common/common.cpp +212 -351
  40. package/src/llama.cpp/common/common.h +204 -117
  41. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  42. package/src/llama.cpp/common/log.cpp +50 -50
  43. package/src/llama.cpp/common/log.h +18 -18
  44. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  45. package/src/llama.cpp/common/ngram-cache.h +19 -19
  46. package/src/llama.cpp/common/sampling.cpp +163 -121
  47. package/src/llama.cpp/common/sampling.h +41 -20
  48. package/src/llama.cpp/common/speculative.cpp +274 -0
  49. package/src/llama.cpp/common/speculative.h +28 -0
  50. package/src/llama.cpp/docs/build.md +134 -161
  51. package/src/llama.cpp/examples/CMakeLists.txt +33 -14
  52. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/batched/batched.cpp +19 -18
  54. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  56. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  58. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  60. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  61. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  63. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  64. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  65. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  66. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  67. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  69. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  71. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  73. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  75. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  77. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
  79. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  80. package/src/llama.cpp/examples/infill/infill.cpp +41 -87
  81. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
  83. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
  84. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  85. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  86. package/src/llama.cpp/examples/llava/clip.cpp +263 -66
  87. package/src/llama.cpp/examples/llava/clip.h +8 -2
  88. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  89. package/src/llama.cpp/examples/llava/llava.cpp +83 -22
  90. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  91. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  92. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  94. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  95. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  96. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  97. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
  98. package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
  99. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  100. package/src/llama.cpp/examples/main/main.cpp +73 -114
  101. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  102. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  104. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  105. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  106. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  108. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  110. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  111. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  112. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  113. package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
  114. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  115. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  116. package/src/llama.cpp/examples/run/run.cpp +911 -0
  117. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  118. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
  119. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
  120. package/src/llama.cpp/examples/server/server.cpp +2073 -1339
  121. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  122. package/src/llama.cpp/examples/server/utils.hpp +354 -277
  123. package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
  124. package/src/llama.cpp/examples/simple/simple.cpp +130 -94
  125. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  126. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
  127. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
  129. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  130. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  131. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
  133. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  134. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  135. package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
  136. package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
  137. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  138. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  139. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  140. package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
  141. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  142. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  143. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  144. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  145. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  146. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  147. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  148. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  149. package/src/llama.cpp/ggml/include/ggml.h +159 -417
  150. package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
  151. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
  152. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
  153. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
  154. package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
  155. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  156. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
  157. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
  158. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  159. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  160. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
  161. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  162. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  163. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  164. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  165. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  169. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  170. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
  171. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  172. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  173. package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  174. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  175. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  176. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  177. package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
  178. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  179. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  180. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  181. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
  182. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  183. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  184. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  185. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  186. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  187. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
  188. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
  189. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
  190. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
  192. package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
  193. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  194. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
  195. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
  196. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  197. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  198. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  199. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  200. package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  201. package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
  202. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  203. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  204. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
  205. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
  208. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
  209. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  210. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  211. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  212. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
  213. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  214. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  215. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  216. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
  217. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  218. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  219. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
  220. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
  221. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  222. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  223. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  224. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  225. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  226. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  227. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  228. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  229. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  230. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  231. package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
  232. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
  233. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
  234. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
  235. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  236. package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
  237. package/src/llama.cpp/include/llama-cpp.h +25 -0
  238. package/src/llama.cpp/include/llama.h +93 -52
  239. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  242. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  243. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  244. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  245. package/src/llama.cpp/src/CMakeLists.txt +4 -8
  246. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  247. package/src/llama.cpp/src/llama-grammar.h +2 -5
  248. package/src/llama.cpp/src/llama-sampling.cpp +779 -194
  249. package/src/llama.cpp/src/llama-sampling.h +21 -2
  250. package/src/llama.cpp/src/llama-vocab.cpp +55 -10
  251. package/src/llama.cpp/src/llama-vocab.h +35 -11
  252. package/src/llama.cpp/src/llama.cpp +4317 -2979
  253. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  254. package/src/llama.cpp/src/unicode.cpp +62 -51
  255. package/src/llama.cpp/src/unicode.h +9 -10
  256. package/src/llama.cpp/tests/CMakeLists.txt +48 -38
  257. package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
  258. package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
  259. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  260. package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
  261. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  262. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  263. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  264. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  265. package/src/llama.cpp/tests/test-log.cpp +2 -2
  266. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  267. package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
  268. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  269. package/src/llama.cpp/tests/test-rope.cpp +62 -20
  270. package/src/llama.cpp/tests/test-sampling.cpp +163 -138
  271. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  272. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  273. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  274. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  275. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  276. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  277. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  278. package/src/llama.cpp/common/train.cpp +0 -1515
  279. package/src/llama.cpp/common/train.h +0 -233
  280. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  281. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  282. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
  283. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
  284. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  285. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  286. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -4,24 +4,29 @@
4
4
  // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
5
5
  #include "clip.h"
6
6
  #include "ggml.h"
7
+ #include "ggml-cpu.h"
7
8
  #include "ggml-alloc.h"
8
9
  #include "ggml-backend.h"
9
10
 
10
- #ifdef GGML_USE_CUDA
11
- #include "ggml-cuda.h"
12
- #endif
13
-
14
- #ifdef GGML_USE_METAL
15
- #include "ggml-metal.h"
16
- #endif
17
-
18
- #ifdef GGML_USE_CANN
19
- #include "ggml-cann.h"
20
- #endif
21
-
22
- #ifdef GGML_USE_VULKAN
23
- #include "ggml-vulkan.h"
24
- #endif
11
+ //#ifdef GGML_USE_CUDA
12
+ //#include "ggml-cuda.h"
13
+ //#endif
14
+ //
15
+ //#ifdef GGML_USE_SYCL
16
+ //#include "ggml-sycl.h"
17
+ //#endif
18
+ //
19
+ //#ifdef GGML_USE_METAL
20
+ //#include "ggml-metal.h"
21
+ //#endif
22
+ //
23
+ //#ifdef GGML_USE_CANN
24
+ //#include "ggml-cann.h"
25
+ //#endif
26
+ //
27
+ //#ifdef GGML_USE_VULKAN
28
+ //#include "ggml-vulkan.h"
29
+ //#endif
25
30
 
26
31
  #define STB_IMAGE_IMPLEMENTATION
27
32
  #include "stb_image.h"
@@ -39,10 +44,17 @@
39
44
  #include <cinttypes>
40
45
  #include <limits>
41
46
 
42
- #define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
43
- #define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
44
- #define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
45
- #define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
47
+ #if defined(LLAVA_LOG_OFF)
48
+ # define LOG_INF(...)
49
+ # define LOG_WRN(...)
50
+ # define LOG_ERR(...)
51
+ # define LOG_DBG(...)
52
+ #else // defined(LLAVA_LOG_OFF)
53
+ # define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
54
+ # define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
55
+ # define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
56
+ # define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
57
+ #endif // defined(LLAVA_LOG_OFF)
46
58
 
47
59
  //#define CLIP_DEBUG_FUNCTIONS
48
60
 
@@ -90,7 +102,9 @@ static std::string format(const char * fmt, ...) {
90
102
  #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
91
103
  #define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
92
104
  #define KEY_MINICPMV_VERSION "clip.minicpmv_version"
105
+ #define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
93
106
  #define KEY_USE_GELU "clip.use_gelu"
107
+ #define KEY_USE_SILU "clip.use_silu"
94
108
  #define KEY_N_EMBD "clip.%s.embedding_length"
95
109
  #define KEY_N_FF "clip.%s.feed_forward_length"
96
110
  #define KEY_N_BLOCK "clip.%s.block_count"
@@ -117,7 +131,8 @@ static std::string format(const char * fmt, ...) {
117
131
  #define TN_TOKEN_EMBD "%s.token_embd.weight"
118
132
  #define TN_POS_EMBD "%s.position_embd.weight"
119
133
  #define TN_CLASS_EMBD "v.class_embd"
120
- #define TN_PATCH_EMBD "v.patch_embd.weight"
134
+ #define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
135
+ #define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
121
136
  #define TN_PATCH_BIAS "v.patch_embd.bias"
122
137
  #define TN_ATTN_K "%s.blk.%d.attn_k.%s"
123
138
  #define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
@@ -151,6 +166,7 @@ enum projector_type {
151
166
  PROJECTOR_TYPE_LDP,
152
167
  PROJECTOR_TYPE_LDPV2,
153
168
  PROJECTOR_TYPE_RESAMPLER,
169
+ PROJECTOR_TYPE_MERGER,
154
170
  PROJECTOR_TYPE_UNKNOWN,
155
171
  };
156
172
 
@@ -159,6 +175,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
159
175
  { PROJECTOR_TYPE_LDP, "ldp" },
160
176
  { PROJECTOR_TYPE_LDPV2, "ldpv2"},
161
177
  { PROJECTOR_TYPE_RESAMPLER, "resampler"},
178
+ { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
162
179
  };
163
180
 
164
181
 
@@ -451,7 +468,8 @@ struct clip_vision_model {
451
468
 
452
469
  // embeddings
453
470
  struct ggml_tensor * class_embedding;
454
- struct ggml_tensor * patch_embeddings;
471
+ struct ggml_tensor * patch_embeddings_0;
472
+ struct ggml_tensor * patch_embeddings_1; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
455
473
  struct ggml_tensor * patch_bias;
456
474
  struct ggml_tensor * position_embeddings;
457
475
 
@@ -541,6 +559,7 @@ struct clip_ctx {
541
559
  bool has_vision_encoder = false;
542
560
  bool has_llava_projector = false;
543
561
  bool has_minicpmv_projector = false;
562
+ bool has_qwen2vl_merger = false;
544
563
  int minicpmv_version = 2;
545
564
 
546
565
  struct clip_vision_model vision_model;
@@ -549,6 +568,7 @@ struct clip_ctx {
549
568
  float image_mean[3];
550
569
  float image_std[3];
551
570
  bool use_gelu = false;
571
+ bool use_silu = false;
552
572
  int32_t ftype = 1;
553
573
 
554
574
  bool has_class_embedding = true;
@@ -594,14 +614,26 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
594
614
  image_size_height = imgs->data->ny;
595
615
  }
596
616
  }
617
+ else if (ctx->has_qwen2vl_merger) {
618
+ // use the image's native resolution when image is avaible
619
+ if (is_inf) {
620
+ // if (imgs->data->nx && imgs->data->ny) {
621
+ image_size_width = imgs->data->nx;
622
+ image_size_height = imgs->data->ny;
623
+ }
624
+ }
597
625
  const int patch_size = hparams.patch_size;
598
626
  const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
627
+ const int patches_w = image_size_width / patch_size;
628
+ const int patches_h = image_size_height / patch_size;
599
629
  const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
630
+ const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions;
600
631
  const int hidden_size = hparams.hidden_size;
601
632
  const int n_head = hparams.n_head;
602
633
  const int d_head = hidden_size / n_head;
603
634
  int n_layer = hparams.n_layer;
604
635
  const float eps = hparams.eps;
636
+ int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
605
637
 
606
638
  const int batch_size = imgs->size;
607
639
 
@@ -622,10 +654,30 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
622
654
  ggml_set_name(inp_raw, "inp_raw");
623
655
  ggml_set_input(inp_raw);
624
656
 
625
- struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
657
+ struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
626
658
 
627
- inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
628
- inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
659
+ if (ctx->has_qwen2vl_merger) {
660
+ GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
661
+ GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
662
+
663
+ auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
664
+ inp = ggml_add(ctx0, inp, inp_1);
665
+ inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
666
+ inp = ggml_reshape_4d(
667
+ ctx0, inp,
668
+ hidden_size * 2, patches_w / 2, patches_h, batch_size);
669
+ inp = ggml_reshape_4d(
670
+ ctx0, inp,
671
+ hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
672
+ inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
673
+ inp = ggml_reshape_3d(
674
+ ctx0, inp,
675
+ hidden_size, patches_w * patches_h, batch_size);
676
+ }
677
+ else {
678
+ inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
679
+ inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
680
+ }
629
681
 
630
682
  if (ctx->has_patch_bias) {
631
683
  // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
@@ -647,12 +699,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
647
699
  }
648
700
  }
649
701
 
650
- struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
702
+ struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
651
703
  ggml_set_name(positions, "positions");
652
704
  ggml_set_input(positions);
653
705
 
654
- embeddings =
655
- ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
706
+ if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding
707
+ embeddings =
708
+ ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
709
+ }
656
710
 
657
711
  if (ctx->has_minicpmv_projector) {
658
712
  int pos_w = image_size_width/patch_size;
@@ -676,7 +730,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
676
730
  }
677
731
 
678
732
  // loop over layers
679
- if (ctx->has_minicpmv_projector) {
733
+ if (ctx->has_minicpmv_projector || ctx->has_qwen2vl_merger) {
734
+ // TODO: figure out why we doing thing in this way ???
680
735
  n_layer += 1;
681
736
  }
682
737
  for (int il = 0; il < n_layer - 1; il++) {
@@ -698,8 +753,13 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
698
753
  struct ggml_tensor * Q =
699
754
  ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
700
755
 
701
- Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
702
756
  Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
757
+ if (ctx->has_qwen2vl_merger) {
758
+ Q = ggml_rope_multi(
759
+ ctx0, Q, positions, nullptr,
760
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
761
+ }
762
+ Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
703
763
  Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
704
764
  Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
705
765
 
@@ -707,6 +767,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
707
767
  ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
708
768
 
709
769
  K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
770
+ if (ctx->has_qwen2vl_merger) {
771
+ K = ggml_rope_multi(
772
+ ctx0, K, positions, nullptr,
773
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
774
+ }
710
775
  K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
711
776
  K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
712
777
 
@@ -746,6 +811,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
746
811
 
747
812
  if (ctx->use_gelu) {
748
813
  cur = ggml_gelu_inplace(ctx0, cur);
814
+ } else if (ctx->use_silu) {
815
+ cur = ggml_silu_inplace(ctx0, cur);
749
816
  } else {
750
817
  cur = ggml_gelu_quick_inplace(ctx0, cur);
751
818
  }
@@ -757,6 +824,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
757
824
  cur = ggml_add(ctx0, embeddings, cur);
758
825
 
759
826
  embeddings = cur;
827
+
760
828
  }
761
829
 
762
830
  // post-layernorm
@@ -828,7 +896,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
828
896
  mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
829
897
  mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
830
898
  // stride = 1, padding = 1, bias is nullptr
831
- block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
899
+ block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
832
900
 
833
901
  // layer norm
834
902
  // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
@@ -876,7 +944,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
876
944
  // block_2
877
945
  {
878
946
  // stride = 2
879
- block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
947
+ block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
880
948
 
881
949
  // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
882
950
  // layer norm
@@ -937,7 +1005,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
937
1005
  // mlp_2 ne [24, 24, 2048, 1]
938
1006
  mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
939
1007
  // weight ne = [3, 3, 2048, 1]
940
- struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
1008
+ struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
941
1009
  peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
942
1010
  peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
943
1011
  mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
@@ -1018,6 +1086,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
1018
1086
  GGML_ASSERT(false);
1019
1087
  }
1020
1088
  }
1089
+ else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
1090
+ embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
1091
+
1092
+ embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
1093
+ embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
1094
+
1095
+ // GELU activation
1096
+ embeddings = ggml_gelu(ctx0, embeddings);
1097
+
1098
+ // Second linear layer
1099
+ embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
1100
+ embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
1101
+ }
1021
1102
 
1022
1103
  // build the graph
1023
1104
  ggml_build_forward_expand(gf, embeddings);
@@ -1141,25 +1222,30 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1141
1222
  }
1142
1223
  }
1143
1224
 
1144
- #ifdef GGML_USE_CUDA
1145
- new_clip->backend = ggml_backend_cuda_init(0);
1146
- LOG_INF("%s: CLIP using CUDA backend\n", __func__);
1147
- #endif
1148
-
1149
- #ifdef GGML_USE_METAL
1150
- new_clip->backend = ggml_backend_metal_init();
1151
- LOG_INF("%s: CLIP using Metal backend\n", __func__);
1152
- #endif
1153
-
1154
- #ifdef GGML_USE_CANN
1155
- new_clip->backend = ggml_backend_cann_init(0);
1156
- LOG_INF("%s: CLIP using CANN backend\n", __func__);
1157
- #endif
1158
-
1159
- #ifdef GGML_USE_VULKAN
1160
- new_clip->backend = ggml_backend_vk_init(0);
1161
- LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
1162
- #endif
1225
+ //#ifdef GGML_USE_CUDA
1226
+ // new_clip->backend = ggml_backend_cuda_init(0);
1227
+ // LOG_INF("%s: CLIP using CUDA backend\n", __func__);
1228
+ //#endif
1229
+ //
1230
+ //#ifdef GGML_USE_METAL
1231
+ // new_clip->backend = ggml_backend_metal_init();
1232
+ // LOG_INF("%s: CLIP using Metal backend\n", __func__);
1233
+ //#endif
1234
+ //
1235
+ //#ifdef GGML_USE_CANN
1236
+ // new_clip->backend = ggml_backend_cann_init(0);
1237
+ // LOG_INF("%s: CLIP using CANN backend\n", __func__);
1238
+ //#endif
1239
+ //
1240
+ //#ifdef GGML_USE_VULKAN
1241
+ // new_clip->backend = ggml_backend_vk_init(0);
1242
+ // LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
1243
+ //#endif
1244
+ //
1245
+ //#ifdef GGML_USE_SYCL
1246
+ // new_clip->backend = ggml_backend_sycl_init(0);
1247
+ // LOG_INF("%s: CLIP using SYCL backend\n", __func__);
1248
+ //#endif
1163
1249
 
1164
1250
  if (!new_clip->backend) {
1165
1251
  new_clip->backend = ggml_backend_cpu_init();
@@ -1189,6 +1275,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1189
1275
  new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
1190
1276
  }
1191
1277
 
1278
+ idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER);
1279
+ if (idx != -1) {
1280
+ new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx);
1281
+ }
1192
1282
  // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
1193
1283
 
1194
1284
  GGML_ASSERT(new_clip->has_vision_encoder);
@@ -1197,6 +1287,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1197
1287
  idx = get_key_idx(ctx, KEY_USE_GELU);
1198
1288
  new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
1199
1289
 
1290
+ try {
1291
+ idx = get_key_idx(ctx, KEY_USE_SILU);
1292
+ new_clip->use_silu = gguf_get_val_bool(ctx, idx);
1293
+ } catch (std::runtime_error & /*e*/) {
1294
+ new_clip->use_silu = false;
1295
+ }
1296
+
1200
1297
  if (verbosity >= 1) {
1201
1298
  LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
1202
1299
  LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
@@ -1372,11 +1469,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1372
1469
  }
1373
1470
 
1374
1471
  try {
1375
- vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
1472
+ vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
1376
1473
  vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
1377
1474
  } catch(const std::exception& /*e*/) {
1378
1475
  LOG_ERR("%s: failed to load vision model tensors\n", __func__);
1379
1476
  }
1477
+ try {
1478
+ vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
1479
+ } catch(const std::exception& /*e*/) {
1480
+ new_clip->has_qwen2vl_merger = false;
1481
+ }
1380
1482
 
1381
1483
  // LLaVA projection
1382
1484
  if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
@@ -1464,6 +1566,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1464
1566
  vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
1465
1567
  vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
1466
1568
  }
1569
+ else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) {
1570
+ vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
1571
+ vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
1572
+ vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
1573
+ vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
1574
+ }
1467
1575
  else {
1468
1576
  std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
1469
1577
  throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@@ -1502,6 +1610,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1502
1610
  new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
1503
1611
  clip_image_f32_batch batch;
1504
1612
  batch.size = 1;
1613
+ batch.data = nullptr;
1505
1614
  ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
1506
1615
  ggml_gallocr_reserve(new_clip->compute_alloc, gf);
1507
1616
  size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
@@ -1515,6 +1624,10 @@ void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size
1515
1624
  ctx_clip->load_image_size = load_image_size;
1516
1625
  }
1517
1626
 
1627
+ struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
1628
+ return ctx_clip->load_image_size;
1629
+ }
1630
+
1518
1631
  struct clip_image_size * clip_image_size_init() {
1519
1632
  struct clip_image_size * load_image_size = new struct clip_image_size();
1520
1633
  load_image_size->width = 448;
@@ -1967,6 +2080,23 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
1967
2080
  }
1968
2081
  return true;
1969
2082
  }
2083
+ else if (ctx->has_qwen2vl_merger) {
2084
+ clip_image_u8 * resized = clip_image_u8_init();
2085
+ auto patch_size = clip_patch_size(ctx) * 2;
2086
+ int nx = ceil((float)img->nx / patch_size) * patch_size;
2087
+ int ny = ceil((float)img->ny / patch_size) * patch_size;
2088
+ bicubic_resize(*img, *resized, nx, ny);
2089
+
2090
+ res_imgs->data = new clip_image_f32[1];
2091
+ // clip_image_f32 * res = clip_image_f32_init();
2092
+ normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std);
2093
+ // res_imgs->data[0] = *res;
2094
+ res_imgs->size = 1;
2095
+
2096
+ // clip_image_f32_free(res);
2097
+ clip_image_u8_free(resized);
2098
+ return true;
2099
+ }
1970
2100
 
1971
2101
  bool pad_to_square = true;
1972
2102
  if (!ctx->has_vision_encoder) {
@@ -2156,6 +2286,13 @@ size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
2156
2286
  return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
2157
2287
  }
2158
2288
 
2289
+ size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
2290
+ clip_image_f32 img;
2291
+ img.nx = img_w;
2292
+ img.ny = img_h;
2293
+ return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
2294
+ }
2295
+
2159
2296
  int32_t clip_image_size(const struct clip_ctx * ctx) {
2160
2297
  return ctx->vision_model.hparams.image_size;
2161
2298
  }
@@ -2177,6 +2314,13 @@ const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
2177
2314
  }
2178
2315
 
2179
2316
  int clip_n_patches(const struct clip_ctx * ctx) {
2317
+ clip_image_f32 img;
2318
+ img.nx = ctx->vision_model.hparams.image_size;
2319
+ img.ny = ctx->vision_model.hparams.image_size;
2320
+ return clip_n_patches_by_img(ctx, &img);
2321
+ }
2322
+
2323
+ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
2180
2324
  const auto & params = ctx->vision_model.hparams;
2181
2325
 
2182
2326
  int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
@@ -2190,6 +2334,11 @@ int clip_n_patches(const struct clip_ctx * ctx) {
2190
2334
  else if (ctx->minicpmv_version == 3) {
2191
2335
  n_patches = 64;
2192
2336
  }
2337
+ } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
2338
+ int patch_size = params.patch_size * 2;
2339
+ int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
2340
+ int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
2341
+ n_patches = x_patch * y_patch;
2193
2342
  }
2194
2343
 
2195
2344
  return n_patches;
@@ -2318,7 +2467,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2318
2467
  const int image_size = hparams.image_size;
2319
2468
  int image_size_width = image_size;
2320
2469
  int image_size_height = image_size;
2321
- if (ctx->has_minicpmv_projector) {
2470
+ if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
2322
2471
  image_size_width = imgs->data[0].nx;
2323
2472
  image_size_height = imgs->data[0].ny;
2324
2473
  }
@@ -2338,7 +2487,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2338
2487
  for (size_t i = 0; i < imgs->size; i++) {
2339
2488
  const int nx = imgs->data[i].nx;
2340
2489
  const int ny = imgs->data[i].ny;
2341
- if (!ctx->has_minicpmv_projector) {
2490
+ if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) {
2342
2491
  GGML_ASSERT(nx == image_size && ny == image_size);
2343
2492
  }
2344
2493
 
@@ -2396,9 +2545,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2396
2545
  auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
2397
2546
 
2398
2547
  float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
2399
- for(int i=0;i<pos_w * pos_h;++i){
2400
- for(int j=0;j<embed_dim;++j){
2401
- pos_embed_data[i*embed_dim+j]=pos_embed_t[i][j];
2548
+ for(int i=0;i < pos_w * pos_h; ++i){
2549
+ for(int j=0; j < embed_dim; ++j){
2550
+ pos_embed_data[i * embed_dim + j] = pos_embed_t[i][j];
2402
2551
  }
2403
2552
  }
2404
2553
 
@@ -2418,7 +2567,34 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2418
2567
  }
2419
2568
  }
2420
2569
 
2421
- {
2570
+ if (ctx->has_qwen2vl_merger) {
2571
+ struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
2572
+
2573
+ const int pw = image_size_width / patch_size;
2574
+ const int ph = image_size_height / patch_size;
2575
+ int* positions_data = (int*)malloc(ggml_nbytes(positions));
2576
+
2577
+ int ptr = 0;
2578
+ for (int y = 0; y < ph; y+=2)
2579
+ {
2580
+ for (int x = 0; x < pw; x+=2)
2581
+ {
2582
+ for (int dy = 0; dy < 2; dy++) {
2583
+ for (int dx = 0; dx < 2; dx++) {
2584
+ positions_data[ptr] = y + dy;
2585
+ positions_data[num_patches + ptr] = x + dx;
2586
+ positions_data[num_patches * 2 + ptr] = y + dy;
2587
+ positions_data[num_patches * 3 + ptr] = x + dx;
2588
+ ptr++;
2589
+ }
2590
+ }
2591
+ }
2592
+ }
2593
+
2594
+ ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
2595
+ free(positions_data);
2596
+ }
2597
+ else {
2422
2598
  struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
2423
2599
 
2424
2600
  int* positions_data = (int*)malloc(ggml_nbytes(positions));
@@ -2427,16 +2603,16 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2427
2603
  }
2428
2604
  ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
2429
2605
  free(positions_data);
2430
- }
2431
2606
 
2432
- {
2433
- struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
2434
- int* patches_data = (int*)malloc(ggml_nbytes(patches));
2435
- for (int i = 0; i < num_patches; i++) {
2436
- patches_data[i] = i + 1;
2607
+ {
2608
+ struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
2609
+ int* patches_data = (int*)malloc(ggml_nbytes(patches));
2610
+ for (int i = 0; i < num_patches; i++) {
2611
+ patches_data[i] = i + 1;
2612
+ }
2613
+ ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
2614
+ free(patches_data);
2437
2615
  }
2438
- ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
2439
- free(patches_data);
2440
2616
  }
2441
2617
  }
2442
2618
 
@@ -2609,6 +2785,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
2609
2785
  return 3584;
2610
2786
  }
2611
2787
  }
2788
+ if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
2789
+ return ctx->vision_model.mm_1_b->ne[0];
2790
+ }
2612
2791
 
2613
2792
  std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
2614
2793
  throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@@ -2620,3 +2799,21 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
2620
2799
  }
2621
2800
  return 0;
2622
2801
  }
2802
+
2803
+ bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
2804
+ return ctx->has_qwen2vl_merger;
2805
+ }
2806
+
2807
+
2808
+ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
2809
+ clip_image_f32 clip_img;
2810
+ clip_img.buf.resize(h * w * 3);
2811
+ for (int i = 0; i < h*w*3; i++)
2812
+ {
2813
+ clip_img.buf[i] = img[i];
2814
+ }
2815
+ clip_img.nx = w;
2816
+ clip_img.ny = h;
2817
+ clip_image_encode(ctx, n_threads, &clip_img, vec);
2818
+ return true;
2819
+ }
@@ -45,6 +45,7 @@ CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity
45
45
  CLIP_API void clip_free(struct clip_ctx * ctx);
46
46
 
47
47
  CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
48
+ CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
48
49
 
49
50
  CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
50
51
  CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
@@ -55,11 +56,13 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
55
56
 
56
57
  CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
57
58
 
58
- CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
59
- CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
59
+ CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
60
+ CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
61
+ CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
60
62
 
61
63
  CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
62
64
  CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
65
+ CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
63
66
 
64
67
  CLIP_API struct clip_image_size * clip_image_size_init();
65
68
  CLIP_API struct clip_image_u8 * clip_image_u8_init ();
@@ -86,6 +89,9 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
86
89
  CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
87
90
 
88
91
  CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
92
+ CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
93
+
94
+ CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
89
95
 
90
96
  #ifdef __cplusplus
91
97
  }