@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -1,13 +1,23 @@
1
1
  #include "clip.h"
2
- #include "common.h"
3
- #include "llama.h"
4
2
  #include "llava.h"
5
- #include "base64.hpp"
6
3
 
4
+ #include "llama.h"
5
+
6
+ #include <algorithm>
7
+ #include <cerrno>
7
8
  #include <cstdio>
8
9
  #include <cstdlib>
10
+ #include <cstring>
11
+ #include <limits>
9
12
  #include <vector>
10
- #include <numeric>
13
+
14
+ #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
15
+ #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
16
+
17
+ #define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
18
+ #define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
19
+ #define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
20
+ #define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
11
21
 
12
22
  // RGB uint8 image
13
23
  struct clip_image_u8 {
@@ -54,7 +64,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
54
64
  int downscaled_height = static_cast<int>(original_height * scale);
55
65
  int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
56
66
  int wasted_resolution = (width * height) - effective_resolution;
57
- // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
67
+ // LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
58
68
  if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
59
69
  max_effective_resolution = effective_resolution;
60
70
  min_wasted_resolution = wasted_resolution;
@@ -184,7 +194,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
184
194
  // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
185
195
  ggml_build_forward_expand(gf, flatten);
186
196
  ggml_graph_compute_with_ctx(model.ctx, gf, 1);
187
- struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
197
+ struct ggml_tensor* result = ggml_graph_node(gf, -1);
188
198
 
189
199
  memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
190
200
  // append without newline tokens (default behavior in llava_arch when not using unpad ):
@@ -202,6 +212,33 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
202
212
  return true;
203
213
  }
204
214
 
215
+ static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) {
216
+ int width = image->nx;
217
+ int height = image->ny;
218
+ int num_patches = (height / patch_size) * (width / patch_size);
219
+ clip_image_f32 * patch = clip_image_f32_init();
220
+ patch->nx = patch_size * num_patches;
221
+ patch->ny = patch_size;
222
+ patch->buf.resize(3 * patch->nx * patch->ny);
223
+
224
+ int patch_index = 0;
225
+
226
+ for (int i = 0; i < height; i += patch_size) {
227
+ for (int j = 0; j < width; j += patch_size) {
228
+ for (int pi = 0; pi < patch_size; ++pi) {
229
+ for (int pj = 0; pj < patch_size; ++pj) {
230
+ int input_index = ((i + pi) * width + (j + pj)) * 3;
231
+ int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3;
232
+ patch->buf[output_index] = image->buf[input_index];
233
+ patch->buf[output_index+1] = image->buf[input_index+1];
234
+ patch->buf[output_index+2] = image->buf[input_index+2];
235
+ }
236
+ }
237
+ patch_index++;
238
+ }
239
+ }
240
+ return patch;
241
+ }
205
242
 
206
243
  static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
207
244
  // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
@@ -209,7 +246,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
209
246
  img_res_v.size = 0;
210
247
  img_res_v.data = nullptr;
211
248
  if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
212
- LOG_TEE("%s: unable to preprocess image\n", __func__);
249
+ LOG_ERR("%s: unable to preprocess image\n", __func__);
213
250
  delete[] img_res_v.data;
214
251
  return false;
215
252
  }
@@ -218,17 +255,62 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
218
255
 
219
256
  const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
220
257
 
221
- if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
258
+ if (clip_is_minicpmv(ctx_clip)) {
259
+ std::vector<float *> image_embd_v;
260
+ image_embd_v.resize(img_res_v.size);
261
+ struct clip_image_size * load_image_size = clip_image_size_init();
262
+ for (size_t i = 0; i < img_res_v.size; i++) {
263
+ const int64_t t_img_enc_step_start_us = ggml_time_us();
264
+ image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
265
+ int patch_size=14;
266
+ load_image_size->width = img_res_v.data[i].nx;
267
+ load_image_size->height = img_res_v.data[i].ny;
268
+ clip_add_load_image_size(ctx_clip, load_image_size);
269
+ bool encoded = false;
270
+ int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
271
+ if (has_minicpmv_projector == 2) {
272
+ encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
273
+ }
274
+ else if (has_minicpmv_projector == 3) {
275
+ encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
276
+ }
277
+ if (!encoded) {
278
+ LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
279
+ return false;
280
+ }
281
+ const int64_t t_img_enc_steop_batch_us = ggml_time_us();
282
+ LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
283
+ }
284
+ const int64_t t_img_enc_batch_us = ggml_time_us();
285
+ LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
286
+
287
+ int n_img_pos_out = 0;
288
+ for (size_t i = 0; i < image_embd_v.size(); i++) {
289
+ std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
290
+ n_img_pos_out += clip_n_patches(ctx_clip);
291
+ }
292
+ *n_img_pos = n_img_pos_out;
293
+ for (size_t i = 0; i < image_embd_v.size(); i++) {
294
+ free(image_embd_v[i]);
295
+ }
296
+ image_embd_v.clear();
297
+ load_image_size->width = img->nx;
298
+ load_image_size->height = img->ny;
299
+ clip_add_load_image_size(ctx_clip, load_image_size);
300
+ LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
301
+ }
302
+ else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
222
303
  // flat / default llava-1.5 type embedding
223
304
  *n_img_pos = clip_n_patches(ctx_clip);
224
305
  bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
225
306
  delete[] img_res_v.data;
226
307
  if (!encoded) {
227
- LOG_TEE("Unable to encode image\n");
308
+ LOG_ERR("Unable to encode image\n");
228
309
 
229
310
  return false;
230
311
  }
231
- } else {
312
+ }
313
+ else {
232
314
  // spatial_unpad llava-1.6 type embedding
233
315
  // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
234
316
  std::vector<float *> image_embd_v;
@@ -237,12 +319,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
237
319
  image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
238
320
  const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
239
321
  if (!encoded) {
240
- LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
322
+ LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
241
323
  return false;
242
324
  }
243
325
  }
244
326
  const int64_t t_img_enc_batch_us = ggml_time_us();
245
- LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
327
+ LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
246
328
 
247
329
  const int32_t * image_grid = clip_image_grid(ctx_clip);
248
330
 
@@ -275,12 +357,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
275
357
  // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
276
358
  }
277
359
 
278
- LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
360
+ LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
279
361
 
280
362
  const int64_t t_img_enc_end_us = ggml_time_us();
281
363
  float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
282
364
 
283
- LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
365
+ LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
284
366
 
285
367
  return true;
286
368
  }
@@ -290,22 +372,26 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
290
372
  int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
291
373
  auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
292
374
  if (n_image_embd != n_llama_embd) {
293
- LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
375
+ LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
294
376
  return false;
295
377
  }
296
378
  return true;
297
379
  }
298
380
 
299
381
  bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
300
- float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
382
+ int num_max_patches = 6;
383
+ if (clip_is_minicpmv(ctx_clip)) {
384
+ num_max_patches = 10;
385
+ }
386
+ float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
301
387
  if (!image_embd) {
302
- LOG_TEE("Unable to allocate memory for image embeddings\n");
388
+ LOG_ERR("Unable to allocate memory for image embeddings\n");
303
389
  return false;
304
390
  }
305
391
 
306
392
  int n_img_pos;
307
393
  if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
308
- LOG_TEE("%s: cannot encode image, aborting\n", __func__);
394
+ LOG_ERR("%s: cannot encode image, aborting\n", __func__);
309
395
  free(image_embd);
310
396
  return false;
311
397
  }
@@ -315,6 +401,39 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
315
401
  return true;
316
402
  }
317
403
 
404
+ struct llava_embd_batch {
405
+ std::vector<llama_pos> pos;
406
+ std::vector<int32_t> n_seq_id;
407
+ std::vector<llama_seq_id> seq_id_0;
408
+ std::vector<llama_seq_id *> seq_ids;
409
+ std::vector<int8_t> logits;
410
+ llama_batch batch;
411
+ llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
412
+ pos .resize(n_tokens);
413
+ n_seq_id.resize(n_tokens);
414
+ seq_ids .resize(n_tokens + 1);
415
+ logits .resize(n_tokens);
416
+ seq_id_0.resize(1);
417
+ seq_id_0[0] = seq_id;
418
+ seq_ids [n_tokens] = nullptr;
419
+ batch = {
420
+ /*n_tokens =*/ n_tokens,
421
+ /*tokens =*/ nullptr,
422
+ /*embd =*/ embd,
423
+ /*pos =*/ pos.data(),
424
+ /*n_seq_id =*/ n_seq_id.data(),
425
+ /*seq_id =*/ seq_ids.data(),
426
+ /*logits =*/ logits.data(),
427
+ };
428
+ for (int i = 0; i < n_tokens; i++) {
429
+ batch.pos [i] = pos_0 + i;
430
+ batch.n_seq_id[i] = 1;
431
+ batch.seq_id [i] = seq_id_0.data();
432
+ batch.logits [i] = false;
433
+ }
434
+ }
435
+ };
436
+
318
437
  bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
319
438
  int n_embd = llama_n_embd(llama_get_model(ctx_llama));
320
439
 
@@ -323,9 +442,10 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
323
442
  if (n_eval > n_batch) {
324
443
  n_eval = n_batch;
325
444
  }
326
- llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
327
- if (llama_decode(ctx_llama, batch)) {
328
- LOG_TEE("%s : failed to eval\n", __func__);
445
+ float * embd = image_embed->embed+i*n_embd;
446
+ llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
447
+ if (llama_decode(ctx_llama, llava_batch.batch)) {
448
+ LOG_ERR("%s : failed to eval\n", __func__);
329
449
  return false;
330
450
  }
331
451
  *n_past += n_eval;
@@ -337,7 +457,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
337
457
  clip_image_u8 * img = clip_image_u8_init();
338
458
  if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
339
459
  clip_image_u8_free(img);
340
- LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
460
+ LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
341
461
  return NULL;
342
462
  }
343
463
 
@@ -346,7 +466,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
346
466
  bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
347
467
  if (!image_embed_result) {
348
468
  clip_image_u8_free(img);
349
- LOG_TEE("%s: coulnd't embed the image\n", __func__);
469
+ LOG_ERR("%s: couldn't embed the image\n", __func__);
350
470
  return NULL;
351
471
  }
352
472
 
@@ -360,7 +480,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
360
480
  static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
361
481
  auto file = fopen(path, "rb");
362
482
  if (file == NULL) {
363
- LOG_TEE("%s: can't read file %s\n", __func__, path);
483
+ LOG_ERR("%s: can't read file %s\n", __func__, path);
364
484
  return false;
365
485
  }
366
486
 
@@ -370,7 +490,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
370
490
 
371
491
  auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
372
492
  if (buffer == NULL) {
373
- LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
493
+ LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
374
494
  perror("Memory allocation error");
375
495
  fclose(file);
376
496
  return false;
@@ -395,7 +515,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
395
515
  long image_bytes_length;
396
516
  auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
397
517
  if (!loaded) {
398
- LOG_TEE("%s: failed to load %s\n", __func__, image_path);
518
+ LOG_ERR("%s: failed to load %s\n", __func__, image_path);
399
519
  return NULL;
400
520
  }
401
521
 
@@ -17,12 +17,11 @@
17
17
  # define LLAVA_API
18
18
  #endif
19
19
 
20
- struct clip_ctx;
21
-
22
20
  #ifdef __cplusplus
23
21
  extern "C" {
24
22
  #endif
25
23
 
24
+ struct clip_ctx;
26
25
  struct llava_image_embed {
27
26
  float * embed;
28
27
  int n_image_pos;
@@ -37,8 +36,8 @@ LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip,
37
36
  LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
38
37
  /** build an image embed from a path to an image filename */
39
38
  LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
40
- LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
41
39
  /** free an embedding made with llava_image_embed_make_* */
40
+ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
42
41
 
43
42
  /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
44
43
  LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
@@ -0,0 +1,323 @@
1
+ #include "arg.h"
2
+ #include "log.h"
3
+ #include "common.h"
4
+ #include "sampling.h"
5
+ #include "clip.h"
6
+ #include "llava.h"
7
+ #include "llama.h"
8
+ #include "ggml.h"
9
+
10
+ #include <algorithm>
11
+ #include <cstdio>
12
+ #include <cstdlib>
13
+ #include <cstring>
14
+ #include <vector>
15
+ #include <iostream> // TODO: remove me
16
+
17
+ struct llava_context {
18
+ struct clip_ctx * ctx_clip = NULL;
19
+ struct llama_context * ctx_llama = NULL;
20
+ struct llama_model * model = NULL;
21
+ };
22
+
23
+ static void show_additional_info(int /*argc*/, char ** argv) {
24
+ LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
25
+ LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
26
+ }
27
+
28
+ static struct llama_model * llava_init(common_params * params) {
29
+ llama_backend_init();
30
+ llama_numa_init(params->numa);
31
+
32
+ llama_model_params model_params = common_model_params_to_llama(*params);
33
+
34
+ llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
35
+ if (model == NULL) {
36
+ LOG_ERR("%s: unable to load model\n" , __func__);
37
+ return NULL;
38
+ }
39
+ return model;
40
+ }
41
+
42
+ static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
43
+ auto prompt = params->prompt;
44
+ if (prompt.empty()) {
45
+ prompt = "describe the image in detail.";
46
+ }
47
+
48
+ llama_context_params ctx_params = common_context_params_to_llama(*params);
49
+ if (params->n_ctx < 2048) {
50
+ // warn user here, "Image processing requires at least 2048 context, setting context to 2048"
51
+ LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
52
+ ctx_params.n_ctx = 2048;
53
+ } else {
54
+ ctx_params.n_ctx = params->n_ctx;
55
+ }
56
+
57
+ llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
58
+
59
+ if (ctx_llama == NULL) {
60
+ LOG_ERR("%s: failed to create the llama_context\n" , __func__);
61
+ return NULL;
62
+ }
63
+
64
+ auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
65
+
66
+ ctx_llava->ctx_llama = ctx_llama;
67
+ ctx_llava->model = model;
68
+ return ctx_llava;
69
+ }
70
+
71
+ static void llava_free(struct llava_context * ctx_llava) {
72
+ if (ctx_llava->ctx_clip) {
73
+ clip_free(ctx_llava->ctx_clip);
74
+ ctx_llava->ctx_clip = NULL;
75
+ }
76
+
77
+ llama_free(ctx_llava->ctx_llama);
78
+ llama_free_model(ctx_llava->model);
79
+ llama_backend_free();
80
+ }
81
+
82
+ static struct clip_ctx * clip_init_context(common_params * params) {
83
+ const char * clip_path = params->mmproj.c_str();
84
+
85
+ auto prompt = params->prompt;
86
+ if (prompt.empty()) {
87
+ prompt = "describe the image in detail.";
88
+ }
89
+ auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
90
+ return ctx_clip;
91
+ }
92
+
93
+ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
94
+ int N = (int) tokens.size();
95
+ for (int i = 0; i < N; i += n_batch) {
96
+ int n_eval = (int) tokens.size() - i;
97
+ if (n_eval > n_batch) {
98
+ n_eval = n_batch;
99
+ }
100
+ if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
101
+ LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
102
+ return false;
103
+ }
104
+ *n_past += n_eval;
105
+ }
106
+ return true;
107
+ }
108
+
109
+ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
110
+ std::vector<llama_token> tokens;
111
+ tokens.push_back(id);
112
+ return eval_tokens(ctx_llama, tokens, 1, n_past);
113
+ }
114
+
115
+ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
116
+ std::string str2 = str;
117
+ std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
118
+ return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
119
+ }
120
+
121
+ static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
122
+ float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
123
+ std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
124
+
125
+ auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
126
+ slice_embed->embed = image_embed;
127
+ slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
128
+ llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
129
+ llava_image_embed_free(slice_embed);
130
+ }
131
+
132
+ static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) {
133
+ std::string system_prompt;
134
+ int idx = 0;
135
+ int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
136
+ int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
137
+ if (has_minicpmv_projector == 2) {
138
+ system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
139
+ }
140
+ else if (has_minicpmv_projector == 3) {
141
+ system_prompt = "<|im_start|>user\n";
142
+ }
143
+ LOG_INF("%s: image token past: %d\n", __func__, n_past);
144
+ eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
145
+ process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
146
+ eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
147
+ if (num_image_embeds > 1) {
148
+ size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
149
+ eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
150
+ for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
151
+ for (size_t j = 0; j < num_image_embeds_col; ++j) {
152
+ eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
153
+ process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
154
+ eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
155
+ if (j == num_image_embeds_col - 1) {
156
+ eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
157
+ }
158
+ }
159
+ }
160
+ eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
161
+ }
162
+ LOG_INF("%s: image token past: %d\n", __func__, n_past);
163
+ }
164
+
165
+ static const char * sample(struct common_sampler * smpl,
166
+ struct llama_context * ctx_llama,
167
+ int * n_past) {
168
+ const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
169
+ common_sampler_accept(smpl, id, true);
170
+ static std::string ret;
171
+ if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
172
+ ret = "</s>";
173
+ } else {
174
+ ret = common_token_to_piece(ctx_llama, id);
175
+ }
176
+ eval_id(ctx_llama, id, n_past);
177
+ return ret.c_str();
178
+ }
179
+
180
+ static struct llava_context * minicpmv_init(common_params * params, const std::string & fname, int &n_past){
181
+ auto * ctx_clip = clip_init_context(params);
182
+ auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
183
+ if (!embeds) {
184
+ LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
185
+ return NULL;
186
+ }
187
+
188
+ // process the prompt
189
+ if (params->prompt.empty() && params->interactive == false) {
190
+ LOG_ERR("prompt should be given or interactive mode should be on");
191
+ return NULL;
192
+ }
193
+
194
+ auto * model = llava_init(params);
195
+ if (model == NULL) {
196
+ fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
197
+ return NULL;
198
+ }
199
+ const int64_t t_llava_init_start_us = ggml_time_us();
200
+ auto * ctx_llava = llava_init_context(params, model);
201
+ ctx_llava->ctx_clip = ctx_clip;
202
+ const int64_t t_llava_init_end_us = ggml_time_us();
203
+ float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
204
+ LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
205
+
206
+ const int64_t t_process_image_start_us = ggml_time_us();
207
+ process_image(ctx_llava, embeds, params, n_past);
208
+ const int64_t t_process_image_end_us = ggml_time_us();
209
+ float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
210
+ LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
211
+
212
+ llava_image_embed_free(embeds);
213
+ return ctx_llava;
214
+ }
215
+
216
+ static struct common_sampler * llama_init(struct llava_context * ctx_llava, common_params * params, const std::string & prompt, int & n_past, bool is_first = false){
217
+ std::string user_prompt = prompt;
218
+ int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
219
+ if (!is_first) {
220
+ if (has_minicpmv_projector == 2) {
221
+ user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
222
+ }
223
+ else if (has_minicpmv_projector == 3) {
224
+ user_prompt = "<|im_start|>user\n" + prompt;
225
+ }
226
+ }
227
+
228
+ eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
229
+ if (has_minicpmv_projector == 2) {
230
+ eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
231
+ }
232
+ else if (has_minicpmv_projector == 3) {
233
+ eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
234
+ }
235
+
236
+ // generate the response
237
+
238
+ LOG_INF("\n");
239
+
240
+ struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams);
241
+ return smpl;
242
+ }
243
+
244
+ static const char * llama_loop(struct llava_context * ctx_llava,struct common_sampler * smpl, int &n_past){
245
+
246
+ const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
247
+ return tmp;
248
+ }
249
+
250
+ int main(int argc, char ** argv) {
251
+ ggml_time_init();
252
+
253
+ common_params params;
254
+
255
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
256
+ return 1;
257
+ }
258
+
259
+ common_init();
260
+
261
+ if (params.mmproj.empty() || (params.image.empty())) {
262
+ show_additional_info(argc, argv);
263
+ return 1;
264
+ }
265
+
266
+ for (auto & image : params.image) {
267
+ int n_past = 0;
268
+ auto * ctx_llava = minicpmv_init(&params, image, n_past);
269
+
270
+ if (!params.prompt.empty()) {
271
+ LOG("<user>%s\n", params.prompt.c_str());
272
+ LOG("<assistant>");
273
+ auto * smpl = llama_init(ctx_llava, &params, params.prompt, n_past, true);
274
+ const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
275
+ std::string response;
276
+ bool have_tmp = false;
277
+ for (int i = 0; i < max_tgt_len; i++) {
278
+ const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
279
+ response += tmp;
280
+ if (strcmp(tmp, "</s>") == 0){
281
+ if (!have_tmp) {
282
+ continue;
283
+ }
284
+ break;
285
+ }
286
+ if (strstr(tmp, "###")) break; // Yi-VL behavior
287
+ have_tmp = true;
288
+ printf("%s", tmp);
289
+ if (strstr(response.c_str(), "<user>")) break; // minicpm-v
290
+
291
+ fflush(stdout);
292
+ }
293
+ common_sampler_free(smpl);
294
+ }else {
295
+ while (true) {
296
+ LOG("<user>");
297
+ std::string prompt;
298
+ std::getline(std::cin, prompt);
299
+ LOG("<assistant>");
300
+ auto * smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
301
+ const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
302
+ std::string response;
303
+ for (int i = 0; i < max_tgt_len; i++) {
304
+ const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
305
+ response += tmp;
306
+ if (strcmp(tmp, "</s>") == 0) break;
307
+ if (strstr(tmp, "###")) break; // Yi-VL behavior
308
+ printf("%s", tmp);// mistral llava-1.6
309
+ if (strstr(response.c_str(), "<user>")) break; // minicpm-v
310
+ fflush(stdout);
311
+ }
312
+ common_sampler_free(smpl);
313
+ }
314
+ }
315
+ printf("\n");
316
+ llama_perf_context_print(ctx_llava->ctx_llama);
317
+
318
+ ctx_llava->model = NULL;
319
+ llava_free(ctx_llava);
320
+ }
321
+
322
+ return 0;
323
+ }
@@ -2,3 +2,4 @@
2
2
  --extra-index-url https://download.pytorch.org/whl/cpu
3
3
  pillow~=10.2.0
4
4
  torch~=2.2.1
5
+ torchvision~=0.17.1