@fugood/llama.node 0.3.16 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +238 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +6 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  130. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
  131. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  133. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  135. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  136. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
  142. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  143. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  144. package/src/llama.cpp/include/llama.h +30 -11
  145. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  147. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  149. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  150. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  151. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  152. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  153. package/src/llama.cpp/src/llama-arch.cpp +160 -17
  154. package/src/llama.cpp/src/llama-arch.h +16 -0
  155. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  156. package/src/llama.cpp/src/llama-chat.h +6 -2
  157. package/src/llama.cpp/src/llama-context.cpp +108 -92
  158. package/src/llama.cpp/src/llama-context.h +1 -2
  159. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  160. package/src/llama.cpp/src/llama-graph.h +26 -6
  161. package/src/llama.cpp/src/llama-hparams.h +13 -0
  162. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  163. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  164. package/src/llama.cpp/src/llama-memory.h +1 -1
  165. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  166. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  167. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  168. package/src/llama.cpp/src/llama-model.cpp +1760 -534
  169. package/src/llama.cpp/src/llama-model.h +13 -1
  170. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  171. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  172. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  173. package/src/llama.cpp/src/llama.cpp +1 -1
  174. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  175. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  176. package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
  177. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  178. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  179. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  180. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  181. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  182. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  183. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  184. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  185. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  186. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  188. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  189. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  190. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  191. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  192. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  193. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -0,0 +1,708 @@
1
+ #include "clip.h"
2
+ #include "clip-impl.h"
3
+ #include "mtmd.h"
4
+
5
+ #include "llama.h"
6
+
7
+ #include <algorithm>
8
+ #include <cerrno>
9
+ #include <cstdio>
10
+ #include <cstdlib>
11
+ #include <cstring>
12
+ #include <limits>
13
+ #include <vector>
14
+
15
+ // slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
16
+ // models not having it (llava-1.6) will process embeddings without any special tokens in-between
17
+ enum mtmd_slice_tmpl {
18
+ MTMD_SLICE_TMPL_NONE,
19
+ MTMD_SLICE_TMPL_MINICPMV_2_5,
20
+ MTMD_SLICE_TMPL_MINICPMV_2_6,
21
+ // TODO @ngxson : add support for idefics (SmolVLM)
22
+ };
23
+
24
+ struct mtmd_context {
25
+ struct clip_ctx * ctx_clip;
26
+ const struct llama_model * text_model;
27
+ std::vector<float> image_embd_v; // image embedding vector
28
+
29
+ bool print_timings;
30
+ int n_threads;
31
+ std::string image_marker;
32
+
33
+ // for minicpmv, we need special tokens in-between slices
34
+ mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
35
+ llama_token tok_ov_img_start = LLAMA_TOKEN_NULL; // overview image
36
+ llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
37
+ llama_token tok_slices_start = LLAMA_TOKEN_NULL; // start of all slices
38
+ llama_token tok_slices_end = LLAMA_TOKEN_NULL; // end of all slices
39
+ llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice
40
+ llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice
41
+ llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
42
+
43
+ bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
44
+
45
+ // TODO @ngxson : add timings
46
+
47
+ mtmd_context(const char * mmproj_fname,
48
+ const llama_model * text_model,
49
+ const mtmd_context_params & ctx_params) :
50
+ text_model (text_model),
51
+ print_timings(ctx_params.print_timings),
52
+ n_threads (ctx_params.n_threads),
53
+ image_marker (ctx_params.image_marker)
54
+ {
55
+ clip_context_params ctx_clip_params;
56
+ ctx_clip_params.use_gpu = ctx_params.use_gpu;
57
+ ctx_clip_params.verbosity = ctx_params.verbosity;
58
+ ctx_clip = clip_init(mmproj_fname, ctx_clip_params);
59
+ if (!ctx_clip) {
60
+ throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
61
+ }
62
+
63
+ use_mrope = clip_is_qwen2vl(ctx_clip);
64
+
65
+ int minicpmv_version = clip_is_minicpmv(ctx_clip);
66
+ if (minicpmv_version == 2) {
67
+ // minicpmv 2.5 format:
68
+ // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
69
+ slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
70
+ tok_ov_img_start = lookup_token("<image>");
71
+ tok_ov_img_end = lookup_token("</image>");
72
+ tok_slices_start = lookup_token("<slice>");
73
+ tok_slices_end = lookup_token("</slice>");
74
+ tok_sli_img_start = tok_ov_img_start;
75
+ tok_sli_img_end = tok_ov_img_end;
76
+ tok_row_end = lookup_token("\n");
77
+
78
+ } else if (minicpmv_version == 3 || minicpmv_version == 4) {
79
+ // minicpmv 2.6 format:
80
+ // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
81
+ slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
82
+ tok_ov_img_start = lookup_token("<image>");
83
+ tok_ov_img_end = lookup_token("</image>");
84
+ tok_sli_img_start = lookup_token("<slice>");
85
+ tok_sli_img_end = lookup_token("</slice>");
86
+ tok_row_end = lookup_token("\n");
87
+
88
+ } else if (minicpmv_version != 0) {
89
+ GGML_ASSERT(false && "unsupported minicpmv version");
90
+ }
91
+ }
92
+
93
+ ~mtmd_context() {
94
+ clip_free(ctx_clip);
95
+ }
96
+
97
+ private:
98
+ llama_token lookup_token(const std::string & token_text) {
99
+ const llama_vocab * vocab = llama_model_get_vocab(text_model);
100
+ const int n_vocab = llama_vocab_n_tokens(vocab);
101
+ for (int i = 0; i < n_vocab; i++) {
102
+ if (token_to_piece(vocab, i, true) == token_text) {
103
+ return i;
104
+ }
105
+ }
106
+ return LLAMA_TOKEN_NULL;
107
+ }
108
+
109
+ std::string token_to_piece(const llama_vocab * vocab, llama_token token, bool special) {
110
+ std::string piece;
111
+ piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
112
+ const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
113
+ if (n_chars < 0) {
114
+ piece.resize(-n_chars);
115
+ int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
116
+ GGML_ASSERT(check == -n_chars);
117
+ } else {
118
+ piece.resize(n_chars);
119
+ }
120
+ return piece;
121
+ }
122
+ };
123
+
124
+ struct mtmd_image_tokens_data {
125
+ clip_image_f32_batch batch_f32; // preprocessed image patches
126
+ };
127
+
128
+ struct mtmd_image_tokens {
129
+ uint32_t nx; // number of tokens in x direction
130
+ uint32_t ny; // number of tokens in y direction
131
+ bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
132
+ uint32_t n_tokens() const { return nx * ny; }
133
+ clip_image_f32_batch batch_f32; // preprocessed image patches
134
+ std::string id; // optional user-defined ID, useful for KV cache tracking
135
+ };
136
+
137
+ mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
138
+ const struct llama_model * text_model,
139
+ const struct mtmd_context_params ctx_params) {
140
+ try {
141
+ return new mtmd_context(mmproj_fname, text_model, ctx_params);
142
+ } catch (const std::exception & e) {
143
+ LOG_ERR("%s: error: %s\n", __func__, e.what());
144
+ return nullptr;
145
+ }
146
+ }
147
+
148
+ void mtmd_free(mtmd_context * ctx) {
149
+ if (ctx) {
150
+ delete ctx;
151
+ }
152
+ }
153
+
154
+ // copied from common_tokenize
155
+ static std::vector<llama_token> mtmd_tokenize_text_internal(
156
+ const struct llama_vocab * vocab,
157
+ const std::string & text,
158
+ bool add_special,
159
+ bool parse_special) {
160
+ // upper limit for the number of tokens
161
+ int n_tokens = text.length() + 2 * add_special;
162
+ std::vector<llama_token> result(n_tokens);
163
+ n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
164
+ if (n_tokens < 0) {
165
+ result.resize(-n_tokens);
166
+ int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
167
+ GGML_ASSERT(check == -n_tokens);
168
+ } else {
169
+ result.resize(n_tokens);
170
+ }
171
+ return result;
172
+ }
173
+
174
+ int32_t mtmd_tokenize(mtmd_context * ctx,
175
+ std::vector<mtmd_input_chunk> & output,
176
+ const mtmd_input_text & text,
177
+ const std::vector<mtmd_bitmap> & bitmaps) {
178
+ auto vocab = llama_model_get_vocab(ctx->text_model);
179
+
180
+ std::string prompt_modified(text.text);
181
+ std::string marker_modified(ctx->image_marker);
182
+ projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
183
+
184
+ // a bit hacky here, but works for now
185
+ // for some models, we need to add prefix and suffix to the image embeddings
186
+ if (clip_is_gemma3(ctx->ctx_clip)) {
187
+ // gemma 3
188
+ // <start_of_image> ... (image embeddings) ... <end_of_image>
189
+ marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
190
+ string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
191
+
192
+ } else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
193
+ // <|begin_of_image|> ... (image embeddings) ... <|end_of_image|>
194
+ marker_modified = "<|begin_of_image|>" + ctx->image_marker + "<|end_of_image|>";
195
+ string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
196
+
197
+ } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
198
+ // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
199
+ marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";
200
+ string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
201
+
202
+ } else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
203
+ // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
204
+ marker_modified = ctx->image_marker + "[IMG_END]";
205
+ string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
206
+ }
207
+
208
+ else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
209
+ // <|vision_start|> ... (image embeddings) ... <|vision_end|>
210
+ marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>";
211
+ string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
212
+
213
+ }
214
+
215
+ // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
216
+
217
+ std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
218
+ output.clear();
219
+ output.reserve(parts.size());
220
+
221
+ size_t i_img = 0;
222
+
223
+ // utility for adding raw tokens
224
+ auto add_text_chunk = [&output](std::vector<llama_token> && tokens) {
225
+ mtmd_input_chunk chunk{
226
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
227
+ std::move(tokens),
228
+ {},
229
+ };
230
+ output.emplace_back(std::move(chunk));
231
+ };
232
+
233
+ // utility for splitting batch of multiple images into chunks of batch having single images
234
+ auto split_batch_to_chunk = [&ctx](clip_image_f32_batch && batch_f32, const std::string & id) {
235
+ std::vector<mtmd_input_chunk> chunks;
236
+
237
+ for (auto & entry : batch_f32.entries) {
238
+ mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
239
+ image_tokens->nx = clip_n_output_tokens(ctx->ctx_clip, entry.get());
240
+ image_tokens->ny = 1;
241
+ image_tokens->batch_f32.entries.push_back(std::move(entry));
242
+ image_tokens->id = id;
243
+
244
+ mtmd_input_chunk chunk{
245
+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
246
+ {},
247
+ std::move(image_tokens),
248
+ };
249
+ chunks.emplace_back(std::move(chunk));
250
+ }
251
+
252
+ return chunks;
253
+ };
254
+
255
+ for (const auto & part : parts) {
256
+ // printf("tokenizing part: %s\n", part.c_str());
257
+ bool add_bos = &parts.front() == &part;
258
+ auto tokens = mtmd_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
259
+ if (tokens.empty()) {
260
+ continue;
261
+ }
262
+ mtmd_input_chunk chunk{
263
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
264
+ std::move(tokens),
265
+ {},
266
+ };
267
+ output.emplace_back(std::move(chunk));
268
+
269
+ if (&parts.back() != &part) {
270
+ // add image token to middle of 2 parts
271
+
272
+ if (i_img >= bitmaps.size()) {
273
+ LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
274
+ return 1;
275
+ }
276
+
277
+ // convert mtmd_bitmap to clip_image_u8
278
+ clip_image_u8_ptr img_u8(clip_image_u8_init());
279
+ img_u8->nx = bitmaps[i_img].nx;
280
+ img_u8->ny = bitmaps[i_img].ny;
281
+ img_u8->buf.resize(bitmaps[i_img].data.size());
282
+ std::memcpy(img_u8->buf.data(), bitmaps[i_img].data.data(), img_u8->nx * img_u8->ny * 3);
283
+ clip_image_size img_u8_size{img_u8->nx, img_u8->ny};
284
+
285
+ // preprocess image
286
+ clip_image_f32_batch batch_f32;
287
+ bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), &batch_f32);
288
+ if (!ok) {
289
+ LOG_ERR("Unable to preprocess image\n");
290
+ return 2;
291
+ }
292
+
293
+ if (ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6) {
294
+ // split batch into chunks of single images
295
+ auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img].id);
296
+ GGML_ASSERT(chunks.size() > 0);
297
+
298
+ // add overview image
299
+ add_text_chunk({ctx->tok_ov_img_start});
300
+ output.emplace_back(std::move(chunks.front()));
301
+ chunks.erase(chunks.begin());
302
+ add_text_chunk({ctx->tok_ov_img_end});
303
+
304
+ // add slices
305
+ if (!chunks.empty()) {
306
+ clip_add_load_image_size(ctx->ctx_clip, &img_u8_size);
307
+ int n_col = clip_uhd_num_image_embeds_col(ctx->ctx_clip);
308
+ int n_row = (int)chunks.size() / n_col;
309
+ GGML_ASSERT(n_row * n_col == (int)chunks.size());
310
+ if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
311
+ add_text_chunk({ctx->tok_slices_start});
312
+ }
313
+ for (int y = 0; y < n_row; y++) {
314
+ for (int x = 0; x < n_col; x++) {
315
+ if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
316
+ add_text_chunk({ctx->tok_sli_img_start});
317
+ }
318
+ output.emplace_back(std::move(chunks[y * n_col + x]));
319
+ if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
320
+ add_text_chunk({ctx->tok_sli_img_end});
321
+ }
322
+ }
323
+ if (ctx->tok_row_end != LLAMA_TOKEN_NULL && y != n_row - 1) {
324
+ add_text_chunk({ctx->tok_row_end});
325
+ }
326
+ }
327
+ if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
328
+ add_text_chunk({ctx->tok_slices_end});
329
+ }
330
+ }
331
+
332
+ } else {
333
+ size_t n_tokens = 0;
334
+ for (const auto & entry : batch_f32.entries) {
335
+ n_tokens += clip_n_output_tokens(ctx->ctx_clip, entry.get());
336
+ }
337
+
338
+ mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
339
+ if (ctx->use_mrope) {
340
+ // for Qwen2VL, we need this information for M-RoPE decoding positions
341
+ image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get());
342
+ image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get());
343
+ image_tokens->use_mrope_pos = true;
344
+ } else {
345
+ // other models, we only need the total number of tokens
346
+ image_tokens->nx = n_tokens;
347
+ image_tokens->ny = 1;
348
+ }
349
+ image_tokens->batch_f32 = std::move(batch_f32);
350
+ image_tokens->id = bitmaps[i_img].id; // optional
351
+
352
+ LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
353
+ LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
354
+ LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
355
+
356
+ mtmd_input_chunk chunk{
357
+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
358
+ {},
359
+ std::move(image_tokens),
360
+ };
361
+ output.emplace_back(std::move(chunk));
362
+ }
363
+
364
+ i_img++; // move to next image
365
+ }
366
+ }
367
+
368
+ return 0;
369
+ }
370
+
371
+ void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
372
+ if (image_tokens) {
373
+ delete image_tokens;
374
+ }
375
+ }
376
+
377
+ size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
378
+ return image_tokens->n_tokens();
379
+ }
380
+
381
+ size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
382
+ return image_tokens->nx;
383
+ }
384
+
385
+ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
386
+ return image_tokens->ny;
387
+ }
388
+
389
+ std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
390
+ return image_tokens->id;
391
+ }
392
+
393
+ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
394
+ if (image_tokens->use_mrope_pos) {
395
+ return 1; // for M-RoPE, the whole image is 1 in temporal dimension
396
+ }
397
+ return image_tokens->n_tokens();
398
+ }
399
+
400
+ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
401
+ int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
402
+ ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
403
+ bool ok = false;
404
+
405
+ // only effective for minicpmv and qwen2vl, other models will ignore load_image_size
406
+ {
407
+ clip_image_size slice_size{
408
+ image_tokens->batch_f32.entries[0]->nx,
409
+ image_tokens->batch_f32.entries[0]->ny};
410
+ clip_add_load_image_size(ctx->ctx_clip, &slice_size);
411
+ }
412
+
413
+ if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
414
+ // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
415
+ const auto & entries = image_tokens->batch_f32.entries;
416
+ for (size_t i = 0; i < entries.size(); i++) {
417
+ int n_tokens_per_image = clip_n_output_tokens(ctx->ctx_clip, entries[i].get());
418
+ ok = clip_image_encode(
419
+ ctx->ctx_clip,
420
+ ctx->n_threads,
421
+ entries[i].get(),
422
+ ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
423
+ }
424
+ } else {
425
+ ok = clip_image_batch_encode(
426
+ ctx->ctx_clip,
427
+ ctx->n_threads,
428
+ &image_tokens->batch_f32,
429
+ ctx->image_embd_v.data());
430
+ }
431
+
432
+ return ok ? 0 : 1;
433
+ }
434
+
435
+ float * mtmd_get_output_embd(mtmd_context * ctx) {
436
+ return ctx->image_embd_v.data();
437
+ }
438
+
439
+ size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
440
+ size_t n_tokens = 0;
441
+ for (auto & chunk : chunks) {
442
+ if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
443
+ n_tokens += chunk.tokens_text.size();
444
+ } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
445
+ n_tokens += mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
446
+ } else {
447
+ GGML_ASSERT(false && "chunk type not supported");
448
+ }
449
+ }
450
+ return n_tokens;
451
+ }
452
+
453
+ llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks) {
454
+ llama_pos n_pos = 0;
455
+ for (auto & chunk : chunks) {
456
+ if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
457
+ n_pos += chunk.tokens_text.size();
458
+ } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
459
+ n_pos += mtmd_image_tokens_get_n_pos(chunk.tokens_image.get());
460
+ } else {
461
+ GGML_ASSERT(false && "chunk type not supported");
462
+ }
463
+ }
464
+ return n_pos;
465
+ }
466
+
467
+ // helper struct to make working with embd batch easier
468
+ // note: this will be removed after llama_batch_ext refactoring
469
+ struct decode_embd_batch {
470
+ int n_pos_per_embd;
471
+ int n_mmproj_embd;
472
+ std::vector<llama_pos> pos;
473
+ std::vector<llama_pos> pos_view; // used by mrope
474
+ std::vector<int32_t> n_seq_id;
475
+ std::vector<llama_seq_id> seq_id_0;
476
+ std::vector<llama_seq_id *> seq_ids;
477
+ std::vector<int8_t> logits;
478
+ llama_batch batch;
479
+ decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
480
+ pos .resize(n_tokens * n_pos_per_embd);
481
+ n_seq_id.resize(n_tokens);
482
+ seq_ids .resize(n_tokens + 1);
483
+ logits .resize(n_tokens);
484
+ seq_id_0.resize(1);
485
+ seq_ids [n_tokens] = nullptr;
486
+ batch = {
487
+ /*n_tokens =*/ n_tokens,
488
+ /*tokens =*/ nullptr,
489
+ /*embd =*/ embd,
490
+ /*pos =*/ pos.data(),
491
+ /*n_seq_id =*/ n_seq_id.data(),
492
+ /*seq_id =*/ seq_ids.data(),
493
+ /*logits =*/ logits.data(),
494
+ };
495
+ }
496
+
497
+ void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) {
498
+ seq_id_0[0] = seq_id;
499
+ for (int i = 0; i < batch.n_tokens; i++) {
500
+ batch.pos [i] = pos_0 + i;
501
+ batch.n_seq_id[i] = 1;
502
+ batch.seq_id [i] = seq_id_0.data();
503
+ batch.logits [i] = false;
504
+ }
505
+ }
506
+
507
+ void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
508
+ GGML_ASSERT(n_pos_per_embd == 4);
509
+ seq_id_0[0] = seq_id;
510
+ for (int y = 0; y < ny; y++) {
511
+ for (int x = 0; x < nx; x++) {
512
+ int i = y * nx + x;
513
+ pos[i ] = pos_0;
514
+ pos[i + batch.n_tokens ] = pos_0 + y;
515
+ pos[i + batch.n_tokens * 2] = pos_0 + x;
516
+ pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
517
+ }
518
+ }
519
+ for (int i = 0; i < batch.n_tokens; i++) {
520
+ batch.n_seq_id[i] = 1;
521
+ batch.seq_id [i] = seq_id_0.data();
522
+ batch.logits [i] = false;
523
+ }
524
+ }
525
+
526
+ llama_batch get_view(int offset, int n_tokens) {
527
+ llama_pos * pos_ptr;
528
+ pos_view.clear();
529
+ pos_view.resize(n_tokens * n_pos_per_embd);
530
+ if (n_pos_per_embd > 1) {
531
+ // mrope
532
+ // for example, with layout of src: 1234...1234...1234...1234...
533
+ // offset 2 will give us dst: 34...34...34...34...
534
+ for (int i = 0; i < n_pos_per_embd; i++) {
535
+ auto src = pos.begin() + i * batch.n_tokens + offset;
536
+ pos_view.insert(pos_view.end(), src, src + n_tokens);
537
+ }
538
+ pos_ptr = pos_view.data();
539
+ } else {
540
+ // normal
541
+ pos_ptr = pos.data() + offset;
542
+ }
543
+ return {
544
+ /*n_tokens =*/ n_tokens,
545
+ /*tokens =*/ nullptr,
546
+ /*embd =*/ batch.embd + offset * n_mmproj_embd,
547
+ /*pos =*/ pos_ptr,
548
+ /*n_seq_id =*/ batch.n_seq_id + offset,
549
+ /*seq_id =*/ batch.seq_id + offset,
550
+ /*logits =*/ batch.logits + offset,
551
+ };
552
+ }
553
+ };
554
+
555
+ int32_t mtmd_helper_eval(mtmd_context * ctx,
556
+ llama_context * lctx,
557
+ mtmd_input_chunks & chunks,
558
+ llama_pos pos0,
559
+ llama_seq_id seq_id,
560
+ int32_t n_batch) {
561
+ int32_t ret;
562
+ llama_pos n_past = pos0;
563
+ llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
564
+ int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
565
+ int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
566
+
567
+ for (auto & chunk : chunks) {
568
+ bool is_last = &chunk == &chunks.back();
569
+ if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
570
+ text_batch.n_tokens = chunk.tokens_text.size();
571
+ size_t i = 0;
572
+ while (i < chunk.tokens_text.size()) { // split into batches
573
+ for (; i < chunk.tokens_text.size() && text_batch.n_tokens < n_batch; i++) {
574
+ text_batch.token [i] = chunk.tokens_text[i];
575
+ text_batch.pos [i] = n_past++;
576
+ text_batch.n_seq_id[i] = 1;
577
+ text_batch.seq_id [i][0] = seq_id;
578
+ text_batch.logits [i] = false;
579
+ }
580
+ if (is_last) {
581
+ // always get logits for last input chunk
582
+ text_batch.logits[text_batch.n_tokens - 1] = true;
583
+ }
584
+ ret = llama_decode(lctx, text_batch);
585
+ if (ret != 0) {
586
+ LOG_ERR("failed to decode text\n");
587
+ llama_batch_free(text_batch);
588
+ return ret;
589
+ }
590
+ }
591
+
592
+ } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
593
+ GGML_ASSERT(!is_last && "logits for last image chunk is not yet support");
594
+ GGML_ASSERT(chunk.tokens_image != nullptr);
595
+ int64_t t0 = ggml_time_ms();
596
+ if (ctx->print_timings) {
597
+ LOG_INF("encoding image or slice...\n");
598
+ }
599
+ ret = mtmd_encode(ctx, chunk.tokens_image.get());
600
+ if (ret != 0) {
601
+ LOG_ERR("failed to encode image\n");
602
+ llama_batch_free(text_batch);
603
+ return ret;
604
+ }
605
+ if (ctx->print_timings) {
606
+ LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
607
+ }
608
+
609
+ int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
610
+ int32_t i_batch = 0;
611
+ int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
612
+ float * embd = mtmd_get_output_embd(ctx);
613
+ decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
614
+
615
+ const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
616
+ const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
617
+
618
+ if (mtmd_decode_use_mrope(ctx)) {
619
+ batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
620
+ } else {
621
+ batch_embd.set_position_normal(n_past, seq_id);
622
+ }
623
+
624
+ if (mtmd_decode_use_non_causal(ctx)) {
625
+ llama_set_causal_attn(lctx, false);
626
+ // TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
627
+ }
628
+
629
+ while (i_batch < n_img_batches) { // split into batches
630
+ int pos_offset = i_batch*n_batch;
631
+ int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
632
+ llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
633
+
634
+ LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
635
+
636
+ int64_t t1 = ggml_time_ms();
637
+ ret = llama_decode(lctx, batch_embd_view);
638
+ if (ret != 0) {
639
+ LOG_ERR("failed to decode image\n");
640
+ llama_set_causal_attn(lctx, true); // restore causal attn
641
+ llama_batch_free(text_batch);
642
+ return ret;
643
+ }
644
+
645
+ if (ctx->print_timings) {
646
+ LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
647
+ }
648
+
649
+ i_batch++;
650
+ }
651
+
652
+ // for mrope, one image is one single **temporal** position
653
+ n_past += mtmd_decode_use_mrope(ctx) ? 1 : n_tokens;
654
+
655
+ if (mtmd_decode_use_non_causal(ctx)) {
656
+ llama_set_causal_attn(lctx, true);
657
+ }
658
+
659
+ } else {
660
+ GGML_ASSERT(false && "chunk type not supported");
661
+ }
662
+ }
663
+
664
+ llama_batch_free(text_batch);
665
+ return 0;
666
+ }
667
+
668
+ int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output) {
669
+ clip_image_u8_ptr img_u8(clip_image_u8_init());
670
+ bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
671
+ if (!ok) {
672
+ LOG_ERR("Unable to load image from buffer\n");
673
+ return 1;
674
+ }
675
+ unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
676
+ output.data.resize(output.nx * output.ny * 3);
677
+ std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
678
+ return 0;
679
+ }
680
+
681
+ int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output) {
682
+ clip_image_u8_ptr img_u8(clip_image_u8_init());
683
+ bool ok = clip_image_load_from_file(fname, img_u8.get());
684
+ if (!ok) {
685
+ LOG_ERR("Unable to load image %s\n", fname);
686
+ return 1;
687
+ }
688
+ unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
689
+ output.data.resize(output.nx * output.ny * 3);
690
+ std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
691
+ return 0;
692
+ }
693
+
694
+ bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
695
+ projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
696
+ if (proj_type == PROJECTOR_TYPE_GEMMA3) {
697
+ return true;
698
+ }
699
+ return false;
700
+ }
701
+
702
+ bool mtmd_decode_use_mrope(mtmd_context * ctx) {
703
+ return ctx->use_mrope;
704
+ }
705
+
706
+ void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
707
+ mtmd_image_tokens_free(val);
708
+ }