@fugood/llama.node 0.3.17 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. package/CMakeLists.txt +3 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +39 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +366 -19
  24. package/src/LlamaCompletionWorker.h +30 -10
  25. package/src/LlamaContext.cpp +213 -5
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
  29. package/src/llama.cpp/.github/workflows/build.yml +41 -762
  30. package/src/llama.cpp/.github/workflows/docker.yml +5 -2
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +12 -12
  33. package/src/llama.cpp/CMakeLists.txt +5 -17
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +31 -3
  37. package/src/llama.cpp/common/arg.cpp +48 -29
  38. package/src/llama.cpp/common/chat.cpp +128 -106
  39. package/src/llama.cpp/common/chat.h +2 -0
  40. package/src/llama.cpp/common/common.cpp +37 -1
  41. package/src/llama.cpp/common/common.h +18 -9
  42. package/src/llama.cpp/common/llguidance.cpp +1 -0
  43. package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
  44. package/src/llama.cpp/common/minja/minja.hpp +69 -36
  45. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  46. package/src/llama.cpp/common/regex-partial.h +56 -0
  47. package/src/llama.cpp/common/sampling.cpp +57 -50
  48. package/src/llama.cpp/examples/CMakeLists.txt +2 -23
  49. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
  50. package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
  51. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  52. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  53. package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
  54. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  55. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  56. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  57. package/src/llama.cpp/ggml/include/ggml.h +10 -7
  58. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  60. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  61. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
  62. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
  63. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
  64. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
  65. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
  66. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  67. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  68. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  69. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
  70. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
  71. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
  72. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  73. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
  74. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
  75. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  76. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  77. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
  78. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  79. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
  80. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
  81. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
  82. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  83. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  84. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  85. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  86. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
  87. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  88. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
  89. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  90. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  91. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  92. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
  93. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
  94. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
  95. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
  96. package/src/llama.cpp/ggml/src/ggml.c +29 -20
  97. package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
  98. package/src/llama.cpp/include/llama.h +52 -11
  99. package/src/llama.cpp/requirements/requirements-all.txt +3 -3
  100. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  101. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  102. package/src/llama.cpp/src/llama-adapter.cpp +6 -0
  103. package/src/llama.cpp/src/llama-arch.cpp +3 -0
  104. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  105. package/src/llama.cpp/src/llama-batch.h +2 -1
  106. package/src/llama.cpp/src/llama-chat.cpp +17 -7
  107. package/src/llama.cpp/src/llama-chat.h +1 -0
  108. package/src/llama.cpp/src/llama-context.cpp +389 -501
  109. package/src/llama.cpp/src/llama-context.h +44 -32
  110. package/src/llama.cpp/src/llama-cparams.h +1 -0
  111. package/src/llama.cpp/src/llama-graph.cpp +20 -38
  112. package/src/llama.cpp/src/llama-graph.h +12 -8
  113. package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
  114. package/src/llama.cpp/src/llama-kv-cache.h +271 -85
  115. package/src/llama.cpp/src/llama-memory.h +11 -1
  116. package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
  117. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  118. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  119. package/src/llama.cpp/src/llama-model.cpp +316 -69
  120. package/src/llama.cpp/src/llama-model.h +8 -1
  121. package/src/llama.cpp/src/llama-quant.cpp +15 -13
  122. package/src/llama.cpp/src/llama-sampling.cpp +18 -6
  123. package/src/llama.cpp/src/llama-vocab.cpp +42 -4
  124. package/src/llama.cpp/src/llama-vocab.h +6 -0
  125. package/src/llama.cpp/src/llama.cpp +14 -0
  126. package/src/llama.cpp/tests/CMakeLists.txt +10 -2
  127. package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
  128. package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
  129. package/src/llama.cpp/tests/test-chat.cpp +3 -1
  130. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  131. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  132. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  133. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  134. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  135. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
  136. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  137. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
  138. package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
  139. package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
  140. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
  141. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
  142. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  143. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
  144. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  145. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
  146. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  147. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
  148. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
  149. package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
  150. package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
  151. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  152. package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
  153. package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
  154. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  155. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  156. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  157. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  158. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  159. package/src/llama.cpp/examples/llava/clip.h +0 -135
  160. package/src/llama.cpp/examples/llava/llava.cpp +0 -586
  161. package/src/llama.cpp/examples/llava/llava.h +0 -49
  162. package/src/llama.cpp/examples/llava/mtmd.h +0 -168
  163. package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
  164. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  165. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  166. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  167. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  168. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  169. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  170. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  171. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  172. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  173. /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
  174. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  175. /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
  176. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  177. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  178. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  179. /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
  180. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  181. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  182. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  183. /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
  184. /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
  185. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  186. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  187. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  188. /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
  189. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  190. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  191. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  192. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
  193. /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
@@ -12,6 +12,30 @@
12
12
  #include <limits>
13
13
  #include <vector>
14
14
 
15
+ // represents raw image data, layout is RGBRGBRGB...
16
+ // length of data must be nx * ny * 3
17
+ struct mtmd_bitmap {
18
+ uint32_t nx;
19
+ uint32_t ny;
20
+ std::vector<unsigned char> data;
21
+ std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
22
+ };
23
+
24
+ struct mtmd_image_tokens_deleter {
25
+ void operator()(mtmd_image_tokens * val); // forward declaration
26
+ };
27
+ using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
28
+
29
+ struct mtmd_input_chunk {
30
+ mtmd_input_chunk_type type;
31
+ std::vector<llama_token> tokens_text;
32
+ mtmd_image_tokens_ptr tokens_image;
33
+ };
34
+
35
+ struct mtmd_input_chunks {
36
+ std::vector<mtmd_input_chunk> entries;
37
+ };
38
+
15
39
  // slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
16
40
  // models not having it (llava-1.6) will process embeddings without any special tokens in-between
17
41
  enum mtmd_slice_tmpl {
@@ -21,6 +45,16 @@ enum mtmd_slice_tmpl {
21
45
  // TODO @ngxson : add support for idefics (SmolVLM)
22
46
  };
23
47
 
48
+ mtmd_context_params mtmd_context_params_default() {
49
+ mtmd_context_params params;
50
+ params.use_gpu = true;
51
+ params.print_timings = true;
52
+ params.n_threads = 4;
53
+ params.verbosity = GGML_LOG_LEVEL_INFO;
54
+ params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
55
+ return params;
56
+ }
57
+
24
58
  struct mtmd_context {
25
59
  struct clip_ctx * ctx_clip;
26
60
  const struct llama_model * text_model;
@@ -132,6 +166,16 @@ struct mtmd_image_tokens {
132
166
  uint32_t n_tokens() const { return nx * ny; }
133
167
  clip_image_f32_batch batch_f32; // preprocessed image patches
134
168
  std::string id; // optional user-defined ID, useful for KV cache tracking
169
+
170
+ mtmd_image_tokens clone() {
171
+ return mtmd_image_tokens{
172
+ nx,
173
+ ny,
174
+ use_mrope_pos,
175
+ batch_f32.clone(),
176
+ id
177
+ };
178
+ }
135
179
  };
136
180
 
137
181
  mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
@@ -172,12 +216,13 @@ static std::vector<llama_token> mtmd_tokenize_text_internal(
172
216
  }
173
217
 
174
218
  int32_t mtmd_tokenize(mtmd_context * ctx,
175
- std::vector<mtmd_input_chunk> & output,
176
- const mtmd_input_text & text,
177
- const std::vector<mtmd_bitmap> & bitmaps) {
219
+ mtmd_input_chunks * output,
220
+ const mtmd_input_text * text,
221
+ const mtmd_bitmap ** bitmaps,
222
+ size_t n_bitmaps) {
178
223
  auto vocab = llama_model_get_vocab(ctx->text_model);
179
224
 
180
- std::string prompt_modified(text.text);
225
+ std::string prompt_modified(text->text);
181
226
  std::string marker_modified(ctx->image_marker);
182
227
  projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
183
228
 
@@ -189,11 +234,6 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
189
234
  marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
190
235
  string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
191
236
 
192
- } else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
193
- // <|begin_of_image|> ... (image embeddings) ... <|end_of_image|>
194
- marker_modified = "<|begin_of_image|>" + ctx->image_marker + "<|end_of_image|>";
195
- string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
196
-
197
237
  } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
198
238
  // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
199
239
  marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";
@@ -212,11 +252,19 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
212
252
 
213
253
  }
214
254
 
255
+ else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
256
+ // <img> ... (image embeddings) ... </img>
257
+ marker_modified = "<img>" + ctx->image_marker + "</img>";
258
+ string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
259
+
260
+ }
261
+
215
262
  // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
263
+ // for glm-edge, BOI and EOI token's embeddings are not present in the text model
216
264
 
217
265
  std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
218
- output.clear();
219
- output.reserve(parts.size());
266
+ output->entries.clear();
267
+ output->entries.reserve(parts.size());
220
268
 
221
269
  size_t i_img = 0;
222
270
 
@@ -227,7 +275,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
227
275
  std::move(tokens),
228
276
  {},
229
277
  };
230
- output.emplace_back(std::move(chunk));
278
+ output->entries.emplace_back(std::move(chunk));
231
279
  };
232
280
 
233
281
  // utility for splitting batch of multiple images into chunks of batch having single images
@@ -255,7 +303,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
255
303
  for (const auto & part : parts) {
256
304
  // printf("tokenizing part: %s\n", part.c_str());
257
305
  bool add_bos = &parts.front() == &part;
258
- auto tokens = mtmd_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
306
+ auto tokens = mtmd_tokenize_text_internal(vocab, part, text->add_special && add_bos, text->parse_special);
259
307
  if (tokens.empty()) {
260
308
  continue;
261
309
  }
@@ -264,22 +312,22 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
264
312
  std::move(tokens),
265
313
  {},
266
314
  };
267
- output.emplace_back(std::move(chunk));
315
+ output->entries.emplace_back(std::move(chunk));
268
316
 
269
317
  if (&parts.back() != &part) {
270
318
  // add image token to middle of 2 parts
271
319
 
272
- if (i_img >= bitmaps.size()) {
320
+ if (i_img >= n_bitmaps) {
273
321
  LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
274
322
  return 1;
275
323
  }
276
324
 
277
325
  // convert mtmd_bitmap to clip_image_u8
278
326
  clip_image_u8_ptr img_u8(clip_image_u8_init());
279
- img_u8->nx = bitmaps[i_img].nx;
280
- img_u8->ny = bitmaps[i_img].ny;
281
- img_u8->buf.resize(bitmaps[i_img].data.size());
282
- std::memcpy(img_u8->buf.data(), bitmaps[i_img].data.data(), img_u8->nx * img_u8->ny * 3);
327
+ img_u8->nx = bitmaps[i_img]->nx;
328
+ img_u8->ny = bitmaps[i_img]->ny;
329
+ img_u8->buf.resize(bitmaps[i_img]->data.size());
330
+ std::memcpy(img_u8->buf.data(), bitmaps[i_img]->data.data(), img_u8->nx * img_u8->ny * 3);
283
331
  clip_image_size img_u8_size{img_u8->nx, img_u8->ny};
284
332
 
285
333
  // preprocess image
@@ -292,12 +340,12 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
292
340
 
293
341
  if (ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6) {
294
342
  // split batch into chunks of single images
295
- auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img].id);
343
+ auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img]->id);
296
344
  GGML_ASSERT(chunks.size() > 0);
297
345
 
298
346
  // add overview image
299
347
  add_text_chunk({ctx->tok_ov_img_start});
300
- output.emplace_back(std::move(chunks.front()));
348
+ output->entries.emplace_back(std::move(chunks.front()));
301
349
  chunks.erase(chunks.begin());
302
350
  add_text_chunk({ctx->tok_ov_img_end});
303
351
 
@@ -315,7 +363,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
315
363
  if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
316
364
  add_text_chunk({ctx->tok_sli_img_start});
317
365
  }
318
- output.emplace_back(std::move(chunks[y * n_col + x]));
366
+ output->entries.emplace_back(std::move(chunks[y * n_col + x]));
319
367
  if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
320
368
  add_text_chunk({ctx->tok_sli_img_end});
321
369
  }
@@ -347,7 +395,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
347
395
  image_tokens->ny = 1;
348
396
  }
349
397
  image_tokens->batch_f32 = std::move(batch_f32);
350
- image_tokens->id = bitmaps[i_img].id; // optional
398
+ image_tokens->id = bitmaps[i_img]->id; // optional
351
399
 
352
400
  LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
353
401
  LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
@@ -358,7 +406,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
358
406
  {},
359
407
  std::move(image_tokens),
360
408
  };
361
- output.emplace_back(std::move(chunk));
409
+ output->entries.emplace_back(std::move(chunk));
362
410
  }
363
411
 
364
412
  i_img++; // move to next image
@@ -368,35 +416,12 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
368
416
  return 0;
369
417
  }
370
418
 
371
- void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
419
+ static void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
372
420
  if (image_tokens) {
373
421
  delete image_tokens;
374
422
  }
375
423
  }
376
424
 
377
- size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
378
- return image_tokens->n_tokens();
379
- }
380
-
381
- size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
382
- return image_tokens->nx;
383
- }
384
-
385
- size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
386
- return image_tokens->ny;
387
- }
388
-
389
- std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
390
- return image_tokens->id;
391
- }
392
-
393
- llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
394
- if (image_tokens->use_mrope_pos) {
395
- return 1; // for M-RoPE, the whole image is 1 in temporal dimension
396
- }
397
- return image_tokens->n_tokens();
398
- }
399
-
400
425
  int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
401
426
  int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
402
427
  ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
@@ -436,273 +461,218 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
436
461
  return ctx->image_embd_v.data();
437
462
  }
438
463
 
439
- size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
440
- size_t n_tokens = 0;
441
- for (auto & chunk : chunks) {
442
- if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
443
- n_tokens += chunk.tokens_text.size();
444
- } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
445
- n_tokens += mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
446
- } else {
447
- GGML_ASSERT(false && "chunk type not supported");
448
- }
464
+ bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
465
+ projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
466
+ if (proj_type == PROJECTOR_TYPE_GEMMA3) {
467
+ return true;
449
468
  }
450
- return n_tokens;
469
+ return false;
451
470
  }
452
471
 
453
- llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks) {
454
- llama_pos n_pos = 0;
455
- for (auto & chunk : chunks) {
456
- if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
457
- n_pos += chunk.tokens_text.size();
458
- } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
459
- n_pos += mtmd_image_tokens_get_n_pos(chunk.tokens_image.get());
460
- } else {
461
- GGML_ASSERT(false && "chunk type not supported");
462
- }
463
- }
464
- return n_pos;
465
- }
466
-
467
- // helper struct to make working with embd batch easier
468
- // note: this will be removed after llama_batch_ext refactoring
469
- struct decode_embd_batch {
470
- int n_pos_per_embd;
471
- int n_mmproj_embd;
472
- std::vector<llama_pos> pos;
473
- std::vector<llama_pos> pos_view; // used by mrope
474
- std::vector<int32_t> n_seq_id;
475
- std::vector<llama_seq_id> seq_id_0;
476
- std::vector<llama_seq_id *> seq_ids;
477
- std::vector<int8_t> logits;
478
- llama_batch batch;
479
- decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
480
- pos .resize(n_tokens * n_pos_per_embd);
481
- n_seq_id.resize(n_tokens);
482
- seq_ids .resize(n_tokens + 1);
483
- logits .resize(n_tokens);
484
- seq_id_0.resize(1);
485
- seq_ids [n_tokens] = nullptr;
486
- batch = {
487
- /*n_tokens =*/ n_tokens,
488
- /*tokens =*/ nullptr,
489
- /*embd =*/ embd,
490
- /*pos =*/ pos.data(),
491
- /*n_seq_id =*/ n_seq_id.data(),
492
- /*seq_id =*/ seq_ids.data(),
493
- /*logits =*/ logits.data(),
494
- };
495
- }
472
+ bool mtmd_decode_use_mrope(mtmd_context * ctx) {
473
+ return ctx->use_mrope;
474
+ }
496
475
 
497
- void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) {
498
- seq_id_0[0] = seq_id;
499
- for (int i = 0; i < batch.n_tokens; i++) {
500
- batch.pos [i] = pos_0 + i;
501
- batch.n_seq_id[i] = 1;
502
- batch.seq_id [i] = seq_id_0.data();
503
- batch.logits [i] = false;
504
- }
505
- }
476
+ void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
477
+ mtmd_image_tokens_free(val);
478
+ }
506
479
 
507
- void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
508
- GGML_ASSERT(n_pos_per_embd == 4);
509
- seq_id_0[0] = seq_id;
510
- for (int y = 0; y < ny; y++) {
511
- for (int x = 0; x < nx; x++) {
512
- int i = y * nx + x;
513
- pos[i ] = pos_0;
514
- pos[i + batch.n_tokens ] = pos_0 + y;
515
- pos[i + batch.n_tokens * 2] = pos_0 + x;
516
- pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
517
- }
518
- }
519
- for (int i = 0; i < batch.n_tokens; i++) {
520
- batch.n_seq_id[i] = 1;
521
- batch.seq_id [i] = seq_id_0.data();
522
- batch.logits [i] = false;
523
- }
480
+ // these 2 helpers below use internal clip_image_u8_ptr,
481
+ // so unfortunately they cannot moved to mtmd-helper.h
482
+ // however, in theory, user can decode image file to bitmap using
483
+ // whichever library they want, and then use mtmd_bitmap_init() to create bitmap
484
+
485
+ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
486
+ clip_image_u8_ptr img_u8(clip_image_u8_init());
487
+ bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
488
+ if (!ok) {
489
+ LOG_ERR("Unable to load image from buffer\n");
490
+ return nullptr;
524
491
  }
492
+ uint32_t nx, ny;
493
+ unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
494
+ return mtmd_bitmap_init(nx, ny, data);
495
+ }
525
496
 
526
- llama_batch get_view(int offset, int n_tokens) {
527
- llama_pos * pos_ptr;
528
- pos_view.clear();
529
- pos_view.resize(n_tokens * n_pos_per_embd);
530
- if (n_pos_per_embd > 1) {
531
- // mrope
532
- // for example, with layout of src: 1234...1234...1234...1234...
533
- // offset 2 will give us dst: 34...34...34...34...
534
- for (int i = 0; i < n_pos_per_embd; i++) {
535
- auto src = pos.begin() + i * batch.n_tokens + offset;
536
- pos_view.insert(pos_view.end(), src, src + n_tokens);
537
- }
538
- pos_ptr = pos_view.data();
539
- } else {
540
- // normal
541
- pos_ptr = pos.data() + offset;
542
- }
543
- return {
544
- /*n_tokens =*/ n_tokens,
545
- /*tokens =*/ nullptr,
546
- /*embd =*/ batch.embd + offset * n_mmproj_embd,
547
- /*pos =*/ pos_ptr,
548
- /*n_seq_id =*/ batch.n_seq_id + offset,
549
- /*seq_id =*/ batch.seq_id + offset,
550
- /*logits =*/ batch.logits + offset,
551
- };
497
+ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
498
+ clip_image_u8_ptr img_u8(clip_image_u8_init());
499
+ bool ok = clip_image_load_from_file(fname, img_u8.get());
500
+ if (!ok) {
501
+ LOG_ERR("Unable to load image %s\n", fname);
502
+ return nullptr;
552
503
  }
553
- };
504
+ uint32_t nx, ny;
505
+ unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
506
+ return mtmd_bitmap_init(nx, ny, data);
507
+ }
554
508
 
555
- int32_t mtmd_helper_eval(mtmd_context * ctx,
556
- llama_context * lctx,
557
- mtmd_input_chunks & chunks,
558
- llama_pos pos0,
559
- llama_seq_id seq_id,
560
- int32_t n_batch) {
561
- int32_t ret;
562
- llama_pos n_past = pos0;
563
- llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
564
- int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
565
- int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
566
-
567
- for (auto & chunk : chunks) {
568
- bool is_last = &chunk == &chunks.back();
569
- if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
570
- text_batch.n_tokens = chunk.tokens_text.size();
571
- size_t i = 0;
572
- while (i < chunk.tokens_text.size()) { // split into batches
573
- for (; i < chunk.tokens_text.size() && text_batch.n_tokens < n_batch; i++) {
574
- text_batch.token [i] = chunk.tokens_text[i];
575
- text_batch.pos [i] = n_past++;
576
- text_batch.n_seq_id[i] = 1;
577
- text_batch.seq_id [i][0] = seq_id;
578
- text_batch.logits [i] = false;
579
- }
580
- if (is_last) {
581
- // always get logits for last input chunk
582
- text_batch.logits[text_batch.n_tokens - 1] = true;
583
- }
584
- ret = llama_decode(lctx, text_batch);
585
- if (ret != 0) {
586
- LOG_ERR("failed to decode text\n");
587
- llama_batch_free(text_batch);
588
- return ret;
589
- }
590
- }
509
+ //
510
+ // public API functions
511
+ //
512
+
513
+ // mtmd_bitmap
514
+
515
+ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
516
+ uint32_t ny,
517
+ const unsigned char * data) {
518
+ mtmd_bitmap * bitmap = new mtmd_bitmap;
519
+ bitmap->nx = nx;
520
+ bitmap->ny = ny;
521
+ size_t data_size = (size_t)nx * ny * 3;
522
+ bitmap->data.resize(data_size);
523
+ std::memcpy(bitmap->data.data(), data, data_size);
524
+ return bitmap;
525
+ }
591
526
 
592
- } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
593
- GGML_ASSERT(!is_last && "logits for last image chunk is not yet support");
594
- GGML_ASSERT(chunk.tokens_image != nullptr);
595
- int64_t t0 = ggml_time_ms();
596
- if (ctx->print_timings) {
597
- LOG_INF("encoding image or slice...\n");
598
- }
599
- ret = mtmd_encode(ctx, chunk.tokens_image.get());
600
- if (ret != 0) {
601
- LOG_ERR("failed to encode image\n");
602
- llama_batch_free(text_batch);
603
- return ret;
604
- }
605
- if (ctx->print_timings) {
606
- LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
607
- }
527
+ uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) {
528
+ return bitmap->nx;
529
+ }
608
530
 
609
- int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
610
- int32_t i_batch = 0;
611
- int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
612
- float * embd = mtmd_get_output_embd(ctx);
613
- decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
531
+ uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
532
+ return bitmap->ny;
533
+ }
614
534
 
615
- const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
616
- const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
535
+ const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
536
+ return bitmap->data.data();
537
+ }
617
538
 
618
- if (mtmd_decode_use_mrope(ctx)) {
619
- batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
620
- } else {
621
- batch_embd.set_position_normal(n_past, seq_id);
622
- }
539
+ const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
540
+ return bitmap->id.c_str();
541
+ }
623
542
 
624
- if (mtmd_decode_use_non_causal(ctx)) {
625
- llama_set_causal_attn(lctx, false);
626
- // TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
627
- }
543
+ void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id) {
544
+ if (id) {
545
+ bitmap->id = std::string(id);
546
+ } else {
547
+ bitmap->id.clear();
548
+ }
549
+ }
628
550
 
629
- while (i_batch < n_img_batches) { // split into batches
630
- int pos_offset = i_batch*n_batch;
631
- int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
632
- llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
551
+ void mtmd_bitmap_free(mtmd_bitmap * bitmap) {
552
+ if (bitmap) {
553
+ delete bitmap;
554
+ }
555
+ }
633
556
 
634
- LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
557
+ // mtmd_input_chunks
635
558
 
636
- int64_t t1 = ggml_time_ms();
637
- ret = llama_decode(lctx, batch_embd_view);
638
- if (ret != 0) {
639
- LOG_ERR("failed to decode image\n");
640
- llama_set_causal_attn(lctx, true); // restore causal attn
641
- llama_batch_free(text_batch);
642
- return ret;
643
- }
559
+ mtmd_input_chunks * mtmd_input_chunks_init() {
560
+ return new mtmd_input_chunks;
561
+ }
644
562
 
645
- if (ctx->print_timings) {
646
- LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
647
- }
563
+ size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks) {
564
+ return chunks->entries.size();
565
+ }
648
566
 
649
- i_batch++;
650
- }
567
+ const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx) {
568
+ if (idx >= chunks->entries.size()) {
569
+ return nullptr;
570
+ }
571
+ return &chunks->entries[idx];
572
+ }
651
573
 
652
- // for mrope, one image is one single **temporal** position
653
- n_past += mtmd_decode_use_mrope(ctx) ? 1 : n_tokens;
574
+ void mtmd_input_chunks_free(mtmd_input_chunks * chunks) {
575
+ if (chunks) {
576
+ delete chunks;
577
+ }
578
+ }
654
579
 
655
- if (mtmd_decode_use_non_causal(ctx)) {
656
- llama_set_causal_attn(lctx, true);
657
- }
580
+ // mtmd_input_chunk
658
581
 
659
- } else {
660
- GGML_ASSERT(false && "chunk type not supported");
661
- }
662
- }
582
+ enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk) {
583
+ return chunk->type;
584
+ }
663
585
 
664
- llama_batch_free(text_batch);
665
- return 0;
586
+ const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output) {
587
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
588
+ *n_tokens_output = chunk->tokens_text.size();
589
+ return chunk->tokens_text.data();
590
+ }
591
+ *n_tokens_output = 0;
592
+ return nullptr;
666
593
  }
667
594
 
668
- int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output) {
669
- clip_image_u8_ptr img_u8(clip_image_u8_init());
670
- bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
671
- if (!ok) {
672
- LOG_ERR("Unable to load image from buffer\n");
673
- return 1;
595
+ const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk) {
596
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
597
+ return chunk->tokens_image.get();
674
598
  }
675
- unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
676
- output.data.resize(output.nx * output.ny * 3);
677
- std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
678
- return 0;
599
+ return nullptr;
679
600
  }
680
601
 
681
- int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output) {
682
- clip_image_u8_ptr img_u8(clip_image_u8_init());
683
- bool ok = clip_image_load_from_file(fname, img_u8.get());
684
- if (!ok) {
685
- LOG_ERR("Unable to load image %s\n", fname);
686
- return 1;
602
+ mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
603
+ mtmd_input_chunk * copy = new mtmd_input_chunk{
604
+ chunk->type,
605
+ chunk->tokens_text,
606
+ mtmd_image_tokens_ptr(),
607
+ };
608
+ if (chunk->tokens_image) {
609
+ // copy the image tokens
610
+ copy->tokens_image = mtmd_image_tokens_ptr(new mtmd_image_tokens());
611
+ *copy->tokens_image = chunk->tokens_image->clone();
687
612
  }
688
- unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
689
- output.data.resize(output.nx * output.ny * 3);
690
- std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
691
- return 0;
613
+ return copy;
692
614
  }
693
615
 
694
- bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
695
- projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
696
- if (proj_type == PROJECTOR_TYPE_GEMMA3) {
697
- return true;
616
+ void mtmd_input_chunk_free(mtmd_input_chunk * chunk) {
617
+ if (chunk) {
618
+ delete chunk;
698
619
  }
699
- return false;
700
620
  }
701
621
 
702
- bool mtmd_decode_use_mrope(mtmd_context * ctx) {
703
- return ctx->use_mrope;
622
+ // mtmd_image_tokens
623
+
624
+ size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
625
+ return image_tokens->n_tokens();
704
626
  }
705
627
 
706
- void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
707
- mtmd_image_tokens_free(val);
628
+ size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
629
+ return image_tokens->nx;
630
+ }
631
+
632
+ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
633
+ return image_tokens->ny;
634
+ }
635
+
636
+ const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
637
+ return image_tokens->id.c_str();
638
+ }
639
+
640
+ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
641
+ if (image_tokens->use_mrope_pos) {
642
+ return 1; // for M-RoPE, the whole image is 1 in temporal dimension
643
+ }
644
+ return image_tokens->n_tokens();
645
+ }
646
+
647
+ // test function
648
+
649
+ mtmd_input_chunks * mtmd_test_create_input_chunks() {
650
+ mtmd_input_chunks * chunks = mtmd_input_chunks_init();
651
+ if (!chunks) {
652
+ return nullptr;
653
+ }
654
+
655
+ // create a text chunk
656
+ std::vector<llama_token> tokens_text = { 1, 2, 3, 4, 5 };
657
+ mtmd_input_chunk chunk_text{
658
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
659
+ std::move(tokens_text),
660
+ {},
661
+ };
662
+ chunks->entries.emplace_back(std::move(chunk_text));
663
+
664
+ // create an image chunk
665
+ mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
666
+ image_tokens->nx = 4;
667
+ image_tokens->ny = 4;
668
+ image_tokens->batch_f32.entries.resize(16);
669
+ image_tokens->id = "image_1";
670
+ mtmd_input_chunk chunk_image{
671
+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
672
+ {},
673
+ std::move(image_tokens),
674
+ };
675
+ chunks->entries.emplace_back(std::move(chunk_image));
676
+
677
+ return chunks;
708
678
  }