@fugood/llama.node 0.4.7 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/CMakeLists.txt +4 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/lib/binding.ts +66 -6
  11. package/lib/index.js +59 -17
  12. package/lib/index.ts +74 -23
  13. package/package.json +1 -1
  14. package/src/DecodeAudioTokenWorker.cpp +40 -0
  15. package/src/DecodeAudioTokenWorker.h +22 -0
  16. package/src/EmbeddingWorker.cpp +7 -5
  17. package/src/LlamaCompletionWorker.cpp +68 -54
  18. package/src/LlamaCompletionWorker.h +7 -8
  19. package/src/LlamaContext.cpp +551 -235
  20. package/src/LlamaContext.h +26 -4
  21. package/src/LoadSessionWorker.cpp +4 -2
  22. package/src/SaveSessionWorker.cpp +10 -6
  23. package/src/TokenizeWorker.cpp +23 -14
  24. package/src/TokenizeWorker.h +2 -2
  25. package/src/addons.cc +8 -11
  26. package/src/common.hpp +129 -126
  27. package/src/llama.cpp/.github/workflows/build.yml +2 -2
  28. package/src/llama.cpp/.github/workflows/release.yml +152 -129
  29. package/src/llama.cpp/.github/workflows/winget.yml +42 -0
  30. package/src/llama.cpp/common/arg.cpp +14 -13
  31. package/src/llama.cpp/common/common.cpp +4 -75
  32. package/src/llama.cpp/common/common.h +7 -12
  33. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
  34. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
  35. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
  36. package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
  37. package/src/llama.cpp/examples/simple/simple.cpp +1 -1
  38. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  39. package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
  40. package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
  41. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  42. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
  43. package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
  44. package/src/llama.cpp/ggml/include/ggml.h +11 -0
  45. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
  46. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
  47. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
  48. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
  51. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  52. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
  53. package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
  54. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
  55. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
  56. package/src/llama.cpp/ggml/src/ggml.c +64 -18
  57. package/src/llama.cpp/include/llama.h +24 -124
  58. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  59. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  60. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  61. package/src/llama.cpp/src/llama-batch.cpp +3 -1
  62. package/src/llama.cpp/src/llama-context.cpp +60 -110
  63. package/src/llama.cpp/src/llama-graph.cpp +137 -233
  64. package/src/llama.cpp/src/llama-graph.h +49 -7
  65. package/src/llama.cpp/src/llama-hparams.cpp +17 -1
  66. package/src/llama.cpp/src/llama-hparams.h +34 -5
  67. package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
  68. package/src/llama.cpp/src/llama-kv-cache.h +201 -85
  69. package/src/llama.cpp/src/llama-memory.h +3 -2
  70. package/src/llama.cpp/src/llama-model.cpp +273 -94
  71. package/src/llama.cpp/src/llama-model.h +4 -1
  72. package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
  73. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
  74. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
  75. package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
  76. package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
  77. package/src/llama.cpp/tools/mtmd/clip.h +6 -4
  78. package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
  79. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  80. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
  81. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
  82. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
  83. package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
  84. package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
  85. package/src/llama.cpp/tools/run/run.cpp +2 -2
  86. package/src/llama.cpp/tools/server/server.cpp +158 -47
  87. package/src/llama.cpp/tools/server/utils.hpp +71 -43
  88. package/src/llama.cpp/tools/tts/tts.cpp +4 -2
  89. package/src/tts_utils.cpp +342 -0
  90. package/src/tts_utils.h +62 -0
  91. package/bin/win32/arm64/llama-node.node +0 -0
  92. package/bin/win32/arm64/node.lib +0 -0
  93. package/bin/win32/x64/llama-node.node +0 -0
  94. package/bin/win32/x64/node.lib +0 -0
  95. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  96. package/bin/win32-vulkan/arm64/node.lib +0 -0
  97. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  98. package/bin/win32-vulkan/x64/node.lib +0 -0
@@ -1,6 +1,7 @@
1
1
  #include "clip.h"
2
2
  #include "clip-impl.h"
3
3
  #include "mtmd.h"
4
+ #include "mtmd-audio.h"
4
5
 
5
6
  #include "llama.h"
6
7
 
@@ -19,17 +20,49 @@ struct mtmd_bitmap {
19
20
  uint32_t ny;
20
21
  std::vector<unsigned char> data;
21
22
  std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
23
+ bool is_audio = false; // true if the bitmap is audio
22
24
  };
23
25
 
24
- struct mtmd_image_tokens_deleter {
25
- void operator()(mtmd_image_tokens * val); // forward declaration
26
+ struct mtmd_image_tokens {
27
+ uint32_t nx; // number of tokens in x direction
28
+ uint32_t ny; // number of tokens in y direction
29
+ bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
30
+ uint32_t n_tokens() const { return nx * ny; }
31
+ clip_image_f32_batch batch_f32; // preprocessed image patches
32
+ std::string id; // optional user-defined ID, useful for KV cache tracking
33
+
34
+ mtmd_image_tokens clone() {
35
+ return mtmd_image_tokens{
36
+ nx,
37
+ ny,
38
+ use_mrope_pos,
39
+ batch_f32.clone(),
40
+ id
41
+ };
42
+ }
26
43
  };
27
- using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
44
+ using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens>;
45
+
46
+ struct mtmd_audio_tokens {
47
+ uint32_t n_tokens; // number of tokens
48
+ clip_image_f32_batch batch_f32; // preprocessed image patches
49
+ std::string id; // optional user-defined ID, useful for KV cache tracking
50
+
51
+ mtmd_audio_tokens clone() {
52
+ return mtmd_audio_tokens{
53
+ n_tokens,
54
+ batch_f32.clone(),
55
+ id
56
+ };
57
+ }
58
+ };
59
+ using mtmd_audio_tokens_ptr = std::unique_ptr<mtmd_audio_tokens>;
28
60
 
29
61
  struct mtmd_input_chunk {
30
62
  mtmd_input_chunk_type type;
31
63
  std::vector<llama_token> tokens_text;
32
64
  mtmd_image_tokens_ptr tokens_image;
65
+ mtmd_audio_tokens_ptr tokens_audio;
33
66
  };
34
67
 
35
68
  struct mtmd_input_chunks {
@@ -42,9 +75,14 @@ enum mtmd_slice_tmpl {
42
75
  MTMD_SLICE_TMPL_NONE,
43
76
  MTMD_SLICE_TMPL_MINICPMV_2_5,
44
77
  MTMD_SLICE_TMPL_MINICPMV_2_6,
78
+ MTMD_SLICE_TMPL_LLAMA4,
45
79
  // TODO @ngxson : add support for idefics (SmolVLM)
46
80
  };
47
81
 
82
+ const char * mtmd_default_marker() {
83
+ return "<__media__>";
84
+ }
85
+
48
86
  mtmd_context_params mtmd_context_params_default() {
49
87
  mtmd_context_params params;
50
88
  params.use_gpu = true;
@@ -52,6 +90,7 @@ mtmd_context_params mtmd_context_params_default() {
52
90
  params.n_threads = 4;
53
91
  params.verbosity = GGML_LOG_LEVEL_INFO;
54
92
  params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
93
+ params.media_marker = mtmd_default_marker();
55
94
  return params;
56
95
  }
57
96
 
@@ -62,20 +101,29 @@ struct mtmd_context {
62
101
 
63
102
  bool print_timings;
64
103
  int n_threads;
65
- std::string image_marker;
104
+ std::string media_marker;
105
+ bool has_vision;
106
+ bool has_audio;
66
107
 
67
- // for minicpmv, we need special tokens in-between slices
108
+ // for llava-uhd style models, we need special tokens in-between slices
109
+ // minicpmv calls them "slices", llama 4 calls them "tiles"
68
110
  mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
69
111
  llama_token tok_ov_img_start = LLAMA_TOKEN_NULL; // overview image
70
112
  llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
71
113
  llama_token tok_slices_start = LLAMA_TOKEN_NULL; // start of all slices
72
114
  llama_token tok_slices_end = LLAMA_TOKEN_NULL; // end of all slices
73
- llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice
74
- llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice
115
+ llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice start
116
+ llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice end
117
+ llama_token tok_sli_img_mid = LLAMA_TOKEN_NULL; // between 2 slices
75
118
  llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
119
+ bool tok_row_end_trail = false;
120
+ bool ov_img_first = false;
76
121
 
77
122
  bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
78
123
 
124
+ // for whisper, we pre-calculate the mel filter bank
125
+ whisper_preprocessor::whisper_filters w_filters;
126
+
79
127
  // TODO @ngxson : add timings
80
128
 
81
129
  mtmd_context(const char * mmproj_fname,
@@ -84,8 +132,12 @@ struct mtmd_context {
84
132
  text_model (text_model),
85
133
  print_timings(ctx_params.print_timings),
86
134
  n_threads (ctx_params.n_threads),
87
- image_marker (ctx_params.image_marker)
135
+ media_marker (ctx_params.media_marker)
88
136
  {
137
+ if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
138
+ throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
139
+ }
140
+
89
141
  clip_context_params ctx_clip_params;
90
142
  ctx_clip_params.use_gpu = ctx_params.use_gpu;
91
143
  ctx_clip_params.verbosity = ctx_params.verbosity;
@@ -94,8 +146,11 @@ struct mtmd_context {
94
146
  throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
95
147
  }
96
148
 
97
- use_mrope = clip_is_qwen2vl(ctx_clip);
149
+ has_vision = clip_has_vision_encoder(ctx_clip);
150
+ has_audio = clip_has_audio_encoder(ctx_clip);
151
+ use_mrope = clip_is_qwen2vl(ctx_clip);
98
152
 
153
+ projector_type proj = clip_get_projector_type(ctx_clip);
99
154
  int minicpmv_version = clip_is_minicpmv(ctx_clip);
100
155
  if (minicpmv_version == 2) {
101
156
  // minicpmv 2.5 format:
@@ -108,6 +163,8 @@ struct mtmd_context {
108
163
  tok_sli_img_start = tok_ov_img_start;
109
164
  tok_sli_img_end = tok_ov_img_end;
110
165
  tok_row_end = lookup_token("\n");
166
+ tok_row_end_trail = false; // no trailing end-of-row token
167
+ ov_img_first = true;
111
168
 
112
169
  } else if (minicpmv_version == 3 || minicpmv_version == 4) {
113
170
  // minicpmv 2.6 format:
@@ -118,9 +175,40 @@ struct mtmd_context {
118
175
  tok_sli_img_start = lookup_token("<slice>");
119
176
  tok_sli_img_end = lookup_token("</slice>");
120
177
  tok_row_end = lookup_token("\n");
178
+ tok_row_end_trail = false; // no trailing end-of-row token
179
+ ov_img_first = true;
121
180
 
122
181
  } else if (minicpmv_version != 0) {
123
182
  GGML_ASSERT(false && "unsupported minicpmv version");
183
+ } else if (proj == PROJECTOR_TYPE_LLAMA4) {
184
+ // llama 4 format:
185
+ // <|image_start|>
186
+ // (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
187
+ // (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
188
+ // ... <|tile_y_separator|> <-- trailing end-of-row token
189
+ // <|image|> (overview) <-- overview image is last
190
+ // <|image_end|>
191
+ slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
192
+ tok_ov_img_start = lookup_token("<|image|>");
193
+ tok_sli_img_mid = lookup_token("<|tile_x_separator|>");
194
+ tok_row_end = lookup_token("<|tile_y_separator|>");
195
+ tok_row_end_trail = true; // add trailing end-of-row token
196
+ ov_img_first = false; // overview image is last
197
+ }
198
+
199
+ if (proj == PROJECTOR_TYPE_ULTRAVOX) {
200
+ // TODO @ngxson : check if model n_mel is 128 or 80
201
+ w_filters = whisper_precalc_filters::get_128_bins();
202
+ }
203
+
204
+ // warning messages
205
+ if (proj == PROJECTOR_TYPE_LLAMA4) {
206
+ LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
207
+ " https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
208
+ }
209
+ if (has_audio) {
210
+ LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
211
+ " https://github.com/ggml-org/llama.cpp/pull/13623\n", __func__);
124
212
  }
125
213
  }
126
214
 
@@ -155,29 +243,6 @@ private:
155
243
  }
156
244
  };
157
245
 
158
- struct mtmd_image_tokens_data {
159
- clip_image_f32_batch batch_f32; // preprocessed image patches
160
- };
161
-
162
- struct mtmd_image_tokens {
163
- uint32_t nx; // number of tokens in x direction
164
- uint32_t ny; // number of tokens in y direction
165
- bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
166
- uint32_t n_tokens() const { return nx * ny; }
167
- clip_image_f32_batch batch_f32; // preprocessed image patches
168
- std::string id; // optional user-defined ID, useful for KV cache tracking
169
-
170
- mtmd_image_tokens clone() {
171
- return mtmd_image_tokens{
172
- nx,
173
- ny,
174
- use_mrope_pos,
175
- batch_f32.clone(),
176
- id
177
- };
178
- }
179
- };
180
-
181
246
  mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
182
247
  const struct llama_model * text_model,
183
248
  const struct mtmd_context_params ctx_params) {
@@ -223,57 +288,63 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
223
288
  auto vocab = llama_model_get_vocab(ctx->text_model);
224
289
 
225
290
  std::string prompt_modified(text->text);
226
- std::string marker_modified(ctx->image_marker);
291
+ std::string marker_modified(ctx->media_marker);
227
292
  projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
228
293
 
294
+ // for compatibility, we convert image marker to media marker
295
+ string_replace_all(prompt_modified, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
296
+
229
297
  // a bit hacky here, but works for now
230
298
  // for some models, we need to add prefix and suffix to the image embeddings
231
299
  if (clip_is_gemma3(ctx->ctx_clip)) {
232
300
  // gemma 3
233
301
  // <start_of_image> ... (image embeddings) ... <end_of_image>
234
- marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
235
- string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
302
+ marker_modified = "<start_of_image>" + ctx->media_marker + "<end_of_image>";
303
+ string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
236
304
 
237
305
  } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
238
306
  // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
239
- marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";
240
- string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
307
+ marker_modified = "<fake_token_around_image><global-img>" + ctx->media_marker + "<fake_token_around_image>";
308
+ string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
241
309
 
242
310
  } else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
243
311
  // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
244
- marker_modified = ctx->image_marker + "[IMG_END]";
245
- string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
246
- }
312
+ marker_modified = ctx->media_marker + "[IMG_END]";
313
+ string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
247
314
 
248
- else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
315
+ } else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
249
316
  // <|vision_start|> ... (image embeddings) ... <|vision_end|>
250
- marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>";
251
- string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
317
+ marker_modified = "<|vision_start|>" + ctx->media_marker + "<|vision_end|>";
318
+ string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
252
319
 
253
- }
320
+ } else if (proj_type == PROJECTOR_TYPE_LLAMA4) {
321
+ // (more details in mtmd_context constructor)
322
+ marker_modified = "<|image_start|>" + ctx->media_marker + "<|image_end|>";
323
+ string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
254
324
 
255
- else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
325
+ } else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
256
326
  // <img> ... (image embeddings) ... </img>
257
- marker_modified = "<img>" + ctx->image_marker + "</img>";
258
- string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
327
+ marker_modified = "<img>" + ctx->media_marker + "</img>";
328
+ string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
259
329
 
260
330
  }
261
331
 
262
332
  // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
263
333
  // for glm-edge, BOI and EOI token's embeddings are not present in the text model
264
334
 
265
- std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
335
+ std::vector<std::string> parts = string_split_str(prompt_modified, ctx->media_marker);
266
336
  output->entries.clear();
267
337
  output->entries.reserve(parts.size());
268
338
 
269
- size_t i_img = 0;
339
+ size_t i_bm = 0;
270
340
 
271
341
  // utility for adding raw tokens
272
342
  auto add_text_chunk = [&output](std::vector<llama_token> && tokens) {
273
343
  mtmd_input_chunk chunk{
274
344
  MTMD_INPUT_CHUNK_TYPE_TEXT,
275
345
  std::move(tokens),
276
- {},
346
+ nullptr, // image tokens
347
+ nullptr, // audio tokens
277
348
  };
278
349
  output->entries.emplace_back(std::move(chunk));
279
350
  };
@@ -291,8 +362,9 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
291
362
 
292
363
  mtmd_input_chunk chunk{
293
364
  MTMD_INPUT_CHUNK_TYPE_IMAGE,
294
- {},
365
+ {}, // text tokens
295
366
  std::move(image_tokens),
367
+ nullptr, // audio tokens
296
368
  };
297
369
  chunks.emplace_back(std::move(chunk));
298
370
  }
@@ -310,25 +382,36 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
310
382
  mtmd_input_chunk chunk{
311
383
  MTMD_INPUT_CHUNK_TYPE_TEXT,
312
384
  std::move(tokens),
313
- {},
385
+ nullptr, // image tokens
386
+ nullptr, // audio tokens
314
387
  };
315
388
  output->entries.emplace_back(std::move(chunk));
316
389
 
317
- if (&parts.back() != &part) {
318
- // add image token to middle of 2 parts
390
+ // only add image/audio tokens to middle of 2 parts
391
+ // therefore, we skip handling image/audio if this is the last part
392
+ if (&parts.back() == &part) {
393
+ continue;
394
+ }
395
+
396
+ if (!bitmaps[i_bm]->is_audio) {
397
+ // handle image
319
398
 
320
- if (i_img >= n_bitmaps) {
399
+ if (i_bm >= n_bitmaps) {
321
400
  LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
322
401
  return 1;
323
402
  }
324
403
 
404
+ if (!ctx->has_vision) {
405
+ LOG_ERR("%s: error: model does not support vision input\n", __func__);
406
+ return 2;
407
+ }
408
+
325
409
  // convert mtmd_bitmap to clip_image_u8
326
410
  clip_image_u8_ptr img_u8(clip_image_u8_init());
327
- img_u8->nx = bitmaps[i_img]->nx;
328
- img_u8->ny = bitmaps[i_img]->ny;
329
- img_u8->buf.resize(bitmaps[i_img]->data.size());
330
- std::memcpy(img_u8->buf.data(), bitmaps[i_img]->data.data(), img_u8->nx * img_u8->ny * 3);
331
- clip_image_size img_u8_size{img_u8->nx, img_u8->ny};
411
+ img_u8->nx = bitmaps[i_bm]->nx;
412
+ img_u8->ny = bitmaps[i_bm]->ny;
413
+ img_u8->buf.resize(bitmaps[i_bm]->data.size());
414
+ std::memcpy(img_u8->buf.data(), bitmaps[i_bm]->data.data(), img_u8->nx * img_u8->ny * 3);
332
415
 
333
416
  // preprocess image
334
417
  clip_image_f32_batch batch_f32;
@@ -338,28 +421,40 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
338
421
  return 2;
339
422
  }
340
423
 
341
- if (ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6) {
424
+ // handle llava-uhd style preprocessing
425
+ if (
426
+ ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
427
+ || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
428
+ || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
429
+ ) {
342
430
  // split batch into chunks of single images
343
- auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img]->id);
431
+ auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_bm]->id);
344
432
  GGML_ASSERT(chunks.size() > 0);
345
433
 
346
- // add overview image
347
- add_text_chunk({ctx->tok_ov_img_start});
348
- output->entries.emplace_back(std::move(chunks.front()));
434
+ auto ov_chunk = std::move(chunks.front());
349
435
  chunks.erase(chunks.begin());
350
- add_text_chunk({ctx->tok_ov_img_end});
351
436
 
352
- // add slices
437
+ // add overview image (first)
438
+ if (ctx->ov_img_first) {
439
+ if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
440
+ add_text_chunk({ctx->tok_ov_img_start});
441
+ }
442
+ output->entries.emplace_back(std::move(ov_chunk));
443
+ if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
444
+ add_text_chunk({ctx->tok_ov_img_end});
445
+ }
446
+ }
447
+
448
+ // add slices (or tiles)
353
449
  if (!chunks.empty()) {
354
- clip_add_load_image_size(ctx->ctx_clip, &img_u8_size);
355
- int n_col = clip_uhd_num_image_embeds_col(ctx->ctx_clip);
356
- int n_row = (int)chunks.size() / n_col;
357
- GGML_ASSERT(n_row * n_col == (int)chunks.size());
450
+ const int n_col = batch_f32.grid_x;
451
+ const int n_row = batch_f32.grid_y;
358
452
  if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
359
453
  add_text_chunk({ctx->tok_slices_start});
360
454
  }
361
455
  for (int y = 0; y < n_row; y++) {
362
456
  for (int x = 0; x < n_col; x++) {
457
+ const bool is_last_in_row = (x == n_col - 1);
363
458
  if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
364
459
  add_text_chunk({ctx->tok_sli_img_start});
365
460
  }
@@ -367,8 +462,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
367
462
  if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
368
463
  add_text_chunk({ctx->tok_sli_img_end});
369
464
  }
465
+ if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
466
+ add_text_chunk({ctx->tok_sli_img_mid});
467
+ }
370
468
  }
371
- if (ctx->tok_row_end != LLAMA_TOKEN_NULL && y != n_row - 1) {
469
+ if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
372
470
  add_text_chunk({ctx->tok_row_end});
373
471
  }
374
472
  }
@@ -377,6 +475,17 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
377
475
  }
378
476
  }
379
477
 
478
+ // add overview image (last)
479
+ if (!ctx->ov_img_first) {
480
+ if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
481
+ add_text_chunk({ctx->tok_ov_img_start});
482
+ }
483
+ output->entries.emplace_back(std::move(ov_chunk));
484
+ if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
485
+ add_text_chunk({ctx->tok_ov_img_end});
486
+ }
487
+ }
488
+
380
489
  } else {
381
490
  size_t n_tokens = 0;
382
491
  for (const auto & entry : batch_f32.entries) {
@@ -395,7 +504,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
395
504
  image_tokens->ny = 1;
396
505
  }
397
506
  image_tokens->batch_f32 = std::move(batch_f32);
398
- image_tokens->id = bitmaps[i_img]->id; // optional
507
+ image_tokens->id = bitmaps[i_bm]->id; // optional
399
508
 
400
509
  LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
401
510
  LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
@@ -403,23 +512,101 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
403
512
 
404
513
  mtmd_input_chunk chunk{
405
514
  MTMD_INPUT_CHUNK_TYPE_IMAGE,
406
- {},
515
+ {}, // text tokens
407
516
  std::move(image_tokens),
517
+ nullptr, // audio tokens
408
518
  };
409
519
  output->entries.emplace_back(std::move(chunk));
410
520
  }
411
521
 
412
- i_img++; // move to next image
522
+ i_bm++; // move to next image
523
+ continue;
524
+
525
+ } else {
526
+ // handle audio
527
+
528
+ if (i_bm >= n_bitmaps) {
529
+ LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
530
+ return 1;
531
+ }
532
+
533
+ if (!ctx->has_audio) {
534
+ LOG_ERR("%s: error: model does not support audio input\n", __func__);
535
+ return 2;
536
+ }
537
+
538
+ if (bitmaps[i_bm]->data.size() == 0) {
539
+ LOG_ERR("%s: error: empty audio data\n", __func__);
540
+ return 2;
541
+ }
542
+
543
+ // preprocess audio
544
+ GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
545
+ std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
546
+ const float * samples = (const float *)bitmaps[i_bm]->data.data();
547
+ size_t n_samples = bitmaps[i_bm]->data.size() / sizeof(float);
548
+ bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks);
549
+ if (!ok) {
550
+ LOG_ERR("Unable to preprocess audio\n");
551
+ return 2;
552
+ }
553
+
554
+ // consider each mel_spec as a separate audio chunk
555
+ // TODO: maybe support batching, but this may come with memory cost
556
+ for (auto & mel_spec : mel_spec_chunks) {
557
+ clip_image_f32_ptr mel_f32(clip_image_f32_init());
558
+ mel_f32->nx = mel_spec.n_len;
559
+ mel_f32->ny = mel_spec.n_mel;
560
+ mel_f32->buf = std::move(mel_spec.data);
561
+ size_t n_tokens = clip_n_output_tokens(ctx->ctx_clip, mel_f32.get());
562
+
563
+ clip_image_f32_batch batch_f32;
564
+ batch_f32.is_audio = true;
565
+ batch_f32.entries.push_back(std::move(mel_f32));
566
+
567
+ mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
568
+ audio_tokens->n_tokens = n_tokens;
569
+ audio_tokens->batch_f32 = std::move(batch_f32);
570
+ audio_tokens->id = bitmaps[i_bm]->id; // optional
571
+
572
+ LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
573
+
574
+ mtmd_input_chunk chunk{
575
+ MTMD_INPUT_CHUNK_TYPE_AUDIO,
576
+ {}, // text tokens
577
+ nullptr, // image tokens
578
+ std::move(audio_tokens),
579
+ };
580
+ output->entries.emplace_back(std::move(chunk));
581
+ }
582
+
583
+ i_bm++;
584
+ continue;
413
585
  }
414
586
  }
415
587
 
416
588
  return 0;
417
589
  }
418
590
 
419
- static void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
420
- if (image_tokens) {
421
- delete image_tokens;
591
+ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
592
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
593
+ LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
594
+ return 0;
595
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
596
+ return mtmd_encode(ctx, chunk->tokens_image.get());
597
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
598
+ int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
599
+ ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
600
+ bool ok = clip_image_batch_encode(
601
+ ctx->ctx_clip,
602
+ ctx->n_threads,
603
+ &chunk->tokens_audio->batch_f32,
604
+ ctx->image_embd_v.data());
605
+ return ok ? 0 : 1;
422
606
  }
607
+
608
+ LOG_ERR("mtmd_encode_chunk: unknown chunk type %d\n", (int)chunk->type);
609
+ return 1;
423
610
  }
424
611
 
425
612
  int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
@@ -427,14 +614,6 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
427
614
  ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
428
615
  bool ok = false;
429
616
 
430
- // only effective for minicpmv and qwen2vl, other models will ignore load_image_size
431
- {
432
- clip_image_size slice_size{
433
- image_tokens->batch_f32.entries[0]->nx,
434
- image_tokens->batch_f32.entries[0]->ny};
435
- clip_add_load_image_size(ctx->ctx_clip, &slice_size);
436
- }
437
-
438
617
  if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
439
618
  // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
440
619
  const auto & entries = image_tokens->batch_f32.entries;
@@ -473,8 +652,12 @@ bool mtmd_decode_use_mrope(mtmd_context * ctx) {
473
652
  return ctx->use_mrope;
474
653
  }
475
654
 
476
- void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
477
- mtmd_image_tokens_free(val);
655
+ bool mtmd_support_vision(mtmd_context * ctx) {
656
+ return ctx->has_vision;
657
+ }
658
+
659
+ bool mtmd_support_audio(mtmd_context * ctx) {
660
+ return ctx->has_audio;
478
661
  }
479
662
 
480
663
  // these 2 helpers below use internal clip_image_u8_ptr,
@@ -483,6 +666,15 @@ void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
483
666
  // whichever library they want, and then use mtmd_bitmap_init() to create bitmap
484
667
 
485
668
  mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
669
+ if (audio_helpers::is_audio_file((const char *)buf, len)) {
670
+ std::vector<float> pcmf32;
671
+ if (!audio_helpers::decode_audio_from_buf(buf, len, COMMON_SAMPLE_RATE, pcmf32)) {
672
+ LOG_ERR("Unable to read WAV audio file from buffer\n");
673
+ return nullptr;
674
+ }
675
+ return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
676
+ }
677
+
486
678
  clip_image_u8_ptr img_u8(clip_image_u8_init());
487
679
  bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
488
680
  if (!ok) {
@@ -495,15 +687,26 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t
495
687
  }
496
688
 
497
689
  mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
498
- clip_image_u8_ptr img_u8(clip_image_u8_init());
499
- bool ok = clip_image_load_from_file(fname, img_u8.get());
500
- if (!ok) {
501
- LOG_ERR("Unable to load image %s\n", fname);
690
+ std::vector<unsigned char> buf;
691
+ FILE * f = fopen(fname, "rb");
692
+ if (!f) {
693
+ LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
502
694
  return nullptr;
503
695
  }
504
- uint32_t nx, ny;
505
- unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
506
- return mtmd_bitmap_init(nx, ny, data);
696
+
697
+ fseek(f, 0, SEEK_END);
698
+ long file_size = ftell(f);
699
+ fseek(f, 0, SEEK_SET);
700
+ buf.resize(file_size);
701
+
702
+ size_t n_read = fread(buf.data(), 1, file_size, f);
703
+ fclose(f);
704
+ if (n_read != (size_t)file_size) {
705
+ LOG_ERR("Failed to read entire file %s", fname);
706
+ return nullptr;
707
+ }
708
+
709
+ return mtmd_helper_bitmap_init_from_buf(buf.data(), buf.size());
507
710
  }
508
711
 
509
712
  //
@@ -524,6 +727,18 @@ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
524
727
  return bitmap;
525
728
  }
526
729
 
730
+ mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
731
+ const float * data) {
732
+ mtmd_bitmap * bitmap = new mtmd_bitmap;
733
+ bitmap->nx = n_samples;
734
+ bitmap->ny = 1;
735
+ bitmap->is_audio = true;
736
+ size_t data_size = n_samples * sizeof(float);
737
+ bitmap->data.resize(data_size);
738
+ std::memcpy(bitmap->data.data(), data, data_size);
739
+ return bitmap;
740
+ }
741
+
527
742
  uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) {
528
743
  return bitmap->nx;
529
744
  }
@@ -536,6 +751,14 @@ const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
536
751
  return bitmap->data.data();
537
752
  }
538
753
 
754
+ size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
755
+ return bitmap->data.size();
756
+ }
757
+
758
+ bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
759
+ return bitmap->is_audio;
760
+ }
761
+
539
762
  const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
540
763
  return bitmap->id.c_str();
541
764
  }
@@ -599,17 +822,56 @@ const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chu
599
822
  return nullptr;
600
823
  }
601
824
 
825
+ size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk) {
826
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
827
+ return chunk->tokens_text.size();
828
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
829
+ return mtmd_image_tokens_get_n_tokens(chunk->tokens_image.get());
830
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
831
+ return chunk->tokens_audio->n_tokens;
832
+ } else {
833
+ GGML_ABORT("invalid chunk type");
834
+ }
835
+ }
836
+
837
+ llama_pos mtmd_input_chunk_get_n_pos(const mtmd_input_chunk * chunk) {
838
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
839
+ return chunk->tokens_text.size();
840
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
841
+ return mtmd_image_tokens_get_n_pos(chunk->tokens_image.get());
842
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
843
+ return chunk->tokens_audio->n_tokens;
844
+ } else {
845
+ GGML_ABORT("invalid chunk type");
846
+ }
847
+ }
848
+
849
+ const char * mtmd_input_chunk_get_id(const mtmd_input_chunk * chunk) {
850
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
851
+ return chunk->tokens_image->id.c_str();
852
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
853
+ return chunk->tokens_audio->id.c_str();
854
+ }
855
+ return nullptr;
856
+ }
857
+
602
858
  mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
603
859
  mtmd_input_chunk * copy = new mtmd_input_chunk{
604
860
  chunk->type,
605
861
  chunk->tokens_text,
606
- mtmd_image_tokens_ptr(),
862
+ nullptr,
863
+ nullptr,
607
864
  };
608
865
  if (chunk->tokens_image) {
609
866
  // copy the image tokens
610
867
  copy->tokens_image = mtmd_image_tokens_ptr(new mtmd_image_tokens());
611
868
  *copy->tokens_image = chunk->tokens_image->clone();
612
869
  }
870
+ if (chunk->tokens_audio) {
871
+ // copy the audio tokens
872
+ copy->tokens_audio = mtmd_audio_tokens_ptr(new mtmd_audio_tokens());
873
+ *copy->tokens_audio = chunk->tokens_audio->clone();
874
+ }
613
875
  return copy;
614
876
  }
615
877
 
@@ -657,7 +919,8 @@ mtmd_input_chunks * mtmd_test_create_input_chunks() {
657
919
  mtmd_input_chunk chunk_text{
658
920
  MTMD_INPUT_CHUNK_TYPE_TEXT,
659
921
  std::move(tokens_text),
660
- {},
922
+ nullptr, // image tokens
923
+ nullptr, // audio tokens
661
924
  };
662
925
  chunks->entries.emplace_back(std::move(chunk_text));
663
926
 
@@ -669,8 +932,9 @@ mtmd_input_chunks * mtmd_test_create_input_chunks() {
669
932
  image_tokens->id = "image_1";
670
933
  mtmd_input_chunk chunk_image{
671
934
  MTMD_INPUT_CHUNK_TYPE_IMAGE,
672
- {},
935
+ {}, // text tokens
673
936
  std::move(image_tokens),
937
+ nullptr, // audio tokens
674
938
  };
675
939
  chunks->entries.emplace_back(std::move(chunk_image));
676
940