@fugood/llama.node 0.4.7 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +20 -6
- package/lib/index.js +41 -17
- package/lib/index.ts +50 -23
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +9 -9
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +37 -18
- package/src/LlamaContext.h +1 -0
- package/src/TokenizeWorker.cpp +16 -12
- package/src/TokenizeWorker.h +2 -2
- package/src/common.hpp +54 -50
- package/src/llama.cpp/.github/workflows/build.yml +2 -2
- package/src/llama.cpp/.github/workflows/release.yml +152 -129
- package/src/llama.cpp/.github/workflows/winget.yml +42 -0
- package/src/llama.cpp/common/arg.cpp +14 -13
- package/src/llama.cpp/common/common.cpp +4 -75
- package/src/llama.cpp/common/common.h +7 -12
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
- package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
- package/src/llama.cpp/examples/simple/simple.cpp +1 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
- package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
- package/src/llama.cpp/ggml/include/ggml.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
- package/src/llama.cpp/ggml/src/ggml.c +64 -18
- package/src/llama.cpp/include/llama.h +24 -124
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/src/llama-batch.cpp +3 -1
- package/src/llama.cpp/src/llama-context.cpp +60 -110
- package/src/llama.cpp/src/llama-graph.cpp +137 -233
- package/src/llama.cpp/src/llama-graph.h +49 -7
- package/src/llama.cpp/src/llama-hparams.cpp +17 -1
- package/src/llama.cpp/src/llama-hparams.h +34 -5
- package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
- package/src/llama.cpp/src/llama-kv-cache.h +201 -85
- package/src/llama.cpp/src/llama-memory.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +273 -94
- package/src/llama.cpp/src/llama-model.h +4 -1
- package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
- package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
- package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
- package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
- package/src/llama.cpp/tools/mtmd/clip.h +6 -4
- package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
- package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
- package/src/llama.cpp/tools/run/run.cpp +2 -2
- package/src/llama.cpp/tools/server/server.cpp +158 -47
- package/src/llama.cpp/tools/server/utils.hpp +71 -43
- package/src/llama.cpp/tools/tts/tts.cpp +4 -2
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#include "clip.h"
|
|
2
2
|
#include "clip-impl.h"
|
|
3
3
|
#include "mtmd.h"
|
|
4
|
+
#include "mtmd-audio.h"
|
|
4
5
|
|
|
5
6
|
#include "llama.h"
|
|
6
7
|
|
|
@@ -19,17 +20,49 @@ struct mtmd_bitmap {
|
|
|
19
20
|
uint32_t ny;
|
|
20
21
|
std::vector<unsigned char> data;
|
|
21
22
|
std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
|
|
23
|
+
bool is_audio = false; // true if the bitmap is audio
|
|
22
24
|
};
|
|
23
25
|
|
|
24
|
-
struct
|
|
25
|
-
|
|
26
|
+
struct mtmd_image_tokens {
|
|
27
|
+
uint32_t nx; // number of tokens in x direction
|
|
28
|
+
uint32_t ny; // number of tokens in y direction
|
|
29
|
+
bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
|
|
30
|
+
uint32_t n_tokens() const { return nx * ny; }
|
|
31
|
+
clip_image_f32_batch batch_f32; // preprocessed image patches
|
|
32
|
+
std::string id; // optional user-defined ID, useful for KV cache tracking
|
|
33
|
+
|
|
34
|
+
mtmd_image_tokens clone() {
|
|
35
|
+
return mtmd_image_tokens{
|
|
36
|
+
nx,
|
|
37
|
+
ny,
|
|
38
|
+
use_mrope_pos,
|
|
39
|
+
batch_f32.clone(),
|
|
40
|
+
id
|
|
41
|
+
};
|
|
42
|
+
}
|
|
26
43
|
};
|
|
27
|
-
using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens
|
|
44
|
+
using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens>;
|
|
45
|
+
|
|
46
|
+
struct mtmd_audio_tokens {
|
|
47
|
+
uint32_t n_tokens; // number of tokens
|
|
48
|
+
clip_image_f32_batch batch_f32; // preprocessed image patches
|
|
49
|
+
std::string id; // optional user-defined ID, useful for KV cache tracking
|
|
50
|
+
|
|
51
|
+
mtmd_audio_tokens clone() {
|
|
52
|
+
return mtmd_audio_tokens{
|
|
53
|
+
n_tokens,
|
|
54
|
+
batch_f32.clone(),
|
|
55
|
+
id
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
};
|
|
59
|
+
using mtmd_audio_tokens_ptr = std::unique_ptr<mtmd_audio_tokens>;
|
|
28
60
|
|
|
29
61
|
struct mtmd_input_chunk {
|
|
30
62
|
mtmd_input_chunk_type type;
|
|
31
63
|
std::vector<llama_token> tokens_text;
|
|
32
64
|
mtmd_image_tokens_ptr tokens_image;
|
|
65
|
+
mtmd_audio_tokens_ptr tokens_audio;
|
|
33
66
|
};
|
|
34
67
|
|
|
35
68
|
struct mtmd_input_chunks {
|
|
@@ -42,9 +75,14 @@ enum mtmd_slice_tmpl {
|
|
|
42
75
|
MTMD_SLICE_TMPL_NONE,
|
|
43
76
|
MTMD_SLICE_TMPL_MINICPMV_2_5,
|
|
44
77
|
MTMD_SLICE_TMPL_MINICPMV_2_6,
|
|
78
|
+
MTMD_SLICE_TMPL_LLAMA4,
|
|
45
79
|
// TODO @ngxson : add support for idefics (SmolVLM)
|
|
46
80
|
};
|
|
47
81
|
|
|
82
|
+
const char * mtmd_default_marker() {
|
|
83
|
+
return "<__media__>";
|
|
84
|
+
}
|
|
85
|
+
|
|
48
86
|
mtmd_context_params mtmd_context_params_default() {
|
|
49
87
|
mtmd_context_params params;
|
|
50
88
|
params.use_gpu = true;
|
|
@@ -52,6 +90,7 @@ mtmd_context_params mtmd_context_params_default() {
|
|
|
52
90
|
params.n_threads = 4;
|
|
53
91
|
params.verbosity = GGML_LOG_LEVEL_INFO;
|
|
54
92
|
params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
|
|
93
|
+
params.media_marker = mtmd_default_marker();
|
|
55
94
|
return params;
|
|
56
95
|
}
|
|
57
96
|
|
|
@@ -62,20 +101,29 @@ struct mtmd_context {
|
|
|
62
101
|
|
|
63
102
|
bool print_timings;
|
|
64
103
|
int n_threads;
|
|
65
|
-
std::string
|
|
104
|
+
std::string media_marker;
|
|
105
|
+
bool has_vision;
|
|
106
|
+
bool has_audio;
|
|
66
107
|
|
|
67
|
-
// for
|
|
108
|
+
// for llava-uhd style models, we need special tokens in-between slices
|
|
109
|
+
// minicpmv calls them "slices", llama 4 calls them "tiles"
|
|
68
110
|
mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
|
|
69
111
|
llama_token tok_ov_img_start = LLAMA_TOKEN_NULL; // overview image
|
|
70
112
|
llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
|
|
71
113
|
llama_token tok_slices_start = LLAMA_TOKEN_NULL; // start of all slices
|
|
72
114
|
llama_token tok_slices_end = LLAMA_TOKEN_NULL; // end of all slices
|
|
73
|
-
llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice
|
|
74
|
-
llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice
|
|
115
|
+
llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice start
|
|
116
|
+
llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice end
|
|
117
|
+
llama_token tok_sli_img_mid = LLAMA_TOKEN_NULL; // between 2 slices
|
|
75
118
|
llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
|
|
119
|
+
bool tok_row_end_trail = false;
|
|
120
|
+
bool ov_img_first = false;
|
|
76
121
|
|
|
77
122
|
bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
|
|
78
123
|
|
|
124
|
+
// for whisper, we pre-calculate the mel filter bank
|
|
125
|
+
whisper_preprocessor::whisper_filters w_filters;
|
|
126
|
+
|
|
79
127
|
// TODO @ngxson : add timings
|
|
80
128
|
|
|
81
129
|
mtmd_context(const char * mmproj_fname,
|
|
@@ -84,8 +132,12 @@ struct mtmd_context {
|
|
|
84
132
|
text_model (text_model),
|
|
85
133
|
print_timings(ctx_params.print_timings),
|
|
86
134
|
n_threads (ctx_params.n_threads),
|
|
87
|
-
|
|
135
|
+
media_marker (ctx_params.media_marker)
|
|
88
136
|
{
|
|
137
|
+
if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
|
|
138
|
+
throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
|
|
139
|
+
}
|
|
140
|
+
|
|
89
141
|
clip_context_params ctx_clip_params;
|
|
90
142
|
ctx_clip_params.use_gpu = ctx_params.use_gpu;
|
|
91
143
|
ctx_clip_params.verbosity = ctx_params.verbosity;
|
|
@@ -94,8 +146,11 @@ struct mtmd_context {
|
|
|
94
146
|
throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
|
|
95
147
|
}
|
|
96
148
|
|
|
97
|
-
|
|
149
|
+
has_vision = clip_has_vision_encoder(ctx_clip);
|
|
150
|
+
has_audio = clip_has_audio_encoder(ctx_clip);
|
|
151
|
+
use_mrope = clip_is_qwen2vl(ctx_clip);
|
|
98
152
|
|
|
153
|
+
projector_type proj = clip_get_projector_type(ctx_clip);
|
|
99
154
|
int minicpmv_version = clip_is_minicpmv(ctx_clip);
|
|
100
155
|
if (minicpmv_version == 2) {
|
|
101
156
|
// minicpmv 2.5 format:
|
|
@@ -108,6 +163,8 @@ struct mtmd_context {
|
|
|
108
163
|
tok_sli_img_start = tok_ov_img_start;
|
|
109
164
|
tok_sli_img_end = tok_ov_img_end;
|
|
110
165
|
tok_row_end = lookup_token("\n");
|
|
166
|
+
tok_row_end_trail = false; // no trailing end-of-row token
|
|
167
|
+
ov_img_first = true;
|
|
111
168
|
|
|
112
169
|
} else if (minicpmv_version == 3 || minicpmv_version == 4) {
|
|
113
170
|
// minicpmv 2.6 format:
|
|
@@ -118,9 +175,40 @@ struct mtmd_context {
|
|
|
118
175
|
tok_sli_img_start = lookup_token("<slice>");
|
|
119
176
|
tok_sli_img_end = lookup_token("</slice>");
|
|
120
177
|
tok_row_end = lookup_token("\n");
|
|
178
|
+
tok_row_end_trail = false; // no trailing end-of-row token
|
|
179
|
+
ov_img_first = true;
|
|
121
180
|
|
|
122
181
|
} else if (minicpmv_version != 0) {
|
|
123
182
|
GGML_ASSERT(false && "unsupported minicpmv version");
|
|
183
|
+
} else if (proj == PROJECTOR_TYPE_LLAMA4) {
|
|
184
|
+
// llama 4 format:
|
|
185
|
+
// <|image_start|>
|
|
186
|
+
// (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
|
|
187
|
+
// (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
|
|
188
|
+
// ... <|tile_y_separator|> <-- trailing end-of-row token
|
|
189
|
+
// <|image|> (overview) <-- overview image is last
|
|
190
|
+
// <|image_end|>
|
|
191
|
+
slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
|
|
192
|
+
tok_ov_img_start = lookup_token("<|image|>");
|
|
193
|
+
tok_sli_img_mid = lookup_token("<|tile_x_separator|>");
|
|
194
|
+
tok_row_end = lookup_token("<|tile_y_separator|>");
|
|
195
|
+
tok_row_end_trail = true; // add trailing end-of-row token
|
|
196
|
+
ov_img_first = false; // overview image is last
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
if (proj == PROJECTOR_TYPE_ULTRAVOX) {
|
|
200
|
+
// TODO @ngxson : check if model n_mel is 128 or 80
|
|
201
|
+
w_filters = whisper_precalc_filters::get_128_bins();
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// warning messages
|
|
205
|
+
if (proj == PROJECTOR_TYPE_LLAMA4) {
|
|
206
|
+
LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
|
|
207
|
+
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
|
|
208
|
+
}
|
|
209
|
+
if (has_audio) {
|
|
210
|
+
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
|
|
211
|
+
" https://github.com/ggml-org/llama.cpp/pull/13623\n", __func__);
|
|
124
212
|
}
|
|
125
213
|
}
|
|
126
214
|
|
|
@@ -155,29 +243,6 @@ private:
|
|
|
155
243
|
}
|
|
156
244
|
};
|
|
157
245
|
|
|
158
|
-
struct mtmd_image_tokens_data {
|
|
159
|
-
clip_image_f32_batch batch_f32; // preprocessed image patches
|
|
160
|
-
};
|
|
161
|
-
|
|
162
|
-
struct mtmd_image_tokens {
|
|
163
|
-
uint32_t nx; // number of tokens in x direction
|
|
164
|
-
uint32_t ny; // number of tokens in y direction
|
|
165
|
-
bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
|
|
166
|
-
uint32_t n_tokens() const { return nx * ny; }
|
|
167
|
-
clip_image_f32_batch batch_f32; // preprocessed image patches
|
|
168
|
-
std::string id; // optional user-defined ID, useful for KV cache tracking
|
|
169
|
-
|
|
170
|
-
mtmd_image_tokens clone() {
|
|
171
|
-
return mtmd_image_tokens{
|
|
172
|
-
nx,
|
|
173
|
-
ny,
|
|
174
|
-
use_mrope_pos,
|
|
175
|
-
batch_f32.clone(),
|
|
176
|
-
id
|
|
177
|
-
};
|
|
178
|
-
}
|
|
179
|
-
};
|
|
180
|
-
|
|
181
246
|
mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
|
|
182
247
|
const struct llama_model * text_model,
|
|
183
248
|
const struct mtmd_context_params ctx_params) {
|
|
@@ -223,57 +288,63 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
|
223
288
|
auto vocab = llama_model_get_vocab(ctx->text_model);
|
|
224
289
|
|
|
225
290
|
std::string prompt_modified(text->text);
|
|
226
|
-
std::string marker_modified(ctx->
|
|
291
|
+
std::string marker_modified(ctx->media_marker);
|
|
227
292
|
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
|
|
228
293
|
|
|
294
|
+
// for compatibility, we convert image marker to media marker
|
|
295
|
+
string_replace_all(prompt_modified, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
|
|
296
|
+
|
|
229
297
|
// a bit hacky here, but works for now
|
|
230
298
|
// for some models, we need to add prefix and suffix to the image embeddings
|
|
231
299
|
if (clip_is_gemma3(ctx->ctx_clip)) {
|
|
232
300
|
// gemma 3
|
|
233
301
|
// <start_of_image> ... (image embeddings) ... <end_of_image>
|
|
234
|
-
marker_modified = "<start_of_image>" + ctx->
|
|
235
|
-
string_replace_all(prompt_modified, ctx->
|
|
302
|
+
marker_modified = "<start_of_image>" + ctx->media_marker + "<end_of_image>";
|
|
303
|
+
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
|
236
304
|
|
|
237
305
|
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
|
|
238
306
|
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
|
|
239
|
-
marker_modified = "<fake_token_around_image><global-img>" + ctx->
|
|
240
|
-
string_replace_all(prompt_modified, ctx->
|
|
307
|
+
marker_modified = "<fake_token_around_image><global-img>" + ctx->media_marker + "<fake_token_around_image>";
|
|
308
|
+
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
|
241
309
|
|
|
242
310
|
} else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
|
|
243
311
|
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
|
|
244
|
-
marker_modified = ctx->
|
|
245
|
-
string_replace_all(prompt_modified, ctx->
|
|
246
|
-
}
|
|
312
|
+
marker_modified = ctx->media_marker + "[IMG_END]";
|
|
313
|
+
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
|
247
314
|
|
|
248
|
-
else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
|
315
|
+
} else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
|
249
316
|
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
|
250
|
-
marker_modified = "<|vision_start|>" + ctx->
|
|
251
|
-
string_replace_all(prompt_modified, ctx->
|
|
317
|
+
marker_modified = "<|vision_start|>" + ctx->media_marker + "<|vision_end|>";
|
|
318
|
+
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
|
252
319
|
|
|
253
|
-
}
|
|
320
|
+
} else if (proj_type == PROJECTOR_TYPE_LLAMA4) {
|
|
321
|
+
// (more details in mtmd_context constructor)
|
|
322
|
+
marker_modified = "<|image_start|>" + ctx->media_marker + "<|image_end|>";
|
|
323
|
+
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
|
254
324
|
|
|
255
|
-
else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
|
|
325
|
+
} else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
|
|
256
326
|
// <img> ... (image embeddings) ... </img>
|
|
257
|
-
marker_modified = "<img>" + ctx->
|
|
258
|
-
string_replace_all(prompt_modified, ctx->
|
|
327
|
+
marker_modified = "<img>" + ctx->media_marker + "</img>";
|
|
328
|
+
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
|
259
329
|
|
|
260
330
|
}
|
|
261
331
|
|
|
262
332
|
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
|
|
263
333
|
// for glm-edge, BOI and EOI token's embeddings are not present in the text model
|
|
264
334
|
|
|
265
|
-
std::vector<std::string> parts = string_split_str(prompt_modified, ctx->
|
|
335
|
+
std::vector<std::string> parts = string_split_str(prompt_modified, ctx->media_marker);
|
|
266
336
|
output->entries.clear();
|
|
267
337
|
output->entries.reserve(parts.size());
|
|
268
338
|
|
|
269
|
-
size_t
|
|
339
|
+
size_t i_bm = 0;
|
|
270
340
|
|
|
271
341
|
// utility for adding raw tokens
|
|
272
342
|
auto add_text_chunk = [&output](std::vector<llama_token> && tokens) {
|
|
273
343
|
mtmd_input_chunk chunk{
|
|
274
344
|
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
|
275
345
|
std::move(tokens),
|
|
276
|
-
|
|
346
|
+
nullptr, // image tokens
|
|
347
|
+
nullptr, // audio tokens
|
|
277
348
|
};
|
|
278
349
|
output->entries.emplace_back(std::move(chunk));
|
|
279
350
|
};
|
|
@@ -291,8 +362,9 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
|
291
362
|
|
|
292
363
|
mtmd_input_chunk chunk{
|
|
293
364
|
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
|
294
|
-
{},
|
|
365
|
+
{}, // text tokens
|
|
295
366
|
std::move(image_tokens),
|
|
367
|
+
nullptr, // audio tokens
|
|
296
368
|
};
|
|
297
369
|
chunks.emplace_back(std::move(chunk));
|
|
298
370
|
}
|
|
@@ -310,25 +382,36 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
|
310
382
|
mtmd_input_chunk chunk{
|
|
311
383
|
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
|
312
384
|
std::move(tokens),
|
|
313
|
-
|
|
385
|
+
nullptr, // image tokens
|
|
386
|
+
nullptr, // audio tokens
|
|
314
387
|
};
|
|
315
388
|
output->entries.emplace_back(std::move(chunk));
|
|
316
389
|
|
|
317
|
-
|
|
318
|
-
|
|
390
|
+
// only add image/audio tokens to middle of 2 parts
|
|
391
|
+
// therefore, we skip handling image/audio if this is the last part
|
|
392
|
+
if (&parts.back() == &part) {
|
|
393
|
+
continue;
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
if (!bitmaps[i_bm]->is_audio) {
|
|
397
|
+
// handle image
|
|
319
398
|
|
|
320
|
-
if (
|
|
399
|
+
if (i_bm >= n_bitmaps) {
|
|
321
400
|
LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
|
|
322
401
|
return 1;
|
|
323
402
|
}
|
|
324
403
|
|
|
404
|
+
if (!ctx->has_vision) {
|
|
405
|
+
LOG_ERR("%s: error: model does not support vision input\n", __func__);
|
|
406
|
+
return 2;
|
|
407
|
+
}
|
|
408
|
+
|
|
325
409
|
// convert mtmd_bitmap to clip_image_u8
|
|
326
410
|
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
|
327
|
-
img_u8->nx = bitmaps[
|
|
328
|
-
img_u8->ny = bitmaps[
|
|
329
|
-
img_u8->buf.resize(bitmaps[
|
|
330
|
-
std::memcpy(img_u8->buf.data(), bitmaps[
|
|
331
|
-
clip_image_size img_u8_size{img_u8->nx, img_u8->ny};
|
|
411
|
+
img_u8->nx = bitmaps[i_bm]->nx;
|
|
412
|
+
img_u8->ny = bitmaps[i_bm]->ny;
|
|
413
|
+
img_u8->buf.resize(bitmaps[i_bm]->data.size());
|
|
414
|
+
std::memcpy(img_u8->buf.data(), bitmaps[i_bm]->data.data(), img_u8->nx * img_u8->ny * 3);
|
|
332
415
|
|
|
333
416
|
// preprocess image
|
|
334
417
|
clip_image_f32_batch batch_f32;
|
|
@@ -338,28 +421,40 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
|
338
421
|
return 2;
|
|
339
422
|
}
|
|
340
423
|
|
|
341
|
-
|
|
424
|
+
// handle llava-uhd style preprocessing
|
|
425
|
+
if (
|
|
426
|
+
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
|
|
427
|
+
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|
|
428
|
+
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
|
|
429
|
+
) {
|
|
342
430
|
// split batch into chunks of single images
|
|
343
|
-
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[
|
|
431
|
+
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_bm]->id);
|
|
344
432
|
GGML_ASSERT(chunks.size() > 0);
|
|
345
433
|
|
|
346
|
-
|
|
347
|
-
add_text_chunk({ctx->tok_ov_img_start});
|
|
348
|
-
output->entries.emplace_back(std::move(chunks.front()));
|
|
434
|
+
auto ov_chunk = std::move(chunks.front());
|
|
349
435
|
chunks.erase(chunks.begin());
|
|
350
|
-
add_text_chunk({ctx->tok_ov_img_end});
|
|
351
436
|
|
|
352
|
-
// add
|
|
437
|
+
// add overview image (first)
|
|
438
|
+
if (ctx->ov_img_first) {
|
|
439
|
+
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
|
|
440
|
+
add_text_chunk({ctx->tok_ov_img_start});
|
|
441
|
+
}
|
|
442
|
+
output->entries.emplace_back(std::move(ov_chunk));
|
|
443
|
+
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
|
|
444
|
+
add_text_chunk({ctx->tok_ov_img_end});
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
// add slices (or tiles)
|
|
353
449
|
if (!chunks.empty()) {
|
|
354
|
-
|
|
355
|
-
int
|
|
356
|
-
int n_row = (int)chunks.size() / n_col;
|
|
357
|
-
GGML_ASSERT(n_row * n_col == (int)chunks.size());
|
|
450
|
+
const int n_col = batch_f32.grid_x;
|
|
451
|
+
const int n_row = batch_f32.grid_y;
|
|
358
452
|
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
|
|
359
453
|
add_text_chunk({ctx->tok_slices_start});
|
|
360
454
|
}
|
|
361
455
|
for (int y = 0; y < n_row; y++) {
|
|
362
456
|
for (int x = 0; x < n_col; x++) {
|
|
457
|
+
const bool is_last_in_row = (x == n_col - 1);
|
|
363
458
|
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
|
|
364
459
|
add_text_chunk({ctx->tok_sli_img_start});
|
|
365
460
|
}
|
|
@@ -367,8 +462,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
|
367
462
|
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
|
|
368
463
|
add_text_chunk({ctx->tok_sli_img_end});
|
|
369
464
|
}
|
|
465
|
+
if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
|
|
466
|
+
add_text_chunk({ctx->tok_sli_img_mid});
|
|
467
|
+
}
|
|
370
468
|
}
|
|
371
|
-
if (
|
|
469
|
+
if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
|
|
372
470
|
add_text_chunk({ctx->tok_row_end});
|
|
373
471
|
}
|
|
374
472
|
}
|
|
@@ -377,6 +475,17 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
|
377
475
|
}
|
|
378
476
|
}
|
|
379
477
|
|
|
478
|
+
// add overview image (last)
|
|
479
|
+
if (!ctx->ov_img_first) {
|
|
480
|
+
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
|
|
481
|
+
add_text_chunk({ctx->tok_ov_img_start});
|
|
482
|
+
}
|
|
483
|
+
output->entries.emplace_back(std::move(ov_chunk));
|
|
484
|
+
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
|
|
485
|
+
add_text_chunk({ctx->tok_ov_img_end});
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
|
|
380
489
|
} else {
|
|
381
490
|
size_t n_tokens = 0;
|
|
382
491
|
for (const auto & entry : batch_f32.entries) {
|
|
@@ -395,7 +504,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
|
395
504
|
image_tokens->ny = 1;
|
|
396
505
|
}
|
|
397
506
|
image_tokens->batch_f32 = std::move(batch_f32);
|
|
398
|
-
image_tokens->id = bitmaps[
|
|
507
|
+
image_tokens->id = bitmaps[i_bm]->id; // optional
|
|
399
508
|
|
|
400
509
|
LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
|
|
401
510
|
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
|
|
@@ -403,23 +512,101 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
|
403
512
|
|
|
404
513
|
mtmd_input_chunk chunk{
|
|
405
514
|
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
|
406
|
-
{},
|
|
515
|
+
{}, // text tokens
|
|
407
516
|
std::move(image_tokens),
|
|
517
|
+
nullptr, // audio tokens
|
|
408
518
|
};
|
|
409
519
|
output->entries.emplace_back(std::move(chunk));
|
|
410
520
|
}
|
|
411
521
|
|
|
412
|
-
|
|
522
|
+
i_bm++; // move to next image
|
|
523
|
+
continue;
|
|
524
|
+
|
|
525
|
+
} else {
|
|
526
|
+
// handle audio
|
|
527
|
+
|
|
528
|
+
if (i_bm >= n_bitmaps) {
|
|
529
|
+
LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
|
|
530
|
+
return 1;
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
if (!ctx->has_audio) {
|
|
534
|
+
LOG_ERR("%s: error: model does not support audio input\n", __func__);
|
|
535
|
+
return 2;
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
if (bitmaps[i_bm]->data.size() == 0) {
|
|
539
|
+
LOG_ERR("%s: error: empty audio data\n", __func__);
|
|
540
|
+
return 2;
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
// preprocess audio
|
|
544
|
+
GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
|
|
545
|
+
std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
|
|
546
|
+
const float * samples = (const float *)bitmaps[i_bm]->data.data();
|
|
547
|
+
size_t n_samples = bitmaps[i_bm]->data.size() / sizeof(float);
|
|
548
|
+
bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks);
|
|
549
|
+
if (!ok) {
|
|
550
|
+
LOG_ERR("Unable to preprocess audio\n");
|
|
551
|
+
return 2;
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
// consider each mel_spec as a separate audio chunk
|
|
555
|
+
// TODO: maybe support batching, but this may come with memory cost
|
|
556
|
+
for (auto & mel_spec : mel_spec_chunks) {
|
|
557
|
+
clip_image_f32_ptr mel_f32(clip_image_f32_init());
|
|
558
|
+
mel_f32->nx = mel_spec.n_len;
|
|
559
|
+
mel_f32->ny = mel_spec.n_mel;
|
|
560
|
+
mel_f32->buf = std::move(mel_spec.data);
|
|
561
|
+
size_t n_tokens = clip_n_output_tokens(ctx->ctx_clip, mel_f32.get());
|
|
562
|
+
|
|
563
|
+
clip_image_f32_batch batch_f32;
|
|
564
|
+
batch_f32.is_audio = true;
|
|
565
|
+
batch_f32.entries.push_back(std::move(mel_f32));
|
|
566
|
+
|
|
567
|
+
mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
|
|
568
|
+
audio_tokens->n_tokens = n_tokens;
|
|
569
|
+
audio_tokens->batch_f32 = std::move(batch_f32);
|
|
570
|
+
audio_tokens->id = bitmaps[i_bm]->id; // optional
|
|
571
|
+
|
|
572
|
+
LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
|
|
573
|
+
|
|
574
|
+
mtmd_input_chunk chunk{
|
|
575
|
+
MTMD_INPUT_CHUNK_TYPE_AUDIO,
|
|
576
|
+
{}, // text tokens
|
|
577
|
+
nullptr, // image tokens
|
|
578
|
+
std::move(audio_tokens),
|
|
579
|
+
};
|
|
580
|
+
output->entries.emplace_back(std::move(chunk));
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
i_bm++;
|
|
584
|
+
continue;
|
|
413
585
|
}
|
|
414
586
|
}
|
|
415
587
|
|
|
416
588
|
return 0;
|
|
417
589
|
}
|
|
418
590
|
|
|
419
|
-
|
|
420
|
-
if (
|
|
421
|
-
|
|
591
|
+
int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
|
|
592
|
+
if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
|
593
|
+
LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
|
|
594
|
+
return 0;
|
|
595
|
+
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
596
|
+
return mtmd_encode(ctx, chunk->tokens_image.get());
|
|
597
|
+
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
|
598
|
+
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
|
|
599
|
+
ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
|
|
600
|
+
bool ok = clip_image_batch_encode(
|
|
601
|
+
ctx->ctx_clip,
|
|
602
|
+
ctx->n_threads,
|
|
603
|
+
&chunk->tokens_audio->batch_f32,
|
|
604
|
+
ctx->image_embd_v.data());
|
|
605
|
+
return ok ? 0 : 1;
|
|
422
606
|
}
|
|
607
|
+
|
|
608
|
+
LOG_ERR("mtmd_encode_chunk: unknown chunk type %d\n", (int)chunk->type);
|
|
609
|
+
return 1;
|
|
423
610
|
}
|
|
424
611
|
|
|
425
612
|
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
|
|
@@ -427,14 +614,6 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
|
|
|
427
614
|
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
|
|
428
615
|
bool ok = false;
|
|
429
616
|
|
|
430
|
-
// only effective for minicpmv and qwen2vl, other models will ignore load_image_size
|
|
431
|
-
{
|
|
432
|
-
clip_image_size slice_size{
|
|
433
|
-
image_tokens->batch_f32.entries[0]->nx,
|
|
434
|
-
image_tokens->batch_f32.entries[0]->ny};
|
|
435
|
-
clip_add_load_image_size(ctx->ctx_clip, &slice_size);
|
|
436
|
-
}
|
|
437
|
-
|
|
438
617
|
if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
|
|
439
618
|
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
|
|
440
619
|
const auto & entries = image_tokens->batch_f32.entries;
|
|
@@ -473,8 +652,12 @@ bool mtmd_decode_use_mrope(mtmd_context * ctx) {
|
|
|
473
652
|
return ctx->use_mrope;
|
|
474
653
|
}
|
|
475
654
|
|
|
476
|
-
|
|
477
|
-
|
|
655
|
+
bool mtmd_support_vision(mtmd_context * ctx) {
|
|
656
|
+
return ctx->has_vision;
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
bool mtmd_support_audio(mtmd_context * ctx) {
|
|
660
|
+
return ctx->has_audio;
|
|
478
661
|
}
|
|
479
662
|
|
|
480
663
|
// these 2 helpers below use internal clip_image_u8_ptr,
|
|
@@ -483,6 +666,15 @@ void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
|
|
|
483
666
|
// whichever library they want, and then use mtmd_bitmap_init() to create bitmap
|
|
484
667
|
|
|
485
668
|
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
|
|
669
|
+
if (audio_helpers::is_audio_file((const char *)buf, len)) {
|
|
670
|
+
std::vector<float> pcmf32;
|
|
671
|
+
if (!audio_helpers::decode_audio_from_buf(buf, len, COMMON_SAMPLE_RATE, pcmf32)) {
|
|
672
|
+
LOG_ERR("Unable to read WAV audio file from buffer\n");
|
|
673
|
+
return nullptr;
|
|
674
|
+
}
|
|
675
|
+
return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
|
|
676
|
+
}
|
|
677
|
+
|
|
486
678
|
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
|
487
679
|
bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
|
|
488
680
|
if (!ok) {
|
|
@@ -495,15 +687,26 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t
|
|
|
495
687
|
}
|
|
496
688
|
|
|
497
689
|
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
if (!
|
|
501
|
-
LOG_ERR("Unable to
|
|
690
|
+
std::vector<unsigned char> buf;
|
|
691
|
+
FILE * f = fopen(fname, "rb");
|
|
692
|
+
if (!f) {
|
|
693
|
+
LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
|
|
502
694
|
return nullptr;
|
|
503
695
|
}
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
696
|
+
|
|
697
|
+
fseek(f, 0, SEEK_END);
|
|
698
|
+
long file_size = ftell(f);
|
|
699
|
+
fseek(f, 0, SEEK_SET);
|
|
700
|
+
buf.resize(file_size);
|
|
701
|
+
|
|
702
|
+
size_t n_read = fread(buf.data(), 1, file_size, f);
|
|
703
|
+
fclose(f);
|
|
704
|
+
if (n_read != (size_t)file_size) {
|
|
705
|
+
LOG_ERR("Failed to read entire file %s", fname);
|
|
706
|
+
return nullptr;
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
return mtmd_helper_bitmap_init_from_buf(buf.data(), buf.size());
|
|
507
710
|
}
|
|
508
711
|
|
|
509
712
|
//
|
|
@@ -524,6 +727,18 @@ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
|
|
|
524
727
|
return bitmap;
|
|
525
728
|
}
|
|
526
729
|
|
|
730
|
+
mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
|
|
731
|
+
const float * data) {
|
|
732
|
+
mtmd_bitmap * bitmap = new mtmd_bitmap;
|
|
733
|
+
bitmap->nx = n_samples;
|
|
734
|
+
bitmap->ny = 1;
|
|
735
|
+
bitmap->is_audio = true;
|
|
736
|
+
size_t data_size = n_samples * sizeof(float);
|
|
737
|
+
bitmap->data.resize(data_size);
|
|
738
|
+
std::memcpy(bitmap->data.data(), data, data_size);
|
|
739
|
+
return bitmap;
|
|
740
|
+
}
|
|
741
|
+
|
|
527
742
|
uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) {
|
|
528
743
|
return bitmap->nx;
|
|
529
744
|
}
|
|
@@ -536,6 +751,14 @@ const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
|
|
|
536
751
|
return bitmap->data.data();
|
|
537
752
|
}
|
|
538
753
|
|
|
754
|
+
size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
|
|
755
|
+
return bitmap->data.size();
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
|
|
759
|
+
return bitmap->is_audio;
|
|
760
|
+
}
|
|
761
|
+
|
|
539
762
|
const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
|
|
540
763
|
return bitmap->id.c_str();
|
|
541
764
|
}
|
|
@@ -599,17 +822,56 @@ const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chu
|
|
|
599
822
|
return nullptr;
|
|
600
823
|
}
|
|
601
824
|
|
|
825
|
+
size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk) {
|
|
826
|
+
if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
|
827
|
+
return chunk->tokens_text.size();
|
|
828
|
+
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
829
|
+
return mtmd_image_tokens_get_n_tokens(chunk->tokens_image.get());
|
|
830
|
+
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
|
831
|
+
return chunk->tokens_audio->n_tokens;
|
|
832
|
+
} else {
|
|
833
|
+
GGML_ABORT("invalid chunk type");
|
|
834
|
+
}
|
|
835
|
+
}
|
|
836
|
+
|
|
837
|
+
llama_pos mtmd_input_chunk_get_n_pos(const mtmd_input_chunk * chunk) {
|
|
838
|
+
if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
|
839
|
+
return chunk->tokens_text.size();
|
|
840
|
+
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
841
|
+
return mtmd_image_tokens_get_n_pos(chunk->tokens_image.get());
|
|
842
|
+
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
|
843
|
+
return chunk->tokens_audio->n_tokens;
|
|
844
|
+
} else {
|
|
845
|
+
GGML_ABORT("invalid chunk type");
|
|
846
|
+
}
|
|
847
|
+
}
|
|
848
|
+
|
|
849
|
+
const char * mtmd_input_chunk_get_id(const mtmd_input_chunk * chunk) {
|
|
850
|
+
if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
851
|
+
return chunk->tokens_image->id.c_str();
|
|
852
|
+
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
|
853
|
+
return chunk->tokens_audio->id.c_str();
|
|
854
|
+
}
|
|
855
|
+
return nullptr;
|
|
856
|
+
}
|
|
857
|
+
|
|
602
858
|
mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
|
|
603
859
|
mtmd_input_chunk * copy = new mtmd_input_chunk{
|
|
604
860
|
chunk->type,
|
|
605
861
|
chunk->tokens_text,
|
|
606
|
-
|
|
862
|
+
nullptr,
|
|
863
|
+
nullptr,
|
|
607
864
|
};
|
|
608
865
|
if (chunk->tokens_image) {
|
|
609
866
|
// copy the image tokens
|
|
610
867
|
copy->tokens_image = mtmd_image_tokens_ptr(new mtmd_image_tokens());
|
|
611
868
|
*copy->tokens_image = chunk->tokens_image->clone();
|
|
612
869
|
}
|
|
870
|
+
if (chunk->tokens_audio) {
|
|
871
|
+
// copy the audio tokens
|
|
872
|
+
copy->tokens_audio = mtmd_audio_tokens_ptr(new mtmd_audio_tokens());
|
|
873
|
+
*copy->tokens_audio = chunk->tokens_audio->clone();
|
|
874
|
+
}
|
|
613
875
|
return copy;
|
|
614
876
|
}
|
|
615
877
|
|
|
@@ -657,7 +919,8 @@ mtmd_input_chunks * mtmd_test_create_input_chunks() {
|
|
|
657
919
|
mtmd_input_chunk chunk_text{
|
|
658
920
|
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
|
659
921
|
std::move(tokens_text),
|
|
660
|
-
|
|
922
|
+
nullptr, // image tokens
|
|
923
|
+
nullptr, // audio tokens
|
|
661
924
|
};
|
|
662
925
|
chunks->entries.emplace_back(std::move(chunk_text));
|
|
663
926
|
|
|
@@ -669,8 +932,9 @@ mtmd_input_chunks * mtmd_test_create_input_chunks() {
|
|
|
669
932
|
image_tokens->id = "image_1";
|
|
670
933
|
mtmd_input_chunk chunk_image{
|
|
671
934
|
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
|
672
|
-
{},
|
|
935
|
+
{}, // text tokens
|
|
673
936
|
std::move(image_tokens),
|
|
937
|
+
nullptr, // audio tokens
|
|
674
938
|
};
|
|
675
939
|
chunks->entries.emplace_back(std::move(chunk_image));
|
|
676
940
|
|