@fugood/llama.node 0.3.17 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +39 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +366 -19
- package/src/LlamaCompletionWorker.h +30 -10
- package/src/LlamaContext.cpp +213 -5
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
- package/src/llama.cpp/.github/workflows/build.yml +41 -762
- package/src/llama.cpp/.github/workflows/docker.yml +5 -2
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +12 -12
- package/src/llama.cpp/CMakeLists.txt +5 -17
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +31 -3
- package/src/llama.cpp/common/arg.cpp +48 -29
- package/src/llama.cpp/common/chat.cpp +128 -106
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +37 -1
- package/src/llama.cpp/common/common.h +18 -9
- package/src/llama.cpp/common/llguidance.cpp +1 -0
- package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
- package/src/llama.cpp/common/minja/minja.hpp +69 -36
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +57 -50
- package/src/llama.cpp/examples/CMakeLists.txt +2 -23
- package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
- package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml.h +10 -7
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
- package/src/llama.cpp/ggml/src/ggml.c +29 -20
- package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
- package/src/llama.cpp/include/llama.h +52 -11
- package/src/llama.cpp/requirements/requirements-all.txt +3 -3
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-adapter.cpp +6 -0
- package/src/llama.cpp/src/llama-arch.cpp +3 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +17 -7
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +389 -501
- package/src/llama.cpp/src/llama-context.h +44 -32
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +20 -38
- package/src/llama.cpp/src/llama-graph.h +12 -8
- package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
- package/src/llama.cpp/src/llama-kv-cache.h +271 -85
- package/src/llama.cpp/src/llama-memory.h +11 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +316 -69
- package/src/llama.cpp/src/llama-model.h +8 -1
- package/src/llama.cpp/src/llama-quant.cpp +15 -13
- package/src/llama.cpp/src/llama-sampling.cpp +18 -6
- package/src/llama.cpp/src/llama-vocab.cpp +42 -4
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +14 -0
- package/src/llama.cpp/tests/CMakeLists.txt +10 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
- package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
- package/src/llama.cpp/tests/test-chat.cpp +3 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
- package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
- package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
- package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
- package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
- package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
- package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.h +0 -135
- package/src/llama.cpp/examples/llava/llava.cpp +0 -586
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/mtmd.h +0 -168
- package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
|
@@ -12,6 +12,30 @@
|
|
|
12
12
|
#include <limits>
|
|
13
13
|
#include <vector>
|
|
14
14
|
|
|
15
|
+
// represents raw image data, layout is RGBRGBRGB...
|
|
16
|
+
// length of data must be nx * ny * 3
|
|
17
|
+
struct mtmd_bitmap {
|
|
18
|
+
uint32_t nx;
|
|
19
|
+
uint32_t ny;
|
|
20
|
+
std::vector<unsigned char> data;
|
|
21
|
+
std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
struct mtmd_image_tokens_deleter {
|
|
25
|
+
void operator()(mtmd_image_tokens * val); // forward declaration
|
|
26
|
+
};
|
|
27
|
+
using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
|
|
28
|
+
|
|
29
|
+
struct mtmd_input_chunk {
|
|
30
|
+
mtmd_input_chunk_type type;
|
|
31
|
+
std::vector<llama_token> tokens_text;
|
|
32
|
+
mtmd_image_tokens_ptr tokens_image;
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
struct mtmd_input_chunks {
|
|
36
|
+
std::vector<mtmd_input_chunk> entries;
|
|
37
|
+
};
|
|
38
|
+
|
|
15
39
|
// slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
|
|
16
40
|
// models not having it (llava-1.6) will process embeddings without any special tokens in-between
|
|
17
41
|
enum mtmd_slice_tmpl {
|
|
@@ -21,6 +45,16 @@ enum mtmd_slice_tmpl {
|
|
|
21
45
|
// TODO @ngxson : add support for idefics (SmolVLM)
|
|
22
46
|
};
|
|
23
47
|
|
|
48
|
+
mtmd_context_params mtmd_context_params_default() {
|
|
49
|
+
mtmd_context_params params;
|
|
50
|
+
params.use_gpu = true;
|
|
51
|
+
params.print_timings = true;
|
|
52
|
+
params.n_threads = 4;
|
|
53
|
+
params.verbosity = GGML_LOG_LEVEL_INFO;
|
|
54
|
+
params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
|
|
55
|
+
return params;
|
|
56
|
+
}
|
|
57
|
+
|
|
24
58
|
struct mtmd_context {
|
|
25
59
|
struct clip_ctx * ctx_clip;
|
|
26
60
|
const struct llama_model * text_model;
|
|
@@ -132,6 +166,16 @@ struct mtmd_image_tokens {
|
|
|
132
166
|
uint32_t n_tokens() const { return nx * ny; }
|
|
133
167
|
clip_image_f32_batch batch_f32; // preprocessed image patches
|
|
134
168
|
std::string id; // optional user-defined ID, useful for KV cache tracking
|
|
169
|
+
|
|
170
|
+
mtmd_image_tokens clone() {
|
|
171
|
+
return mtmd_image_tokens{
|
|
172
|
+
nx,
|
|
173
|
+
ny,
|
|
174
|
+
use_mrope_pos,
|
|
175
|
+
batch_f32.clone(),
|
|
176
|
+
id
|
|
177
|
+
};
|
|
178
|
+
}
|
|
135
179
|
};
|
|
136
180
|
|
|
137
181
|
mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
|
|
@@ -172,12 +216,13 @@ static std::vector<llama_token> mtmd_tokenize_text_internal(
|
|
|
172
216
|
}
|
|
173
217
|
|
|
174
218
|
int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
219
|
+
mtmd_input_chunks * output,
|
|
220
|
+
const mtmd_input_text * text,
|
|
221
|
+
const mtmd_bitmap ** bitmaps,
|
|
222
|
+
size_t n_bitmaps) {
|
|
178
223
|
auto vocab = llama_model_get_vocab(ctx->text_model);
|
|
179
224
|
|
|
180
|
-
std::string prompt_modified(text
|
|
225
|
+
std::string prompt_modified(text->text);
|
|
181
226
|
std::string marker_modified(ctx->image_marker);
|
|
182
227
|
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
|
|
183
228
|
|
|
@@ -189,11 +234,6 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
|
189
234
|
marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
|
|
190
235
|
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
|
191
236
|
|
|
192
|
-
} else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
|
193
|
-
// <|begin_of_image|> ... (image embeddings) ... <|end_of_image|>
|
|
194
|
-
marker_modified = "<|begin_of_image|>" + ctx->image_marker + "<|end_of_image|>";
|
|
195
|
-
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
|
196
|
-
|
|
197
237
|
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
|
|
198
238
|
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
|
|
199
239
|
marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";
|
|
@@ -212,11 +252,19 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
|
212
252
|
|
|
213
253
|
}
|
|
214
254
|
|
|
255
|
+
else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
|
|
256
|
+
// <img> ... (image embeddings) ... </img>
|
|
257
|
+
marker_modified = "<img>" + ctx->image_marker + "</img>";
|
|
258
|
+
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
|
259
|
+
|
|
260
|
+
}
|
|
261
|
+
|
|
215
262
|
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
|
|
263
|
+
// for glm-edge, BOI and EOI token's embeddings are not present in the text model
|
|
216
264
|
|
|
217
265
|
std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
|
|
218
|
-
output.clear();
|
|
219
|
-
output.reserve(parts.size());
|
|
266
|
+
output->entries.clear();
|
|
267
|
+
output->entries.reserve(parts.size());
|
|
220
268
|
|
|
221
269
|
size_t i_img = 0;
|
|
222
270
|
|
|
@@ -227,7 +275,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
|
227
275
|
std::move(tokens),
|
|
228
276
|
{},
|
|
229
277
|
};
|
|
230
|
-
output.emplace_back(std::move(chunk));
|
|
278
|
+
output->entries.emplace_back(std::move(chunk));
|
|
231
279
|
};
|
|
232
280
|
|
|
233
281
|
// utility for splitting batch of multiple images into chunks of batch having single images
|
|
@@ -255,7 +303,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
|
255
303
|
for (const auto & part : parts) {
|
|
256
304
|
// printf("tokenizing part: %s\n", part.c_str());
|
|
257
305
|
bool add_bos = &parts.front() == ∂
|
|
258
|
-
auto tokens = mtmd_tokenize_text_internal(vocab, part, text
|
|
306
|
+
auto tokens = mtmd_tokenize_text_internal(vocab, part, text->add_special && add_bos, text->parse_special);
|
|
259
307
|
if (tokens.empty()) {
|
|
260
308
|
continue;
|
|
261
309
|
}
|
|
@@ -264,22 +312,22 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
|
264
312
|
std::move(tokens),
|
|
265
313
|
{},
|
|
266
314
|
};
|
|
267
|
-
output.emplace_back(std::move(chunk));
|
|
315
|
+
output->entries.emplace_back(std::move(chunk));
|
|
268
316
|
|
|
269
317
|
if (&parts.back() != &part) {
|
|
270
318
|
// add image token to middle of 2 parts
|
|
271
319
|
|
|
272
|
-
if (i_img >=
|
|
320
|
+
if (i_img >= n_bitmaps) {
|
|
273
321
|
LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
|
|
274
322
|
return 1;
|
|
275
323
|
}
|
|
276
324
|
|
|
277
325
|
// convert mtmd_bitmap to clip_image_u8
|
|
278
326
|
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
|
279
|
-
img_u8->nx = bitmaps[i_img]
|
|
280
|
-
img_u8->ny = bitmaps[i_img]
|
|
281
|
-
img_u8->buf.resize(bitmaps[i_img]
|
|
282
|
-
std::memcpy(img_u8->buf.data(), bitmaps[i_img]
|
|
327
|
+
img_u8->nx = bitmaps[i_img]->nx;
|
|
328
|
+
img_u8->ny = bitmaps[i_img]->ny;
|
|
329
|
+
img_u8->buf.resize(bitmaps[i_img]->data.size());
|
|
330
|
+
std::memcpy(img_u8->buf.data(), bitmaps[i_img]->data.data(), img_u8->nx * img_u8->ny * 3);
|
|
283
331
|
clip_image_size img_u8_size{img_u8->nx, img_u8->ny};
|
|
284
332
|
|
|
285
333
|
// preprocess image
|
|
@@ -292,12 +340,12 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
|
292
340
|
|
|
293
341
|
if (ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6) {
|
|
294
342
|
// split batch into chunks of single images
|
|
295
|
-
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img]
|
|
343
|
+
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img]->id);
|
|
296
344
|
GGML_ASSERT(chunks.size() > 0);
|
|
297
345
|
|
|
298
346
|
// add overview image
|
|
299
347
|
add_text_chunk({ctx->tok_ov_img_start});
|
|
300
|
-
output.emplace_back(std::move(chunks.front()));
|
|
348
|
+
output->entries.emplace_back(std::move(chunks.front()));
|
|
301
349
|
chunks.erase(chunks.begin());
|
|
302
350
|
add_text_chunk({ctx->tok_ov_img_end});
|
|
303
351
|
|
|
@@ -315,7 +363,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
|
315
363
|
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
|
|
316
364
|
add_text_chunk({ctx->tok_sli_img_start});
|
|
317
365
|
}
|
|
318
|
-
output.emplace_back(std::move(chunks[y * n_col + x]));
|
|
366
|
+
output->entries.emplace_back(std::move(chunks[y * n_col + x]));
|
|
319
367
|
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
|
|
320
368
|
add_text_chunk({ctx->tok_sli_img_end});
|
|
321
369
|
}
|
|
@@ -347,7 +395,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
|
347
395
|
image_tokens->ny = 1;
|
|
348
396
|
}
|
|
349
397
|
image_tokens->batch_f32 = std::move(batch_f32);
|
|
350
|
-
image_tokens->id = bitmaps[i_img]
|
|
398
|
+
image_tokens->id = bitmaps[i_img]->id; // optional
|
|
351
399
|
|
|
352
400
|
LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
|
|
353
401
|
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
|
|
@@ -358,7 +406,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
|
358
406
|
{},
|
|
359
407
|
std::move(image_tokens),
|
|
360
408
|
};
|
|
361
|
-
output.emplace_back(std::move(chunk));
|
|
409
|
+
output->entries.emplace_back(std::move(chunk));
|
|
362
410
|
}
|
|
363
411
|
|
|
364
412
|
i_img++; // move to next image
|
|
@@ -368,35 +416,12 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
|
368
416
|
return 0;
|
|
369
417
|
}
|
|
370
418
|
|
|
371
|
-
void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
|
|
419
|
+
static void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
|
|
372
420
|
if (image_tokens) {
|
|
373
421
|
delete image_tokens;
|
|
374
422
|
}
|
|
375
423
|
}
|
|
376
424
|
|
|
377
|
-
size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
|
|
378
|
-
return image_tokens->n_tokens();
|
|
379
|
-
}
|
|
380
|
-
|
|
381
|
-
size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
|
|
382
|
-
return image_tokens->nx;
|
|
383
|
-
}
|
|
384
|
-
|
|
385
|
-
size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
|
|
386
|
-
return image_tokens->ny;
|
|
387
|
-
}
|
|
388
|
-
|
|
389
|
-
std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
|
|
390
|
-
return image_tokens->id;
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
|
|
394
|
-
if (image_tokens->use_mrope_pos) {
|
|
395
|
-
return 1; // for M-RoPE, the whole image is 1 in temporal dimension
|
|
396
|
-
}
|
|
397
|
-
return image_tokens->n_tokens();
|
|
398
|
-
}
|
|
399
|
-
|
|
400
425
|
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
|
|
401
426
|
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
|
|
402
427
|
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
|
|
@@ -436,273 +461,218 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
|
|
|
436
461
|
return ctx->image_embd_v.data();
|
|
437
462
|
}
|
|
438
463
|
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
n_tokens += chunk.tokens_text.size();
|
|
444
|
-
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
445
|
-
n_tokens += mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
|
|
446
|
-
} else {
|
|
447
|
-
GGML_ASSERT(false && "chunk type not supported");
|
|
448
|
-
}
|
|
464
|
+
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
|
|
465
|
+
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
|
|
466
|
+
if (proj_type == PROJECTOR_TYPE_GEMMA3) {
|
|
467
|
+
return true;
|
|
449
468
|
}
|
|
450
|
-
return
|
|
469
|
+
return false;
|
|
451
470
|
}
|
|
452
471
|
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
|
457
|
-
n_pos += chunk.tokens_text.size();
|
|
458
|
-
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
459
|
-
n_pos += mtmd_image_tokens_get_n_pos(chunk.tokens_image.get());
|
|
460
|
-
} else {
|
|
461
|
-
GGML_ASSERT(false && "chunk type not supported");
|
|
462
|
-
}
|
|
463
|
-
}
|
|
464
|
-
return n_pos;
|
|
465
|
-
}
|
|
466
|
-
|
|
467
|
-
// helper struct to make working with embd batch easier
|
|
468
|
-
// note: this will be removed after llama_batch_ext refactoring
|
|
469
|
-
struct decode_embd_batch {
|
|
470
|
-
int n_pos_per_embd;
|
|
471
|
-
int n_mmproj_embd;
|
|
472
|
-
std::vector<llama_pos> pos;
|
|
473
|
-
std::vector<llama_pos> pos_view; // used by mrope
|
|
474
|
-
std::vector<int32_t> n_seq_id;
|
|
475
|
-
std::vector<llama_seq_id> seq_id_0;
|
|
476
|
-
std::vector<llama_seq_id *> seq_ids;
|
|
477
|
-
std::vector<int8_t> logits;
|
|
478
|
-
llama_batch batch;
|
|
479
|
-
decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
|
480
|
-
pos .resize(n_tokens * n_pos_per_embd);
|
|
481
|
-
n_seq_id.resize(n_tokens);
|
|
482
|
-
seq_ids .resize(n_tokens + 1);
|
|
483
|
-
logits .resize(n_tokens);
|
|
484
|
-
seq_id_0.resize(1);
|
|
485
|
-
seq_ids [n_tokens] = nullptr;
|
|
486
|
-
batch = {
|
|
487
|
-
/*n_tokens =*/ n_tokens,
|
|
488
|
-
/*tokens =*/ nullptr,
|
|
489
|
-
/*embd =*/ embd,
|
|
490
|
-
/*pos =*/ pos.data(),
|
|
491
|
-
/*n_seq_id =*/ n_seq_id.data(),
|
|
492
|
-
/*seq_id =*/ seq_ids.data(),
|
|
493
|
-
/*logits =*/ logits.data(),
|
|
494
|
-
};
|
|
495
|
-
}
|
|
472
|
+
bool mtmd_decode_use_mrope(mtmd_context * ctx) {
|
|
473
|
+
return ctx->use_mrope;
|
|
474
|
+
}
|
|
496
475
|
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
batch.pos [i] = pos_0 + i;
|
|
501
|
-
batch.n_seq_id[i] = 1;
|
|
502
|
-
batch.seq_id [i] = seq_id_0.data();
|
|
503
|
-
batch.logits [i] = false;
|
|
504
|
-
}
|
|
505
|
-
}
|
|
476
|
+
void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
|
|
477
|
+
mtmd_image_tokens_free(val);
|
|
478
|
+
}
|
|
506
479
|
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
}
|
|
519
|
-
for (int i = 0; i < batch.n_tokens; i++) {
|
|
520
|
-
batch.n_seq_id[i] = 1;
|
|
521
|
-
batch.seq_id [i] = seq_id_0.data();
|
|
522
|
-
batch.logits [i] = false;
|
|
523
|
-
}
|
|
480
|
+
// these 2 helpers below use internal clip_image_u8_ptr,
|
|
481
|
+
// so unfortunately they cannot moved to mtmd-helper.h
|
|
482
|
+
// however, in theory, user can decode image file to bitmap using
|
|
483
|
+
// whichever library they want, and then use mtmd_bitmap_init() to create bitmap
|
|
484
|
+
|
|
485
|
+
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
|
|
486
|
+
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
|
487
|
+
bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
|
|
488
|
+
if (!ok) {
|
|
489
|
+
LOG_ERR("Unable to load image from buffer\n");
|
|
490
|
+
return nullptr;
|
|
524
491
|
}
|
|
492
|
+
uint32_t nx, ny;
|
|
493
|
+
unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
|
|
494
|
+
return mtmd_bitmap_init(nx, ny, data);
|
|
495
|
+
}
|
|
525
496
|
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
// for example, with layout of src: 1234...1234...1234...1234...
|
|
533
|
-
// offset 2 will give us dst: 34...34...34...34...
|
|
534
|
-
for (int i = 0; i < n_pos_per_embd; i++) {
|
|
535
|
-
auto src = pos.begin() + i * batch.n_tokens + offset;
|
|
536
|
-
pos_view.insert(pos_view.end(), src, src + n_tokens);
|
|
537
|
-
}
|
|
538
|
-
pos_ptr = pos_view.data();
|
|
539
|
-
} else {
|
|
540
|
-
// normal
|
|
541
|
-
pos_ptr = pos.data() + offset;
|
|
542
|
-
}
|
|
543
|
-
return {
|
|
544
|
-
/*n_tokens =*/ n_tokens,
|
|
545
|
-
/*tokens =*/ nullptr,
|
|
546
|
-
/*embd =*/ batch.embd + offset * n_mmproj_embd,
|
|
547
|
-
/*pos =*/ pos_ptr,
|
|
548
|
-
/*n_seq_id =*/ batch.n_seq_id + offset,
|
|
549
|
-
/*seq_id =*/ batch.seq_id + offset,
|
|
550
|
-
/*logits =*/ batch.logits + offset,
|
|
551
|
-
};
|
|
497
|
+
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
|
|
498
|
+
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
|
499
|
+
bool ok = clip_image_load_from_file(fname, img_u8.get());
|
|
500
|
+
if (!ok) {
|
|
501
|
+
LOG_ERR("Unable to load image %s\n", fname);
|
|
502
|
+
return nullptr;
|
|
552
503
|
}
|
|
553
|
-
|
|
504
|
+
uint32_t nx, ny;
|
|
505
|
+
unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
|
|
506
|
+
return mtmd_bitmap_init(nx, ny, data);
|
|
507
|
+
}
|
|
554
508
|
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
while (i < chunk.tokens_text.size()) { // split into batches
|
|
573
|
-
for (; i < chunk.tokens_text.size() && text_batch.n_tokens < n_batch; i++) {
|
|
574
|
-
text_batch.token [i] = chunk.tokens_text[i];
|
|
575
|
-
text_batch.pos [i] = n_past++;
|
|
576
|
-
text_batch.n_seq_id[i] = 1;
|
|
577
|
-
text_batch.seq_id [i][0] = seq_id;
|
|
578
|
-
text_batch.logits [i] = false;
|
|
579
|
-
}
|
|
580
|
-
if (is_last) {
|
|
581
|
-
// always get logits for last input chunk
|
|
582
|
-
text_batch.logits[text_batch.n_tokens - 1] = true;
|
|
583
|
-
}
|
|
584
|
-
ret = llama_decode(lctx, text_batch);
|
|
585
|
-
if (ret != 0) {
|
|
586
|
-
LOG_ERR("failed to decode text\n");
|
|
587
|
-
llama_batch_free(text_batch);
|
|
588
|
-
return ret;
|
|
589
|
-
}
|
|
590
|
-
}
|
|
509
|
+
//
|
|
510
|
+
// public API functions
|
|
511
|
+
//
|
|
512
|
+
|
|
513
|
+
// mtmd_bitmap
|
|
514
|
+
|
|
515
|
+
mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
|
|
516
|
+
uint32_t ny,
|
|
517
|
+
const unsigned char * data) {
|
|
518
|
+
mtmd_bitmap * bitmap = new mtmd_bitmap;
|
|
519
|
+
bitmap->nx = nx;
|
|
520
|
+
bitmap->ny = ny;
|
|
521
|
+
size_t data_size = (size_t)nx * ny * 3;
|
|
522
|
+
bitmap->data.resize(data_size);
|
|
523
|
+
std::memcpy(bitmap->data.data(), data, data_size);
|
|
524
|
+
return bitmap;
|
|
525
|
+
}
|
|
591
526
|
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
int64_t t0 = ggml_time_ms();
|
|
596
|
-
if (ctx->print_timings) {
|
|
597
|
-
LOG_INF("encoding image or slice...\n");
|
|
598
|
-
}
|
|
599
|
-
ret = mtmd_encode(ctx, chunk.tokens_image.get());
|
|
600
|
-
if (ret != 0) {
|
|
601
|
-
LOG_ERR("failed to encode image\n");
|
|
602
|
-
llama_batch_free(text_batch);
|
|
603
|
-
return ret;
|
|
604
|
-
}
|
|
605
|
-
if (ctx->print_timings) {
|
|
606
|
-
LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
|
|
607
|
-
}
|
|
527
|
+
uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) {
|
|
528
|
+
return bitmap->nx;
|
|
529
|
+
}
|
|
608
530
|
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
float * embd = mtmd_get_output_embd(ctx);
|
|
613
|
-
decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
|
|
531
|
+
uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
|
|
532
|
+
return bitmap->ny;
|
|
533
|
+
}
|
|
614
534
|
|
|
615
|
-
|
|
616
|
-
|
|
535
|
+
const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
|
|
536
|
+
return bitmap->data.data();
|
|
537
|
+
}
|
|
617
538
|
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
batch_embd.set_position_normal(n_past, seq_id);
|
|
622
|
-
}
|
|
539
|
+
const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
|
|
540
|
+
return bitmap->id.c_str();
|
|
541
|
+
}
|
|
623
542
|
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
543
|
+
void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id) {
|
|
544
|
+
if (id) {
|
|
545
|
+
bitmap->id = std::string(id);
|
|
546
|
+
} else {
|
|
547
|
+
bitmap->id.clear();
|
|
548
|
+
}
|
|
549
|
+
}
|
|
628
550
|
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
551
|
+
void mtmd_bitmap_free(mtmd_bitmap * bitmap) {
|
|
552
|
+
if (bitmap) {
|
|
553
|
+
delete bitmap;
|
|
554
|
+
}
|
|
555
|
+
}
|
|
633
556
|
|
|
634
|
-
|
|
557
|
+
// mtmd_input_chunks
|
|
635
558
|
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
LOG_ERR("failed to decode image\n");
|
|
640
|
-
llama_set_causal_attn(lctx, true); // restore causal attn
|
|
641
|
-
llama_batch_free(text_batch);
|
|
642
|
-
return ret;
|
|
643
|
-
}
|
|
559
|
+
mtmd_input_chunks * mtmd_input_chunks_init() {
|
|
560
|
+
return new mtmd_input_chunks;
|
|
561
|
+
}
|
|
644
562
|
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
563
|
+
size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks) {
|
|
564
|
+
return chunks->entries.size();
|
|
565
|
+
}
|
|
648
566
|
|
|
649
|
-
|
|
650
|
-
|
|
567
|
+
const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx) {
|
|
568
|
+
if (idx >= chunks->entries.size()) {
|
|
569
|
+
return nullptr;
|
|
570
|
+
}
|
|
571
|
+
return &chunks->entries[idx];
|
|
572
|
+
}
|
|
651
573
|
|
|
652
|
-
|
|
653
|
-
|
|
574
|
+
void mtmd_input_chunks_free(mtmd_input_chunks * chunks) {
|
|
575
|
+
if (chunks) {
|
|
576
|
+
delete chunks;
|
|
577
|
+
}
|
|
578
|
+
}
|
|
654
579
|
|
|
655
|
-
|
|
656
|
-
llama_set_causal_attn(lctx, true);
|
|
657
|
-
}
|
|
580
|
+
// mtmd_input_chunk
|
|
658
581
|
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
}
|
|
582
|
+
enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk) {
|
|
583
|
+
return chunk->type;
|
|
584
|
+
}
|
|
663
585
|
|
|
664
|
-
|
|
665
|
-
|
|
586
|
+
const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output) {
|
|
587
|
+
if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
|
588
|
+
*n_tokens_output = chunk->tokens_text.size();
|
|
589
|
+
return chunk->tokens_text.data();
|
|
590
|
+
}
|
|
591
|
+
*n_tokens_output = 0;
|
|
592
|
+
return nullptr;
|
|
666
593
|
}
|
|
667
594
|
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
if (!ok) {
|
|
672
|
-
LOG_ERR("Unable to load image from buffer\n");
|
|
673
|
-
return 1;
|
|
595
|
+
const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk) {
|
|
596
|
+
if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
597
|
+
return chunk->tokens_image.get();
|
|
674
598
|
}
|
|
675
|
-
|
|
676
|
-
output.data.resize(output.nx * output.ny * 3);
|
|
677
|
-
std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
|
|
678
|
-
return 0;
|
|
599
|
+
return nullptr;
|
|
679
600
|
}
|
|
680
601
|
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
602
|
+
mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
|
|
603
|
+
mtmd_input_chunk * copy = new mtmd_input_chunk{
|
|
604
|
+
chunk->type,
|
|
605
|
+
chunk->tokens_text,
|
|
606
|
+
mtmd_image_tokens_ptr(),
|
|
607
|
+
};
|
|
608
|
+
if (chunk->tokens_image) {
|
|
609
|
+
// copy the image tokens
|
|
610
|
+
copy->tokens_image = mtmd_image_tokens_ptr(new mtmd_image_tokens());
|
|
611
|
+
*copy->tokens_image = chunk->tokens_image->clone();
|
|
687
612
|
}
|
|
688
|
-
|
|
689
|
-
output.data.resize(output.nx * output.ny * 3);
|
|
690
|
-
std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
|
|
691
|
-
return 0;
|
|
613
|
+
return copy;
|
|
692
614
|
}
|
|
693
615
|
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
return true;
|
|
616
|
+
void mtmd_input_chunk_free(mtmd_input_chunk * chunk) {
|
|
617
|
+
if (chunk) {
|
|
618
|
+
delete chunk;
|
|
698
619
|
}
|
|
699
|
-
return false;
|
|
700
620
|
}
|
|
701
621
|
|
|
702
|
-
|
|
703
|
-
|
|
622
|
+
// mtmd_image_tokens
|
|
623
|
+
|
|
624
|
+
size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
|
|
625
|
+
return image_tokens->n_tokens();
|
|
704
626
|
}
|
|
705
627
|
|
|
706
|
-
|
|
707
|
-
|
|
628
|
+
size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
|
|
629
|
+
return image_tokens->nx;
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
|
|
633
|
+
return image_tokens->ny;
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
|
|
637
|
+
return image_tokens->id.c_str();
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
|
|
641
|
+
if (image_tokens->use_mrope_pos) {
|
|
642
|
+
return 1; // for M-RoPE, the whole image is 1 in temporal dimension
|
|
643
|
+
}
|
|
644
|
+
return image_tokens->n_tokens();
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
// test function
|
|
648
|
+
|
|
649
|
+
mtmd_input_chunks * mtmd_test_create_input_chunks() {
|
|
650
|
+
mtmd_input_chunks * chunks = mtmd_input_chunks_init();
|
|
651
|
+
if (!chunks) {
|
|
652
|
+
return nullptr;
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
// create a text chunk
|
|
656
|
+
std::vector<llama_token> tokens_text = { 1, 2, 3, 4, 5 };
|
|
657
|
+
mtmd_input_chunk chunk_text{
|
|
658
|
+
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
|
659
|
+
std::move(tokens_text),
|
|
660
|
+
{},
|
|
661
|
+
};
|
|
662
|
+
chunks->entries.emplace_back(std::move(chunk_text));
|
|
663
|
+
|
|
664
|
+
// create an image chunk
|
|
665
|
+
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
|
666
|
+
image_tokens->nx = 4;
|
|
667
|
+
image_tokens->ny = 4;
|
|
668
|
+
image_tokens->batch_f32.entries.resize(16);
|
|
669
|
+
image_tokens->id = "image_1";
|
|
670
|
+
mtmd_input_chunk chunk_image{
|
|
671
|
+
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
|
672
|
+
{},
|
|
673
|
+
std::move(image_tokens),
|
|
674
|
+
};
|
|
675
|
+
chunks->entries.emplace_back(std::move(chunk_image));
|
|
676
|
+
|
|
677
|
+
return chunks;
|
|
708
678
|
}
|