@fugood/llama.node 0.3.16 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +44 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +374 -19
- package/src/LlamaCompletionWorker.h +31 -10
- package/src/LlamaContext.cpp +216 -7
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
- package/src/llama.cpp/.github/workflows/build.yml +89 -767
- package/src/llama.cpp/.github/workflows/docker.yml +9 -6
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +19 -23
- package/src/llama.cpp/CMakeLists.txt +11 -1
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +35 -4
- package/src/llama.cpp/common/arg.cpp +844 -121
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +129 -107
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +64 -518
- package/src/llama.cpp/common/common.h +35 -45
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +31 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
- package/src/llama.cpp/common/minja/minja.hpp +186 -127
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +60 -50
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +2 -32
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +76 -106
- package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
- package/src/llama.cpp/ggml/src/ggml.c +170 -265
- package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
- package/src/llama.cpp/include/llama.h +82 -22
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +5 -3
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +4 -2
- package/src/llama.cpp/src/llama-adapter.cpp +43 -1
- package/src/llama.cpp/src/llama-arch.cpp +163 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +91 -16
- package/src/llama.cpp/src/llama-chat.h +7 -2
- package/src/llama.cpp/src/llama-context.cpp +479 -575
- package/src/llama.cpp/src/llama-context.h +44 -33
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +209 -157
- package/src/llama.cpp/src/llama-graph.h +38 -14
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
- package/src/llama.cpp/src/llama-kv-cache.h +283 -171
- package/src/llama.cpp/src/llama-memory.h +12 -2
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +1803 -330
- package/src/llama.cpp/src/llama-model.h +21 -2
- package/src/llama.cpp/src/llama-quant.cpp +33 -10
- package/src/llama.cpp/src/llama-sampling.cpp +25 -7
- package/src/llama.cpp/src/llama-vocab.cpp +86 -10
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +15 -1
- package/src/llama.cpp/tests/CMakeLists.txt +52 -31
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
- package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
- package/src/llama.cpp/tests/test-chat.cpp +15 -3
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
- package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
- package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
- package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
- package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
- package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
- package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
- package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
- package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
- package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
- package/src/llama.cpp/examples/llava/clip.h +0 -118
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/llava.cpp +0 -574
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
- package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
|
@@ -0,0 +1,678 @@
|
|
|
1
|
+
#include "clip.h"
|
|
2
|
+
#include "clip-impl.h"
|
|
3
|
+
#include "mtmd.h"
|
|
4
|
+
|
|
5
|
+
#include "llama.h"
|
|
6
|
+
|
|
7
|
+
#include <algorithm>
|
|
8
|
+
#include <cerrno>
|
|
9
|
+
#include <cstdio>
|
|
10
|
+
#include <cstdlib>
|
|
11
|
+
#include <cstring>
|
|
12
|
+
#include <limits>
|
|
13
|
+
#include <vector>
|
|
14
|
+
|
|
15
|
+
// represents raw image data, layout is RGBRGBRGB...
|
|
16
|
+
// length of data must be nx * ny * 3
|
|
17
|
+
struct mtmd_bitmap {
|
|
18
|
+
uint32_t nx;
|
|
19
|
+
uint32_t ny;
|
|
20
|
+
std::vector<unsigned char> data;
|
|
21
|
+
std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
struct mtmd_image_tokens_deleter {
|
|
25
|
+
void operator()(mtmd_image_tokens * val); // forward declaration
|
|
26
|
+
};
|
|
27
|
+
using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
|
|
28
|
+
|
|
29
|
+
struct mtmd_input_chunk {
|
|
30
|
+
mtmd_input_chunk_type type;
|
|
31
|
+
std::vector<llama_token> tokens_text;
|
|
32
|
+
mtmd_image_tokens_ptr tokens_image;
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
struct mtmd_input_chunks {
|
|
36
|
+
std::vector<mtmd_input_chunk> entries;
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
// slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
|
|
40
|
+
// models not having it (llava-1.6) will process embeddings without any special tokens in-between
|
|
41
|
+
enum mtmd_slice_tmpl {
|
|
42
|
+
MTMD_SLICE_TMPL_NONE,
|
|
43
|
+
MTMD_SLICE_TMPL_MINICPMV_2_5,
|
|
44
|
+
MTMD_SLICE_TMPL_MINICPMV_2_6,
|
|
45
|
+
// TODO @ngxson : add support for idefics (SmolVLM)
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
mtmd_context_params mtmd_context_params_default() {
|
|
49
|
+
mtmd_context_params params;
|
|
50
|
+
params.use_gpu = true;
|
|
51
|
+
params.print_timings = true;
|
|
52
|
+
params.n_threads = 4;
|
|
53
|
+
params.verbosity = GGML_LOG_LEVEL_INFO;
|
|
54
|
+
params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
|
|
55
|
+
return params;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
struct mtmd_context {
|
|
59
|
+
struct clip_ctx * ctx_clip;
|
|
60
|
+
const struct llama_model * text_model;
|
|
61
|
+
std::vector<float> image_embd_v; // image embedding vector
|
|
62
|
+
|
|
63
|
+
bool print_timings;
|
|
64
|
+
int n_threads;
|
|
65
|
+
std::string image_marker;
|
|
66
|
+
|
|
67
|
+
// for minicpmv, we need special tokens in-between slices
|
|
68
|
+
mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
|
|
69
|
+
llama_token tok_ov_img_start = LLAMA_TOKEN_NULL; // overview image
|
|
70
|
+
llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
|
|
71
|
+
llama_token tok_slices_start = LLAMA_TOKEN_NULL; // start of all slices
|
|
72
|
+
llama_token tok_slices_end = LLAMA_TOKEN_NULL; // end of all slices
|
|
73
|
+
llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice
|
|
74
|
+
llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice
|
|
75
|
+
llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
|
|
76
|
+
|
|
77
|
+
bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
|
|
78
|
+
|
|
79
|
+
// TODO @ngxson : add timings
|
|
80
|
+
|
|
81
|
+
mtmd_context(const char * mmproj_fname,
|
|
82
|
+
const llama_model * text_model,
|
|
83
|
+
const mtmd_context_params & ctx_params) :
|
|
84
|
+
text_model (text_model),
|
|
85
|
+
print_timings(ctx_params.print_timings),
|
|
86
|
+
n_threads (ctx_params.n_threads),
|
|
87
|
+
image_marker (ctx_params.image_marker)
|
|
88
|
+
{
|
|
89
|
+
clip_context_params ctx_clip_params;
|
|
90
|
+
ctx_clip_params.use_gpu = ctx_params.use_gpu;
|
|
91
|
+
ctx_clip_params.verbosity = ctx_params.verbosity;
|
|
92
|
+
ctx_clip = clip_init(mmproj_fname, ctx_clip_params);
|
|
93
|
+
if (!ctx_clip) {
|
|
94
|
+
throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
use_mrope = clip_is_qwen2vl(ctx_clip);
|
|
98
|
+
|
|
99
|
+
int minicpmv_version = clip_is_minicpmv(ctx_clip);
|
|
100
|
+
if (minicpmv_version == 2) {
|
|
101
|
+
// minicpmv 2.5 format:
|
|
102
|
+
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
|
|
103
|
+
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
|
|
104
|
+
tok_ov_img_start = lookup_token("<image>");
|
|
105
|
+
tok_ov_img_end = lookup_token("</image>");
|
|
106
|
+
tok_slices_start = lookup_token("<slice>");
|
|
107
|
+
tok_slices_end = lookup_token("</slice>");
|
|
108
|
+
tok_sli_img_start = tok_ov_img_start;
|
|
109
|
+
tok_sli_img_end = tok_ov_img_end;
|
|
110
|
+
tok_row_end = lookup_token("\n");
|
|
111
|
+
|
|
112
|
+
} else if (minicpmv_version == 3 || minicpmv_version == 4) {
|
|
113
|
+
// minicpmv 2.6 format:
|
|
114
|
+
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
|
|
115
|
+
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
|
|
116
|
+
tok_ov_img_start = lookup_token("<image>");
|
|
117
|
+
tok_ov_img_end = lookup_token("</image>");
|
|
118
|
+
tok_sli_img_start = lookup_token("<slice>");
|
|
119
|
+
tok_sli_img_end = lookup_token("</slice>");
|
|
120
|
+
tok_row_end = lookup_token("\n");
|
|
121
|
+
|
|
122
|
+
} else if (minicpmv_version != 0) {
|
|
123
|
+
GGML_ASSERT(false && "unsupported minicpmv version");
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
~mtmd_context() {
|
|
128
|
+
clip_free(ctx_clip);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
private:
|
|
132
|
+
llama_token lookup_token(const std::string & token_text) {
|
|
133
|
+
const llama_vocab * vocab = llama_model_get_vocab(text_model);
|
|
134
|
+
const int n_vocab = llama_vocab_n_tokens(vocab);
|
|
135
|
+
for (int i = 0; i < n_vocab; i++) {
|
|
136
|
+
if (token_to_piece(vocab, i, true) == token_text) {
|
|
137
|
+
return i;
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
return LLAMA_TOKEN_NULL;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
std::string token_to_piece(const llama_vocab * vocab, llama_token token, bool special) {
|
|
144
|
+
std::string piece;
|
|
145
|
+
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
|
146
|
+
const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
|
|
147
|
+
if (n_chars < 0) {
|
|
148
|
+
piece.resize(-n_chars);
|
|
149
|
+
int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
|
|
150
|
+
GGML_ASSERT(check == -n_chars);
|
|
151
|
+
} else {
|
|
152
|
+
piece.resize(n_chars);
|
|
153
|
+
}
|
|
154
|
+
return piece;
|
|
155
|
+
}
|
|
156
|
+
};
|
|
157
|
+
|
|
158
|
+
struct mtmd_image_tokens_data {
|
|
159
|
+
clip_image_f32_batch batch_f32; // preprocessed image patches
|
|
160
|
+
};
|
|
161
|
+
|
|
162
|
+
struct mtmd_image_tokens {
|
|
163
|
+
uint32_t nx; // number of tokens in x direction
|
|
164
|
+
uint32_t ny; // number of tokens in y direction
|
|
165
|
+
bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
|
|
166
|
+
uint32_t n_tokens() const { return nx * ny; }
|
|
167
|
+
clip_image_f32_batch batch_f32; // preprocessed image patches
|
|
168
|
+
std::string id; // optional user-defined ID, useful for KV cache tracking
|
|
169
|
+
|
|
170
|
+
mtmd_image_tokens clone() {
|
|
171
|
+
return mtmd_image_tokens{
|
|
172
|
+
nx,
|
|
173
|
+
ny,
|
|
174
|
+
use_mrope_pos,
|
|
175
|
+
batch_f32.clone(),
|
|
176
|
+
id
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
};
|
|
180
|
+
|
|
181
|
+
mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
|
|
182
|
+
const struct llama_model * text_model,
|
|
183
|
+
const struct mtmd_context_params ctx_params) {
|
|
184
|
+
try {
|
|
185
|
+
return new mtmd_context(mmproj_fname, text_model, ctx_params);
|
|
186
|
+
} catch (const std::exception & e) {
|
|
187
|
+
LOG_ERR("%s: error: %s\n", __func__, e.what());
|
|
188
|
+
return nullptr;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
void mtmd_free(mtmd_context * ctx) {
|
|
193
|
+
if (ctx) {
|
|
194
|
+
delete ctx;
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
// copied from common_tokenize
|
|
199
|
+
static std::vector<llama_token> mtmd_tokenize_text_internal(
|
|
200
|
+
const struct llama_vocab * vocab,
|
|
201
|
+
const std::string & text,
|
|
202
|
+
bool add_special,
|
|
203
|
+
bool parse_special) {
|
|
204
|
+
// upper limit for the number of tokens
|
|
205
|
+
int n_tokens = text.length() + 2 * add_special;
|
|
206
|
+
std::vector<llama_token> result(n_tokens);
|
|
207
|
+
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
|
208
|
+
if (n_tokens < 0) {
|
|
209
|
+
result.resize(-n_tokens);
|
|
210
|
+
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
|
211
|
+
GGML_ASSERT(check == -n_tokens);
|
|
212
|
+
} else {
|
|
213
|
+
result.resize(n_tokens);
|
|
214
|
+
}
|
|
215
|
+
return result;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
219
|
+
mtmd_input_chunks * output,
|
|
220
|
+
const mtmd_input_text * text,
|
|
221
|
+
const mtmd_bitmap ** bitmaps,
|
|
222
|
+
size_t n_bitmaps) {
|
|
223
|
+
auto vocab = llama_model_get_vocab(ctx->text_model);
|
|
224
|
+
|
|
225
|
+
std::string prompt_modified(text->text);
|
|
226
|
+
std::string marker_modified(ctx->image_marker);
|
|
227
|
+
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
|
|
228
|
+
|
|
229
|
+
// a bit hacky here, but works for now
|
|
230
|
+
// for some models, we need to add prefix and suffix to the image embeddings
|
|
231
|
+
if (clip_is_gemma3(ctx->ctx_clip)) {
|
|
232
|
+
// gemma 3
|
|
233
|
+
// <start_of_image> ... (image embeddings) ... <end_of_image>
|
|
234
|
+
marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
|
|
235
|
+
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
|
236
|
+
|
|
237
|
+
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
|
|
238
|
+
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
|
|
239
|
+
marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";
|
|
240
|
+
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
|
241
|
+
|
|
242
|
+
} else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
|
|
243
|
+
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
|
|
244
|
+
marker_modified = ctx->image_marker + "[IMG_END]";
|
|
245
|
+
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
|
249
|
+
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
|
250
|
+
marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>";
|
|
251
|
+
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
|
252
|
+
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
|
|
256
|
+
// <img> ... (image embeddings) ... </img>
|
|
257
|
+
marker_modified = "<img>" + ctx->image_marker + "</img>";
|
|
258
|
+
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
|
259
|
+
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
|
|
263
|
+
// for glm-edge, BOI and EOI token's embeddings are not present in the text model
|
|
264
|
+
|
|
265
|
+
std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
|
|
266
|
+
output->entries.clear();
|
|
267
|
+
output->entries.reserve(parts.size());
|
|
268
|
+
|
|
269
|
+
size_t i_img = 0;
|
|
270
|
+
|
|
271
|
+
// utility for adding raw tokens
|
|
272
|
+
auto add_text_chunk = [&output](std::vector<llama_token> && tokens) {
|
|
273
|
+
mtmd_input_chunk chunk{
|
|
274
|
+
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
|
275
|
+
std::move(tokens),
|
|
276
|
+
{},
|
|
277
|
+
};
|
|
278
|
+
output->entries.emplace_back(std::move(chunk));
|
|
279
|
+
};
|
|
280
|
+
|
|
281
|
+
// utility for splitting batch of multiple images into chunks of batch having single images
|
|
282
|
+
auto split_batch_to_chunk = [&ctx](clip_image_f32_batch && batch_f32, const std::string & id) {
|
|
283
|
+
std::vector<mtmd_input_chunk> chunks;
|
|
284
|
+
|
|
285
|
+
for (auto & entry : batch_f32.entries) {
|
|
286
|
+
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
|
287
|
+
image_tokens->nx = clip_n_output_tokens(ctx->ctx_clip, entry.get());
|
|
288
|
+
image_tokens->ny = 1;
|
|
289
|
+
image_tokens->batch_f32.entries.push_back(std::move(entry));
|
|
290
|
+
image_tokens->id = id;
|
|
291
|
+
|
|
292
|
+
mtmd_input_chunk chunk{
|
|
293
|
+
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
|
294
|
+
{},
|
|
295
|
+
std::move(image_tokens),
|
|
296
|
+
};
|
|
297
|
+
chunks.emplace_back(std::move(chunk));
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
return chunks;
|
|
301
|
+
};
|
|
302
|
+
|
|
303
|
+
for (const auto & part : parts) {
|
|
304
|
+
// printf("tokenizing part: %s\n", part.c_str());
|
|
305
|
+
bool add_bos = &parts.front() == ∂
|
|
306
|
+
auto tokens = mtmd_tokenize_text_internal(vocab, part, text->add_special && add_bos, text->parse_special);
|
|
307
|
+
if (tokens.empty()) {
|
|
308
|
+
continue;
|
|
309
|
+
}
|
|
310
|
+
mtmd_input_chunk chunk{
|
|
311
|
+
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
|
312
|
+
std::move(tokens),
|
|
313
|
+
{},
|
|
314
|
+
};
|
|
315
|
+
output->entries.emplace_back(std::move(chunk));
|
|
316
|
+
|
|
317
|
+
if (&parts.back() != &part) {
|
|
318
|
+
// add image token to middle of 2 parts
|
|
319
|
+
|
|
320
|
+
if (i_img >= n_bitmaps) {
|
|
321
|
+
LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
|
|
322
|
+
return 1;
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
// convert mtmd_bitmap to clip_image_u8
|
|
326
|
+
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
|
327
|
+
img_u8->nx = bitmaps[i_img]->nx;
|
|
328
|
+
img_u8->ny = bitmaps[i_img]->ny;
|
|
329
|
+
img_u8->buf.resize(bitmaps[i_img]->data.size());
|
|
330
|
+
std::memcpy(img_u8->buf.data(), bitmaps[i_img]->data.data(), img_u8->nx * img_u8->ny * 3);
|
|
331
|
+
clip_image_size img_u8_size{img_u8->nx, img_u8->ny};
|
|
332
|
+
|
|
333
|
+
// preprocess image
|
|
334
|
+
clip_image_f32_batch batch_f32;
|
|
335
|
+
bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), &batch_f32);
|
|
336
|
+
if (!ok) {
|
|
337
|
+
LOG_ERR("Unable to preprocess image\n");
|
|
338
|
+
return 2;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
if (ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6) {
|
|
342
|
+
// split batch into chunks of single images
|
|
343
|
+
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img]->id);
|
|
344
|
+
GGML_ASSERT(chunks.size() > 0);
|
|
345
|
+
|
|
346
|
+
// add overview image
|
|
347
|
+
add_text_chunk({ctx->tok_ov_img_start});
|
|
348
|
+
output->entries.emplace_back(std::move(chunks.front()));
|
|
349
|
+
chunks.erase(chunks.begin());
|
|
350
|
+
add_text_chunk({ctx->tok_ov_img_end});
|
|
351
|
+
|
|
352
|
+
// add slices
|
|
353
|
+
if (!chunks.empty()) {
|
|
354
|
+
clip_add_load_image_size(ctx->ctx_clip, &img_u8_size);
|
|
355
|
+
int n_col = clip_uhd_num_image_embeds_col(ctx->ctx_clip);
|
|
356
|
+
int n_row = (int)chunks.size() / n_col;
|
|
357
|
+
GGML_ASSERT(n_row * n_col == (int)chunks.size());
|
|
358
|
+
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
|
|
359
|
+
add_text_chunk({ctx->tok_slices_start});
|
|
360
|
+
}
|
|
361
|
+
for (int y = 0; y < n_row; y++) {
|
|
362
|
+
for (int x = 0; x < n_col; x++) {
|
|
363
|
+
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
|
|
364
|
+
add_text_chunk({ctx->tok_sli_img_start});
|
|
365
|
+
}
|
|
366
|
+
output->entries.emplace_back(std::move(chunks[y * n_col + x]));
|
|
367
|
+
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
|
|
368
|
+
add_text_chunk({ctx->tok_sli_img_end});
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
if (ctx->tok_row_end != LLAMA_TOKEN_NULL && y != n_row - 1) {
|
|
372
|
+
add_text_chunk({ctx->tok_row_end});
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
|
|
376
|
+
add_text_chunk({ctx->tok_slices_end});
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
} else {
|
|
381
|
+
size_t n_tokens = 0;
|
|
382
|
+
for (const auto & entry : batch_f32.entries) {
|
|
383
|
+
n_tokens += clip_n_output_tokens(ctx->ctx_clip, entry.get());
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
|
387
|
+
if (ctx->use_mrope) {
|
|
388
|
+
// for Qwen2VL, we need this information for M-RoPE decoding positions
|
|
389
|
+
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get());
|
|
390
|
+
image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get());
|
|
391
|
+
image_tokens->use_mrope_pos = true;
|
|
392
|
+
} else {
|
|
393
|
+
// other models, we only need the total number of tokens
|
|
394
|
+
image_tokens->nx = n_tokens;
|
|
395
|
+
image_tokens->ny = 1;
|
|
396
|
+
}
|
|
397
|
+
image_tokens->batch_f32 = std::move(batch_f32);
|
|
398
|
+
image_tokens->id = bitmaps[i_img]->id; // optional
|
|
399
|
+
|
|
400
|
+
LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
|
|
401
|
+
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
|
|
402
|
+
LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
|
|
403
|
+
|
|
404
|
+
mtmd_input_chunk chunk{
|
|
405
|
+
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
|
406
|
+
{},
|
|
407
|
+
std::move(image_tokens),
|
|
408
|
+
};
|
|
409
|
+
output->entries.emplace_back(std::move(chunk));
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
i_img++; // move to next image
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
return 0;
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
static void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
|
|
420
|
+
if (image_tokens) {
|
|
421
|
+
delete image_tokens;
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
|
|
426
|
+
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
|
|
427
|
+
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
|
|
428
|
+
bool ok = false;
|
|
429
|
+
|
|
430
|
+
// only effective for minicpmv and qwen2vl, other models will ignore load_image_size
|
|
431
|
+
{
|
|
432
|
+
clip_image_size slice_size{
|
|
433
|
+
image_tokens->batch_f32.entries[0]->nx,
|
|
434
|
+
image_tokens->batch_f32.entries[0]->ny};
|
|
435
|
+
clip_add_load_image_size(ctx->ctx_clip, &slice_size);
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
|
|
439
|
+
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
|
|
440
|
+
const auto & entries = image_tokens->batch_f32.entries;
|
|
441
|
+
for (size_t i = 0; i < entries.size(); i++) {
|
|
442
|
+
int n_tokens_per_image = clip_n_output_tokens(ctx->ctx_clip, entries[i].get());
|
|
443
|
+
ok = clip_image_encode(
|
|
444
|
+
ctx->ctx_clip,
|
|
445
|
+
ctx->n_threads,
|
|
446
|
+
entries[i].get(),
|
|
447
|
+
ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
|
|
448
|
+
}
|
|
449
|
+
} else {
|
|
450
|
+
ok = clip_image_batch_encode(
|
|
451
|
+
ctx->ctx_clip,
|
|
452
|
+
ctx->n_threads,
|
|
453
|
+
&image_tokens->batch_f32,
|
|
454
|
+
ctx->image_embd_v.data());
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
return ok ? 0 : 1;
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
float * mtmd_get_output_embd(mtmd_context * ctx) {
|
|
461
|
+
return ctx->image_embd_v.data();
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
|
|
465
|
+
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
|
|
466
|
+
if (proj_type == PROJECTOR_TYPE_GEMMA3) {
|
|
467
|
+
return true;
|
|
468
|
+
}
|
|
469
|
+
return false;
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
bool mtmd_decode_use_mrope(mtmd_context * ctx) {
|
|
473
|
+
return ctx->use_mrope;
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
|
|
477
|
+
mtmd_image_tokens_free(val);
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
// these 2 helpers below use internal clip_image_u8_ptr,
|
|
481
|
+
// so unfortunately they cannot moved to mtmd-helper.h
|
|
482
|
+
// however, in theory, user can decode image file to bitmap using
|
|
483
|
+
// whichever library they want, and then use mtmd_bitmap_init() to create bitmap
|
|
484
|
+
|
|
485
|
+
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
|
|
486
|
+
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
|
487
|
+
bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
|
|
488
|
+
if (!ok) {
|
|
489
|
+
LOG_ERR("Unable to load image from buffer\n");
|
|
490
|
+
return nullptr;
|
|
491
|
+
}
|
|
492
|
+
uint32_t nx, ny;
|
|
493
|
+
unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
|
|
494
|
+
return mtmd_bitmap_init(nx, ny, data);
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
|
|
498
|
+
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
|
499
|
+
bool ok = clip_image_load_from_file(fname, img_u8.get());
|
|
500
|
+
if (!ok) {
|
|
501
|
+
LOG_ERR("Unable to load image %s\n", fname);
|
|
502
|
+
return nullptr;
|
|
503
|
+
}
|
|
504
|
+
uint32_t nx, ny;
|
|
505
|
+
unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
|
|
506
|
+
return mtmd_bitmap_init(nx, ny, data);
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
//
|
|
510
|
+
// public API functions
|
|
511
|
+
//
|
|
512
|
+
|
|
513
|
+
// mtmd_bitmap
|
|
514
|
+
|
|
515
|
+
mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
|
|
516
|
+
uint32_t ny,
|
|
517
|
+
const unsigned char * data) {
|
|
518
|
+
mtmd_bitmap * bitmap = new mtmd_bitmap;
|
|
519
|
+
bitmap->nx = nx;
|
|
520
|
+
bitmap->ny = ny;
|
|
521
|
+
size_t data_size = (size_t)nx * ny * 3;
|
|
522
|
+
bitmap->data.resize(data_size);
|
|
523
|
+
std::memcpy(bitmap->data.data(), data, data_size);
|
|
524
|
+
return bitmap;
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) {
|
|
528
|
+
return bitmap->nx;
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
|
|
532
|
+
return bitmap->ny;
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
|
|
536
|
+
return bitmap->data.data();
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
|
|
540
|
+
return bitmap->id.c_str();
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id) {
|
|
544
|
+
if (id) {
|
|
545
|
+
bitmap->id = std::string(id);
|
|
546
|
+
} else {
|
|
547
|
+
bitmap->id.clear();
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
void mtmd_bitmap_free(mtmd_bitmap * bitmap) {
|
|
552
|
+
if (bitmap) {
|
|
553
|
+
delete bitmap;
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
// mtmd_input_chunks
|
|
558
|
+
|
|
559
|
+
mtmd_input_chunks * mtmd_input_chunks_init() {
|
|
560
|
+
return new mtmd_input_chunks;
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks) {
|
|
564
|
+
return chunks->entries.size();
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx) {
|
|
568
|
+
if (idx >= chunks->entries.size()) {
|
|
569
|
+
return nullptr;
|
|
570
|
+
}
|
|
571
|
+
return &chunks->entries[idx];
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
void mtmd_input_chunks_free(mtmd_input_chunks * chunks) {
|
|
575
|
+
if (chunks) {
|
|
576
|
+
delete chunks;
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
// mtmd_input_chunk
|
|
581
|
+
|
|
582
|
+
enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk) {
|
|
583
|
+
return chunk->type;
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output) {
|
|
587
|
+
if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
|
588
|
+
*n_tokens_output = chunk->tokens_text.size();
|
|
589
|
+
return chunk->tokens_text.data();
|
|
590
|
+
}
|
|
591
|
+
*n_tokens_output = 0;
|
|
592
|
+
return nullptr;
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk) {
|
|
596
|
+
if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
597
|
+
return chunk->tokens_image.get();
|
|
598
|
+
}
|
|
599
|
+
return nullptr;
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
|
|
603
|
+
mtmd_input_chunk * copy = new mtmd_input_chunk{
|
|
604
|
+
chunk->type,
|
|
605
|
+
chunk->tokens_text,
|
|
606
|
+
mtmd_image_tokens_ptr(),
|
|
607
|
+
};
|
|
608
|
+
if (chunk->tokens_image) {
|
|
609
|
+
// copy the image tokens
|
|
610
|
+
copy->tokens_image = mtmd_image_tokens_ptr(new mtmd_image_tokens());
|
|
611
|
+
*copy->tokens_image = chunk->tokens_image->clone();
|
|
612
|
+
}
|
|
613
|
+
return copy;
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
void mtmd_input_chunk_free(mtmd_input_chunk * chunk) {
|
|
617
|
+
if (chunk) {
|
|
618
|
+
delete chunk;
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
// mtmd_image_tokens
|
|
623
|
+
|
|
624
|
+
size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
|
|
625
|
+
return image_tokens->n_tokens();
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
|
|
629
|
+
return image_tokens->nx;
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
|
|
633
|
+
return image_tokens->ny;
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
|
|
637
|
+
return image_tokens->id.c_str();
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
|
|
641
|
+
if (image_tokens->use_mrope_pos) {
|
|
642
|
+
return 1; // for M-RoPE, the whole image is 1 in temporal dimension
|
|
643
|
+
}
|
|
644
|
+
return image_tokens->n_tokens();
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
// test function
|
|
648
|
+
|
|
649
|
+
mtmd_input_chunks * mtmd_test_create_input_chunks() {
|
|
650
|
+
mtmd_input_chunks * chunks = mtmd_input_chunks_init();
|
|
651
|
+
if (!chunks) {
|
|
652
|
+
return nullptr;
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
// create a text chunk
|
|
656
|
+
std::vector<llama_token> tokens_text = { 1, 2, 3, 4, 5 };
|
|
657
|
+
mtmd_input_chunk chunk_text{
|
|
658
|
+
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
|
659
|
+
std::move(tokens_text),
|
|
660
|
+
{},
|
|
661
|
+
};
|
|
662
|
+
chunks->entries.emplace_back(std::move(chunk_text));
|
|
663
|
+
|
|
664
|
+
// create an image chunk
|
|
665
|
+
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
|
666
|
+
image_tokens->nx = 4;
|
|
667
|
+
image_tokens->ny = 4;
|
|
668
|
+
image_tokens->batch_f32.entries.resize(16);
|
|
669
|
+
image_tokens->id = "image_1";
|
|
670
|
+
mtmd_input_chunk chunk_image{
|
|
671
|
+
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
|
672
|
+
{},
|
|
673
|
+
std::move(image_tokens),
|
|
674
|
+
};
|
|
675
|
+
chunks->entries.emplace_back(std::move(chunk_image));
|
|
676
|
+
|
|
677
|
+
return chunks;
|
|
678
|
+
}
|