local-llm-rn 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cpp/CMakeLists.txt +285 -0
- package/cpp/common/CMakeLists.txt +149 -0
- package/cpp/common/arg.cpp +3799 -0
- package/cpp/common/arg.h +131 -0
- package/cpp/common/base64.hpp +392 -0
- package/cpp/common/build-info.cpp.in +4 -0
- package/cpp/common/chat-parser-xml-toolcall.cpp +879 -0
- package/cpp/common/chat-parser-xml-toolcall.h +45 -0
- package/cpp/common/chat-parser.cpp +1649 -0
- package/cpp/common/chat-parser.h +133 -0
- package/cpp/common/chat-peg-parser.cpp +124 -0
- package/cpp/common/chat-peg-parser.h +105 -0
- package/cpp/common/chat.cpp +3355 -0
- package/cpp/common/chat.h +252 -0
- package/cpp/common/common.cpp +1824 -0
- package/cpp/common/common.h +930 -0
- package/cpp/common/console.cpp +1137 -0
- package/cpp/common/console.h +41 -0
- package/cpp/common/debug.cpp +167 -0
- package/cpp/common/debug.h +43 -0
- package/cpp/common/download.cpp +792 -0
- package/cpp/common/download.h +84 -0
- package/cpp/common/http.h +84 -0
- package/cpp/common/jinja/README.md +88 -0
- package/cpp/common/jinja/caps.cpp +285 -0
- package/cpp/common/jinja/caps.h +30 -0
- package/cpp/common/jinja/lexer.cpp +341 -0
- package/cpp/common/jinja/lexer.h +157 -0
- package/cpp/common/jinja/parser.cpp +591 -0
- package/cpp/common/jinja/parser.h +21 -0
- package/cpp/common/jinja/runtime.cpp +867 -0
- package/cpp/common/jinja/runtime.h +638 -0
- package/cpp/common/jinja/string.cpp +213 -0
- package/cpp/common/jinja/string.h +61 -0
- package/cpp/common/jinja/utils.h +149 -0
- package/cpp/common/jinja/value.cpp +1393 -0
- package/cpp/common/jinja/value.h +756 -0
- package/cpp/common/json-partial.cpp +324 -0
- package/cpp/common/json-partial.h +39 -0
- package/cpp/common/json-schema-to-grammar.cpp +1153 -0
- package/cpp/common/json-schema-to-grammar.h +43 -0
- package/cpp/common/llguidance.cpp +258 -0
- package/cpp/common/log.cpp +446 -0
- package/cpp/common/log.h +119 -0
- package/cpp/common/ngram-cache.cpp +285 -0
- package/cpp/common/ngram-cache.h +101 -0
- package/cpp/common/ngram-map.cpp +530 -0
- package/cpp/common/ngram-map.h +115 -0
- package/cpp/common/ngram-mod.cpp +60 -0
- package/cpp/common/ngram-mod.h +38 -0
- package/cpp/common/peg-parser.cpp +1712 -0
- package/cpp/common/peg-parser.h +459 -0
- package/cpp/common/preset.cpp +483 -0
- package/cpp/common/preset.h +83 -0
- package/cpp/common/regex-partial.cpp +204 -0
- package/cpp/common/regex-partial.h +56 -0
- package/cpp/common/sampling.cpp +745 -0
- package/cpp/common/sampling.h +119 -0
- package/cpp/common/speculative.cpp +1074 -0
- package/cpp/common/speculative.h +41 -0
- package/cpp/common/unicode.cpp +64 -0
- package/cpp/common/unicode.h +22 -0
- package/cpp/ggml/CMakeLists.txt +494 -0
- package/cpp/ggml/cmake/GitVars.cmake +22 -0
- package/cpp/ggml/cmake/common.cmake +50 -0
- package/cpp/ggml/cmake/ggml-config.cmake.in +191 -0
- package/cpp/ggml/include/ggml-alloc.h +85 -0
- package/cpp/ggml/include/ggml-backend.h +373 -0
- package/cpp/ggml/include/ggml-blas.h +25 -0
- package/cpp/ggml/include/ggml-cann.h +123 -0
- package/cpp/ggml/include/ggml-cpp.h +39 -0
- package/cpp/ggml/include/ggml-cpu.h +151 -0
- package/cpp/ggml/include/ggml-cuda.h +47 -0
- package/cpp/ggml/include/ggml-hexagon.h +19 -0
- package/cpp/ggml/include/ggml-metal.h +61 -0
- package/cpp/ggml/include/ggml-opencl.h +26 -0
- package/cpp/ggml/include/ggml-opt.h +256 -0
- package/cpp/ggml/include/ggml-rpc.h +30 -0
- package/cpp/ggml/include/ggml-sycl.h +49 -0
- package/cpp/ggml/include/ggml-virtgpu.h +14 -0
- package/cpp/ggml/include/ggml-vulkan.h +29 -0
- package/cpp/ggml/include/ggml-webgpu.h +19 -0
- package/cpp/ggml/include/ggml-zdnn.h +17 -0
- package/cpp/ggml/include/ggml-zendnn.h +22 -0
- package/cpp/ggml/include/ggml.h +2753 -0
- package/cpp/ggml/include/gguf.h +204 -0
- package/cpp/ggml/src/CMakeLists.txt +492 -0
- package/cpp/ggml/src/ggml-alloc.c +1244 -0
- package/cpp/ggml/src/ggml-backend-dl.cpp +48 -0
- package/cpp/ggml/src/ggml-backend-dl.h +45 -0
- package/cpp/ggml/src/ggml-backend-impl.h +255 -0
- package/cpp/ggml/src/ggml-backend-reg.cpp +566 -0
- package/cpp/ggml/src/ggml-backend.cpp +2270 -0
- package/cpp/ggml/src/ggml-blas/CMakeLists.txt +101 -0
- package/cpp/ggml/src/ggml-blas/ggml-blas.cpp +518 -0
- package/cpp/ggml/src/ggml-common.h +1878 -0
- package/cpp/ggml/src/ggml-cpu/CMakeLists.txt +691 -0
- package/cpp/ggml/src/ggml-cpu/amx/amx.cpp +247 -0
- package/cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
- package/cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
- package/cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4052 -0
- package/cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +4935 -0
- package/cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2159 -0
- package/cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
- package/cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
- package/cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2726 -0
- package/cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
- package/cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- package/cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
- package/cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
- package/cpp/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
- package/cpp/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
- package/cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
- package/cpp/ggml/src/ggml-cpu/arch-fallback.h +313 -0
- package/cpp/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
- package/cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
- package/cpp/ggml/src/ggml-cpu/common.h +95 -0
- package/cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +529 -0
- package/cpp/ggml/src/ggml-cpu/ggml-cpu.c +3734 -0
- package/cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +701 -0
- package/cpp/ggml/src/ggml-cpu/hbm.cpp +55 -0
- package/cpp/ggml/src/ggml-cpu/hbm.h +8 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +938 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +798 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4033 -0
- package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
- package/cpp/ggml/src/ggml-cpu/ops.cpp +10978 -0
- package/cpp/ggml/src/ggml-cpu/ops.h +116 -0
- package/cpp/ggml/src/ggml-cpu/quants.c +1193 -0
- package/cpp/ggml/src/ggml-cpu/quants.h +97 -0
- package/cpp/ggml/src/ggml-cpu/repack.cpp +3316 -0
- package/cpp/ggml/src/ggml-cpu/repack.h +173 -0
- package/cpp/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- package/cpp/ggml/src/ggml-cpu/simd-mappings.h +1279 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/cpp/ggml/src/ggml-cpu/traits.cpp +36 -0
- package/cpp/ggml/src/ggml-cpu/traits.h +38 -0
- package/cpp/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
- package/cpp/ggml/src/ggml-cpu/unary-ops.h +35 -0
- package/cpp/ggml/src/ggml-cpu/vec.cpp +629 -0
- package/cpp/ggml/src/ggml-cpu/vec.h +1585 -0
- package/cpp/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
- package/cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3232 -0
- package/cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -0
- package/cpp/ggml/src/ggml-hexagon/htp/act-ops.c +815 -0
- package/cpp/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- package/cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +827 -0
- package/cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- package/cpp/ggml/src/ggml-hexagon/htp/cpy-ops.c +251 -0
- package/cpp/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +666 -0
- package/cpp/ggml/src/ggml-hexagon/htp/get-rows-ops.c +111 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +154 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +65 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-arith.h +470 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-base.h +173 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-div.h +116 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h +176 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-reduce.h +266 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -0
- package/cpp/ggml/src/ggml-hexagon/htp/main.c +1150 -0
- package/cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2595 -0
- package/cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +498 -0
- package/cpp/ggml/src/ggml-hexagon/htp/set-rows-ops.c +167 -0
- package/cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +421 -0
- package/cpp/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +130 -0
- package/cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +384 -0
- package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
- package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- package/cpp/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- package/cpp/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- package/cpp/ggml/src/ggml-hexagon/libdl.h +79 -0
- package/cpp/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- package/cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
- package/cpp/ggml/src/ggml-impl.h +724 -0
- package/cpp/ggml/src/ggml-metal/CMakeLists.txt +124 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-context.m +702 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1890 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-device.h +290 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-device.m +1749 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-impl.h +1054 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +4370 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal.cpp +937 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal.metal +9819 -0
- package/cpp/ggml/src/ggml-musa/CMakeLists.txt +125 -0
- package/cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
- package/cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
- package/cpp/ggml/src/ggml-opencl/CMakeLists.txt +150 -0
- package/cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +11553 -0
- package/cpp/ggml/src/ggml-opencl/kernels/add.cl +190 -0
- package/cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- package/cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- package/cpp/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
- package/cpp/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
- package/cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- package/cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- package/cpp/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
- package/cpp/ggml/src/ggml-opencl/kernels/cvt.cl +417 -0
- package/cpp/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
- package/cpp/ggml/src/ggml-opencl/kernels/div.cl +138 -0
- package/cpp/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
- package/cpp/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
- package/cpp/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
- package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
- package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
- package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- package/cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
- package/cpp/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
- package/cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
- package/cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
- package/cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- package/cpp/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
- package/cpp/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
- package/cpp/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
- package/cpp/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
- package/cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
- package/cpp/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
- package/cpp/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
- package/cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- package/cpp/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
- package/cpp/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
- package/cpp/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
- package/cpp/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
- package/cpp/ggml/src/ggml-opencl/kernels/transpose.cl +117 -0
- package/cpp/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- package/cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
- package/cpp/ggml/src/ggml-opt.cpp +1093 -0
- package/cpp/ggml/src/ggml-quants.c +5325 -0
- package/cpp/ggml/src/ggml-quants.h +106 -0
- package/cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2118 -0
- package/cpp/ggml/src/ggml-threading.cpp +12 -0
- package/cpp/ggml/src/ggml-threading.h +14 -0
- package/cpp/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- package/cpp/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- package/cpp/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- package/cpp/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- package/cpp/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- package/cpp/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
- package/cpp/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1231 -0
- package/cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3150 -0
- package/cpp/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +107 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +923 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +182 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +668 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +713 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +103 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +138 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +188 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +194 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- package/cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- package/cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
- package/cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +633 -0
- package/cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- package/cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- package/cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
- package/cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
- package/cpp/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
- package/cpp/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
- package/cpp/ggml/src/ggml.c +7669 -0
- package/cpp/ggml/src/ggml.cpp +26 -0
- package/cpp/ggml/src/gguf.cpp +1699 -0
- package/cpp/include/llama-cpp.h +32 -0
- package/cpp/include/llama.h +1568 -0
- package/cpp/mtmd/CMakeLists.txt +98 -0
- package/cpp/mtmd/README.md +63 -0
- package/cpp/mtmd/clip-graph.h +117 -0
- package/cpp/mtmd/clip-impl.h +586 -0
- package/cpp/mtmd/clip-model.h +390 -0
- package/cpp/mtmd/clip.cpp +4154 -0
- package/cpp/mtmd/clip.h +121 -0
- package/cpp/mtmd/deprecation-warning.cpp +22 -0
- package/cpp/mtmd/legacy-models/convert_image_encoder_to_gguf.py +412 -0
- package/cpp/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py +280 -0
- package/cpp/mtmd/legacy-models/glmedge-surgery.py +33 -0
- package/cpp/mtmd/legacy-models/llava_surgery.py +38 -0
- package/cpp/mtmd/legacy-models/llava_surgery_v2.py +180 -0
- package/cpp/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +892 -0
- package/cpp/mtmd/legacy-models/minicpmv-surgery.py +47 -0
- package/cpp/mtmd/models/cogvlm.cpp +98 -0
- package/cpp/mtmd/models/conformer.cpp +216 -0
- package/cpp/mtmd/models/glm4v.cpp +122 -0
- package/cpp/mtmd/models/internvl.cpp +69 -0
- package/cpp/mtmd/models/kimik25.cpp +101 -0
- package/cpp/mtmd/models/kimivl.cpp +63 -0
- package/cpp/mtmd/models/llama4.cpp +96 -0
- package/cpp/mtmd/models/llava.cpp +374 -0
- package/cpp/mtmd/models/minicpmv.cpp +114 -0
- package/cpp/mtmd/models/mobilenetv5.cpp +451 -0
- package/cpp/mtmd/models/models.h +128 -0
- package/cpp/mtmd/models/nemotron-v2-vl.cpp +35 -0
- package/cpp/mtmd/models/paddleocr.cpp +52 -0
- package/cpp/mtmd/models/pixtral.cpp +86 -0
- package/cpp/mtmd/models/qwen2vl.cpp +183 -0
- package/cpp/mtmd/models/qwen3vl.cpp +193 -0
- package/cpp/mtmd/models/siglip.cpp +86 -0
- package/cpp/mtmd/models/whisper-enc.cpp +115 -0
- package/cpp/mtmd/models/youtuvl.cpp +179 -0
- package/cpp/mtmd/mtmd-audio.cpp +730 -0
- package/cpp/mtmd/mtmd-audio.h +113 -0
- package/cpp/mtmd/mtmd-cli.cpp +437 -0
- package/cpp/mtmd/mtmd-helper.cpp +521 -0
- package/cpp/mtmd/mtmd-helper.h +96 -0
- package/cpp/mtmd/mtmd.cpp +1156 -0
- package/cpp/mtmd/mtmd.h +319 -0
- package/cpp/mtmd/requirements.txt +5 -0
- package/cpp/mtmd/test-1.jpeg +0 -0
- package/cpp/mtmd/test-2.mp3 +0 -0
- package/cpp/mtmd/tests.sh +192 -0
- package/cpp/src/CMakeLists.txt +169 -0
- package/cpp/src/llama-adapter.cpp +488 -0
- package/cpp/src/llama-adapter.h +89 -0
- package/cpp/src/llama-arch.cpp +2855 -0
- package/cpp/src/llama-arch.h +619 -0
- package/cpp/src/llama-batch.cpp +917 -0
- package/cpp/src/llama-batch.h +173 -0
- package/cpp/src/llama-chat.cpp +896 -0
- package/cpp/src/llama-chat.h +71 -0
- package/cpp/src/llama-context.cpp +3512 -0
- package/cpp/src/llama-context.h +359 -0
- package/cpp/src/llama-cparams.cpp +5 -0
- package/cpp/src/llama-cparams.h +44 -0
- package/cpp/src/llama-grammar.cpp +1464 -0
- package/cpp/src/llama-grammar.h +194 -0
- package/cpp/src/llama-graph.cpp +2685 -0
- package/cpp/src/llama-graph.h +1026 -0
- package/cpp/src/llama-hparams.cpp +234 -0
- package/cpp/src/llama-hparams.h +339 -0
- package/cpp/src/llama-impl.cpp +171 -0
- package/cpp/src/llama-impl.h +73 -0
- package/cpp/src/llama-io.cpp +15 -0
- package/cpp/src/llama-io.h +35 -0
- package/cpp/src/llama-kv-cache-iswa.cpp +330 -0
- package/cpp/src/llama-kv-cache-iswa.h +137 -0
- package/cpp/src/llama-kv-cache.cpp +2271 -0
- package/cpp/src/llama-kv-cache.h +388 -0
- package/cpp/src/llama-kv-cells.h +533 -0
- package/cpp/src/llama-memory-hybrid-iswa.cpp +275 -0
- package/cpp/src/llama-memory-hybrid-iswa.h +140 -0
- package/cpp/src/llama-memory-hybrid.cpp +268 -0
- package/cpp/src/llama-memory-hybrid.h +139 -0
- package/cpp/src/llama-memory-recurrent.cpp +1165 -0
- package/cpp/src/llama-memory-recurrent.h +182 -0
- package/cpp/src/llama-memory.cpp +59 -0
- package/cpp/src/llama-memory.h +122 -0
- package/cpp/src/llama-mmap.cpp +785 -0
- package/cpp/src/llama-mmap.h +92 -0
- package/cpp/src/llama-model-loader.cpp +1414 -0
- package/cpp/src/llama-model-loader.h +203 -0
- package/cpp/src/llama-model-saver.cpp +286 -0
- package/cpp/src/llama-model-saver.h +37 -0
- package/cpp/src/llama-model.cpp +9253 -0
- package/cpp/src/llama-model.h +576 -0
- package/cpp/src/llama-quant.cpp +1119 -0
- package/cpp/src/llama-quant.h +1 -0
- package/cpp/src/llama-sampler.cpp +3885 -0
- package/cpp/src/llama-sampler.h +42 -0
- package/cpp/src/llama-vocab.cpp +3970 -0
- package/cpp/src/llama-vocab.h +187 -0
- package/cpp/src/llama.cpp +1313 -0
- package/cpp/src/models/afmoe.cpp +191 -0
- package/cpp/src/models/apertus.cpp +125 -0
- package/cpp/src/models/arcee.cpp +135 -0
- package/cpp/src/models/arctic.cpp +138 -0
- package/cpp/src/models/arwkv7.cpp +86 -0
- package/cpp/src/models/baichuan.cpp +122 -0
- package/cpp/src/models/bailingmoe.cpp +144 -0
- package/cpp/src/models/bailingmoe2.cpp +135 -0
- package/cpp/src/models/bert.cpp +178 -0
- package/cpp/src/models/bitnet.cpp +160 -0
- package/cpp/src/models/bloom.cpp +101 -0
- package/cpp/src/models/chameleon.cpp +178 -0
- package/cpp/src/models/chatglm.cpp +132 -0
- package/cpp/src/models/codeshell.cpp +111 -0
- package/cpp/src/models/cogvlm.cpp +102 -0
- package/cpp/src/models/cohere2-iswa.cpp +134 -0
- package/cpp/src/models/command-r.cpp +122 -0
- package/cpp/src/models/dbrx.cpp +123 -0
- package/cpp/src/models/deci.cpp +135 -0
- package/cpp/src/models/deepseek.cpp +144 -0
- package/cpp/src/models/deepseek2.cpp +262 -0
- package/cpp/src/models/delta-net-base.cpp +376 -0
- package/cpp/src/models/dots1.cpp +134 -0
- package/cpp/src/models/dream.cpp +105 -0
- package/cpp/src/models/ernie4-5-moe.cpp +150 -0
- package/cpp/src/models/ernie4-5.cpp +110 -0
- package/cpp/src/models/eurobert.cpp +97 -0
- package/cpp/src/models/exaone-moe.cpp +146 -0
- package/cpp/src/models/exaone.cpp +114 -0
- package/cpp/src/models/exaone4.cpp +123 -0
- package/cpp/src/models/falcon-h1.cpp +111 -0
- package/cpp/src/models/falcon.cpp +120 -0
- package/cpp/src/models/gemma-embedding.cpp +116 -0
- package/cpp/src/models/gemma.cpp +112 -0
- package/cpp/src/models/gemma2-iswa.cpp +128 -0
- package/cpp/src/models/gemma3.cpp +155 -0
- package/cpp/src/models/gemma3n-iswa.cpp +384 -0
- package/cpp/src/models/glm4-moe.cpp +170 -0
- package/cpp/src/models/glm4.cpp +157 -0
- package/cpp/src/models/gpt2.cpp +105 -0
- package/cpp/src/models/gptneox.cpp +144 -0
- package/cpp/src/models/granite-hybrid.cpp +196 -0
- package/cpp/src/models/granite.cpp +211 -0
- package/cpp/src/models/grok.cpp +159 -0
- package/cpp/src/models/grovemoe.cpp +141 -0
- package/cpp/src/models/hunyuan-dense.cpp +132 -0
- package/cpp/src/models/hunyuan-moe.cpp +154 -0
- package/cpp/src/models/internlm2.cpp +120 -0
- package/cpp/src/models/jais.cpp +86 -0
- package/cpp/src/models/jais2.cpp +123 -0
- package/cpp/src/models/jamba.cpp +106 -0
- package/cpp/src/models/kimi-linear.cpp +392 -0
- package/cpp/src/models/lfm2.cpp +190 -0
- package/cpp/src/models/llada-moe.cpp +122 -0
- package/cpp/src/models/llada.cpp +99 -0
- package/cpp/src/models/llama-iswa.cpp +178 -0
- package/cpp/src/models/llama.cpp +168 -0
- package/cpp/src/models/maincoder.cpp +117 -0
- package/cpp/src/models/mamba-base.cpp +285 -0
- package/cpp/src/models/mamba.cpp +54 -0
- package/cpp/src/models/mimo2-iswa.cpp +123 -0
- package/cpp/src/models/minicpm3.cpp +200 -0
- package/cpp/src/models/minimax-m2.cpp +124 -0
- package/cpp/src/models/mistral3.cpp +160 -0
- package/cpp/src/models/models.h +684 -0
- package/cpp/src/models/modern-bert.cpp +109 -0
- package/cpp/src/models/mpt.cpp +126 -0
- package/cpp/src/models/nemotron-h.cpp +148 -0
- package/cpp/src/models/nemotron.cpp +122 -0
- package/cpp/src/models/neo-bert.cpp +104 -0
- package/cpp/src/models/olmo.cpp +121 -0
- package/cpp/src/models/olmo2.cpp +150 -0
- package/cpp/src/models/olmoe.cpp +124 -0
- package/cpp/src/models/openai-moe-iswa.cpp +127 -0
- package/cpp/src/models/openelm.cpp +124 -0
- package/cpp/src/models/orion.cpp +123 -0
- package/cpp/src/models/paddleocr.cpp +122 -0
- package/cpp/src/models/pangu-embedded.cpp +121 -0
- package/cpp/src/models/phi2.cpp +121 -0
- package/cpp/src/models/phi3.cpp +152 -0
- package/cpp/src/models/plamo.cpp +110 -0
- package/cpp/src/models/plamo2.cpp +318 -0
- package/cpp/src/models/plamo3.cpp +128 -0
- package/cpp/src/models/plm.cpp +169 -0
- package/cpp/src/models/qwen.cpp +108 -0
- package/cpp/src/models/qwen2.cpp +126 -0
- package/cpp/src/models/qwen2moe.cpp +151 -0
- package/cpp/src/models/qwen2vl.cpp +117 -0
- package/cpp/src/models/qwen3.cpp +117 -0
- package/cpp/src/models/qwen35.cpp +386 -0
- package/cpp/src/models/qwen35moe.cpp +420 -0
- package/cpp/src/models/qwen3moe.cpp +124 -0
- package/cpp/src/models/qwen3next.cpp +525 -0
- package/cpp/src/models/qwen3vl-moe.cpp +140 -0
- package/cpp/src/models/qwen3vl.cpp +132 -0
- package/cpp/src/models/refact.cpp +94 -0
- package/cpp/src/models/rnd1.cpp +126 -0
- package/cpp/src/models/rwkv6-base.cpp +164 -0
- package/cpp/src/models/rwkv6.cpp +94 -0
- package/cpp/src/models/rwkv6qwen2.cpp +86 -0
- package/cpp/src/models/rwkv7-base.cpp +137 -0
- package/cpp/src/models/rwkv7.cpp +90 -0
- package/cpp/src/models/seed-oss.cpp +124 -0
- package/cpp/src/models/smallthinker.cpp +126 -0
- package/cpp/src/models/smollm3.cpp +128 -0
- package/cpp/src/models/stablelm.cpp +146 -0
- package/cpp/src/models/starcoder.cpp +100 -0
- package/cpp/src/models/starcoder2.cpp +121 -0
- package/cpp/src/models/step35-iswa.cpp +168 -0
- package/cpp/src/models/t5-dec.cpp +166 -0
- package/cpp/src/models/t5-enc.cpp +96 -0
- package/cpp/src/models/wavtokenizer-dec.cpp +149 -0
- package/cpp/src/models/xverse.cpp +108 -0
- package/cpp/src/unicode-data.cpp +7034 -0
- package/cpp/src/unicode-data.h +20 -0
- package/cpp/src/unicode.cpp +1103 -0
- package/cpp/src/unicode.h +111 -0
- package/cpp/vendor/nlohmann/json.hpp +25526 -0
- package/cpp/vendor/nlohmann/json_fwd.hpp +187 -0
- package/cpp/vendor/stb/stb_image.h +7988 -0
- package/ios/LocalLLM-Bridging-Header.h +2 -0
- package/ios/LocalLLM.h +5 -0
- package/ios/LocalLLM.mm +1267 -0
- package/local-llm-rn.podspec +60 -0
- package/package.json +35 -0
- package/src/NativeLocalLLM.ts +73 -0
- package/src/device.ts +50 -0
- package/src/download-adapter.ts +17 -0
- package/src/index.ts +21 -0
- package/src/native-bridge.ts +142 -0
- package/src/rn-downloader.ts +37 -0
|
@@ -0,0 +1,4154 @@
|
|
|
1
|
+
#include "clip.h"
|
|
2
|
+
#include "clip-impl.h"
|
|
3
|
+
#include "clip-model.h"
|
|
4
|
+
#include "clip-graph.h"
|
|
5
|
+
#include "models/models.h"
|
|
6
|
+
|
|
7
|
+
#include "ggml.h"
|
|
8
|
+
#include "ggml-cpp.h"
|
|
9
|
+
#include "ggml-alloc.h"
|
|
10
|
+
#include "ggml-backend.h"
|
|
11
|
+
#include "gguf.h"
|
|
12
|
+
|
|
13
|
+
#include <algorithm>
|
|
14
|
+
#include <cassert>
|
|
15
|
+
#include <cmath>
|
|
16
|
+
#include <cstdlib>
|
|
17
|
+
#include <cstring>
|
|
18
|
+
#include <fstream>
|
|
19
|
+
#include <map>
|
|
20
|
+
#include <stdexcept>
|
|
21
|
+
#include <unordered_set>
|
|
22
|
+
#include <vector>
|
|
23
|
+
#include <cinttypes>
|
|
24
|
+
#include <limits>
|
|
25
|
+
#include <array>
|
|
26
|
+
#include <functional>
|
|
27
|
+
|
|
28
|
+
struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
|
|
29
|
+
|
|
30
|
+
//#define CLIP_DEBUG_FUNCTIONS
|
|
31
|
+
|
|
32
|
+
#ifdef CLIP_DEBUG_FUNCTIONS
|
|
33
|
+
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
|
|
34
|
+
std::ofstream file(filename, std::ios::binary);
|
|
35
|
+
if (!file.is_open()) {
|
|
36
|
+
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
|
|
37
|
+
return;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// PPM header: P6 format, width, height, and max color value
|
|
41
|
+
file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
|
|
42
|
+
|
|
43
|
+
// Write pixel data
|
|
44
|
+
for (size_t i = 0; i < img.buf.size(); i += 3) {
|
|
45
|
+
// PPM expects binary data in RGB format, which matches our image buffer
|
|
46
|
+
file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
file.close();
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
|
|
53
|
+
std::ofstream file(filename, std::ios::binary);
|
|
54
|
+
if (!file.is_open()) {
|
|
55
|
+
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
|
|
56
|
+
return;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
|
|
60
|
+
int bytesPerPixel = 3;
|
|
61
|
+
int widthInBytes = img.nx * bytesPerPixel;
|
|
62
|
+
int paddingAmount = (4 - (widthInBytes % 4)) % 4;
|
|
63
|
+
int stride = widthInBytes + paddingAmount;
|
|
64
|
+
|
|
65
|
+
// Bitmap file header
|
|
66
|
+
unsigned char fileHeader[14] = {
|
|
67
|
+
'B','M', // Signature
|
|
68
|
+
0,0,0,0, // Image file size in bytes
|
|
69
|
+
0,0,0,0, // Reserved
|
|
70
|
+
54,0,0,0 // Start of pixel array
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
// Total file size
|
|
74
|
+
fileSize = 54 + (stride * img.ny);
|
|
75
|
+
fileHeader[2] = (unsigned char)(fileSize);
|
|
76
|
+
fileHeader[3] = (unsigned char)(fileSize >> 8);
|
|
77
|
+
fileHeader[4] = (unsigned char)(fileSize >> 16);
|
|
78
|
+
fileHeader[5] = (unsigned char)(fileSize >> 24);
|
|
79
|
+
|
|
80
|
+
// Bitmap information header (BITMAPINFOHEADER)
|
|
81
|
+
unsigned char infoHeader[40] = {
|
|
82
|
+
40,0,0,0, // Size of this header (40 bytes)
|
|
83
|
+
0,0,0,0, // Image width
|
|
84
|
+
0,0,0,0, // Image height
|
|
85
|
+
1,0, // Number of color planes
|
|
86
|
+
24,0, // Bits per pixel
|
|
87
|
+
0,0,0,0, // No compression
|
|
88
|
+
0,0,0,0, // Image size (can be 0 for no compression)
|
|
89
|
+
0,0,0,0, // X pixels per meter (not specified)
|
|
90
|
+
0,0,0,0, // Y pixels per meter (not specified)
|
|
91
|
+
0,0,0,0, // Total colors (color table not used)
|
|
92
|
+
0,0,0,0 // Important colors (all are important)
|
|
93
|
+
};
|
|
94
|
+
|
|
95
|
+
// Width and height in the information header
|
|
96
|
+
infoHeader[4] = (unsigned char)(img.nx);
|
|
97
|
+
infoHeader[5] = (unsigned char)(img.nx >> 8);
|
|
98
|
+
infoHeader[6] = (unsigned char)(img.nx >> 16);
|
|
99
|
+
infoHeader[7] = (unsigned char)(img.nx >> 24);
|
|
100
|
+
infoHeader[8] = (unsigned char)(img.ny);
|
|
101
|
+
infoHeader[9] = (unsigned char)(img.ny >> 8);
|
|
102
|
+
infoHeader[10] = (unsigned char)(img.ny >> 16);
|
|
103
|
+
infoHeader[11] = (unsigned char)(img.ny >> 24);
|
|
104
|
+
|
|
105
|
+
// Write file headers
|
|
106
|
+
file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
|
|
107
|
+
file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));
|
|
108
|
+
|
|
109
|
+
// Pixel data
|
|
110
|
+
std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
|
|
111
|
+
for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
|
|
112
|
+
for (int x = 0; x < img.nx; ++x) {
|
|
113
|
+
// Each pixel
|
|
114
|
+
size_t pixelIndex = (y * img.nx + x) * 3;
|
|
115
|
+
unsigned char pixel[3] = {
|
|
116
|
+
img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
|
|
117
|
+
img.buf[pixelIndex + 1],
|
|
118
|
+
img.buf[pixelIndex]
|
|
119
|
+
};
|
|
120
|
+
file.write(reinterpret_cast<char*>(pixel), 3);
|
|
121
|
+
}
|
|
122
|
+
// Write padding for the row
|
|
123
|
+
file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
file.close();
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// debug function to convert f32 to u8
|
|
130
|
+
static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
|
|
131
|
+
dst.nx = src.nx;
|
|
132
|
+
dst.ny = src.ny;
|
|
133
|
+
dst.buf.resize(3 * src.nx * src.ny);
|
|
134
|
+
for (size_t i = 0; i < src.buf.size(); ++i) {
|
|
135
|
+
dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
#endif
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
struct clip_ctx {
|
|
142
|
+
clip_model model;
|
|
143
|
+
|
|
144
|
+
gguf_context_ptr ctx_gguf;
|
|
145
|
+
ggml_context_ptr ctx_data;
|
|
146
|
+
|
|
147
|
+
std::vector<uint8_t> buf_compute_meta;
|
|
148
|
+
|
|
149
|
+
std::vector<ggml_backend_t> backend_ptrs;
|
|
150
|
+
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
|
151
|
+
|
|
152
|
+
ggml_backend_t backend = nullptr;
|
|
153
|
+
ggml_backend_t backend_cpu = nullptr;
|
|
154
|
+
ggml_backend_buffer_ptr buf;
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
int max_nodes = 8192;
|
|
158
|
+
ggml_backend_sched_ptr sched;
|
|
159
|
+
clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
|
|
160
|
+
bool is_allocated = false;
|
|
161
|
+
|
|
162
|
+
clip_ctx(clip_context_params & ctx_params) {
|
|
163
|
+
flash_attn_type = ctx_params.flash_attn_type;
|
|
164
|
+
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
|
165
|
+
if (!backend_cpu) {
|
|
166
|
+
throw std::runtime_error("failed to initialize CPU backend");
|
|
167
|
+
}
|
|
168
|
+
if (ctx_params.use_gpu) {
|
|
169
|
+
auto backend_name = std::getenv("MTMD_BACKEND_DEVICE");
|
|
170
|
+
if (backend_name != nullptr) {
|
|
171
|
+
backend = ggml_backend_init_by_name(backend_name, nullptr);
|
|
172
|
+
if (!backend) {
|
|
173
|
+
LOG_WRN("%s: Warning: Failed to initialize \"%s\" backend, falling back to default GPU backend\n", __func__, backend_name);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
if (!backend) {
|
|
177
|
+
backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr);
|
|
178
|
+
backend = backend ? backend : ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU, nullptr);
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
if (backend) {
|
|
183
|
+
LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
|
|
184
|
+
backend_ptrs.push_back(backend);
|
|
185
|
+
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
|
|
186
|
+
} else {
|
|
187
|
+
backend = backend_cpu;
|
|
188
|
+
LOG_INF("%s: CLIP using CPU backend\n", __func__);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
if (ctx_params.image_min_tokens > 0) {
|
|
192
|
+
model.hparams.custom_image_min_tokens = ctx_params.image_min_tokens;
|
|
193
|
+
}
|
|
194
|
+
if (ctx_params.image_max_tokens > 0) {
|
|
195
|
+
model.hparams.custom_image_max_tokens = ctx_params.image_max_tokens;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
backend_ptrs.push_back(backend_cpu);
|
|
199
|
+
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
|
|
200
|
+
|
|
201
|
+
sched.reset(
|
|
202
|
+
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
|
|
203
|
+
);
|
|
204
|
+
|
|
205
|
+
if (ctx_params.cb_eval != nullptr) {
|
|
206
|
+
ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
~clip_ctx() {
|
|
211
|
+
ggml_backend_free(backend);
|
|
212
|
+
if (backend != backend_cpu) {
|
|
213
|
+
ggml_backend_free(backend_cpu);
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// this function is added so that we don't change too much of the existing code
|
|
218
|
+
projector_type proj_type() const {
|
|
219
|
+
return model.proj_type;
|
|
220
|
+
}
|
|
221
|
+
};
|
|
222
|
+
|
|
223
|
+
//
|
|
224
|
+
// clip_graph
|
|
225
|
+
//
|
|
226
|
+
|
|
227
|
+
clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
|
|
228
|
+
model(ctx->model),
|
|
229
|
+
hparams(model.hparams),
|
|
230
|
+
proj_type(ctx->proj_type()),
|
|
231
|
+
img(img),
|
|
232
|
+
patch_size(hparams.patch_size),
|
|
233
|
+
n_patches_x(img.nx / patch_size),
|
|
234
|
+
n_patches_y(img.ny / patch_size),
|
|
235
|
+
n_patches(n_patches_x * n_patches_y),
|
|
236
|
+
n_embd(hparams.n_embd),
|
|
237
|
+
n_head(hparams.n_head),
|
|
238
|
+
d_head(n_embd / n_head),
|
|
239
|
+
n_layer(hparams.n_layer),
|
|
240
|
+
n_mmproj_embd(clip_n_mmproj_embd(ctx)),
|
|
241
|
+
eps(hparams.eps),
|
|
242
|
+
kq_scale(1.0f / sqrtf((float)d_head)),
|
|
243
|
+
flash_attn_type(ctx->flash_attn_type) {
|
|
244
|
+
struct ggml_init_params params = {
|
|
245
|
+
/*.mem_size =*/ ctx->buf_compute_meta.size(),
|
|
246
|
+
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
|
|
247
|
+
/*.no_alloc =*/ true,
|
|
248
|
+
};
|
|
249
|
+
ctx0_ptr.reset(ggml_init(params));
|
|
250
|
+
ctx0 = ctx0_ptr.get();
|
|
251
|
+
gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
|
|
255
|
+
if (il >= 0) {
|
|
256
|
+
ggml_format_name(cur, "%s-%d", name, il);
|
|
257
|
+
} else {
|
|
258
|
+
ggml_set_name(cur, name);
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// siglip2 naflex
|
|
263
|
+
ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) {
|
|
264
|
+
ggml_tensor * pos_embd = model.position_embeddings;
|
|
265
|
+
const int height = img.ny / patch_size;
|
|
266
|
+
const int width = img.nx / patch_size;
|
|
267
|
+
const uint32_t mode = interpolation_mode;
|
|
268
|
+
const int n_per_side = (int)std::sqrt(pos_embd->ne[1]);
|
|
269
|
+
|
|
270
|
+
GGML_ASSERT(pos_embd);
|
|
271
|
+
|
|
272
|
+
if (height == n_per_side && width == n_per_side) {
|
|
273
|
+
return pos_embd;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, n_per_side, n_per_side); // -> (n_embd, n_per_side, n_per_side)
|
|
277
|
+
pos_embd = ggml_permute(ctx0, pos_embd, 2, 0, 1, 3); // -> (n_per_side, n_per_side, n_embd)
|
|
278
|
+
pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, mode); // -> (width, height, n_embd)
|
|
279
|
+
pos_embd = ggml_permute(ctx0, pos_embd, 1, 2, 0, 3); // -> (n_embd, width, height)
|
|
280
|
+
pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height); // -> (n_embd, width * height)
|
|
281
|
+
|
|
282
|
+
return pos_embd;
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
// build vision transformer (ViT) cgraph
|
|
286
|
+
// this function should cover most of the models
|
|
287
|
+
// if your model has specific features, you should probably duplicate this function
|
|
288
|
+
ggml_tensor * clip_graph::build_vit(
|
|
289
|
+
ggml_tensor * inp,
|
|
290
|
+
int64_t n_pos,
|
|
291
|
+
norm_type norm_t,
|
|
292
|
+
ffn_op_type ffn_t,
|
|
293
|
+
ggml_tensor * learned_pos_embd,
|
|
294
|
+
std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos
|
|
295
|
+
) {
|
|
296
|
+
if (learned_pos_embd) {
|
|
297
|
+
inp = ggml_add(ctx0, inp, learned_pos_embd);
|
|
298
|
+
cb(inp, "pos_embed", -1);
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
ggml_tensor * inpL = inp;
|
|
302
|
+
|
|
303
|
+
// pre-layernorm
|
|
304
|
+
if (model.pre_ln_w) {
|
|
305
|
+
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
|
|
306
|
+
cb(inpL, "pre_ln", -1);
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
// loop over layers
|
|
310
|
+
for (int il = 0; il < n_layer; il++) {
|
|
311
|
+
auto & layer = model.layers[il];
|
|
312
|
+
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
|
|
313
|
+
|
|
314
|
+
// layernorm1
|
|
315
|
+
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
|
|
316
|
+
cb(cur, "layer_inp_normed", il);
|
|
317
|
+
|
|
318
|
+
// self-attention
|
|
319
|
+
{
|
|
320
|
+
ggml_tensor * Qcur = nullptr;
|
|
321
|
+
ggml_tensor * Kcur = nullptr;
|
|
322
|
+
ggml_tensor * Vcur = nullptr;
|
|
323
|
+
if (layer.qkv_w != nullptr) {
|
|
324
|
+
// fused qkv
|
|
325
|
+
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
|
|
326
|
+
if (layer.qkv_b != nullptr) {
|
|
327
|
+
cur = ggml_add(ctx0, cur, layer.qkv_b);
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
|
331
|
+
/* nb1 */ ggml_row_size(cur->type, d_head),
|
|
332
|
+
/* nb2 */ cur->nb[1],
|
|
333
|
+
/* offset */ 0);
|
|
334
|
+
|
|
335
|
+
Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
|
336
|
+
/* nb1 */ ggml_row_size(cur->type, d_head),
|
|
337
|
+
/* nb2 */ cur->nb[1],
|
|
338
|
+
/* offset */ ggml_row_size(cur->type, n_embd));
|
|
339
|
+
|
|
340
|
+
Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
|
341
|
+
/* nb1 */ ggml_row_size(cur->type, d_head),
|
|
342
|
+
/* nb2 */ cur->nb[1],
|
|
343
|
+
/* offset */ ggml_row_size(cur->type, 2 * n_embd));
|
|
344
|
+
|
|
345
|
+
if (layer.q_norm) {
|
|
346
|
+
GGML_ASSERT(layer.q_norm->ne[0] == Qcur->ne[0]);
|
|
347
|
+
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
|
|
348
|
+
cb(Qcur, "Qcur_norm", il);
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
if (layer.k_norm) {
|
|
352
|
+
GGML_ASSERT(layer.k_norm->ne[0] == Kcur->ne[0]);
|
|
353
|
+
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
|
|
354
|
+
cb(Kcur, "Kcur_norm", il);
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
} else {
|
|
358
|
+
// separate q, k, v
|
|
359
|
+
Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
|
|
360
|
+
if (layer.q_b) {
|
|
361
|
+
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
|
|
365
|
+
if (layer.k_b) {
|
|
366
|
+
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
|
|
370
|
+
if (layer.v_b) {
|
|
371
|
+
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
if (layer.q_norm) {
|
|
375
|
+
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
|
|
376
|
+
cb(Qcur, "Qcur_norm", il);
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
if (layer.k_norm) {
|
|
380
|
+
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
|
|
381
|
+
cb(Kcur, "Kcur_norm", il);
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
|
385
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
|
386
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
cb(Qcur, "Qcur", il);
|
|
390
|
+
cb(Kcur, "Kcur", il);
|
|
391
|
+
cb(Vcur, "Vcur", il);
|
|
392
|
+
|
|
393
|
+
if (add_pos) {
|
|
394
|
+
Qcur = add_pos(Qcur, layer);
|
|
395
|
+
Kcur = add_pos(Kcur, layer);
|
|
396
|
+
cb(Qcur, "Qcur_pos", il);
|
|
397
|
+
cb(Kcur, "Kcur_pos", il);
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
cur = build_attn(layer.o_w, layer.o_b,
|
|
401
|
+
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
|
402
|
+
cb(cur, "attn_out", il);
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
if (layer.ls_1_w) {
|
|
406
|
+
cur = ggml_mul(ctx0, cur, layer.ls_1_w);
|
|
407
|
+
cb(cur, "attn_out_scaled", il);
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// re-add the layer input, e.g., residual
|
|
411
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
412
|
+
|
|
413
|
+
inpL = cur; // inpL = residual, cur = hidden_states
|
|
414
|
+
|
|
415
|
+
cb(cur, "ffn_inp", il);
|
|
416
|
+
|
|
417
|
+
// layernorm2
|
|
418
|
+
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
|
|
419
|
+
cb(cur, "ffn_inp_normed", il);
|
|
420
|
+
|
|
421
|
+
// ffn
|
|
422
|
+
cur = build_ffn(cur,
|
|
423
|
+
layer.ff_up_w, layer.ff_up_b,
|
|
424
|
+
layer.ff_gate_w, layer.ff_gate_b,
|
|
425
|
+
layer.ff_down_w, layer.ff_down_b,
|
|
426
|
+
ffn_t, il);
|
|
427
|
+
|
|
428
|
+
cb(cur, "ffn_out", il);
|
|
429
|
+
|
|
430
|
+
if (layer.ls_2_w) {
|
|
431
|
+
cur = ggml_mul(ctx0, cur, layer.ls_2_w);
|
|
432
|
+
cb(cur, "ffn_out_scaled", il);
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
// residual 2
|
|
436
|
+
cur = ggml_add(ctx0, inpL, cur);
|
|
437
|
+
cb(cur, "layer_out", il);
|
|
438
|
+
|
|
439
|
+
inpL = cur;
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
if (model.audio_has_avgpool()) {
|
|
443
|
+
ggml_tensor * cur = inpL;
|
|
444
|
+
cur = ggml_transpose(ctx0, cur);
|
|
445
|
+
cur = ggml_cont(ctx0, cur);
|
|
446
|
+
cur = ggml_pool_1d(ctx0, cur, GGML_OP_POOL_AVG, 2, 2, 0);
|
|
447
|
+
cur = ggml_transpose(ctx0, cur);
|
|
448
|
+
cur = ggml_cont(ctx0, cur);
|
|
449
|
+
inpL = cur;
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
// post-layernorm
|
|
453
|
+
if (model.post_ln_w) {
|
|
454
|
+
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1);
|
|
455
|
+
}
|
|
456
|
+
return inpL;
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
// build the input after conv2d (inp_raw --> patches)
|
|
460
|
+
// returns tensor with shape [n_embd, n_patches]
|
|
461
|
+
ggml_tensor * clip_graph::build_inp() {
|
|
462
|
+
ggml_tensor * inp_raw = build_inp_raw();
|
|
463
|
+
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
|
464
|
+
inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
|
|
465
|
+
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
|
|
466
|
+
if (model.patch_bias) {
|
|
467
|
+
inp = ggml_add(ctx0, inp, model.patch_bias);
|
|
468
|
+
cb(inp, "patch_bias", -1);
|
|
469
|
+
}
|
|
470
|
+
return inp;
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
ggml_tensor * clip_graph::build_inp_raw(int channels) {
|
|
474
|
+
ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels);
|
|
475
|
+
ggml_set_name(inp_raw, "inp_raw");
|
|
476
|
+
ggml_set_input(inp_raw);
|
|
477
|
+
return inp_raw;
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
ggml_tensor * clip_graph::build_norm(
|
|
481
|
+
ggml_tensor * cur,
|
|
482
|
+
ggml_tensor * mw,
|
|
483
|
+
ggml_tensor * mb,
|
|
484
|
+
norm_type type,
|
|
485
|
+
float norm_eps,
|
|
486
|
+
int il) const {
|
|
487
|
+
|
|
488
|
+
cur = type == NORM_TYPE_RMS
|
|
489
|
+
? ggml_rms_norm(ctx0, cur, norm_eps)
|
|
490
|
+
: ggml_norm(ctx0, cur, norm_eps);
|
|
491
|
+
|
|
492
|
+
if (mw) {
|
|
493
|
+
cur = ggml_mul(ctx0, cur, mw);
|
|
494
|
+
cb(cur, "norm_w", il);
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
if (mb) {
|
|
498
|
+
cur = ggml_add(ctx0, cur, mb);
|
|
499
|
+
cb(cur, "norm_b", il);
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
return cur;
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
ggml_tensor * clip_graph::build_ffn(
|
|
506
|
+
ggml_tensor * cur,
|
|
507
|
+
ggml_tensor * up,
|
|
508
|
+
ggml_tensor * up_b,
|
|
509
|
+
ggml_tensor * gate,
|
|
510
|
+
ggml_tensor * gate_b,
|
|
511
|
+
ggml_tensor * down,
|
|
512
|
+
ggml_tensor * down_b,
|
|
513
|
+
ffn_op_type type_op,
|
|
514
|
+
int il) const {
|
|
515
|
+
|
|
516
|
+
ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur;
|
|
517
|
+
cb(tmp, "ffn_up", il);
|
|
518
|
+
|
|
519
|
+
if (up_b) {
|
|
520
|
+
tmp = ggml_add(ctx0, tmp, up_b);
|
|
521
|
+
cb(tmp, "ffn_up_b", il);
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
if (gate) {
|
|
525
|
+
cur = ggml_mul_mat(ctx0, gate, cur);
|
|
526
|
+
cb(cur, "ffn_gate", il);
|
|
527
|
+
|
|
528
|
+
if (gate_b) {
|
|
529
|
+
cur = ggml_add(ctx0, cur, gate_b);
|
|
530
|
+
cb(cur, "ffn_gate_b", il);
|
|
531
|
+
}
|
|
532
|
+
} else {
|
|
533
|
+
cur = tmp;
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
// we only support parallel ffn for now
|
|
537
|
+
switch (type_op) {
|
|
538
|
+
case FFN_SILU:
|
|
539
|
+
if (gate) {
|
|
540
|
+
cur = ggml_swiglu_split(ctx0, cur, tmp);
|
|
541
|
+
cb(cur, "ffn_swiglu", il);
|
|
542
|
+
} else {
|
|
543
|
+
cur = ggml_silu(ctx0, cur);
|
|
544
|
+
cb(cur, "ffn_silu", il);
|
|
545
|
+
} break;
|
|
546
|
+
case FFN_GELU:
|
|
547
|
+
if (gate) {
|
|
548
|
+
cur = ggml_geglu_split(ctx0, cur, tmp);
|
|
549
|
+
cb(cur, "ffn_geglu", il);
|
|
550
|
+
} else {
|
|
551
|
+
cur = ggml_gelu(ctx0, cur);
|
|
552
|
+
cb(cur, "ffn_gelu", il);
|
|
553
|
+
} break;
|
|
554
|
+
case FFN_GELU_ERF:
|
|
555
|
+
if (gate) {
|
|
556
|
+
cur = ggml_geglu_erf_split(ctx0, cur, tmp);
|
|
557
|
+
cb(cur, "ffn_geglu_erf", il);
|
|
558
|
+
} else {
|
|
559
|
+
cur = ggml_gelu_erf(ctx0, cur);
|
|
560
|
+
cb(cur, "ffn_gelu_erf", il);
|
|
561
|
+
} break;
|
|
562
|
+
case FFN_GELU_QUICK:
|
|
563
|
+
if (gate) {
|
|
564
|
+
cur = ggml_geglu_quick_split(ctx0, cur, tmp);
|
|
565
|
+
cb(cur, "ffn_geglu_quick", il);
|
|
566
|
+
} else {
|
|
567
|
+
cur = ggml_gelu_quick(ctx0, cur);
|
|
568
|
+
cb(cur, "ffn_gelu_quick", il);
|
|
569
|
+
} break;
|
|
570
|
+
case FFN_RELU_SQR:
|
|
571
|
+
{
|
|
572
|
+
cur = ggml_relu(ctx0, cur);
|
|
573
|
+
cur = ggml_sqr(ctx0, cur);
|
|
574
|
+
cb(cur, "ffn_relu_sqr", il);
|
|
575
|
+
} break;
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
if (down) {
|
|
579
|
+
cur = ggml_mul_mat(ctx0, down, cur);
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
if (down_b) {
|
|
583
|
+
cb(cur, "ffn_down", il);
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
if (down_b) {
|
|
587
|
+
cur = ggml_add(ctx0, cur, down_b);
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
return cur;
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
ggml_tensor * clip_graph::build_attn(
|
|
594
|
+
ggml_tensor * wo,
|
|
595
|
+
ggml_tensor * wo_b,
|
|
596
|
+
ggml_tensor * q_cur,
|
|
597
|
+
ggml_tensor * k_cur,
|
|
598
|
+
ggml_tensor * v_cur,
|
|
599
|
+
ggml_tensor * kq_mask,
|
|
600
|
+
float kq_scale,
|
|
601
|
+
int il) const {
|
|
602
|
+
// these nodes are added to the graph together so that they are not reordered
|
|
603
|
+
// by doing so, the number of splits in the graph is reduced
|
|
604
|
+
ggml_build_forward_expand(gf, q_cur);
|
|
605
|
+
ggml_build_forward_expand(gf, k_cur);
|
|
606
|
+
ggml_build_forward_expand(gf, v_cur);
|
|
607
|
+
|
|
608
|
+
ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
|
|
609
|
+
//cb(q, "q", il);
|
|
610
|
+
|
|
611
|
+
ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
|
|
612
|
+
//cb(k, "k", il);
|
|
613
|
+
|
|
614
|
+
ggml_tensor * cur;
|
|
615
|
+
|
|
616
|
+
if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
|
|
617
|
+
ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
|
|
618
|
+
|
|
619
|
+
k = ggml_cast(ctx0, k, GGML_TYPE_F16);
|
|
620
|
+
v = ggml_cast(ctx0, v, GGML_TYPE_F16);
|
|
621
|
+
|
|
622
|
+
cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, 0.0f, 0.0f);
|
|
623
|
+
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
|
624
|
+
|
|
625
|
+
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);
|
|
626
|
+
|
|
627
|
+
} else {
|
|
628
|
+
ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3);
|
|
629
|
+
v = ggml_cont(ctx0, v);
|
|
630
|
+
|
|
631
|
+
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
|
632
|
+
// F32 may not needed for vision encoders?
|
|
633
|
+
// ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
|
634
|
+
|
|
635
|
+
kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f);
|
|
636
|
+
|
|
637
|
+
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
|
|
638
|
+
cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
|
639
|
+
cur = ggml_cont_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2] * cur->ne[3]);
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
cb(cur, "kqv_out", il);
|
|
643
|
+
|
|
644
|
+
if (wo) {
|
|
645
|
+
cur = ggml_mul_mat(ctx0, wo, cur);
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
if (wo_b) {
|
|
649
|
+
cur = ggml_add(ctx0, cur, wo_b);
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
return cur;
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
// implementation of the 2D RoPE without adding a new op in ggml
|
|
656
|
+
// this is not efficient (use double the memory), but works on all backends
|
|
657
|
+
// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
|
|
658
|
+
ggml_tensor * clip_graph::build_rope_2d(
|
|
659
|
+
ggml_context * ctx0,
|
|
660
|
+
ggml_tensor * cur,
|
|
661
|
+
ggml_tensor * pos_a, // first half
|
|
662
|
+
ggml_tensor * pos_b, // second half
|
|
663
|
+
const float freq_base,
|
|
664
|
+
const bool interleave_freq
|
|
665
|
+
) {
|
|
666
|
+
const int64_t n_dim = cur->ne[0];
|
|
667
|
+
const int64_t n_head = cur->ne[1];
|
|
668
|
+
const int64_t n_pos = cur->ne[2];
|
|
669
|
+
|
|
670
|
+
// for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
|
|
671
|
+
// we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
|
|
672
|
+
// first half of cur will use 1e-0, 1e-2 (even)
|
|
673
|
+
// second half of cur will use 1e-1, 1e-3 (odd)
|
|
674
|
+
// the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
|
|
675
|
+
// ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
|
|
676
|
+
// then for the second half, we use freq_scale to shift the inv_freq
|
|
677
|
+
// ^ why? replace (2i) with (2i+1) in the above equation
|
|
678
|
+
const float freq_scale_odd = interleave_freq
|
|
679
|
+
? std::pow(freq_base, (float)-2/n_dim)
|
|
680
|
+
: 1.0;
|
|
681
|
+
|
|
682
|
+
// first half
|
|
683
|
+
ggml_tensor * first;
|
|
684
|
+
{
|
|
685
|
+
first = ggml_view_3d(ctx0, cur,
|
|
686
|
+
n_dim/2, n_head, n_pos,
|
|
687
|
+
cur->nb[1],
|
|
688
|
+
cur->nb[2],
|
|
689
|
+
0);
|
|
690
|
+
first = ggml_rope_ext(
|
|
691
|
+
ctx0,
|
|
692
|
+
first,
|
|
693
|
+
pos_a, // positions
|
|
694
|
+
nullptr, // freq factors
|
|
695
|
+
n_dim/2, // n_dims
|
|
696
|
+
0, 0, freq_base,
|
|
697
|
+
1.0f, 0.0f, 1.0f, 0.0f, 0.0f
|
|
698
|
+
);
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
// second half
|
|
702
|
+
ggml_tensor * second;
|
|
703
|
+
{
|
|
704
|
+
second = ggml_view_3d(ctx0, cur,
|
|
705
|
+
n_dim/2, n_head, n_pos,
|
|
706
|
+
cur->nb[1],
|
|
707
|
+
cur->nb[2],
|
|
708
|
+
n_dim/2 * ggml_element_size(cur));
|
|
709
|
+
second = ggml_rope_ext(
|
|
710
|
+
ctx0,
|
|
711
|
+
second,
|
|
712
|
+
pos_b, // positions
|
|
713
|
+
nullptr, // freq factors
|
|
714
|
+
n_dim/2, // n_dims
|
|
715
|
+
0, 0, freq_base,
|
|
716
|
+
freq_scale_odd,
|
|
717
|
+
0.0f, 1.0f, 0.0f, 0.0f
|
|
718
|
+
);
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
cur = ggml_concat(ctx0, first, second, 0);
|
|
722
|
+
return cur;
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
// Generic function to stack frames for audio processing
|
|
726
|
+
// Abstracts out the StackAudioFrames logic used by ultravox
|
|
727
|
+
ggml_tensor * clip_graph::build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed) {
|
|
728
|
+
if (stack_factor <= 1) {
|
|
729
|
+
return cur;
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
int64_t total_elements = ggml_nelements(cur);
|
|
733
|
+
int64_t stride = n_embed * stack_factor;
|
|
734
|
+
|
|
735
|
+
// Calculate padded length
|
|
736
|
+
int64_t padded_len = GGML_PAD(total_elements, stride);
|
|
737
|
+
int64_t pad = padded_len - total_elements;
|
|
738
|
+
|
|
739
|
+
if (pad > 0) {
|
|
740
|
+
// Pad the tensor to make it divisible by stride
|
|
741
|
+
cur = ggml_view_1d(ctx0, cur, total_elements, 0);
|
|
742
|
+
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
// Reshape to [stride, padded_len / stride]
|
|
746
|
+
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
|
|
747
|
+
ggml_row_size(cur->type, stride), 0);
|
|
748
|
+
return cur;
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
|
|
752
|
+
// support dynamic resolution
|
|
753
|
+
ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
|
|
754
|
+
GGML_ASSERT(scale_factor > 1);
|
|
755
|
+
|
|
756
|
+
const int n_embd = cur->ne[0];
|
|
757
|
+
int width = img.nx / patch_size;
|
|
758
|
+
int height = img.ny / patch_size;
|
|
759
|
+
|
|
760
|
+
// pad width and height to factor
|
|
761
|
+
const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width;
|
|
762
|
+
const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height;
|
|
763
|
+
cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height);
|
|
764
|
+
if (pad_width || pad_height) {
|
|
765
|
+
cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0);
|
|
766
|
+
width += pad_width;
|
|
767
|
+
height += pad_height;
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
// unshuffle h
|
|
771
|
+
cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
|
|
772
|
+
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
|
773
|
+
|
|
774
|
+
// unshuffle w
|
|
775
|
+
cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
|
|
776
|
+
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
|
777
|
+
|
|
778
|
+
cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
|
|
779
|
+
cb(cur, "pixel_shuffle", -1);
|
|
780
|
+
|
|
781
|
+
return cur;
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
|
|
785
|
+
GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");
|
|
786
|
+
|
|
787
|
+
const clip_image_f32 & img = *imgs.entries[0];
|
|
788
|
+
std::unique_ptr<clip_graph> builder;
|
|
789
|
+
|
|
790
|
+
switch (ctx->proj_type()) {
|
|
791
|
+
case PROJECTOR_TYPE_GEMMA3:
|
|
792
|
+
case PROJECTOR_TYPE_IDEFICS3:
|
|
793
|
+
case PROJECTOR_TYPE_LFM2:
|
|
794
|
+
case PROJECTOR_TYPE_JANUS_PRO:
|
|
795
|
+
{
|
|
796
|
+
builder = std::make_unique<clip_graph_siglip>(ctx, img);
|
|
797
|
+
} break;
|
|
798
|
+
case PROJECTOR_TYPE_GEMMA3NV:
|
|
799
|
+
{
|
|
800
|
+
builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
|
|
801
|
+
} break;
|
|
802
|
+
case PROJECTOR_TYPE_PIXTRAL:
|
|
803
|
+
case PROJECTOR_TYPE_LIGHTONOCR:
|
|
804
|
+
{
|
|
805
|
+
builder = std::make_unique<clip_graph_pixtral>(ctx, img);
|
|
806
|
+
} break;
|
|
807
|
+
case PROJECTOR_TYPE_QWEN2VL:
|
|
808
|
+
case PROJECTOR_TYPE_QWEN25VL:
|
|
809
|
+
{
|
|
810
|
+
builder = std::make_unique<clip_graph_qwen2vl>(ctx, img);
|
|
811
|
+
} break;
|
|
812
|
+
case PROJECTOR_TYPE_QWEN3VL:
|
|
813
|
+
{
|
|
814
|
+
builder = std::make_unique<clip_graph_qwen3vl>(ctx, img);
|
|
815
|
+
} break;
|
|
816
|
+
case PROJECTOR_TYPE_MINICPMV:
|
|
817
|
+
{
|
|
818
|
+
builder = std::make_unique<clip_graph_minicpmv>(ctx, img);
|
|
819
|
+
} break;
|
|
820
|
+
case PROJECTOR_TYPE_INTERNVL:
|
|
821
|
+
{
|
|
822
|
+
builder = std::make_unique<clip_graph_internvl>(ctx, img);
|
|
823
|
+
} break;
|
|
824
|
+
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
|
|
825
|
+
{
|
|
826
|
+
builder = std::make_unique<clip_graph_nemotron_v2_vl>(ctx, img);
|
|
827
|
+
} break;
|
|
828
|
+
case PROJECTOR_TYPE_LLAMA4:
|
|
829
|
+
{
|
|
830
|
+
builder = std::make_unique<clip_graph_llama4>(ctx, img);
|
|
831
|
+
} break;
|
|
832
|
+
case PROJECTOR_TYPE_ULTRAVOX:
|
|
833
|
+
case PROJECTOR_TYPE_VOXTRAL:
|
|
834
|
+
case PROJECTOR_TYPE_QWEN2A:
|
|
835
|
+
case PROJECTOR_TYPE_GLMA:
|
|
836
|
+
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
|
837
|
+
{
|
|
838
|
+
builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
|
|
839
|
+
} break;
|
|
840
|
+
case PROJECTOR_TYPE_KIMIVL:
|
|
841
|
+
{
|
|
842
|
+
builder = std::make_unique<clip_graph_kimivl>(ctx, img);
|
|
843
|
+
} break;
|
|
844
|
+
case PROJECTOR_TYPE_PADDLEOCR:
|
|
845
|
+
{
|
|
846
|
+
builder = std::make_unique<clip_graph_paddleocr>(ctx, img);
|
|
847
|
+
} break;
|
|
848
|
+
case PROJECTOR_TYPE_KIMIK25:
|
|
849
|
+
{
|
|
850
|
+
builder = std::make_unique<clip_graph_kimik25>(ctx, img);
|
|
851
|
+
} break;
|
|
852
|
+
case PROJECTOR_TYPE_COGVLM:
|
|
853
|
+
{
|
|
854
|
+
builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
|
|
855
|
+
} break;
|
|
856
|
+
case PROJECTOR_TYPE_MLP:
|
|
857
|
+
case PROJECTOR_TYPE_MLP_NORM:
|
|
858
|
+
case PROJECTOR_TYPE_LDP:
|
|
859
|
+
case PROJECTOR_TYPE_LDPV2:
|
|
860
|
+
case PROJECTOR_TYPE_GLM_EDGE:
|
|
861
|
+
{
|
|
862
|
+
builder = std::make_unique<clip_graph_llava>(ctx, img);
|
|
863
|
+
} break;
|
|
864
|
+
case PROJECTOR_TYPE_LFM2A:
|
|
865
|
+
{
|
|
866
|
+
builder = std::make_unique<clip_graph_conformer>(ctx, img);
|
|
867
|
+
} break;
|
|
868
|
+
case PROJECTOR_TYPE_GLM4V:
|
|
869
|
+
{
|
|
870
|
+
builder = std::make_unique<clip_graph_glm4v>(ctx, img);
|
|
871
|
+
} break;
|
|
872
|
+
case PROJECTOR_TYPE_YOUTUVL:
|
|
873
|
+
{
|
|
874
|
+
builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
|
|
875
|
+
} break;
|
|
876
|
+
default:
|
|
877
|
+
GGML_ABORT("missing cgraph builder");
|
|
878
|
+
}
|
|
879
|
+
|
|
880
|
+
return builder->build();
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
//
|
|
884
|
+
// clip_model_loader
|
|
885
|
+
//
|
|
886
|
+
|
|
887
|
+
struct clip_model_loader {
|
|
888
|
+
ggml_context_ptr ctx_meta;
|
|
889
|
+
gguf_context_ptr ctx_gguf;
|
|
890
|
+
|
|
891
|
+
std::string fname;
|
|
892
|
+
|
|
893
|
+
size_t model_size = 0; // in bytes
|
|
894
|
+
|
|
895
|
+
bool has_vision = false;
|
|
896
|
+
bool has_audio = false;
|
|
897
|
+
|
|
898
|
+
// TODO @ngxson : we should not pass clip_ctx here, it should be clip_model
|
|
899
|
+
clip_model_loader(const char * fname) : fname(fname) {
|
|
900
|
+
struct ggml_context * meta = nullptr;
|
|
901
|
+
|
|
902
|
+
struct gguf_init_params params = {
|
|
903
|
+
/*.no_alloc = */ true,
|
|
904
|
+
/*.ctx = */ &meta,
|
|
905
|
+
};
|
|
906
|
+
|
|
907
|
+
ctx_gguf = gguf_context_ptr(gguf_init_from_file(fname, params));
|
|
908
|
+
if (!ctx_gguf.get()) {
|
|
909
|
+
throw std::runtime_error(string_format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
|
|
910
|
+
}
|
|
911
|
+
|
|
912
|
+
ctx_meta.reset(meta);
|
|
913
|
+
|
|
914
|
+
const int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
|
|
915
|
+
|
|
916
|
+
// print gguf info
|
|
917
|
+
{
|
|
918
|
+
std::string name;
|
|
919
|
+
get_string(KEY_NAME, name, false);
|
|
920
|
+
std::string description;
|
|
921
|
+
get_string(KEY_DESCRIPTION, description, false);
|
|
922
|
+
LOG_INF("%s: model name: %s\n", __func__, name.c_str());
|
|
923
|
+
LOG_INF("%s: description: %s\n", __func__, description.c_str());
|
|
924
|
+
LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx_gguf.get()));
|
|
925
|
+
LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx_gguf.get()));
|
|
926
|
+
LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
|
|
927
|
+
LOG_INF("%s: n_kv: %d\n", __func__, (int)gguf_get_n_kv(ctx_gguf.get()));
|
|
928
|
+
LOG_INF("\n");
|
|
929
|
+
}
|
|
930
|
+
|
|
931
|
+
// modalities
|
|
932
|
+
{
|
|
933
|
+
get_bool(KEY_HAS_VISION_ENC, has_vision, false);
|
|
934
|
+
get_bool(KEY_HAS_AUDIO_ENC, has_audio, false);
|
|
935
|
+
|
|
936
|
+
if (has_vision) {
|
|
937
|
+
LOG_INF("%s: has vision encoder\n", __func__);
|
|
938
|
+
}
|
|
939
|
+
if (has_audio) {
|
|
940
|
+
LOG_INF("%s: has audio encoder\n", __func__);
|
|
941
|
+
}
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
// tensors
|
|
945
|
+
{
|
|
946
|
+
for (int i = 0; i < n_tensors; ++i) {
|
|
947
|
+
const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
|
|
948
|
+
const size_t offset = gguf_get_tensor_offset(ctx_gguf.get(), i);
|
|
949
|
+
enum ggml_type type = gguf_get_tensor_type(ctx_gguf.get(), i);
|
|
950
|
+
ggml_tensor * cur = ggml_get_tensor(meta, name);
|
|
951
|
+
size_t tensor_size = ggml_nbytes(cur);
|
|
952
|
+
model_size += tensor_size;
|
|
953
|
+
LOG_DBG("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
|
|
954
|
+
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
|
|
955
|
+
}
|
|
956
|
+
}
|
|
957
|
+
}
|
|
958
|
+
|
|
959
|
+
void load_hparams(clip_model & model, clip_modality modality) {
|
|
960
|
+
auto & hparams = model.hparams;
|
|
961
|
+
std::string log_ffn_op; // for logging
|
|
962
|
+
|
|
963
|
+
// sanity check
|
|
964
|
+
if (modality == CLIP_MODALITY_VISION) {
|
|
965
|
+
GGML_ASSERT(has_vision);
|
|
966
|
+
} else if (modality == CLIP_MODALITY_AUDIO) {
|
|
967
|
+
GGML_ASSERT(has_audio);
|
|
968
|
+
}
|
|
969
|
+
model.modality = modality;
|
|
970
|
+
|
|
971
|
+
|
|
972
|
+
// projector type
|
|
973
|
+
std::string proj_type;
|
|
974
|
+
{
|
|
975
|
+
// default key
|
|
976
|
+
get_string(KEY_PROJ_TYPE, proj_type, false);
|
|
977
|
+
|
|
978
|
+
// for models with mixed modalities
|
|
979
|
+
if (proj_type.empty()) {
|
|
980
|
+
if (modality == CLIP_MODALITY_VISION) {
|
|
981
|
+
get_string(KEY_VISION_PROJ_TYPE, proj_type, false);
|
|
982
|
+
} else if (modality == CLIP_MODALITY_AUDIO) {
|
|
983
|
+
get_string(KEY_AUDIO_PROJ_TYPE, proj_type, false);
|
|
984
|
+
} else {
|
|
985
|
+
GGML_ABORT("unknown modality");
|
|
986
|
+
}
|
|
987
|
+
}
|
|
988
|
+
|
|
989
|
+
model.proj_type = clip_projector_type_from_string(proj_type);
|
|
990
|
+
|
|
991
|
+
if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) {
|
|
992
|
+
throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str()));
|
|
993
|
+
}
|
|
994
|
+
|
|
995
|
+
// correct arch for multimodal models (legacy method)
|
|
996
|
+
if (model.proj_type == PROJECTOR_TYPE_QWEN25O) {
|
|
997
|
+
model.proj_type = modality == CLIP_MODALITY_VISION
|
|
998
|
+
? PROJECTOR_TYPE_QWEN25VL
|
|
999
|
+
: PROJECTOR_TYPE_QWEN2A;
|
|
1000
|
+
}
|
|
1001
|
+
}
|
|
1002
|
+
|
|
1003
|
+
const bool is_vision = model.modality == CLIP_MODALITY_VISION;
|
|
1004
|
+
const bool is_audio = model.modality == CLIP_MODALITY_AUDIO;
|
|
1005
|
+
|
|
1006
|
+
// other hparams
|
|
1007
|
+
{
|
|
1008
|
+
const char * prefix = is_vision ? "vision" : "audio";
|
|
1009
|
+
get_u32(string_format(KEY_N_EMBD, prefix), hparams.n_embd);
|
|
1010
|
+
get_u32(string_format(KEY_N_HEAD, prefix), hparams.n_head);
|
|
1011
|
+
get_u32(string_format(KEY_N_FF, prefix), hparams.n_ff);
|
|
1012
|
+
get_u32(string_format(KEY_N_BLOCK, prefix), hparams.n_layer);
|
|
1013
|
+
get_u32(string_format(KEY_PROJ_DIM, prefix), hparams.projection_dim);
|
|
1014
|
+
get_f32(string_format(KEY_LAYER_NORM_EPS, prefix), hparams.eps);
|
|
1015
|
+
|
|
1016
|
+
if (is_vision) {
|
|
1017
|
+
get_u32(KEY_IMAGE_SIZE, hparams.image_size);
|
|
1018
|
+
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
|
|
1019
|
+
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
|
|
1020
|
+
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
|
|
1021
|
+
get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false);
|
|
1022
|
+
if (hparams.minicpmv_query_num == 0) {
|
|
1023
|
+
// Fallback to hardcoded values for legacy models
|
|
1024
|
+
if (hparams.minicpmv_version == 3) {
|
|
1025
|
+
hparams.minicpmv_query_num = 64;
|
|
1026
|
+
} else if (hparams.minicpmv_version == 4) {
|
|
1027
|
+
hparams.minicpmv_query_num = 64;
|
|
1028
|
+
} else if (hparams.minicpmv_version == 5) {
|
|
1029
|
+
hparams.minicpmv_query_num = 64;
|
|
1030
|
+
} else if (hparams.minicpmv_version == 6) {
|
|
1031
|
+
hparams.minicpmv_query_num = 64;
|
|
1032
|
+
} else if (hparams.minicpmv_version == 100045) {
|
|
1033
|
+
hparams.minicpmv_query_num = 64;
|
|
1034
|
+
} else {
|
|
1035
|
+
hparams.minicpmv_query_num = 96;
|
|
1036
|
+
}
|
|
1037
|
+
}
|
|
1038
|
+
} else if (is_audio) {
|
|
1039
|
+
get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
|
|
1040
|
+
// some hparams are unused, but still need to set to avoid issues
|
|
1041
|
+
hparams.image_size = 0;
|
|
1042
|
+
hparams.patch_size = 1;
|
|
1043
|
+
|
|
1044
|
+
} else {
|
|
1045
|
+
GGML_ASSERT(false && "unknown modality");
|
|
1046
|
+
}
|
|
1047
|
+
|
|
1048
|
+
// for pinpoints, we need to convert it into a list of resolution candidates
|
|
1049
|
+
{
|
|
1050
|
+
std::vector<int> pinpoints;
|
|
1051
|
+
get_arr_int(KEY_IMAGE_GRID_PINPOINTS, pinpoints, false);
|
|
1052
|
+
if (!pinpoints.empty()) {
|
|
1053
|
+
for (size_t i = 0; i < pinpoints.size(); i += 2) {
|
|
1054
|
+
hparams.image_res_candidates.push_back({
|
|
1055
|
+
pinpoints[i],
|
|
1056
|
+
pinpoints[i+1],
|
|
1057
|
+
});
|
|
1058
|
+
}
|
|
1059
|
+
}
|
|
1060
|
+
}
|
|
1061
|
+
|
|
1062
|
+
// default warmup value
|
|
1063
|
+
hparams.warmup_image_size = hparams.image_size;
|
|
1064
|
+
|
|
1065
|
+
hparams.has_llava_projector = model.proj_type == PROJECTOR_TYPE_MLP
|
|
1066
|
+
|| model.proj_type == PROJECTOR_TYPE_MLP_NORM
|
|
1067
|
+
|| model.proj_type == PROJECTOR_TYPE_LDP
|
|
1068
|
+
|| model.proj_type == PROJECTOR_TYPE_LDPV2;
|
|
1069
|
+
|
|
1070
|
+
{
|
|
1071
|
+
bool use_gelu = false;
|
|
1072
|
+
bool use_silu = false;
|
|
1073
|
+
get_bool(KEY_USE_GELU, use_gelu, false);
|
|
1074
|
+
get_bool(KEY_USE_SILU, use_silu, false);
|
|
1075
|
+
if (use_gelu && use_silu) {
|
|
1076
|
+
throw std::runtime_error(string_format("%s: both use_gelu and use_silu are set to true\n", __func__));
|
|
1077
|
+
}
|
|
1078
|
+
if (use_gelu) {
|
|
1079
|
+
hparams.ffn_op = FFN_GELU;
|
|
1080
|
+
log_ffn_op = "gelu";
|
|
1081
|
+
} else if (use_silu) {
|
|
1082
|
+
hparams.ffn_op = FFN_SILU;
|
|
1083
|
+
log_ffn_op = "silu";
|
|
1084
|
+
} else {
|
|
1085
|
+
hparams.ffn_op = FFN_GELU_QUICK;
|
|
1086
|
+
log_ffn_op = "gelu_quick";
|
|
1087
|
+
}
|
|
1088
|
+
}
|
|
1089
|
+
|
|
1090
|
+
{
|
|
1091
|
+
std::string mm_patch_merge_type;
|
|
1092
|
+
get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false);
|
|
1093
|
+
if (mm_patch_merge_type == "spatial_unpad") {
|
|
1094
|
+
hparams.mm_patch_merge_type = PATCH_MERGE_SPATIAL_UNPAD;
|
|
1095
|
+
}
|
|
1096
|
+
}
|
|
1097
|
+
|
|
1098
|
+
if (is_vision) {
|
|
1099
|
+
int idx_mean = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN);
|
|
1100
|
+
int idx_std = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_STD);
|
|
1101
|
+
GGML_ASSERT(idx_mean >= 0 && "image_mean not found");
|
|
1102
|
+
GGML_ASSERT(idx_std >= 0 && "image_std not found");
|
|
1103
|
+
const float * mean_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_mean);
|
|
1104
|
+
const float * std_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_std);
|
|
1105
|
+
for (int i = 0; i < 3; ++i) {
|
|
1106
|
+
hparams.image_mean[i] = mean_data[i];
|
|
1107
|
+
hparams.image_std[i] = std_data[i];
|
|
1108
|
+
}
|
|
1109
|
+
}
|
|
1110
|
+
|
|
1111
|
+
// Load the vision feature layer indices if they are explicitly provided;
|
|
1112
|
+
// if multiple vision feature layers are present, the values will be concatenated
|
|
1113
|
+
// to form the final visual features.
|
|
1114
|
+
// NOTE: gguf conversions should standardize the values of the vision feature layer to
|
|
1115
|
+
// be non-negative, since we use -1 to mark values as unset here.
|
|
1116
|
+
std::vector<int> vision_feature_layer;
|
|
1117
|
+
get_arr_int(KEY_FEATURE_LAYER, vision_feature_layer, false);
|
|
1118
|
+
// convert std::vector to std::unordered_set
|
|
1119
|
+
for (auto & layer : vision_feature_layer) {
|
|
1120
|
+
hparams.vision_feature_layer.insert(layer);
|
|
1121
|
+
}
|
|
1122
|
+
|
|
1123
|
+
// model-specific params
|
|
1124
|
+
switch (model.proj_type) {
|
|
1125
|
+
case PROJECTOR_TYPE_MINICPMV:
|
|
1126
|
+
{
|
|
1127
|
+
if (hparams.minicpmv_version == 0) {
|
|
1128
|
+
hparams.minicpmv_version = 2; // default to 2 if not set
|
|
1129
|
+
}
|
|
1130
|
+
} break;
|
|
1131
|
+
case PROJECTOR_TYPE_INTERNVL:
|
|
1132
|
+
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
|
|
1133
|
+
{
|
|
1134
|
+
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
|
1135
|
+
} break;
|
|
1136
|
+
case PROJECTOR_TYPE_IDEFICS3:
|
|
1137
|
+
{
|
|
1138
|
+
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
|
1139
|
+
get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
|
|
1140
|
+
} break;
|
|
1141
|
+
case PROJECTOR_TYPE_LFM2:
|
|
1142
|
+
{
|
|
1143
|
+
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
|
1144
|
+
// ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
|
|
1145
|
+
hparams.set_limit_image_tokens(64, 256);
|
|
1146
|
+
} break;
|
|
1147
|
+
case PROJECTOR_TYPE_PIXTRAL:
|
|
1148
|
+
case PROJECTOR_TYPE_LIGHTONOCR:
|
|
1149
|
+
{
|
|
1150
|
+
// ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
|
|
1151
|
+
// TODO: verify the image_min_tokens
|
|
1152
|
+
hparams.n_merge = 1; // the original pixtral does not use patch merging
|
|
1153
|
+
hparams.rope_theta = 10000.0f;
|
|
1154
|
+
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
|
1155
|
+
hparams.set_limit_image_tokens(8, 1024);
|
|
1156
|
+
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
|
|
1157
|
+
} break;
|
|
1158
|
+
case PROJECTOR_TYPE_KIMIVL:
|
|
1159
|
+
{
|
|
1160
|
+
hparams.rope_theta = 10000.0f;
|
|
1161
|
+
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
|
1162
|
+
// TODO: check kimivl preprocessor for exact values
|
|
1163
|
+
hparams.set_limit_image_tokens(8, 1024);
|
|
1164
|
+
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
|
|
1165
|
+
} break;
|
|
1166
|
+
case PROJECTOR_TYPE_KIMIK25:
|
|
1167
|
+
{
|
|
1168
|
+
hparams.rope_theta = 10000.0f;
|
|
1169
|
+
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
|
1170
|
+
|
|
1171
|
+
int min_pixels = 0, max_pixels = 0;
|
|
1172
|
+
get_u32(KEY_IMAGE_MIN_PIXELS, min_pixels, false);
|
|
1173
|
+
get_u32(KEY_IMAGE_MAX_PIXELS, max_pixels, false);
|
|
1174
|
+
if (min_pixels > 0 && max_pixels > 0) {
|
|
1175
|
+
hparams.image_min_pixels = min_pixels;
|
|
1176
|
+
hparams.image_max_pixels = max_pixels;
|
|
1177
|
+
hparams.warmup_image_size = static_cast<int>(std::sqrt(max_pixels));
|
|
1178
|
+
} else {
|
|
1179
|
+
hparams.set_limit_image_tokens(2, 4096);
|
|
1180
|
+
}
|
|
1181
|
+
} break;
|
|
1182
|
+
case PROJECTOR_TYPE_GEMMA3:
|
|
1183
|
+
{
|
|
1184
|
+
// default value (used by all model sizes in gemma 3 family)
|
|
1185
|
+
// number of patches for each **side** is reduced by a factor of 4
|
|
1186
|
+
hparams.n_merge = 4;
|
|
1187
|
+
// test model (tinygemma3) has a different value, we optionally read it
|
|
1188
|
+
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
|
1189
|
+
} break;
|
|
1190
|
+
|
|
1191
|
+
case PROJECTOR_TYPE_GEMMA3NV:
|
|
1192
|
+
{
|
|
1193
|
+
// Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
|
|
1194
|
+
// Similar configuration to Gemma3
|
|
1195
|
+
hparams.n_merge = 1; // MobileNetV5 handles resizing internally
|
|
1196
|
+
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
|
1197
|
+
} break;
|
|
1198
|
+
case PROJECTOR_TYPE_QWEN2VL:
|
|
1199
|
+
case PROJECTOR_TYPE_QWEN25VL:
|
|
1200
|
+
case PROJECTOR_TYPE_QWEN3VL:
|
|
1201
|
+
{
|
|
1202
|
+
hparams.n_merge = 2; // default value for Qwen 2 and 2.5
|
|
1203
|
+
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
|
1204
|
+
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
|
|
1205
|
+
// ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
|
|
1206
|
+
hparams.set_limit_image_tokens(8, 4096);
|
|
1207
|
+
hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
|
|
1208
|
+
const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size;
|
|
1209
|
+
if (hparams.image_min_pixels < warn_min_pixels) {
|
|
1210
|
+
LOG_WRN("%s: Qwen-VL models require at minimum 1024 image tokens to function correctly on grounding tasks\n", __func__);
|
|
1211
|
+
LOG_WRN("%s: if you encounter problems with accuracy, try adding --image-min-tokens 1024\n", __func__);
|
|
1212
|
+
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
|
|
1213
|
+
}
|
|
1214
|
+
} break;
|
|
1215
|
+
case PROJECTOR_TYPE_YOUTUVL:
|
|
1216
|
+
{
|
|
1217
|
+
hparams.n_merge = 2;
|
|
1218
|
+
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
|
1219
|
+
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
|
|
1220
|
+
std::vector<int> wa_layer_indexes_vec;
|
|
1221
|
+
get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true);
|
|
1222
|
+
for (auto & layer : wa_layer_indexes_vec) {
|
|
1223
|
+
hparams.wa_layer_indexes.insert(layer);
|
|
1224
|
+
}
|
|
1225
|
+
// support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens
|
|
1226
|
+
hparams.set_limit_image_tokens(1, 62500);
|
|
1227
|
+
hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
|
|
1228
|
+
} break;
|
|
1229
|
+
case PROJECTOR_TYPE_GLM4V:
|
|
1230
|
+
{
|
|
1231
|
+
hparams.rope_theta = 10000.0f;
|
|
1232
|
+
hparams.n_merge = 2; // default value for GLM4-V
|
|
1233
|
+
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
|
1234
|
+
hparams.set_limit_image_tokens(8, 4096);
|
|
1235
|
+
hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
|
|
1236
|
+
} break;
|
|
1237
|
+
case PROJECTOR_TYPE_LLAMA4:
|
|
1238
|
+
{
|
|
1239
|
+
hparams.rope_theta = 10000.0f;
|
|
1240
|
+
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
|
1241
|
+
set_llava_uhd_res_candidates(model, 3);
|
|
1242
|
+
} break;
|
|
1243
|
+
case PROJECTOR_TYPE_ULTRAVOX:
|
|
1244
|
+
case PROJECTOR_TYPE_QWEN2A:
|
|
1245
|
+
case PROJECTOR_TYPE_GLMA:
|
|
1246
|
+
case PROJECTOR_TYPE_VOXTRAL:
|
|
1247
|
+
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
|
1248
|
+
{
|
|
1249
|
+
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
|
|
1250
|
+
model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
|
|
1251
|
+
model.proj_type == PROJECTOR_TYPE_GLMA;
|
|
1252
|
+
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
|
|
1253
|
+
hparams.ffn_op = FFN_GELU_ERF;
|
|
1254
|
+
log_ffn_op = "gelu_erf"; // temporary solution for logging
|
|
1255
|
+
|
|
1256
|
+
// audio preprocessing params
|
|
1257
|
+
hparams.audio_chunk_len = 30; // in seconds
|
|
1258
|
+
hparams.audio_sample_rate = 16000;
|
|
1259
|
+
hparams.audio_n_fft = 400;
|
|
1260
|
+
hparams.audio_window_len = 400;
|
|
1261
|
+
hparams.audio_hop_len = 160;
|
|
1262
|
+
} break;
|
|
1263
|
+
case PROJECTOR_TYPE_PADDLEOCR:
|
|
1264
|
+
{
|
|
1265
|
+
hparams.n_merge = 2;
|
|
1266
|
+
get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
|
|
1267
|
+
get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
|
|
1268
|
+
|
|
1269
|
+
hparams.set_warmup_n_tokens(28*28); // avoid OOM on warmup
|
|
1270
|
+
} break;
|
|
1271
|
+
case PROJECTOR_TYPE_LFM2A:
|
|
1272
|
+
{
|
|
1273
|
+
// audio preprocessing params
|
|
1274
|
+
hparams.audio_chunk_len = 1; // in seconds
|
|
1275
|
+
hparams.audio_sample_rate = 16000;
|
|
1276
|
+
hparams.audio_n_fft = 512;
|
|
1277
|
+
hparams.audio_window_len = 400;
|
|
1278
|
+
hparams.audio_hop_len = 160;
|
|
1279
|
+
} break;
|
|
1280
|
+
default:
|
|
1281
|
+
break;
|
|
1282
|
+
}
|
|
1283
|
+
|
|
1284
|
+
// sanity check
|
|
1285
|
+
{
|
|
1286
|
+
if (hparams.image_max_pixels < hparams.image_min_pixels) {
|
|
1287
|
+
throw std::runtime_error(string_format("%s: image_max_pixels (%d) is less than image_min_pixels (%d)\n", __func__, hparams.image_max_pixels, hparams.image_min_pixels));
|
|
1288
|
+
}
|
|
1289
|
+
}
|
|
1290
|
+
|
|
1291
|
+
LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
|
|
1292
|
+
LOG_INF("%s: n_embd: %d\n", __func__, hparams.n_embd);
|
|
1293
|
+
LOG_INF("%s: n_head: %d\n", __func__, hparams.n_head);
|
|
1294
|
+
LOG_INF("%s: n_ff: %d\n", __func__, hparams.n_ff);
|
|
1295
|
+
LOG_INF("%s: n_layer: %d\n", __func__, hparams.n_layer);
|
|
1296
|
+
LOG_INF("%s: ffn_op: %s\n", __func__, log_ffn_op.c_str());
|
|
1297
|
+
LOG_INF("%s: projection_dim: %d\n", __func__, hparams.projection_dim);
|
|
1298
|
+
if (is_vision) {
|
|
1299
|
+
LOG_INF("\n--- vision hparams ---\n");
|
|
1300
|
+
LOG_INF("%s: image_size: %d\n", __func__, hparams.image_size);
|
|
1301
|
+
LOG_INF("%s: patch_size: %d\n", __func__, hparams.patch_size);
|
|
1302
|
+
LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector);
|
|
1303
|
+
LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version);
|
|
1304
|
+
LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge);
|
|
1305
|
+
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
|
|
1306
|
+
if (!hparams.wa_layer_indexes.empty()) {
|
|
1307
|
+
LOG_INF("%s: wa_layer_indexes: ", __func__);
|
|
1308
|
+
for (auto & layer : hparams.wa_layer_indexes) {
|
|
1309
|
+
LOG_INF("%d ", layer);
|
|
1310
|
+
}
|
|
1311
|
+
LOG_INF("\n");
|
|
1312
|
+
}
|
|
1313
|
+
if (hparams.image_min_pixels > 0) {
|
|
1314
|
+
LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
|
|
1315
|
+
}
|
|
1316
|
+
if (hparams.image_max_pixels > 0) {
|
|
1317
|
+
LOG_INF("%s: image_max_pixels: %d%s\n", __func__, hparams.image_max_pixels, hparams.custom_image_max_tokens > 0 ? " (custom value)" : "");
|
|
1318
|
+
}
|
|
1319
|
+
} else if (is_audio) {
|
|
1320
|
+
LOG_INF("\n--- audio hparams ---\n");
|
|
1321
|
+
LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins);
|
|
1322
|
+
LOG_INF("%s: proj_stack_factor: %d\n", __func__, hparams.proj_stack_factor);
|
|
1323
|
+
LOG_INF("%s: audio_chunk_len: %d\n", __func__, hparams.audio_chunk_len);
|
|
1324
|
+
LOG_INF("%s: audio_sample_rate: %d\n", __func__, hparams.audio_sample_rate);
|
|
1325
|
+
LOG_INF("%s: audio_n_fft: %d\n", __func__, hparams.audio_n_fft);
|
|
1326
|
+
LOG_INF("%s: audio_window_len: %d\n", __func__, hparams.audio_window_len);
|
|
1327
|
+
LOG_INF("%s: audio_hop_len: %d\n", __func__, hparams.audio_hop_len);
|
|
1328
|
+
}
|
|
1329
|
+
LOG_INF("\n");
|
|
1330
|
+
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
|
|
1331
|
+
LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
|
|
1332
|
+
}
|
|
1333
|
+
}
|
|
1334
|
+
|
|
1335
|
+
void load_tensors(clip_ctx & ctx_clip) {
|
|
1336
|
+
auto & model = ctx_clip.model;
|
|
1337
|
+
auto & hparams = model.hparams;
|
|
1338
|
+
std::map<std::string, size_t> tensor_offset;
|
|
1339
|
+
std::vector<ggml_tensor *> tensors_to_load;
|
|
1340
|
+
|
|
1341
|
+
// TODO @ngxson : support both audio and video in the future
|
|
1342
|
+
const char * prefix = model.modality == CLIP_MODALITY_AUDIO ? "a" : "v";
|
|
1343
|
+
|
|
1344
|
+
// get offsets
|
|
1345
|
+
for (int64_t i = 0; i < gguf_get_n_tensors(ctx_gguf.get()); ++i) {
|
|
1346
|
+
const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
|
|
1347
|
+
tensor_offset[name] = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), i);
|
|
1348
|
+
}
|
|
1349
|
+
|
|
1350
|
+
// create data context
|
|
1351
|
+
struct ggml_init_params params = {
|
|
1352
|
+
/*.mem_size =*/ static_cast<size_t>(gguf_get_n_tensors(ctx_gguf.get()) + 1) * ggml_tensor_overhead(),
|
|
1353
|
+
/*.mem_buffer =*/ NULL,
|
|
1354
|
+
/*.no_alloc =*/ true,
|
|
1355
|
+
};
|
|
1356
|
+
ctx_clip.ctx_data.reset(ggml_init(params));
|
|
1357
|
+
if (!ctx_clip.ctx_data) {
|
|
1358
|
+
throw std::runtime_error(string_format("%s: failed to init ggml context\n", __func__));
|
|
1359
|
+
}
|
|
1360
|
+
|
|
1361
|
+
// helper function
|
|
1362
|
+
auto get_tensor = [&](const std::string & name, bool required = true) {
|
|
1363
|
+
ggml_tensor * cur = ggml_get_tensor(ctx_meta.get(), name.c_str());
|
|
1364
|
+
if (!cur && required) {
|
|
1365
|
+
throw std::runtime_error(string_format("%s: unable to find tensor %s\n", __func__, name.c_str()));
|
|
1366
|
+
}
|
|
1367
|
+
if (cur) {
|
|
1368
|
+
tensors_to_load.push_back(cur);
|
|
1369
|
+
// add tensors to context
|
|
1370
|
+
ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data.get(), cur);
|
|
1371
|
+
ggml_set_name(data_tensor, cur->name);
|
|
1372
|
+
cur = data_tensor;
|
|
1373
|
+
}
|
|
1374
|
+
return cur;
|
|
1375
|
+
};
|
|
1376
|
+
|
|
1377
|
+
model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
|
|
1378
|
+
|
|
1379
|
+
model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false);
|
|
1380
|
+
model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, prefix, "bias"), false);
|
|
1381
|
+
|
|
1382
|
+
model.post_ln_w = get_tensor(string_format(TN_LN_POST, prefix, "weight"), false);
|
|
1383
|
+
model.post_ln_b = get_tensor(string_format(TN_LN_POST, prefix, "bias"), false);
|
|
1384
|
+
|
|
1385
|
+
model.patch_bias = get_tensor(TN_PATCH_BIAS, false);
|
|
1386
|
+
model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
|
|
1387
|
+
model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
|
|
1388
|
+
|
|
1389
|
+
model.norm_embd_w = get_tensor(string_format(TN_NORM_EMBD, "weight"), false);
|
|
1390
|
+
model.norm_embd_b = get_tensor(string_format(TN_NORM_EMBD, "bias"), false);
|
|
1391
|
+
|
|
1392
|
+
model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
|
|
1393
|
+
|
|
1394
|
+
if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
|
|
1395
|
+
hparams.n_layer = 0; // gemma3n does not use normal layer structure
|
|
1396
|
+
}
|
|
1397
|
+
|
|
1398
|
+
// layers
|
|
1399
|
+
model.layers.resize(hparams.n_layer);
|
|
1400
|
+
for (int il = 0; il < hparams.n_layer; ++il) {
|
|
1401
|
+
auto & layer = model.layers[il];
|
|
1402
|
+
layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"), false);
|
|
1403
|
+
layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"), false);
|
|
1404
|
+
layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight"), false);
|
|
1405
|
+
layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight"));
|
|
1406
|
+
layer.qkv_w = get_tensor(string_format(TN_ATTN_QKV, prefix, il, "weight"), false);
|
|
1407
|
+
layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, prefix, il, "weight"), false);
|
|
1408
|
+
layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false);
|
|
1409
|
+
layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"), false);
|
|
1410
|
+
layer.ln_2_w = get_tensor(string_format(TN_LN_2, prefix, il, "weight"), false);
|
|
1411
|
+
layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias
|
|
1412
|
+
layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias
|
|
1413
|
+
|
|
1414
|
+
layer.k_b = get_tensor(string_format(TN_ATTN_K, prefix, il, "bias"), false);
|
|
1415
|
+
layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false);
|
|
1416
|
+
layer.v_b = get_tensor(string_format(TN_ATTN_V, prefix, il, "bias"), false);
|
|
1417
|
+
layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias"), false);
|
|
1418
|
+
layer.qkv_b = get_tensor(string_format(TN_ATTN_QKV, prefix, il, "bias"), false);
|
|
1419
|
+
layer.ln_1_b = get_tensor(string_format(TN_LN_1, prefix, il, "bias"), false);
|
|
1420
|
+
layer.ln_2_b = get_tensor(string_format(TN_LN_2, prefix, il, "bias"), false);
|
|
1421
|
+
|
|
1422
|
+
// ffn
|
|
1423
|
+
layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, prefix, il, "weight"));
|
|
1424
|
+
layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, prefix, il, "bias"), false);
|
|
1425
|
+
layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, prefix, il, "weight"), false);
|
|
1426
|
+
layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, prefix, il, "bias"), false);
|
|
1427
|
+
layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "weight"));
|
|
1428
|
+
layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "bias"), false);
|
|
1429
|
+
|
|
1430
|
+
|
|
1431
|
+
// qwen3vl deepstack layer
|
|
1432
|
+
layer.deepstack_norm_w = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "weight"), false);
|
|
1433
|
+
layer.deepstack_norm_b = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "bias"), false);
|
|
1434
|
+
layer.deepstack_fc1_w = get_tensor(string_format(TN_DEEPSTACK_FC1, il, "weight"), false);
|
|
1435
|
+
layer.deepstack_fc1_b = get_tensor(string_format(TN_DEEPSTACK_FC1, il, "bias"), false);
|
|
1436
|
+
layer.deepstack_fc2_w = get_tensor(string_format(TN_DEEPSTACK_FC2, il, "weight"), false);
|
|
1437
|
+
layer.deepstack_fc2_b = get_tensor(string_format(TN_DEEPSTACK_FC2, il, "bias"), false);
|
|
1438
|
+
if (layer.has_deepstack()) {
|
|
1439
|
+
model.n_deepstack_layers++;
|
|
1440
|
+
}
|
|
1441
|
+
|
|
1442
|
+
// some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
|
|
1443
|
+
// note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
|
|
1444
|
+
bool is_ffn_swapped = (
|
|
1445
|
+
// only old models need this fix
|
|
1446
|
+
model.proj_type == PROJECTOR_TYPE_MLP
|
|
1447
|
+
|| model.proj_type == PROJECTOR_TYPE_MLP_NORM
|
|
1448
|
+
|| model.proj_type == PROJECTOR_TYPE_LDP
|
|
1449
|
+
|| model.proj_type == PROJECTOR_TYPE_LDPV2
|
|
1450
|
+
|| model.proj_type == PROJECTOR_TYPE_QWEN2VL
|
|
1451
|
+
|| model.proj_type == PROJECTOR_TYPE_QWEN25VL
|
|
1452
|
+
|| model.proj_type == PROJECTOR_TYPE_GLM_EDGE
|
|
1453
|
+
|| model.proj_type == PROJECTOR_TYPE_GEMMA3
|
|
1454
|
+
|| model.proj_type == PROJECTOR_TYPE_IDEFICS3
|
|
1455
|
+
|| model.proj_type == PROJECTOR_TYPE_MINICPMV
|
|
1456
|
+
) && layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd;
|
|
1457
|
+
if (is_ffn_swapped) {
|
|
1458
|
+
// swap up and down weights
|
|
1459
|
+
ggml_tensor * tmp = layer.ff_up_w;
|
|
1460
|
+
layer.ff_up_w = layer.ff_down_w;
|
|
1461
|
+
layer.ff_down_w = tmp;
|
|
1462
|
+
// swap up and down biases
|
|
1463
|
+
tmp = layer.ff_up_b;
|
|
1464
|
+
layer.ff_up_b = layer.ff_down_b;
|
|
1465
|
+
layer.ff_down_b = tmp;
|
|
1466
|
+
if (il == 0) {
|
|
1467
|
+
LOG_WRN("%s: ffn up/down are swapped\n", __func__);
|
|
1468
|
+
}
|
|
1469
|
+
}
|
|
1470
|
+
}
|
|
1471
|
+
|
|
1472
|
+
|
|
1473
|
+
switch (model.proj_type) {
|
|
1474
|
+
case PROJECTOR_TYPE_MLP:
|
|
1475
|
+
case PROJECTOR_TYPE_MLP_NORM:
|
|
1476
|
+
{
|
|
1477
|
+
// LLaVA projection
|
|
1478
|
+
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"), false);
|
|
1479
|
+
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
|
|
1480
|
+
// Yi-type llava
|
|
1481
|
+
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"), false);
|
|
1482
|
+
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
|
|
1483
|
+
// missing in Yi-type llava
|
|
1484
|
+
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"), false);
|
|
1485
|
+
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
|
|
1486
|
+
// Yi-type llava
|
|
1487
|
+
model.mm_3_w = get_tensor(string_format(TN_LLAVA_PROJ, 3, "weight"), false);
|
|
1488
|
+
model.mm_3_b = get_tensor(string_format(TN_LLAVA_PROJ, 3, "bias"), false);
|
|
1489
|
+
model.mm_4_w = get_tensor(string_format(TN_LLAVA_PROJ, 4, "weight"), false);
|
|
1490
|
+
model.mm_4_b = get_tensor(string_format(TN_LLAVA_PROJ, 4, "bias"), false);
|
|
1491
|
+
if (model.mm_3_w) {
|
|
1492
|
+
// TODO: this is a hack to support Yi-type llava
|
|
1493
|
+
model.proj_type = PROJECTOR_TYPE_MLP_NORM;
|
|
1494
|
+
}
|
|
1495
|
+
model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false);
|
|
1496
|
+
} break;
|
|
1497
|
+
case PROJECTOR_TYPE_LDP:
|
|
1498
|
+
{
|
|
1499
|
+
// MobileVLM projection
|
|
1500
|
+
model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
|
|
1501
|
+
model.mm_model_mlp_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
|
|
1502
|
+
model.mm_model_mlp_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
|
|
1503
|
+
model.mm_model_mlp_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
|
|
1504
|
+
model.mm_model_block_1_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
|
|
1505
|
+
model.mm_model_block_1_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
|
|
1506
|
+
model.mm_model_block_1_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
|
|
1507
|
+
model.mm_model_block_1_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
|
|
1508
|
+
model.mm_model_block_1_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
|
|
1509
|
+
model.mm_model_block_1_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
|
|
1510
|
+
model.mm_model_block_1_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
|
|
1511
|
+
model.mm_model_block_1_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
|
|
1512
|
+
model.mm_model_block_1_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
|
|
1513
|
+
model.mm_model_block_1_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
|
|
1514
|
+
model.mm_model_block_2_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
|
|
1515
|
+
model.mm_model_block_2_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
|
|
1516
|
+
model.mm_model_block_2_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
|
|
1517
|
+
model.mm_model_block_2_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
|
|
1518
|
+
model.mm_model_block_2_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
|
|
1519
|
+
model.mm_model_block_2_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
|
|
1520
|
+
model.mm_model_block_2_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
|
|
1521
|
+
model.mm_model_block_2_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
|
|
1522
|
+
model.mm_model_block_2_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
|
|
1523
|
+
model.mm_model_block_2_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
|
|
1524
|
+
} break;
|
|
1525
|
+
case PROJECTOR_TYPE_LDPV2:
|
|
1526
|
+
{
|
|
1527
|
+
// MobilVLM_V2 projection
|
|
1528
|
+
model.mm_model_mlp_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
|
|
1529
|
+
model.mm_model_mlp_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
|
|
1530
|
+
model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
|
|
1531
|
+
model.mm_model_mlp_2_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "bias"));
|
|
1532
|
+
model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight"));
|
|
1533
|
+
model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias"));
|
|
1534
|
+
} break;
|
|
1535
|
+
case PROJECTOR_TYPE_MINICPMV:
|
|
1536
|
+
{
|
|
1537
|
+
// model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
|
|
1538
|
+
model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K);
|
|
1539
|
+
model.mm_model_query = get_tensor(TN_MINICPMV_QUERY);
|
|
1540
|
+
model.mm_model_proj = get_tensor(TN_MINICPMV_PROJ);
|
|
1541
|
+
model.mm_model_kv_proj = get_tensor(TN_MINICPMV_KV_PROJ);
|
|
1542
|
+
model.mm_model_attn_q_w = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "weight"));
|
|
1543
|
+
model.mm_model_attn_k_w = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "weight"));
|
|
1544
|
+
model.mm_model_attn_v_w = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "weight"));
|
|
1545
|
+
model.mm_model_attn_q_b = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "bias"));
|
|
1546
|
+
model.mm_model_attn_k_b = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "bias"));
|
|
1547
|
+
model.mm_model_attn_v_b = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "bias"));
|
|
1548
|
+
model.mm_model_attn_o_w = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "weight"));
|
|
1549
|
+
model.mm_model_attn_o_b = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "bias"));
|
|
1550
|
+
model.mm_model_ln_q_w = get_tensor(string_format(TN_MINICPMV_LN, "q", "weight"));
|
|
1551
|
+
model.mm_model_ln_q_b = get_tensor(string_format(TN_MINICPMV_LN, "q", "bias"));
|
|
1552
|
+
model.mm_model_ln_kv_w = get_tensor(string_format(TN_MINICPMV_LN, "kv", "weight"));
|
|
1553
|
+
model.mm_model_ln_kv_b = get_tensor(string_format(TN_MINICPMV_LN, "kv", "bias"));
|
|
1554
|
+
model.mm_model_ln_post_w = get_tensor(string_format(TN_MINICPMV_LN, "post", "weight"));
|
|
1555
|
+
model.mm_model_ln_post_b = get_tensor(string_format(TN_MINICPMV_LN, "post", "bias"));
|
|
1556
|
+
} break;
|
|
1557
|
+
case PROJECTOR_TYPE_GLM_EDGE:
|
|
1558
|
+
{
|
|
1559
|
+
model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight"));
|
|
1560
|
+
model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias"));
|
|
1561
|
+
model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR, "weight"));
|
|
1562
|
+
model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "weight"));
|
|
1563
|
+
model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "bias"));
|
|
1564
|
+
model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
|
|
1565
|
+
model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
|
|
1566
|
+
model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
|
|
1567
|
+
model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI));
|
|
1568
|
+
model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI));
|
|
1569
|
+
} break;
|
|
1570
|
+
case PROJECTOR_TYPE_QWEN2VL:
|
|
1571
|
+
case PROJECTOR_TYPE_QWEN25VL:
|
|
1572
|
+
{
|
|
1573
|
+
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
|
|
1574
|
+
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
|
|
1575
|
+
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
|
1576
|
+
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
|
1577
|
+
} break;
|
|
1578
|
+
case PROJECTOR_TYPE_QWEN3VL:
|
|
1579
|
+
{
|
|
1580
|
+
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
|
|
1581
|
+
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
|
|
1582
|
+
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
|
1583
|
+
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
|
1584
|
+
} break;
|
|
1585
|
+
case PROJECTOR_TYPE_YOUTUVL:
|
|
1586
|
+
{
|
|
1587
|
+
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); // merger.ln_q (RMS norm)
|
|
1588
|
+
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); // merger.mlp.0
|
|
1589
|
+
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
|
|
1590
|
+
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); // merger.mlp.2
|
|
1591
|
+
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
|
1592
|
+
} break;
|
|
1593
|
+
case PROJECTOR_TYPE_GLM4V:
|
|
1594
|
+
{
|
|
1595
|
+
model.projection = get_tensor(TN_MM_PROJECTOR);
|
|
1596
|
+
model.mm_ffn_up_w = get_tensor(string_format(TN_MM_UP, "weight"));
|
|
1597
|
+
model.mm_ffn_up_b = get_tensor(string_format(TN_MM_UP, "bias"), false);
|
|
1598
|
+
model.mm_ffn_gate_w = get_tensor(string_format(TN_MM_GATE, "weight"));
|
|
1599
|
+
model.mm_ffn_gate_b = get_tensor(string_format(TN_MM_GATE, "bias"), false);
|
|
1600
|
+
model.mm_ffn_down_w = get_tensor(string_format(TN_MM_DOWN, "weight"));
|
|
1601
|
+
model.mm_ffn_down_b = get_tensor(string_format(TN_MM_DOWN, "bias"), false);
|
|
1602
|
+
model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight"));
|
|
1603
|
+
model.mm_post_norm_b = get_tensor(string_format(TN_MM_POST_NORM, "bias"), false);
|
|
1604
|
+
model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"));
|
|
1605
|
+
model.mm_patch_merger_b = get_tensor(string_format(TN_MM_PATCH_MERGER, "bias"));
|
|
1606
|
+
} break;
|
|
1607
|
+
case PROJECTOR_TYPE_GEMMA3:
|
|
1608
|
+
{
|
|
1609
|
+
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
|
1610
|
+
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
|
|
1611
|
+
} break;
|
|
1612
|
+
case PROJECTOR_TYPE_GEMMA3NV:
|
|
1613
|
+
{
|
|
1614
|
+
model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
|
|
1615
|
+
model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false);
|
|
1616
|
+
model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false);
|
|
1617
|
+
|
|
1618
|
+
model.msfa_ffn_expand_w = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false);
|
|
1619
|
+
model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded
|
|
1620
|
+
model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
|
|
1621
|
+
model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
|
|
1622
|
+
|
|
1623
|
+
model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
|
|
1624
|
+
|
|
1625
|
+
// Dynamically load blocks stage by stage
|
|
1626
|
+
for (int stage = 0; stage < 4; ++stage) {
|
|
1627
|
+
int blocks_found_in_stage = 0;
|
|
1628
|
+
|
|
1629
|
+
for (int blk_idx = 0; ; ++blk_idx) {
|
|
1630
|
+
bool found_block = false;
|
|
1631
|
+
mobilenetv5_block block;
|
|
1632
|
+
|
|
1633
|
+
// 1. Check for Edge Residual (S0)
|
|
1634
|
+
block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false);
|
|
1635
|
+
if (block.s0_conv_exp_w) {
|
|
1636
|
+
found_block = true;
|
|
1637
|
+
block.s0_bn1_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false);
|
|
1638
|
+
block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false);
|
|
1639
|
+
block.s0_bn2_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false);
|
|
1640
|
+
}
|
|
1641
|
+
// 2. Check for UIR (Universal Inverted Residual)
|
|
1642
|
+
else {
|
|
1643
|
+
// Check for dw_start OR pw_exp (some UIR blocks skip dw_start)
|
|
1644
|
+
block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false);
|
|
1645
|
+
block.pw_exp_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false);
|
|
1646
|
+
|
|
1647
|
+
if (block.dw_start_w || block.pw_exp_w) {
|
|
1648
|
+
found_block = true;
|
|
1649
|
+
if (block.dw_start_w) {
|
|
1650
|
+
block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false);
|
|
1651
|
+
}
|
|
1652
|
+
if (block.pw_exp_w) {
|
|
1653
|
+
block.pw_exp_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false);
|
|
1654
|
+
}
|
|
1655
|
+
block.dw_mid_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false);
|
|
1656
|
+
if (block.dw_mid_w) {
|
|
1657
|
+
block.dw_mid_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false);
|
|
1658
|
+
}
|
|
1659
|
+
block.pw_proj_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false);
|
|
1660
|
+
if (block.pw_proj_w) {
|
|
1661
|
+
block.pw_proj_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false);
|
|
1662
|
+
}
|
|
1663
|
+
block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
|
|
1664
|
+
}
|
|
1665
|
+
}
|
|
1666
|
+
|
|
1667
|
+
// 3. Check for Attention (MQA)
|
|
1668
|
+
// Even if UIR/Edge check failed, this might be a pure attention block
|
|
1669
|
+
ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false);
|
|
1670
|
+
if (attn_q_check) {
|
|
1671
|
+
found_block = true;
|
|
1672
|
+
block.attn_q_w = attn_q_check;
|
|
1673
|
+
block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false);
|
|
1674
|
+
block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false);
|
|
1675
|
+
block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false);
|
|
1676
|
+
block.attn_k_dw_w = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false);
|
|
1677
|
+
block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false);
|
|
1678
|
+
block.attn_v_dw_w = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false);
|
|
1679
|
+
block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false);
|
|
1680
|
+
block.attn_norm_w = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false);
|
|
1681
|
+
// Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check
|
|
1682
|
+
if (!block.layer_scale_w) {
|
|
1683
|
+
block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
|
|
1684
|
+
}
|
|
1685
|
+
}
|
|
1686
|
+
|
|
1687
|
+
if (found_block) {
|
|
1688
|
+
model.mobilenet_blocks.push_back(block);
|
|
1689
|
+
blocks_found_in_stage++;
|
|
1690
|
+
} else {
|
|
1691
|
+
// End of blocks for this stage
|
|
1692
|
+
break;
|
|
1693
|
+
}
|
|
1694
|
+
}
|
|
1695
|
+
|
|
1696
|
+
// Track where this stage ends in the flat vector
|
|
1697
|
+
if (blocks_found_in_stage > 0) {
|
|
1698
|
+
model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1);
|
|
1699
|
+
LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1);
|
|
1700
|
+
}
|
|
1701
|
+
}
|
|
1702
|
+
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
|
1703
|
+
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
|
|
1704
|
+
} break;
|
|
1705
|
+
case PROJECTOR_TYPE_IDEFICS3:
|
|
1706
|
+
{
|
|
1707
|
+
model.projection = get_tensor(TN_MM_PROJECTOR);
|
|
1708
|
+
} break;
|
|
1709
|
+
case PROJECTOR_TYPE_LFM2:
|
|
1710
|
+
{
|
|
1711
|
+
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
|
1712
|
+
model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B, false);
|
|
1713
|
+
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
|
1714
|
+
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
|
|
1715
|
+
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
|
1716
|
+
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
|
1717
|
+
} break;
|
|
1718
|
+
case PROJECTOR_TYPE_KIMIVL:
|
|
1719
|
+
case PROJECTOR_TYPE_PADDLEOCR:
|
|
1720
|
+
case PROJECTOR_TYPE_KIMIK25:
|
|
1721
|
+
{
|
|
1722
|
+
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
|
|
1723
|
+
model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B);
|
|
1724
|
+
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
|
1725
|
+
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
|
|
1726
|
+
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
|
1727
|
+
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
|
1728
|
+
} break;
|
|
1729
|
+
case PROJECTOR_TYPE_PIXTRAL:
|
|
1730
|
+
{
|
|
1731
|
+
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
|
1732
|
+
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
|
|
1733
|
+
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
|
1734
|
+
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
|
|
1735
|
+
// [IMG_BREAK] token embedding
|
|
1736
|
+
model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
|
|
1737
|
+
// for mistral small 3.1
|
|
1738
|
+
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
|
1739
|
+
model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
|
|
1740
|
+
} break;
|
|
1741
|
+
case PROJECTOR_TYPE_LIGHTONOCR:
|
|
1742
|
+
{
|
|
1743
|
+
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
|
1744
|
+
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
|
|
1745
|
+
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
|
1746
|
+
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
|
|
1747
|
+
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
|
1748
|
+
model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
|
|
1749
|
+
} break;
|
|
1750
|
+
case PROJECTOR_TYPE_ULTRAVOX:
|
|
1751
|
+
{
|
|
1752
|
+
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
|
|
1753
|
+
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
|
|
1754
|
+
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
|
|
1755
|
+
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
|
|
1756
|
+
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
|
1757
|
+
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
|
1758
|
+
model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
|
|
1759
|
+
model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight"));
|
|
1760
|
+
} break;
|
|
1761
|
+
case PROJECTOR_TYPE_QWEN2A:
|
|
1762
|
+
{
|
|
1763
|
+
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
|
|
1764
|
+
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
|
|
1765
|
+
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
|
|
1766
|
+
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
|
|
1767
|
+
model.mm_fc_w = get_tensor(string_format(TN_MM_AUDIO_FC, "weight"));
|
|
1768
|
+
model.mm_fc_b = get_tensor(string_format(TN_MM_AUDIO_FC, "bias"));
|
|
1769
|
+
} break;
|
|
1770
|
+
case PROJECTOR_TYPE_VOXTRAL:
|
|
1771
|
+
{
|
|
1772
|
+
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
|
|
1773
|
+
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
|
|
1774
|
+
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
|
|
1775
|
+
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
|
|
1776
|
+
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
|
1777
|
+
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
|
1778
|
+
} break;
|
|
1779
|
+
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
|
1780
|
+
{
|
|
1781
|
+
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
|
|
1782
|
+
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
|
|
1783
|
+
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
|
|
1784
|
+
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
|
|
1785
|
+
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
|
1786
|
+
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
|
|
1787
|
+
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
|
1788
|
+
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
|
|
1789
|
+
} break;
|
|
1790
|
+
case PROJECTOR_TYPE_INTERNVL:
|
|
1791
|
+
{
|
|
1792
|
+
model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
|
|
1793
|
+
model.mm_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
|
|
1794
|
+
model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
|
|
1795
|
+
model.mm_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
|
|
1796
|
+
model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
|
|
1797
|
+
model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
|
|
1798
|
+
} break;
|
|
1799
|
+
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
|
|
1800
|
+
{
|
|
1801
|
+
model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
|
|
1802
|
+
model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
|
|
1803
|
+
model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
|
|
1804
|
+
} break;
|
|
1805
|
+
case PROJECTOR_TYPE_GLMA:
|
|
1806
|
+
{
|
|
1807
|
+
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
|
|
1808
|
+
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
|
|
1809
|
+
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
|
|
1810
|
+
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
|
|
1811
|
+
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
|
1812
|
+
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
|
|
1813
|
+
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
|
1814
|
+
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
|
|
1815
|
+
model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
|
|
1816
|
+
model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias"));
|
|
1817
|
+
model.mm_boi = get_tensor(string_format(TN_TOK_BOI));
|
|
1818
|
+
model.mm_eoi = get_tensor(string_format(TN_TOK_EOI));
|
|
1819
|
+
} break;
|
|
1820
|
+
case PROJECTOR_TYPE_LLAMA4:
|
|
1821
|
+
{
|
|
1822
|
+
model.mm_model_proj = get_tensor(TN_MM_PROJECTOR);
|
|
1823
|
+
model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
|
|
1824
|
+
model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
|
|
1825
|
+
} break;
|
|
1826
|
+
case PROJECTOR_TYPE_COGVLM:
|
|
1827
|
+
{
|
|
1828
|
+
model.mm_model_proj = get_tensor(TN_MM_PROJECTOR);
|
|
1829
|
+
model.mm_post_fc_norm_w = get_tensor(string_format(TN_MM_POST_FC_NORM, "weight"));
|
|
1830
|
+
model.mm_post_fc_norm_b = get_tensor(string_format(TN_MM_POST_FC_NORM, "bias"));
|
|
1831
|
+
model.mm_h_to_4h_w = get_tensor(string_format(TN_MM_H_TO_4H, "weight"));
|
|
1832
|
+
model.mm_gate_w = get_tensor(string_format(TN_MM_GATE, "weight"));
|
|
1833
|
+
model.mm_4h_to_h_w = get_tensor(string_format(TN_MM_4H_TO_H, "weight"));
|
|
1834
|
+
model.mm_boi = get_tensor(TN_TOK_BOI);
|
|
1835
|
+
model.mm_eoi = get_tensor(TN_TOK_EOI);
|
|
1836
|
+
} break;
|
|
1837
|
+
case PROJECTOR_TYPE_JANUS_PRO:
|
|
1838
|
+
{
|
|
1839
|
+
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
|
|
1840
|
+
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
|
|
1841
|
+
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
|
1842
|
+
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
|
|
1843
|
+
} break;
|
|
1844
|
+
case PROJECTOR_TYPE_LFM2A:
|
|
1845
|
+
{
|
|
1846
|
+
for (int i : {0, 2, 3, 5, 6}) {
|
|
1847
|
+
model.pre_encode_conv_X_w[i] = get_tensor(string_format(TN_CONV1D, i, "weight"));
|
|
1848
|
+
model.pre_encode_conv_X_b[i] = get_tensor(string_format(TN_CONV1D, i, "bias"));
|
|
1849
|
+
}
|
|
1850
|
+
model.pre_encode_out_w = get_tensor(string_format(TN_PRE_ENCODE_OUT, "weight"));
|
|
1851
|
+
model.pre_encode_out_b = get_tensor(string_format(TN_PRE_ENCODE_OUT, "bias"));
|
|
1852
|
+
|
|
1853
|
+
model.mm_0_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "weight"));
|
|
1854
|
+
model.mm_0_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "bias"));
|
|
1855
|
+
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
|
1856
|
+
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
|
|
1857
|
+
model.mm_3_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "weight"));
|
|
1858
|
+
model.mm_3_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "bias"));
|
|
1859
|
+
|
|
1860
|
+
for (int il = 0; il < hparams.n_layer; ++il) {
|
|
1861
|
+
auto & layer = model.layers[il];
|
|
1862
|
+
|
|
1863
|
+
layer.ff_norm_w = get_tensor(string_format(TN_FFN_NORM, prefix, il, "weight"));
|
|
1864
|
+
layer.ff_norm_b = get_tensor(string_format(TN_FFN_NORM, prefix, il, "bias"));
|
|
1865
|
+
layer.ff_norm_1_w = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "weight"));
|
|
1866
|
+
layer.ff_norm_1_b = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "bias"));
|
|
1867
|
+
layer.ff_up_1_w = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "weight"));
|
|
1868
|
+
layer.ff_up_1_b = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "bias"));
|
|
1869
|
+
layer.ff_down_1_w = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "weight"));
|
|
1870
|
+
layer.ff_down_1_b = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "bias"));
|
|
1871
|
+
|
|
1872
|
+
layer.pos_bias_u = get_tensor(string_format(TN_POS_BIAS_U, prefix, il));
|
|
1873
|
+
layer.pos_bias_v = get_tensor(string_format(TN_POS_BIAS_V, prefix, il));
|
|
1874
|
+
|
|
1875
|
+
layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight"));
|
|
1876
|
+
layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias"));
|
|
1877
|
+
|
|
1878
|
+
layer.linear_pos_w = get_tensor(string_format(TN_LINEAR_POS, prefix, il, "weight"));
|
|
1879
|
+
|
|
1880
|
+
layer.conv_norm_w = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight"));
|
|
1881
|
+
layer.conv_norm_b = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias"));
|
|
1882
|
+
layer.conv_dw_w = get_tensor(string_format(TN_CONV_DW, prefix, il, "weight"));
|
|
1883
|
+
layer.conv_dw_b = get_tensor(string_format(TN_CONV_DW, prefix, il, "bias"));
|
|
1884
|
+
layer.conv_pw1_w = get_tensor(string_format(TN_CONV_PW1, prefix, il, "weight"));
|
|
1885
|
+
layer.conv_pw1_b = get_tensor(string_format(TN_CONV_PW1, prefix, il, "bias"));
|
|
1886
|
+
layer.conv_pw2_w = get_tensor(string_format(TN_CONV_PW2, prefix, il, "weight"));
|
|
1887
|
+
layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias"));
|
|
1888
|
+
}
|
|
1889
|
+
} break;
|
|
1890
|
+
default:
|
|
1891
|
+
GGML_ASSERT(false && "unknown projector type");
|
|
1892
|
+
}
|
|
1893
|
+
|
|
1894
|
+
// load data
|
|
1895
|
+
{
|
|
1896
|
+
std::vector<uint8_t> read_buf;
|
|
1897
|
+
|
|
1898
|
+
auto fin = std::ifstream(fname, std::ios::binary);
|
|
1899
|
+
if (!fin) {
|
|
1900
|
+
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
|
1901
|
+
}
|
|
1902
|
+
|
|
1903
|
+
// alloc memory and offload data
|
|
1904
|
+
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend);
|
|
1905
|
+
ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
|
|
1906
|
+
ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
|
1907
|
+
for (auto & t : tensors_to_load) {
|
|
1908
|
+
ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name);
|
|
1909
|
+
const size_t offset = tensor_offset[t->name];
|
|
1910
|
+
fin.seekg(offset, std::ios::beg);
|
|
1911
|
+
if (!fin) {
|
|
1912
|
+
throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name));
|
|
1913
|
+
}
|
|
1914
|
+
size_t num_bytes = ggml_nbytes(cur);
|
|
1915
|
+
if (ggml_backend_buft_is_host(buft)) {
|
|
1916
|
+
// for the CPU and Metal backend, we can read directly into the tensor
|
|
1917
|
+
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
|
|
1918
|
+
} else {
|
|
1919
|
+
// read into a temporary buffer first, then copy to device memory
|
|
1920
|
+
read_buf.resize(num_bytes);
|
|
1921
|
+
fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
|
|
1922
|
+
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
|
1923
|
+
}
|
|
1924
|
+
}
|
|
1925
|
+
fin.close();
|
|
1926
|
+
|
|
1927
|
+
LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str());
|
|
1928
|
+
}
|
|
1929
|
+
}
|
|
1930
|
+
|
|
1931
|
+
struct support_info_op {
|
|
1932
|
+
ggml_tensor * op;
|
|
1933
|
+
|
|
1934
|
+
// true if the op runs on the accelerated ctx_clip.backend
|
|
1935
|
+
bool is_accel = true;
|
|
1936
|
+
};
|
|
1937
|
+
|
|
1938
|
+
struct support_info_graph {
|
|
1939
|
+
// whether the clip_ctx.backend supports flash attention
|
|
1940
|
+
bool fattn = true;
|
|
1941
|
+
ggml_tensor * fattn_op = nullptr; // for debugging
|
|
1942
|
+
|
|
1943
|
+
std::vector<support_info_op> ops;
|
|
1944
|
+
};
|
|
1945
|
+
|
|
1946
|
+
static void warmup(clip_ctx & ctx_clip) {
|
|
1947
|
+
// create a fake batch
|
|
1948
|
+
const auto & hparams = ctx_clip.model.hparams;
|
|
1949
|
+
clip_image_f32_batch batch;
|
|
1950
|
+
clip_image_f32_ptr img(clip_image_f32_init());
|
|
1951
|
+
if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
|
|
1952
|
+
img->nx = hparams.warmup_image_size;
|
|
1953
|
+
img->ny = hparams.warmup_image_size;
|
|
1954
|
+
LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny);
|
|
1955
|
+
} else {
|
|
1956
|
+
img->nx = hparams.warmup_audio_size;
|
|
1957
|
+
img->ny = hparams.n_mel_bins;
|
|
1958
|
+
LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx);
|
|
1959
|
+
}
|
|
1960
|
+
batch.entries.push_back(std::move(img));
|
|
1961
|
+
warmup(ctx_clip, batch);
|
|
1962
|
+
}
|
|
1963
|
+
|
|
1964
|
+
static void warmup(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
|
|
1965
|
+
support_info_graph info;
|
|
1966
|
+
|
|
1967
|
+
if (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_AUTO) {
|
|
1968
|
+
// try to enable flash attention to see if it's supported
|
|
1969
|
+
ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED;
|
|
1970
|
+
info = alloc_compute_meta(ctx_clip, batch);
|
|
1971
|
+
if (!info.fattn && info.fattn_op) {
|
|
1972
|
+
auto op = info.fattn_op;
|
|
1973
|
+
LOG_WRN("%s: *****************************************************************\n", __func__);
|
|
1974
|
+
LOG_WRN("%s: WARNING: flash attention not supported by %s, memory usage will increase\n", __func__, ggml_backend_name(ctx_clip.backend));
|
|
1975
|
+
LOG_WRN("%s: op params: \n", __func__);
|
|
1976
|
+
static auto print_shape = [](const char * fn, const char * name, ggml_tensor * t) {
|
|
1977
|
+
LOG_WRN("%s: %s: type = %s, ne = [%d %d %d %d], nb = [%d %d %d %d]\n", fn,
|
|
1978
|
+
name, ggml_type_name(t->type),
|
|
1979
|
+
t->ne[0], t->ne[1], t->ne[2], t->ne[3],
|
|
1980
|
+
t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
|
|
1981
|
+
};
|
|
1982
|
+
print_shape(__func__, " dst", op);
|
|
1983
|
+
print_shape(__func__, "src0", op->src[0]);
|
|
1984
|
+
print_shape(__func__, "src1", op->src[1]);
|
|
1985
|
+
print_shape(__func__, "src2", op->src[2]);
|
|
1986
|
+
LOG_WRN("%s: please report this on github as an issue\n", __func__);
|
|
1987
|
+
LOG_WRN("%s: *****************************************************************\n", __func__);
|
|
1988
|
+
ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED;
|
|
1989
|
+
alloc_compute_meta(ctx_clip, batch);
|
|
1990
|
+
}
|
|
1991
|
+
} else {
|
|
1992
|
+
info = alloc_compute_meta(ctx_clip, batch);
|
|
1993
|
+
if (!info.fattn && ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
|
|
1994
|
+
LOG_WRN("%s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n", __func__);
|
|
1995
|
+
}
|
|
1996
|
+
}
|
|
1997
|
+
|
|
1998
|
+
ctx_clip.is_allocated = true; // mark buffers as allocated
|
|
1999
|
+
|
|
2000
|
+
LOG_INF("%s: flash attention is %s\n", __func__,
|
|
2001
|
+
(ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
|
|
2002
|
+
|
|
2003
|
+
// print ops that are not supported by the GPU backend (if there is one)
|
|
2004
|
+
if (ctx_clip.backend && ctx_clip.backend != ctx_clip.backend_cpu) {
|
|
2005
|
+
std::vector<support_info_op> unsupported_ops;
|
|
2006
|
+
for (const auto & op : info.ops) {
|
|
2007
|
+
if (!op.is_accel) {
|
|
2008
|
+
unsupported_ops.push_back(op);
|
|
2009
|
+
}
|
|
2010
|
+
}
|
|
2011
|
+
if (!unsupported_ops.empty()) {
|
|
2012
|
+
LOG_WRN("%s: *****************************************************************\n", __func__);
|
|
2013
|
+
LOG_WRN("%s: WARNING: the CLIP graph uses unsupported operators by the backend\n", __func__);
|
|
2014
|
+
LOG_WRN("%s: the performance will be suboptimal \n", __func__);
|
|
2015
|
+
LOG_WRN("%s: list of unsupported ops (backend=%s):\n", __func__, ggml_backend_name(ctx_clip.backend));
|
|
2016
|
+
for (const auto & op : unsupported_ops) {
|
|
2017
|
+
LOG_WRN("%s: %16s: type = %s, ne = [%d %d %d %d]\n", __func__,
|
|
2018
|
+
ggml_op_name(op.op->op),
|
|
2019
|
+
ggml_type_name(op.op->type),
|
|
2020
|
+
op.op->ne[0], op.op->ne[1], op.op->ne[2], op.op->ne[3]);
|
|
2021
|
+
}
|
|
2022
|
+
LOG_WRN("%s: flash attention is %s\n", __func__,
|
|
2023
|
+
(ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
|
|
2024
|
+
LOG_WRN("%s: please report this on github as an issue\n", __func__);
|
|
2025
|
+
LOG_WRN("%s: ref: https://github.com/ggml-org/llama.cpp/pull/16837#issuecomment-3461676118\n", __func__);
|
|
2026
|
+
LOG_WRN("%s: *****************************************************************\n", __func__);
|
|
2027
|
+
}
|
|
2028
|
+
}
|
|
2029
|
+
}
|
|
2030
|
+
|
|
2031
|
+
static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
|
|
2032
|
+
ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
|
|
2033
|
+
|
|
2034
|
+
ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
|
|
2035
|
+
ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
|
|
2036
|
+
|
|
2037
|
+
for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) {
|
|
2038
|
+
ggml_backend_t backend = ctx_clip.backend_ptrs[i];
|
|
2039
|
+
ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i];
|
|
2040
|
+
size_t size = ggml_backend_sched_get_buffer_size(ctx_clip.sched.get(), backend);
|
|
2041
|
+
if (size > 1) {
|
|
2042
|
+
LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
|
2043
|
+
ggml_backend_buft_name(buft),
|
|
2044
|
+
size / 1024.0 / 1024.0);
|
|
2045
|
+
}
|
|
2046
|
+
}
|
|
2047
|
+
|
|
2048
|
+
const int n_splits = ggml_backend_sched_get_n_splits(ctx_clip.sched.get());
|
|
2049
|
+
const int n_nodes = ggml_graph_n_nodes(gf);
|
|
2050
|
+
|
|
2051
|
+
LOG_INF("%s: graph splits = %d, nodes = %d\n", __func__, n_splits, n_nodes);
|
|
2052
|
+
|
|
2053
|
+
support_info_graph res {
|
|
2054
|
+
/*.fattn = */ true,
|
|
2055
|
+
/*.fattn_op = */ nullptr,
|
|
2056
|
+
/*.ops = */ {},
|
|
2057
|
+
};
|
|
2058
|
+
|
|
2059
|
+
// check op support
|
|
2060
|
+
for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
|
|
2061
|
+
ggml_tensor * node = ggml_graph_node(gf, i);
|
|
2062
|
+
res.ops.push_back({node, true});
|
|
2063
|
+
if (!ggml_backend_supports_op(ctx_clip.backend, node)) {
|
|
2064
|
+
res.ops.back().is_accel = false;
|
|
2065
|
+
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
|
|
2066
|
+
res.fattn = false;
|
|
2067
|
+
res.fattn_op = node;
|
|
2068
|
+
}
|
|
2069
|
+
}
|
|
2070
|
+
}
|
|
2071
|
+
|
|
2072
|
+
return res;
|
|
2073
|
+
}
|
|
2074
|
+
|
|
2075
|
+
void get_bool(const std::string & key, bool & output, bool required = true) const {
|
|
2076
|
+
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
|
|
2077
|
+
if (i < 0) {
|
|
2078
|
+
if (required) {
|
|
2079
|
+
throw std::runtime_error("Key not found: " + key);
|
|
2080
|
+
}
|
|
2081
|
+
return;
|
|
2082
|
+
}
|
|
2083
|
+
output = gguf_get_val_bool(ctx_gguf.get(), i);
|
|
2084
|
+
}
|
|
2085
|
+
|
|
2086
|
+
void get_i32(const std::string & key, int & output, bool required = true) const {
|
|
2087
|
+
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
|
|
2088
|
+
if (i < 0) {
|
|
2089
|
+
if (required) {
|
|
2090
|
+
throw std::runtime_error("Key not found: " + key);
|
|
2091
|
+
}
|
|
2092
|
+
return;
|
|
2093
|
+
}
|
|
2094
|
+
output = gguf_get_val_i32(ctx_gguf.get(), i);
|
|
2095
|
+
}
|
|
2096
|
+
|
|
2097
|
+
void get_u32(const std::string & key, int & output, bool required = true) const {
|
|
2098
|
+
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
|
|
2099
|
+
if (i < 0) {
|
|
2100
|
+
if (required) {
|
|
2101
|
+
throw std::runtime_error("Key not found: " + key);
|
|
2102
|
+
}
|
|
2103
|
+
return;
|
|
2104
|
+
}
|
|
2105
|
+
output = gguf_get_val_u32(ctx_gguf.get(), i);
|
|
2106
|
+
}
|
|
2107
|
+
|
|
2108
|
+
void get_f32(const std::string & key, float & output, bool required = true) const {
|
|
2109
|
+
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
|
|
2110
|
+
if (i < 0) {
|
|
2111
|
+
if (required) {
|
|
2112
|
+
throw std::runtime_error("Key not found: " + key);
|
|
2113
|
+
}
|
|
2114
|
+
return;
|
|
2115
|
+
}
|
|
2116
|
+
output = gguf_get_val_f32(ctx_gguf.get(), i);
|
|
2117
|
+
}
|
|
2118
|
+
|
|
2119
|
+
void get_string(const std::string & key, std::string & output, bool required = true) const {
|
|
2120
|
+
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
|
|
2121
|
+
if (i < 0) {
|
|
2122
|
+
if (required) {
|
|
2123
|
+
throw std::runtime_error("Key not found: " + key);
|
|
2124
|
+
}
|
|
2125
|
+
return;
|
|
2126
|
+
}
|
|
2127
|
+
output = std::string(gguf_get_val_str(ctx_gguf.get(), i));
|
|
2128
|
+
}
|
|
2129
|
+
|
|
2130
|
+
void get_arr_int(const std::string & key, std::vector<int> & output, bool required = true) const {
|
|
2131
|
+
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
|
|
2132
|
+
if (i < 0) {
|
|
2133
|
+
if (required) {
|
|
2134
|
+
throw std::runtime_error("Key not found: " + key);
|
|
2135
|
+
}
|
|
2136
|
+
return;
|
|
2137
|
+
}
|
|
2138
|
+
int n = gguf_get_arr_n(ctx_gguf.get(), i);
|
|
2139
|
+
output.resize(n);
|
|
2140
|
+
const int32_t * values = (const int32_t *)gguf_get_arr_data(ctx_gguf.get(), i);
|
|
2141
|
+
for (int i = 0; i < n; ++i) {
|
|
2142
|
+
output[i] = values[i];
|
|
2143
|
+
}
|
|
2144
|
+
}
|
|
2145
|
+
|
|
2146
|
+
static void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) {
|
|
2147
|
+
auto & hparams = model.hparams;
|
|
2148
|
+
for (int x = 1; x <= max_patches_per_side; x++) {
|
|
2149
|
+
for (int y = 1; y <= max_patches_per_side; y++) {
|
|
2150
|
+
if (x == 1 && y == 1) {
|
|
2151
|
+
continue; // skip the first point
|
|
2152
|
+
}
|
|
2153
|
+
hparams.image_res_candidates.push_back(clip_image_size{
|
|
2154
|
+
x*hparams.image_size,
|
|
2155
|
+
y*hparams.image_size,
|
|
2156
|
+
});
|
|
2157
|
+
}
|
|
2158
|
+
}
|
|
2159
|
+
}
|
|
2160
|
+
};
|
|
2161
|
+
|
|
2162
|
+
struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params) {
|
|
2163
|
+
clip_ctx * ctx_vision = nullptr;
|
|
2164
|
+
clip_ctx * ctx_audio = nullptr;
|
|
2165
|
+
|
|
2166
|
+
try {
|
|
2167
|
+
clip_model_loader loader(fname);
|
|
2168
|
+
bool skip_audio = false;
|
|
2169
|
+
|
|
2170
|
+
if (loader.has_vision) {
|
|
2171
|
+
ctx_vision = new clip_ctx(ctx_params);
|
|
2172
|
+
loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION);
|
|
2173
|
+
loader.load_tensors(*ctx_vision);
|
|
2174
|
+
if (ctx_params.warmup) {
|
|
2175
|
+
loader.warmup(*ctx_vision);
|
|
2176
|
+
}
|
|
2177
|
+
|
|
2178
|
+
// TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
|
|
2179
|
+
// we can remove this check when we implement audio support for Gemma 3N
|
|
2180
|
+
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
|
|
2181
|
+
|
|
2182
|
+
// clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
|
|
2183
|
+
}
|
|
2184
|
+
|
|
2185
|
+
if (loader.has_audio && !skip_audio) {
|
|
2186
|
+
ctx_audio = new clip_ctx(ctx_params);
|
|
2187
|
+
loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
|
|
2188
|
+
loader.load_tensors(*ctx_audio);
|
|
2189
|
+
if (ctx_params.warmup) {
|
|
2190
|
+
loader.warmup(*ctx_audio);
|
|
2191
|
+
}
|
|
2192
|
+
}
|
|
2193
|
+
|
|
2194
|
+
} catch (const std::exception & e) {
|
|
2195
|
+
LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what());
|
|
2196
|
+
|
|
2197
|
+
delete ctx_vision;
|
|
2198
|
+
delete ctx_audio;
|
|
2199
|
+
|
|
2200
|
+
return {nullptr, nullptr};
|
|
2201
|
+
}
|
|
2202
|
+
|
|
2203
|
+
return {ctx_vision, ctx_audio};
|
|
2204
|
+
}
|
|
2205
|
+
|
|
2206
|
+
struct clip_image_size * clip_image_size_init() {
|
|
2207
|
+
struct clip_image_size * load_image_size = new struct clip_image_size();
|
|
2208
|
+
load_image_size->width = 448;
|
|
2209
|
+
load_image_size->height = 448;
|
|
2210
|
+
return load_image_size;
|
|
2211
|
+
}
|
|
2212
|
+
|
|
2213
|
+
struct clip_image_u8 * clip_image_u8_init() {
|
|
2214
|
+
return new clip_image_u8();
|
|
2215
|
+
}
|
|
2216
|
+
|
|
2217
|
+
struct clip_image_f32 * clip_image_f32_init() {
|
|
2218
|
+
return new clip_image_f32();
|
|
2219
|
+
}
|
|
2220
|
+
|
|
2221
|
+
struct clip_image_f32_batch * clip_image_f32_batch_init() {
|
|
2222
|
+
return new clip_image_f32_batch();
|
|
2223
|
+
}
|
|
2224
|
+
|
|
2225
|
+
unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) {
|
|
2226
|
+
if (nx) *nx = img->nx;
|
|
2227
|
+
if (ny) *ny = img->ny;
|
|
2228
|
+
return img->buf.data();
|
|
2229
|
+
}
|
|
2230
|
+
|
|
2231
|
+
void clip_image_size_free(struct clip_image_size * load_image_size) {
|
|
2232
|
+
if (load_image_size == nullptr) {
|
|
2233
|
+
return;
|
|
2234
|
+
}
|
|
2235
|
+
delete load_image_size;
|
|
2236
|
+
}
|
|
2237
|
+
void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
|
|
2238
|
+
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
|
|
2239
|
+
void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { delete batch; }
|
|
2240
|
+
void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { delete batch; }
|
|
2241
|
+
|
|
2242
|
+
size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) {
|
|
2243
|
+
return batch->entries.size();
|
|
2244
|
+
}
|
|
2245
|
+
|
|
2246
|
+
size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx) {
|
|
2247
|
+
if (idx < 0 || idx >= (int)batch->entries.size()) {
|
|
2248
|
+
LOG_ERR("%s: invalid index %d\n", __func__, idx);
|
|
2249
|
+
return 0;
|
|
2250
|
+
}
|
|
2251
|
+
return batch->entries[idx]->nx;
|
|
2252
|
+
}
|
|
2253
|
+
|
|
2254
|
+
size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) {
|
|
2255
|
+
if (idx < 0 || idx >= (int)batch->entries.size()) {
|
|
2256
|
+
LOG_ERR("%s: invalid index %d\n", __func__, idx);
|
|
2257
|
+
return 0;
|
|
2258
|
+
}
|
|
2259
|
+
return batch->entries[idx]->ny;
|
|
2260
|
+
}
|
|
2261
|
+
|
|
2262
|
+
clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) {
|
|
2263
|
+
if (idx < 0 || idx >= (int)batch->entries.size()) {
|
|
2264
|
+
LOG_ERR("%s: invalid index %d\n", __func__, idx);
|
|
2265
|
+
return nullptr;
|
|
2266
|
+
}
|
|
2267
|
+
return batch->entries[idx].get();
|
|
2268
|
+
}
|
|
2269
|
+
|
|
2270
|
+
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
|
|
2271
|
+
img->nx = nx;
|
|
2272
|
+
img->ny = ny;
|
|
2273
|
+
img->buf.resize(3 * nx * ny);
|
|
2274
|
+
memcpy(img->buf.data(), rgb_pixels, img->buf.size());
|
|
2275
|
+
}
|
|
2276
|
+
|
|
2277
|
+
// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
|
|
2278
|
+
static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
|
|
2279
|
+
dst.nx = src.nx;
|
|
2280
|
+
dst.ny = src.ny;
|
|
2281
|
+
dst.buf.resize(src.buf.size());
|
|
2282
|
+
|
|
2283
|
+
// TODO @ngxson : seems like this could be done more efficiently on cgraph
|
|
2284
|
+
for (size_t i = 0; i < src.buf.size(); ++i) {
|
|
2285
|
+
int c = i % 3; // rgb
|
|
2286
|
+
dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c];
|
|
2287
|
+
}
|
|
2288
|
+
}
|
|
2289
|
+
|
|
2290
|
+
// set of tools to manupulate images
|
|
2291
|
+
// in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv
|
|
2292
|
+
struct img_tool {
|
|
2293
|
+
enum resize_algo {
|
|
2294
|
+
RESIZE_ALGO_BILINEAR,
|
|
2295
|
+
RESIZE_ALGO_BICUBIC,
|
|
2296
|
+
// RESIZE_ALGO_LANCZOS, // TODO
|
|
2297
|
+
};
|
|
2298
|
+
|
|
2299
|
+
static void resize(
|
|
2300
|
+
const clip_image_u8 & src,
|
|
2301
|
+
clip_image_u8 & dst,
|
|
2302
|
+
const clip_image_size & target_resolution,
|
|
2303
|
+
resize_algo algo,
|
|
2304
|
+
bool add_padding = true, // TODO: define the behavior for add_padding = false
|
|
2305
|
+
std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
|
|
2306
|
+
dst.nx = target_resolution.width;
|
|
2307
|
+
dst.ny = target_resolution.height;
|
|
2308
|
+
dst.buf.resize(3 * dst.nx * dst.ny);
|
|
2309
|
+
|
|
2310
|
+
if (dst.nx == src.nx && dst.ny == src.ny) {
|
|
2311
|
+
// no resize needed, simple copy
|
|
2312
|
+
dst.buf = src.buf;
|
|
2313
|
+
return;
|
|
2314
|
+
}
|
|
2315
|
+
|
|
2316
|
+
if (!add_padding) {
|
|
2317
|
+
// direct resize
|
|
2318
|
+
switch (algo) {
|
|
2319
|
+
case RESIZE_ALGO_BILINEAR:
|
|
2320
|
+
resize_bilinear(src, dst, target_resolution.width, target_resolution.height);
|
|
2321
|
+
break;
|
|
2322
|
+
case RESIZE_ALGO_BICUBIC:
|
|
2323
|
+
resize_bicubic(src, dst, target_resolution.width, target_resolution.height);
|
|
2324
|
+
break;
|
|
2325
|
+
default:
|
|
2326
|
+
throw std::runtime_error("Unsupported resize algorithm");
|
|
2327
|
+
}
|
|
2328
|
+
} else {
|
|
2329
|
+
// resize with padding
|
|
2330
|
+
clip_image_u8 resized_image;
|
|
2331
|
+
float scale_w = static_cast<float>(target_resolution.width) / src.nx;
|
|
2332
|
+
float scale_h = static_cast<float>(target_resolution.height) / src.ny;
|
|
2333
|
+
float scale = std::min(scale_w, scale_h);
|
|
2334
|
+
int new_width = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
|
|
2335
|
+
int new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
|
|
2336
|
+
|
|
2337
|
+
switch (algo) {
|
|
2338
|
+
case RESIZE_ALGO_BILINEAR:
|
|
2339
|
+
resize_bilinear(src, resized_image, new_width, new_height);
|
|
2340
|
+
break;
|
|
2341
|
+
case RESIZE_ALGO_BICUBIC:
|
|
2342
|
+
resize_bicubic(src, resized_image, new_width, new_height);
|
|
2343
|
+
break;
|
|
2344
|
+
default:
|
|
2345
|
+
throw std::runtime_error("Unsupported resize algorithm");
|
|
2346
|
+
}
|
|
2347
|
+
|
|
2348
|
+
// fill dst with pad_color
|
|
2349
|
+
fill(dst, pad_color);
|
|
2350
|
+
|
|
2351
|
+
int offset_x = (target_resolution.width - new_width) / 2;
|
|
2352
|
+
int offset_y = (target_resolution.height - new_height) / 2;
|
|
2353
|
+
|
|
2354
|
+
composite(dst, resized_image, offset_x, offset_y);
|
|
2355
|
+
}
|
|
2356
|
+
}
|
|
2357
|
+
|
|
2358
|
+
static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
|
|
2359
|
+
dst.nx = w;
|
|
2360
|
+
dst.ny = h;
|
|
2361
|
+
dst.buf.resize(3 * w * h);
|
|
2362
|
+
|
|
2363
|
+
for (int i = 0; i < h; ++i) {
|
|
2364
|
+
for (int j = 0; j < w; ++j) {
|
|
2365
|
+
int src_idx = 3 * ((y + i)*image.nx + (x + j));
|
|
2366
|
+
int dst_idx = 3 * (i*w + j);
|
|
2367
|
+
dst.buf[dst_idx] = image.buf[src_idx];
|
|
2368
|
+
dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
|
|
2369
|
+
dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
|
|
2370
|
+
}
|
|
2371
|
+
}
|
|
2372
|
+
}
|
|
2373
|
+
|
|
2374
|
+
// calculate the size of the **resized** image, while preserving the aspect ratio
|
|
2375
|
+
// the calculated size will be aligned to the nearest multiple of align_size
|
|
2376
|
+
// if H or W size is larger than longest_edge, it will be resized to longest_edge
|
|
2377
|
+
static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int longest_edge) {
|
|
2378
|
+
GGML_ASSERT(align_size > 0);
|
|
2379
|
+
if (inp_size.width <= 0 || inp_size.height <= 0 || longest_edge <= 0) {
|
|
2380
|
+
return {0, 0};
|
|
2381
|
+
}
|
|
2382
|
+
|
|
2383
|
+
float scale = std::min(static_cast<float>(longest_edge) / inp_size.width,
|
|
2384
|
+
static_cast<float>(longest_edge) / inp_size.height);
|
|
2385
|
+
|
|
2386
|
+
float target_width_f = static_cast<float>(inp_size.width) * scale;
|
|
2387
|
+
float target_height_f = static_cast<float>(inp_size.height) * scale;
|
|
2388
|
+
|
|
2389
|
+
auto ceil_by_factor = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
|
|
2390
|
+
int aligned_width = ceil_by_factor(target_width_f);
|
|
2391
|
+
int aligned_height = ceil_by_factor(target_height_f);
|
|
2392
|
+
|
|
2393
|
+
return {aligned_width, aligned_height};
|
|
2394
|
+
}
|
|
2395
|
+
|
|
2396
|
+
// calculate the size of the **resized** image, while preserving the aspect ratio
|
|
2397
|
+
// the calculated size will have min_pixels <= W*H <= max_pixels
|
|
2398
|
+
// this is referred as "smart_resize" in transformers code
|
|
2399
|
+
static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) {
|
|
2400
|
+
GGML_ASSERT(align_size > 0);
|
|
2401
|
+
const int width = inp_size.width;
|
|
2402
|
+
const int height = inp_size.height;
|
|
2403
|
+
|
|
2404
|
+
auto round_by_factor = [f = align_size](float x) { return static_cast<int>(std::round(x / static_cast<float>(f))) * f; };
|
|
2405
|
+
auto ceil_by_factor = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
|
|
2406
|
+
auto floor_by_factor = [f = align_size](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
|
|
2407
|
+
|
|
2408
|
+
// always align up first
|
|
2409
|
+
int h_bar = std::max(align_size, round_by_factor(height));
|
|
2410
|
+
int w_bar = std::max(align_size, round_by_factor(width));
|
|
2411
|
+
|
|
2412
|
+
if (h_bar * w_bar > max_pixels) {
|
|
2413
|
+
const auto beta = std::sqrt(static_cast<float>(height * width) / max_pixels);
|
|
2414
|
+
h_bar = std::max(align_size, floor_by_factor(height / beta));
|
|
2415
|
+
w_bar = std::max(align_size, floor_by_factor(width / beta));
|
|
2416
|
+
} else if (h_bar * w_bar < min_pixels) {
|
|
2417
|
+
const auto beta = std::sqrt(static_cast<float>(min_pixels) / (height * width));
|
|
2418
|
+
h_bar = ceil_by_factor(height * beta);
|
|
2419
|
+
w_bar = ceil_by_factor(width * beta);
|
|
2420
|
+
}
|
|
2421
|
+
|
|
2422
|
+
return {w_bar, h_bar};
|
|
2423
|
+
}
|
|
2424
|
+
|
|
2425
|
+
// draw src image into dst image at offset (offset_x, offset_y)
|
|
2426
|
+
static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
|
|
2427
|
+
for (int y = 0; y < src.ny; ++y) {
|
|
2428
|
+
for (int x = 0; x < src.nx; ++x) {
|
|
2429
|
+
int dx = x + offset_x;
|
|
2430
|
+
int dy = y + offset_y;
|
|
2431
|
+
// skip pixels that would be out of bounds in the destination
|
|
2432
|
+
if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) {
|
|
2433
|
+
continue;
|
|
2434
|
+
}
|
|
2435
|
+
size_t dst_idx = 3 * (static_cast<size_t>(dy) * dst.nx + static_cast<size_t>(dx));
|
|
2436
|
+
size_t src_idx = 3 * (static_cast<size_t>(y) * src.nx + static_cast<size_t>(x));
|
|
2437
|
+
dst.buf[dst_idx + 0] = src.buf[src_idx + 0];
|
|
2438
|
+
dst.buf[dst_idx + 1] = src.buf[src_idx + 1];
|
|
2439
|
+
dst.buf[dst_idx + 2] = src.buf[src_idx + 2];
|
|
2440
|
+
}
|
|
2441
|
+
}
|
|
2442
|
+
}
|
|
2443
|
+
|
|
2444
|
+
// fill the image with a solid color
|
|
2445
|
+
static void fill(clip_image_u8 & img, const std::array<uint8_t, 3> & color) {
|
|
2446
|
+
for (size_t i = 0; i < img.buf.size(); i += 3) {
|
|
2447
|
+
img.buf[i] = color[0];
|
|
2448
|
+
img.buf[i + 1] = color[1];
|
|
2449
|
+
img.buf[i + 2] = color[2];
|
|
2450
|
+
}
|
|
2451
|
+
}
|
|
2452
|
+
|
|
2453
|
+
private:
|
|
2454
|
+
// Bilinear resize function
|
|
2455
|
+
static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
|
|
2456
|
+
dst.nx = target_width;
|
|
2457
|
+
dst.ny = target_height;
|
|
2458
|
+
dst.buf.resize(3 * target_width * target_height);
|
|
2459
|
+
|
|
2460
|
+
float x_ratio = static_cast<float>(src.nx - 1) / target_width;
|
|
2461
|
+
float y_ratio = static_cast<float>(src.ny - 1) / target_height;
|
|
2462
|
+
|
|
2463
|
+
for (int y = 0; y < target_height; y++) {
|
|
2464
|
+
for (int x = 0; x < target_width; x++) {
|
|
2465
|
+
float px = x_ratio * x;
|
|
2466
|
+
float py = y_ratio * y;
|
|
2467
|
+
int x_floor = static_cast<int>(px);
|
|
2468
|
+
int y_floor = static_cast<int>(py);
|
|
2469
|
+
float x_lerp = px - x_floor;
|
|
2470
|
+
float y_lerp = py - y_floor;
|
|
2471
|
+
|
|
2472
|
+
for (int c = 0; c < 3; c++) {
|
|
2473
|
+
float top = lerp(
|
|
2474
|
+
static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
|
|
2475
|
+
static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
|
|
2476
|
+
x_lerp
|
|
2477
|
+
);
|
|
2478
|
+
float bottom = lerp(
|
|
2479
|
+
static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
|
|
2480
|
+
static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
|
|
2481
|
+
x_lerp
|
|
2482
|
+
);
|
|
2483
|
+
dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
|
|
2484
|
+
}
|
|
2485
|
+
}
|
|
2486
|
+
}
|
|
2487
|
+
}
|
|
2488
|
+
|
|
2489
|
+
// Bicubic resize function
|
|
2490
|
+
// part of image will be cropped if the aspect ratio is different
|
|
2491
|
+
static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
|
|
2492
|
+
const int nx = img.nx;
|
|
2493
|
+
const int ny = img.ny;
|
|
2494
|
+
|
|
2495
|
+
dst.nx = target_width;
|
|
2496
|
+
dst.ny = target_height;
|
|
2497
|
+
dst.buf.resize(3 * target_width * target_height);
|
|
2498
|
+
|
|
2499
|
+
float Cc;
|
|
2500
|
+
float C[5] = {};
|
|
2501
|
+
float d0, d2, d3, a0, a1, a2, a3;
|
|
2502
|
+
int i, j, k, jj;
|
|
2503
|
+
int x, y;
|
|
2504
|
+
float dx, dy;
|
|
2505
|
+
float tx, ty;
|
|
2506
|
+
|
|
2507
|
+
tx = (float)nx / (float)target_width;
|
|
2508
|
+
ty = (float)ny / (float)target_height;
|
|
2509
|
+
|
|
2510
|
+
// Bicubic interpolation; adapted from ViT.cpp, inspired from :
|
|
2511
|
+
// -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
|
|
2512
|
+
// -> https://en.wikipedia.org/wiki/Bicubic_interpolation
|
|
2513
|
+
|
|
2514
|
+
for (i = 0; i < target_height; i++) {
|
|
2515
|
+
for (j = 0; j < target_width; j++) {
|
|
2516
|
+
x = (int)(tx * j);
|
|
2517
|
+
y = (int)(ty * i);
|
|
2518
|
+
|
|
2519
|
+
dx = tx * j - x;
|
|
2520
|
+
dy = ty * i - y;
|
|
2521
|
+
|
|
2522
|
+
for (k = 0; k < 3; k++) {
|
|
2523
|
+
for (jj = 0; jj <= 3; jj++) {
|
|
2524
|
+
d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
|
2525
|
+
d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
|
2526
|
+
d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
|
2527
|
+
a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
|
2528
|
+
|
|
2529
|
+
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
|
|
2530
|
+
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
|
|
2531
|
+
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
|
|
2532
|
+
|
|
2533
|
+
C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
|
|
2534
|
+
|
|
2535
|
+
d0 = C[0] - C[1];
|
|
2536
|
+
d2 = C[2] - C[1];
|
|
2537
|
+
d3 = C[3] - C[1];
|
|
2538
|
+
a0 = C[1];
|
|
2539
|
+
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
|
|
2540
|
+
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
|
|
2541
|
+
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
|
|
2542
|
+
Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
|
|
2543
|
+
|
|
2544
|
+
const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
|
|
2545
|
+
dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
|
|
2546
|
+
}
|
|
2547
|
+
}
|
|
2548
|
+
}
|
|
2549
|
+
}
|
|
2550
|
+
|
|
2551
|
+
return true;
|
|
2552
|
+
}
|
|
2553
|
+
|
|
2554
|
+
static inline int clip(int x, int lower, int upper) {
|
|
2555
|
+
return std::max(lower, std::min(x, upper));
|
|
2556
|
+
}
|
|
2557
|
+
|
|
2558
|
+
// Linear interpolation between two points
|
|
2559
|
+
static inline float lerp(float s, float e, float t) {
|
|
2560
|
+
return s + (e - s) * t;
|
|
2561
|
+
}
|
|
2562
|
+
};
|
|
2563
|
+
|
|
2564
|
+
/**
|
|
2565
|
+
* implementation of LLaVA-UHD:
|
|
2566
|
+
* - https://arxiv.org/pdf/2403.11703
|
|
2567
|
+
* - https://github.com/thunlp/LLaVA-UHD
|
|
2568
|
+
* - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
|
|
2569
|
+
*
|
|
2570
|
+
* overview:
|
|
2571
|
+
* - an image always have a single overview (downscaled image)
|
|
2572
|
+
* - an image can have 0 or multiple slices, depending on the image size
|
|
2573
|
+
* - each slice can then be considered as a separate image
|
|
2574
|
+
*
|
|
2575
|
+
* for example:
|
|
2576
|
+
*
|
|
2577
|
+
* [overview] --> [slice 1] --> [slice 2]
|
|
2578
|
+
* | |
|
|
2579
|
+
* +--> [slice 3] --> [slice 4]
|
|
2580
|
+
*/
|
|
2581
|
+
struct llava_uhd {
|
|
2582
|
+
struct slice_coordinates {
|
|
2583
|
+
int x;
|
|
2584
|
+
int y;
|
|
2585
|
+
clip_image_size size;
|
|
2586
|
+
};
|
|
2587
|
+
|
|
2588
|
+
struct slice_instructions {
|
|
2589
|
+
clip_image_size overview_size; // size of downscaled image
|
|
2590
|
+
clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size)
|
|
2591
|
+
clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices
|
|
2592
|
+
std::vector<slice_coordinates> slices;
|
|
2593
|
+
|
|
2594
|
+
img_tool::resize_algo interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR;
|
|
2595
|
+
bool padding_overview = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
|
|
2596
|
+
std::array<uint8_t, 3> pad_color_overview = {0, 0, 0};
|
|
2597
|
+
|
|
2598
|
+
img_tool::resize_algo interpolation_refined = img_tool::RESIZE_ALGO_BICUBIC;
|
|
2599
|
+
bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
|
|
2600
|
+
std::array<uint8_t, 3> pad_color_refined = {0, 0, 0};
|
|
2601
|
+
};
|
|
2602
|
+
|
|
2603
|
+
static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
|
|
2604
|
+
slice_instructions res;
|
|
2605
|
+
const int patch_size = clip_get_patch_size(ctx);
|
|
2606
|
+
const int slice_size = clip_get_image_size(ctx);
|
|
2607
|
+
const int original_width = original_size.width;
|
|
2608
|
+
const int original_height = original_size.height;
|
|
2609
|
+
|
|
2610
|
+
const bool has_slices = original_size.width > slice_size || original_size.height > slice_size;
|
|
2611
|
+
const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();
|
|
2612
|
+
|
|
2613
|
+
if (!has_slices) {
|
|
2614
|
+
// skip slicing logic
|
|
2615
|
+
res.overview_size = clip_image_size{slice_size, slice_size};
|
|
2616
|
+
res.refined_size = clip_image_size{0, 0};
|
|
2617
|
+
res.grid_size = clip_image_size{0, 0};
|
|
2618
|
+
|
|
2619
|
+
return res;
|
|
2620
|
+
}
|
|
2621
|
+
|
|
2622
|
+
if (has_pinpoints) {
|
|
2623
|
+
// has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
|
|
2624
|
+
auto refine_size = llava_uhd::select_best_resolution(
|
|
2625
|
+
original_size,
|
|
2626
|
+
ctx->model.hparams.image_res_candidates);
|
|
2627
|
+
res.overview_size = clip_image_size{slice_size, slice_size};
|
|
2628
|
+
res.refined_size = refine_size;
|
|
2629
|
+
res.grid_size = clip_image_size{0, 0};
|
|
2630
|
+
res.padding_refined = true;
|
|
2631
|
+
res.interpolation_refined = img_tool::RESIZE_ALGO_BILINEAR; // preserve old behavior when padding
|
|
2632
|
+
|
|
2633
|
+
LOG_DBG("%s: using pinpoints for slicing\n", __func__);
|
|
2634
|
+
LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n",
|
|
2635
|
+
__func__, original_width, original_height,
|
|
2636
|
+
res.overview_size.width, res.overview_size.height,
|
|
2637
|
+
res.refined_size.width, res.refined_size.height);
|
|
2638
|
+
|
|
2639
|
+
for (int y = 0; y < refine_size.height; y += slice_size) {
|
|
2640
|
+
for (int x = 0; x < refine_size.width; x += slice_size) {
|
|
2641
|
+
slice_coordinates slice;
|
|
2642
|
+
slice.x = x;
|
|
2643
|
+
slice.y = y;
|
|
2644
|
+
slice.size.width = std::min(slice_size, refine_size.width - x);
|
|
2645
|
+
slice.size.height = std::min(slice_size, refine_size.height - y);
|
|
2646
|
+
res.slices.push_back(slice);
|
|
2647
|
+
LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
|
|
2648
|
+
__func__, (int)res.slices.size() - 1,
|
|
2649
|
+
slice.x, slice.y, slice.size.width, slice.size.height);
|
|
2650
|
+
}
|
|
2651
|
+
}
|
|
2652
|
+
|
|
2653
|
+
res.grid_size.height = refine_size.height / slice_size;
|
|
2654
|
+
res.grid_size.width = refine_size.width / slice_size;
|
|
2655
|
+
LOG_DBG("%s: grid size: %d x %d\n", __func__, res.grid_size.width, res.grid_size.height);
|
|
2656
|
+
|
|
2657
|
+
return res;
|
|
2658
|
+
}
|
|
2659
|
+
|
|
2660
|
+
// no pinpoints, dynamically calculate the grid size (e.g. minicpmv)
|
|
2661
|
+
|
|
2662
|
+
auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices);
|
|
2663
|
+
res.overview_size = best_size;
|
|
2664
|
+
|
|
2665
|
+
{
|
|
2666
|
+
const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it
|
|
2667
|
+
const float log_ratio = log((float)original_width / original_height);
|
|
2668
|
+
const float ratio = (float)original_width * original_height / (slice_size * slice_size);
|
|
2669
|
+
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
|
2670
|
+
|
|
2671
|
+
auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio);
|
|
2672
|
+
auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
|
|
2673
|
+
res.grid_size = best_grid;
|
|
2674
|
+
res.refined_size = refine_size;
|
|
2675
|
+
|
|
2676
|
+
LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
|
|
2677
|
+
__func__, original_width, original_height,
|
|
2678
|
+
res.overview_size.width, res.overview_size.height,
|
|
2679
|
+
res.refined_size.width, res.refined_size.height,
|
|
2680
|
+
res.grid_size.width, res.grid_size.height);
|
|
2681
|
+
|
|
2682
|
+
int width = refine_size.width;
|
|
2683
|
+
int height = refine_size.height;
|
|
2684
|
+
int grid_x = int(width / best_grid.width);
|
|
2685
|
+
int grid_y = int(height / best_grid.height);
|
|
2686
|
+
for (int patches_y = 0, ic = 0;
|
|
2687
|
+
patches_y < refine_size.height && ic < best_grid.height;
|
|
2688
|
+
patches_y += grid_y, ic += 1) {
|
|
2689
|
+
for (int patches_x = 0, jc = 0;
|
|
2690
|
+
patches_x < refine_size.width && jc < best_grid.width;
|
|
2691
|
+
patches_x += grid_x, jc += 1) {
|
|
2692
|
+
slice_coordinates slice;
|
|
2693
|
+
slice.x = patches_x;
|
|
2694
|
+
slice.y = patches_y;
|
|
2695
|
+
slice.size.width = grid_x;
|
|
2696
|
+
slice.size.height = grid_y;
|
|
2697
|
+
res.slices.push_back(slice);
|
|
2698
|
+
LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
|
|
2699
|
+
__func__, (int)res.slices.size() - 1,
|
|
2700
|
+
slice.x, slice.y, slice.size.width, slice.size.height);
|
|
2701
|
+
}
|
|
2702
|
+
}
|
|
2703
|
+
}
|
|
2704
|
+
|
|
2705
|
+
return res;
|
|
2706
|
+
}
|
|
2707
|
+
|
|
2708
|
+
static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
|
|
2709
|
+
std::vector<clip_image_u8_ptr> output;
|
|
2710
|
+
|
|
2711
|
+
// resize to overview size
|
|
2712
|
+
clip_image_u8_ptr resized_img(clip_image_u8_init());
|
|
2713
|
+
img_tool::resize(*img, *resized_img, inst.overview_size, inst.interpolation_overview,
|
|
2714
|
+
inst.padding_overview, inst.pad_color_overview);
|
|
2715
|
+
output.push_back(std::move(resized_img));
|
|
2716
|
+
|
|
2717
|
+
if (inst.slices.empty()) {
|
|
2718
|
+
// no slices, just return the resized image
|
|
2719
|
+
return output;
|
|
2720
|
+
}
|
|
2721
|
+
|
|
2722
|
+
// resize to refined size
|
|
2723
|
+
clip_image_u8_ptr refined_img(clip_image_u8_init());
|
|
2724
|
+
img_tool::resize(*img, *refined_img, inst.refined_size, inst.interpolation_refined,
|
|
2725
|
+
inst.padding_refined, inst.pad_color_refined);
|
|
2726
|
+
|
|
2727
|
+
// create slices
|
|
2728
|
+
for (const auto & slice : inst.slices) {
|
|
2729
|
+
int x = slice.x;
|
|
2730
|
+
int y = slice.y;
|
|
2731
|
+
int w = slice.size.width;
|
|
2732
|
+
int h = slice.size.height;
|
|
2733
|
+
|
|
2734
|
+
clip_image_u8_ptr img_slice(clip_image_u8_init());
|
|
2735
|
+
img_tool::crop(*refined_img, *img_slice, x, y, w, h);
|
|
2736
|
+
output.push_back(std::move(img_slice));
|
|
2737
|
+
}
|
|
2738
|
+
|
|
2739
|
+
return output;
|
|
2740
|
+
}
|
|
2741
|
+
|
|
2742
|
+
private:
|
|
2743
|
+
static clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
|
|
2744
|
+
int width = original_size.width;
|
|
2745
|
+
int height = original_size.height;
|
|
2746
|
+
if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
|
|
2747
|
+
float r = static_cast<float>(width) / height;
|
|
2748
|
+
height = static_cast<int>(scale_resolution / std::sqrt(r));
|
|
2749
|
+
width = static_cast<int>(height * r);
|
|
2750
|
+
}
|
|
2751
|
+
clip_image_size res;
|
|
2752
|
+
res.width = ensure_divide(width, patch_size);
|
|
2753
|
+
res.height = ensure_divide(height, patch_size);
|
|
2754
|
+
return res;
|
|
2755
|
+
}
|
|
2756
|
+
|
|
2757
|
+
static clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max) {
|
|
2758
|
+
float scale_width = static_cast<float>(target_max.width) / orig.width;
|
|
2759
|
+
float scale_height = static_cast<float>(target_max.height) / orig.height;
|
|
2760
|
+
float scale = std::min(scale_width, scale_height);
|
|
2761
|
+
return clip_image_size{
|
|
2762
|
+
static_cast<int>(orig.width * scale),
|
|
2763
|
+
static_cast<int>(orig.height * scale),
|
|
2764
|
+
};
|
|
2765
|
+
}
|
|
2766
|
+
|
|
2767
|
+
/**
|
|
2768
|
+
* Selects the best resolution from a list of possible resolutions based on the original size.
|
|
2769
|
+
*
|
|
2770
|
+
* For example, when given a list of resolutions:
|
|
2771
|
+
* - 100x100
|
|
2772
|
+
* - 200x100
|
|
2773
|
+
* - 100x200
|
|
2774
|
+
* - 200x200
|
|
2775
|
+
*
|
|
2776
|
+
* And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution).
|
|
2777
|
+
*
|
|
2778
|
+
* @param original_size The original size of the image
|
|
2779
|
+
* @param possible_resolutions A list of possible resolutions
|
|
2780
|
+
* @return The best fit resolution
|
|
2781
|
+
*/
|
|
2782
|
+
static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions) {
|
|
2783
|
+
clip_image_size best_fit;
|
|
2784
|
+
int min_wasted_area = std::numeric_limits<int>::max();
|
|
2785
|
+
int max_effective_resolution = 0;
|
|
2786
|
+
|
|
2787
|
+
for (const clip_image_size & candidate : possible_resolutions) {
|
|
2788
|
+
auto target_size = resize_maintain_aspect_ratio(original_size, candidate);
|
|
2789
|
+
int effective_resolution = std::min(
|
|
2790
|
+
target_size.width * target_size.height,
|
|
2791
|
+
original_size.width * original_size.height);
|
|
2792
|
+
int wasted_area = (candidate.width * candidate.height) - effective_resolution;
|
|
2793
|
+
|
|
2794
|
+
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_area < min_wasted_area)) {
|
|
2795
|
+
max_effective_resolution = effective_resolution;
|
|
2796
|
+
min_wasted_area = wasted_area;
|
|
2797
|
+
best_fit = candidate;
|
|
2798
|
+
}
|
|
2799
|
+
|
|
2800
|
+
LOG_DBG("%s: candidate: %d x %d, target: %d x %d, wasted: %d, effective: %d\n", __func__, candidate.width, candidate.height, target_size.width, target_size.height, wasted_area, effective_resolution);
|
|
2801
|
+
}
|
|
2802
|
+
|
|
2803
|
+
return best_fit;
|
|
2804
|
+
}
|
|
2805
|
+
|
|
2806
|
+
static int ensure_divide(int length, int patch_size) {
|
|
2807
|
+
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
|
|
2808
|
+
}
|
|
2809
|
+
|
|
2810
|
+
static clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
|
|
2811
|
+
int width = original_size.width;
|
|
2812
|
+
int height = original_size.height;
|
|
2813
|
+
int grid_x = grid.width;
|
|
2814
|
+
int grid_y = grid.height;
|
|
2815
|
+
|
|
2816
|
+
int refine_width = ensure_divide(width, grid_x);
|
|
2817
|
+
int refine_height = ensure_divide(height, grid_y);
|
|
2818
|
+
|
|
2819
|
+
clip_image_size grid_size;
|
|
2820
|
+
grid_size.width = refine_width / grid_x;
|
|
2821
|
+
grid_size.height = refine_height / grid_y;
|
|
2822
|
+
|
|
2823
|
+
auto best_grid_size = get_best_resize(grid_size, scale_resolution, patch_size, allow_upscale);
|
|
2824
|
+
int best_grid_width = best_grid_size.width;
|
|
2825
|
+
int best_grid_height = best_grid_size.height;
|
|
2826
|
+
|
|
2827
|
+
clip_image_size refine_size;
|
|
2828
|
+
refine_size.width = best_grid_width * grid_x;
|
|
2829
|
+
refine_size.height = best_grid_height * grid_y;
|
|
2830
|
+
return refine_size;
|
|
2831
|
+
}
|
|
2832
|
+
|
|
2833
|
+
static clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
|
|
2834
|
+
std::vector<int> candidate_split_grids_nums;
|
|
2835
|
+
for (int i : {multiple - 1, multiple, multiple + 1}) {
|
|
2836
|
+
if (i == 1 || i > max_slice_nums) {
|
|
2837
|
+
continue;
|
|
2838
|
+
}
|
|
2839
|
+
candidate_split_grids_nums.push_back(i);
|
|
2840
|
+
}
|
|
2841
|
+
|
|
2842
|
+
std::vector<clip_image_size> candidate_grids;
|
|
2843
|
+
for (int split_grids_nums : candidate_split_grids_nums) {
|
|
2844
|
+
int m = 1;
|
|
2845
|
+
while (m <= split_grids_nums) {
|
|
2846
|
+
if (split_grids_nums % m == 0) {
|
|
2847
|
+
candidate_grids.push_back(clip_image_size{m, split_grids_nums / m});
|
|
2848
|
+
}
|
|
2849
|
+
++m;
|
|
2850
|
+
}
|
|
2851
|
+
}
|
|
2852
|
+
|
|
2853
|
+
clip_image_size best_grid{1, 1};
|
|
2854
|
+
float min_error = std::numeric_limits<float>::infinity();
|
|
2855
|
+
for (const auto& grid : candidate_grids) {
|
|
2856
|
+
float error = std::abs(log_ratio - std::log(1.0 * grid.width / grid.height));
|
|
2857
|
+
if (error < min_error) {
|
|
2858
|
+
best_grid = grid;
|
|
2859
|
+
min_error = error;
|
|
2860
|
+
}
|
|
2861
|
+
}
|
|
2862
|
+
return best_grid;
|
|
2863
|
+
}
|
|
2864
|
+
};
|
|
2865
|
+
|
|
2866
|
+
// ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
|
|
2867
|
+
// some of the logic is similar to llava_uhd, but with different hyperparameters and some logic is unique (e.g. grid layout)
|
|
2868
|
+
struct lfm2_vl_image_processor {
|
|
2869
|
+
// ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
|
|
2870
|
+
static constexpr int min_tiles = 2;
|
|
2871
|
+
static constexpr int max_tiles = 10;
|
|
2872
|
+
static constexpr float max_pixels_tolerance = 2.0f;
|
|
2873
|
+
static constexpr int tile_size = 512;
|
|
2874
|
+
|
|
2875
|
+
static llava_uhd::slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
|
|
2876
|
+
llava_uhd::slice_instructions inst;
|
|
2877
|
+
const auto & params = ctx->model.hparams;
|
|
2878
|
+
const int align_size = params.patch_size * params.n_merge;
|
|
2879
|
+
|
|
2880
|
+
inst.interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR;
|
|
2881
|
+
inst.interpolation_refined = img_tool::RESIZE_ALGO_BILINEAR;
|
|
2882
|
+
inst.overview_size = img_tool::calc_size_preserved_ratio(original_size, align_size, params.image_min_pixels, params.image_max_pixels);
|
|
2883
|
+
|
|
2884
|
+
// tile if either dimension exceeds tile_size with tolerance
|
|
2885
|
+
const bool needs_tiling = original_size.width > tile_size * max_pixels_tolerance || original_size.height > tile_size * max_pixels_tolerance;
|
|
2886
|
+
|
|
2887
|
+
if (!needs_tiling) {
|
|
2888
|
+
inst.refined_size = clip_image_size{0, 0};
|
|
2889
|
+
inst.grid_size = clip_image_size{0, 0};
|
|
2890
|
+
return inst;
|
|
2891
|
+
}
|
|
2892
|
+
|
|
2893
|
+
const clip_image_size grid = get_grid_layout(original_size.height, original_size.width);
|
|
2894
|
+
|
|
2895
|
+
inst.grid_size = grid;
|
|
2896
|
+
inst.refined_size = clip_image_size{tile_size * grid.width, tile_size * grid.height};
|
|
2897
|
+
|
|
2898
|
+
LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
|
|
2899
|
+
__func__,
|
|
2900
|
+
original_size.width, original_size.height,
|
|
2901
|
+
inst.overview_size.width, inst.overview_size.height,
|
|
2902
|
+
inst.refined_size.width, inst.refined_size.height,
|
|
2903
|
+
grid.width, grid.height);
|
|
2904
|
+
|
|
2905
|
+
for (int row = 0; row < grid.height; row++) {
|
|
2906
|
+
for (int col = 0; col < grid.width; col++) {
|
|
2907
|
+
llava_uhd::slice_coordinates slice;
|
|
2908
|
+
slice.x = col * tile_size;
|
|
2909
|
+
slice.y = row * tile_size;
|
|
2910
|
+
slice.size = clip_image_size{tile_size, tile_size};
|
|
2911
|
+
inst.slices.push_back(slice);
|
|
2912
|
+
LOG_DBG("%s: slice %d: x=%d, y=%d, size=%d x %d\n",
|
|
2913
|
+
__func__, (int)inst.slices.size() - 1,
|
|
2914
|
+
slice.x, slice.y, slice.size.width, slice.size.height);
|
|
2915
|
+
}
|
|
2916
|
+
}
|
|
2917
|
+
|
|
2918
|
+
return inst;
|
|
2919
|
+
}
|
|
2920
|
+
|
|
2921
|
+
private:
|
|
2922
|
+
static clip_image_size find_closest_aspect_ratio(
|
|
2923
|
+
float aspect_ratio,
|
|
2924
|
+
const std::vector<clip_image_size> & target_ratios,
|
|
2925
|
+
int width, int height) {
|
|
2926
|
+
float best_ratio_diff = std::numeric_limits<float>::max();
|
|
2927
|
+
clip_image_size best_ratio = {1, 1};
|
|
2928
|
+
const float area = static_cast<float>(width * height);
|
|
2929
|
+
|
|
2930
|
+
for (const auto & ratio : target_ratios) {
|
|
2931
|
+
const float target_aspect_ratio = static_cast<float>(ratio.width) / ratio.height;
|
|
2932
|
+
const float ratio_diff = std::abs(aspect_ratio - target_aspect_ratio);
|
|
2933
|
+
if (ratio_diff < best_ratio_diff) {
|
|
2934
|
+
best_ratio_diff = ratio_diff;
|
|
2935
|
+
best_ratio = ratio;
|
|
2936
|
+
} else if (ratio_diff == best_ratio_diff) {
|
|
2937
|
+
const float target_area = static_cast<float>(tile_size * tile_size * ratio.width * ratio.height);
|
|
2938
|
+
if (area > 0.5f * target_area) {
|
|
2939
|
+
best_ratio = ratio;
|
|
2940
|
+
}
|
|
2941
|
+
}
|
|
2942
|
+
}
|
|
2943
|
+
return best_ratio;
|
|
2944
|
+
}
|
|
2945
|
+
|
|
2946
|
+
static std::vector<clip_image_size> get_target_ratios() {
|
|
2947
|
+
std::vector<clip_image_size> ratios;
|
|
2948
|
+
for (int n = min_tiles; n <= max_tiles; n++) {
|
|
2949
|
+
for (int w = 1; w <= n; w++) {
|
|
2950
|
+
for (int h = 1; h <= n; h++) {
|
|
2951
|
+
if (w * h >= min_tiles && w * h <= max_tiles) {
|
|
2952
|
+
bool found = false;
|
|
2953
|
+
for (const auto & r : ratios) {
|
|
2954
|
+
if (r.width == w && r.height == h) {
|
|
2955
|
+
found = true;
|
|
2956
|
+
break;
|
|
2957
|
+
}
|
|
2958
|
+
}
|
|
2959
|
+
if (!found) {
|
|
2960
|
+
ratios.push_back({w, h});
|
|
2961
|
+
}
|
|
2962
|
+
}
|
|
2963
|
+
}
|
|
2964
|
+
}
|
|
2965
|
+
}
|
|
2966
|
+
std::sort(ratios.begin(), ratios.end(), [](const clip_image_size & a, const clip_image_size & b) {
|
|
2967
|
+
return a.width * a.height < b.width * b.height;
|
|
2968
|
+
});
|
|
2969
|
+
return ratios;
|
|
2970
|
+
}
|
|
2971
|
+
|
|
2972
|
+
static clip_image_size get_grid_layout(int height, int width) {
|
|
2973
|
+
const float aspect_ratio = static_cast<float>(width) / height;
|
|
2974
|
+
const auto ratios = get_target_ratios();
|
|
2975
|
+
return find_closest_aspect_ratio(aspect_ratio, ratios, width, height);
|
|
2976
|
+
}
|
|
2977
|
+
};
|
|
2978
|
+
|
|
2979
|
+
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
|
|
2980
|
+
// res_imgs memory is being allocated here, previous allocations will be freed if found
|
|
2981
|
+
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
|
|
2982
|
+
clip_image_size original_size{img->nx, img->ny};
|
|
2983
|
+
auto & params = ctx->model.hparams;
|
|
2984
|
+
|
|
2985
|
+
switch (ctx->proj_type()) {
|
|
2986
|
+
case PROJECTOR_TYPE_MINICPMV:
|
|
2987
|
+
{
|
|
2988
|
+
auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
|
|
2989
|
+
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
|
|
2990
|
+
|
|
2991
|
+
for (size_t i = 0; i < imgs.size(); ++i) {
|
|
2992
|
+
// clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
|
|
2993
|
+
clip_image_f32_ptr res(clip_image_f32_init());
|
|
2994
|
+
normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
|
|
2995
|
+
res_imgs->entries.push_back(std::move(res));
|
|
2996
|
+
}
|
|
2997
|
+
|
|
2998
|
+
res_imgs->grid_x = inst.grid_size.width;
|
|
2999
|
+
res_imgs->grid_y = inst.grid_size.height;
|
|
3000
|
+
} break;
|
|
3001
|
+
|
|
3002
|
+
case PROJECTOR_TYPE_QWEN2VL:
|
|
3003
|
+
case PROJECTOR_TYPE_QWEN25VL:
|
|
3004
|
+
case PROJECTOR_TYPE_QWEN3VL:
|
|
3005
|
+
case PROJECTOR_TYPE_GLM4V:
|
|
3006
|
+
case PROJECTOR_TYPE_PADDLEOCR:
|
|
3007
|
+
{
|
|
3008
|
+
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
|
|
3009
|
+
clip_image_u8 resized;
|
|
3010
|
+
const clip_image_size new_size = img_tool::calc_size_preserved_ratio(
|
|
3011
|
+
original_size,
|
|
3012
|
+
params.patch_size * 2,
|
|
3013
|
+
params.image_min_pixels,
|
|
3014
|
+
params.image_max_pixels);
|
|
3015
|
+
img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
|
|
3016
|
+
// clip_image_save_to_bmp(resized, "preproc.bmp");
|
|
3017
|
+
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
|
3018
|
+
// clip_image_f32_ptr res(clip_image_f32_init());
|
|
3019
|
+
normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
|
|
3020
|
+
// res_imgs->data[0] = *res;
|
|
3021
|
+
res_imgs->entries.push_back(std::move(img_f32));
|
|
3022
|
+
} break;
|
|
3023
|
+
case PROJECTOR_TYPE_YOUTUVL:
|
|
3024
|
+
{
|
|
3025
|
+
const int patch_size = params.patch_size; // typically 16
|
|
3026
|
+
const int merge_size = params.n_merge; // typically 2
|
|
3027
|
+
const int align_size = patch_size * merge_size; // 32
|
|
3028
|
+
|
|
3029
|
+
const int max_num_patches = params.image_max_pixels > 0 ?
|
|
3030
|
+
params.image_max_pixels / (patch_size * patch_size) : 256;
|
|
3031
|
+
|
|
3032
|
+
// Linear search for optimal scale to fit within max_num_patches
|
|
3033
|
+
float scale = 1.0f;
|
|
3034
|
+
int target_height = original_size.height;
|
|
3035
|
+
int target_width = original_size.width;
|
|
3036
|
+
|
|
3037
|
+
auto get_scaled_image_size = [align_size](float scale, int size) -> int {
|
|
3038
|
+
float scaled_size = size * scale;
|
|
3039
|
+
// Round up to nearest multiple of align_size
|
|
3040
|
+
int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size;
|
|
3041
|
+
// Ensure at least one patch
|
|
3042
|
+
return std::max(align_size, aligned);
|
|
3043
|
+
};
|
|
3044
|
+
|
|
3045
|
+
// Linear search with 0.02 step size
|
|
3046
|
+
while (scale > 0.0f) {
|
|
3047
|
+
target_height = get_scaled_image_size(scale, original_size.height);
|
|
3048
|
+
target_width = get_scaled_image_size(scale, original_size.width);
|
|
3049
|
+
|
|
3050
|
+
int num_patches_h = target_height / patch_size;
|
|
3051
|
+
int num_patches_w = target_width / patch_size;
|
|
3052
|
+
int num_patches = num_patches_h * num_patches_w;
|
|
3053
|
+
|
|
3054
|
+
if (num_patches > max_num_patches) {
|
|
3055
|
+
scale -= 0.02f;
|
|
3056
|
+
} else {
|
|
3057
|
+
break;
|
|
3058
|
+
}
|
|
3059
|
+
}
|
|
3060
|
+
|
|
3061
|
+
clip_image_size new_size = {target_width, target_height};
|
|
3062
|
+
|
|
3063
|
+
// Resize the image
|
|
3064
|
+
clip_image_u8 resized;
|
|
3065
|
+
img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
|
|
3066
|
+
|
|
3067
|
+
// Normalize to float32
|
|
3068
|
+
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
|
3069
|
+
normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
|
|
3070
|
+
|
|
3071
|
+
// Add to results
|
|
3072
|
+
res_imgs->entries.push_back(std::move(img_f32));
|
|
3073
|
+
} break;
|
|
3074
|
+
|
|
3075
|
+
case PROJECTOR_TYPE_IDEFICS3:
|
|
3076
|
+
{
|
|
3077
|
+
// The refined size has two steps:
|
|
3078
|
+
// 1. Resize w/ aspect-ratio preserving such that the longer side is
|
|
3079
|
+
// the preprocessor longest size
|
|
3080
|
+
// 2. Resize w/out preserving aspect ratio such that both sides are
|
|
3081
|
+
// multiples of image_size (always rounding up)
|
|
3082
|
+
//
|
|
3083
|
+
// CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
|
|
3084
|
+
const clip_image_size refined_size = img_tool::calc_size_preserved_ratio(
|
|
3085
|
+
original_size, params.image_size, params.image_longest_edge);
|
|
3086
|
+
// LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
|
|
3087
|
+
// __func__, original_size.width, original_size.height,
|
|
3088
|
+
// refined_size.width, refined_size.height);
|
|
3089
|
+
|
|
3090
|
+
llava_uhd::slice_instructions instructions;
|
|
3091
|
+
instructions.overview_size = clip_image_size{params.image_size, params.image_size};
|
|
3092
|
+
instructions.refined_size = refined_size;
|
|
3093
|
+
instructions.grid_size = clip_image_size{
|
|
3094
|
+
static_cast<int>(std::ceil(static_cast<float>(refined_size.width) / params.image_size)),
|
|
3095
|
+
static_cast<int>(std::ceil(static_cast<float>(refined_size.height) / params.image_size)),
|
|
3096
|
+
};
|
|
3097
|
+
for (int y = 0; y < refined_size.height; y += params.image_size) {
|
|
3098
|
+
for (int x = 0; x < refined_size.width; x += params.image_size) {
|
|
3099
|
+
// LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y);
|
|
3100
|
+
instructions.slices.push_back(llava_uhd::slice_coordinates{
|
|
3101
|
+
/* x */x,
|
|
3102
|
+
/* y */y,
|
|
3103
|
+
/* size */clip_image_size{
|
|
3104
|
+
std::min(params.image_size, refined_size.width - x),
|
|
3105
|
+
std::min(params.image_size, refined_size.height - y)
|
|
3106
|
+
}
|
|
3107
|
+
});
|
|
3108
|
+
}
|
|
3109
|
+
}
|
|
3110
|
+
auto imgs = llava_uhd::slice_image(img, instructions);
|
|
3111
|
+
|
|
3112
|
+
// cast and normalize to f32
|
|
3113
|
+
for (size_t i = 0; i < imgs.size(); ++i) {
|
|
3114
|
+
// clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
|
|
3115
|
+
clip_image_f32_ptr res(clip_image_f32_init());
|
|
3116
|
+
normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
|
|
3117
|
+
res_imgs->entries.push_back(std::move(res));
|
|
3118
|
+
}
|
|
3119
|
+
|
|
3120
|
+
res_imgs->grid_x = instructions.grid_size.width;
|
|
3121
|
+
res_imgs->grid_y = instructions.grid_size.height;
|
|
3122
|
+
} break;
|
|
3123
|
+
|
|
3124
|
+
case PROJECTOR_TYPE_GLM_EDGE:
|
|
3125
|
+
case PROJECTOR_TYPE_GEMMA3:
|
|
3126
|
+
case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution
|
|
3127
|
+
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
|
|
3128
|
+
{
|
|
3129
|
+
clip_image_u8 resized_image;
|
|
3130
|
+
int sz = params.image_size;
|
|
3131
|
+
img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR);
|
|
3132
|
+
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
|
3133
|
+
//clip_image_save_to_bmp(resized_image, "resized.bmp");
|
|
3134
|
+
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
|
|
3135
|
+
res_imgs->entries.push_back(std::move(img_f32));
|
|
3136
|
+
} break;
|
|
3137
|
+
|
|
3138
|
+
case PROJECTOR_TYPE_GEMMA3NV:
|
|
3139
|
+
{
|
|
3140
|
+
clip_image_u8 resized_image;
|
|
3141
|
+
int sz = params.image_size;
|
|
3142
|
+
img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
|
|
3143
|
+
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
|
3144
|
+
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
|
|
3145
|
+
res_imgs->entries.push_back(std::move(img_f32));
|
|
3146
|
+
} break;
|
|
3147
|
+
|
|
3148
|
+
case PROJECTOR_TYPE_JANUS_PRO:
|
|
3149
|
+
{
|
|
3150
|
+
// Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
|
|
3151
|
+
const std::array<uint8_t, 3> pad_color = {127, 127, 127};
|
|
3152
|
+
clip_image_u8 resized_image;
|
|
3153
|
+
int sz = params.image_size;
|
|
3154
|
+
img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
|
|
3155
|
+
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
|
3156
|
+
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
|
|
3157
|
+
res_imgs->entries.push_back(std::move(img_f32));
|
|
3158
|
+
} break;
|
|
3159
|
+
|
|
3160
|
+
case PROJECTOR_TYPE_PIXTRAL:
|
|
3161
|
+
case PROJECTOR_TYPE_LIGHTONOCR:
|
|
3162
|
+
{
|
|
3163
|
+
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
|
|
3164
|
+
clip_image_u8 resized_image;
|
|
3165
|
+
// the original pixtral model doesn't have n_merge
|
|
3166
|
+
const int cur_merge = params.n_merge == 0 ? 1 : params.n_merge;
|
|
3167
|
+
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
|
|
3168
|
+
original_size,
|
|
3169
|
+
params.patch_size * cur_merge,
|
|
3170
|
+
params.image_min_pixels,
|
|
3171
|
+
params.image_max_pixels);
|
|
3172
|
+
img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR);
|
|
3173
|
+
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
|
3174
|
+
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
|
|
3175
|
+
res_imgs->entries.push_back(std::move(img_f32));
|
|
3176
|
+
} break;
|
|
3177
|
+
|
|
3178
|
+
case PROJECTOR_TYPE_LLAMA4:
|
|
3179
|
+
{
|
|
3180
|
+
GGML_ASSERT(!params.image_res_candidates.empty());
|
|
3181
|
+
auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
|
|
3182
|
+
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
|
|
3183
|
+
|
|
3184
|
+
for (size_t i = 0; i < imgs.size(); ++i) {
|
|
3185
|
+
clip_image_f32_ptr res(clip_image_f32_init());
|
|
3186
|
+
normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
|
|
3187
|
+
res_imgs->entries.push_back(std::move(res));
|
|
3188
|
+
}
|
|
3189
|
+
|
|
3190
|
+
res_imgs->grid_x = inst.grid_size.width;
|
|
3191
|
+
res_imgs->grid_y = inst.grid_size.height;
|
|
3192
|
+
} break;
|
|
3193
|
+
|
|
3194
|
+
case PROJECTOR_TYPE_LFM2:
|
|
3195
|
+
{
|
|
3196
|
+
auto const inst = lfm2_vl_image_processor::get_slice_instructions(ctx, original_size);
|
|
3197
|
+
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
|
|
3198
|
+
|
|
3199
|
+
for (size_t i = 0; i < imgs.size(); ++i) {
|
|
3200
|
+
clip_image_f32_ptr res(clip_image_f32_init());
|
|
3201
|
+
normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
|
|
3202
|
+
res_imgs->entries.push_back(std::move(res));
|
|
3203
|
+
}
|
|
3204
|
+
|
|
3205
|
+
res_imgs->grid_x = inst.grid_size.width;
|
|
3206
|
+
res_imgs->grid_y = inst.grid_size.height;
|
|
3207
|
+
} break;
|
|
3208
|
+
|
|
3209
|
+
case PROJECTOR_TYPE_KIMIVL:
|
|
3210
|
+
{
|
|
3211
|
+
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
|
|
3212
|
+
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
|
|
3213
|
+
original_size,
|
|
3214
|
+
params.patch_size * params.n_merge,
|
|
3215
|
+
params.image_min_pixels,
|
|
3216
|
+
params.image_max_pixels);
|
|
3217
|
+
const std::array<uint8_t, 3> pad_color = {122, 116, 104};
|
|
3218
|
+
|
|
3219
|
+
clip_image_u8 resized_img;
|
|
3220
|
+
img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
|
|
3221
|
+
clip_image_f32_ptr res(clip_image_f32_init());
|
|
3222
|
+
normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
|
|
3223
|
+
res_imgs->entries.push_back(std::move(res));
|
|
3224
|
+
} break;
|
|
3225
|
+
|
|
3226
|
+
case PROJECTOR_TYPE_KIMIK25:
|
|
3227
|
+
{
|
|
3228
|
+
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
|
|
3229
|
+
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
|
|
3230
|
+
original_size,
|
|
3231
|
+
params.patch_size * params.n_merge,
|
|
3232
|
+
params.image_min_pixels,
|
|
3233
|
+
params.image_max_pixels);
|
|
3234
|
+
const std::array<uint8_t, 3> pad_color = {0, 0, 0};
|
|
3235
|
+
|
|
3236
|
+
clip_image_u8 resized_img;
|
|
3237
|
+
img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BICUBIC, true, pad_color);
|
|
3238
|
+
clip_image_f32_ptr res(clip_image_f32_init());
|
|
3239
|
+
normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
|
|
3240
|
+
res_imgs->entries.push_back(std::move(res));
|
|
3241
|
+
} break;
|
|
3242
|
+
|
|
3243
|
+
case PROJECTOR_TYPE_MLP:
|
|
3244
|
+
case PROJECTOR_TYPE_MLP_NORM:
|
|
3245
|
+
case PROJECTOR_TYPE_LDP:
|
|
3246
|
+
case PROJECTOR_TYPE_LDPV2:
|
|
3247
|
+
case PROJECTOR_TYPE_COGVLM: // TODO @ngxson : is this correct for cogvlm?
|
|
3248
|
+
{
|
|
3249
|
+
// TODO @ngxson : refactor the code below to avoid duplicated logic
|
|
3250
|
+
|
|
3251
|
+
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
|
|
3252
|
+
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
|
|
3253
|
+
|
|
3254
|
+
clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily
|
|
3255
|
+
|
|
3256
|
+
// The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
|
|
3257
|
+
if (params.image_res_candidates.empty()) { // pad_to_square
|
|
3258
|
+
// for llava-1.5, we resize image to a square, and pad the shorter side with a background color
|
|
3259
|
+
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
|
|
3260
|
+
const int longer_side = std::max(img->nx, img->ny);
|
|
3261
|
+
temp->nx = longer_side;
|
|
3262
|
+
temp->ny = longer_side;
|
|
3263
|
+
temp->buf.resize(3 * longer_side * longer_side);
|
|
3264
|
+
|
|
3265
|
+
// background color in RGB from LLaVA (this is the mean rgb color * 255)
|
|
3266
|
+
const std::array<uint8_t, 3> pad_color = {122, 116, 104};
|
|
3267
|
+
|
|
3268
|
+
// resize the image to the target_size
|
|
3269
|
+
img_tool::resize(*img, *temp, clip_image_size{params.image_size, params.image_size}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
|
|
3270
|
+
|
|
3271
|
+
clip_image_f32_ptr res(clip_image_f32_init());
|
|
3272
|
+
normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std);
|
|
3273
|
+
res_imgs->entries.push_back(std::move(res));
|
|
3274
|
+
|
|
3275
|
+
} else {
|
|
3276
|
+
// "spatial_unpad" with "anyres" processing for llava-1.6
|
|
3277
|
+
auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
|
|
3278
|
+
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
|
|
3279
|
+
|
|
3280
|
+
for (size_t i = 0; i < imgs.size(); ++i) {
|
|
3281
|
+
// clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
|
|
3282
|
+
clip_image_f32_ptr res(clip_image_f32_init());
|
|
3283
|
+
normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
|
|
3284
|
+
res_imgs->entries.push_back(std::move(res));
|
|
3285
|
+
}
|
|
3286
|
+
}
|
|
3287
|
+
} break;
|
|
3288
|
+
|
|
3289
|
+
default:
|
|
3290
|
+
LOG_ERR("%s: unsupported projector type %d\n", __func__, ctx->proj_type());
|
|
3291
|
+
return false;
|
|
3292
|
+
}
|
|
3293
|
+
|
|
3294
|
+
return true;
|
|
3295
|
+
}
|
|
3296
|
+
|
|
3297
|
+
ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
|
|
3298
|
+
return ctx->model.image_newline;
|
|
3299
|
+
}
|
|
3300
|
+
|
|
3301
|
+
void clip_free(clip_ctx * ctx) {
|
|
3302
|
+
if (ctx == nullptr) {
|
|
3303
|
+
return;
|
|
3304
|
+
}
|
|
3305
|
+
delete ctx;
|
|
3306
|
+
}
|
|
3307
|
+
|
|
3308
|
+
// deprecated
|
|
3309
|
+
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
|
|
3310
|
+
const int32_t nx = ctx->model.hparams.image_size;
|
|
3311
|
+
const int32_t ny = ctx->model.hparams.image_size;
|
|
3312
|
+
return clip_embd_nbytes_by_img(ctx, nx, ny);
|
|
3313
|
+
}
|
|
3314
|
+
|
|
3315
|
+
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
|
|
3316
|
+
clip_image_f32 img;
|
|
3317
|
+
img.nx = img_w;
|
|
3318
|
+
img.ny = img_h;
|
|
3319
|
+
return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
|
3320
|
+
}
|
|
3321
|
+
|
|
3322
|
+
int32_t clip_get_image_size(const struct clip_ctx * ctx) {
|
|
3323
|
+
return ctx->model.hparams.image_size;
|
|
3324
|
+
}
|
|
3325
|
+
|
|
3326
|
+
int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
|
|
3327
|
+
return ctx->model.hparams.patch_size;
|
|
3328
|
+
}
|
|
3329
|
+
|
|
3330
|
+
int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
|
|
3331
|
+
return ctx->model.hparams.n_embd;
|
|
3332
|
+
}
|
|
3333
|
+
|
|
3334
|
+
const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
|
|
3335
|
+
return ctx->model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat";
|
|
3336
|
+
}
|
|
3337
|
+
|
|
3338
|
+
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
|
3339
|
+
const auto & params = ctx->model.hparams;
|
|
3340
|
+
const int n_total = clip_n_output_tokens(ctx, img);
|
|
3341
|
+
const auto & proj = ctx->proj_type();
|
|
3342
|
+
switch (proj) {
|
|
3343
|
+
case PROJECTOR_TYPE_QWEN2VL:
|
|
3344
|
+
case PROJECTOR_TYPE_QWEN25VL:
|
|
3345
|
+
case PROJECTOR_TYPE_QWEN3VL:
|
|
3346
|
+
case PROJECTOR_TYPE_GLM4V:
|
|
3347
|
+
case PROJECTOR_TYPE_PADDLEOCR:
|
|
3348
|
+
case PROJECTOR_TYPE_YOUTUVL:
|
|
3349
|
+
return (img->nx / params.patch_size) / 2;
|
|
3350
|
+
default:
|
|
3351
|
+
break;
|
|
3352
|
+
}
|
|
3353
|
+
return n_total;
|
|
3354
|
+
}
|
|
3355
|
+
|
|
3356
|
+
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
|
3357
|
+
const auto & params = ctx->model.hparams;
|
|
3358
|
+
const auto & proj = ctx->proj_type();
|
|
3359
|
+
switch (proj) {
|
|
3360
|
+
case PROJECTOR_TYPE_QWEN2VL:
|
|
3361
|
+
case PROJECTOR_TYPE_QWEN25VL:
|
|
3362
|
+
case PROJECTOR_TYPE_QWEN3VL:
|
|
3363
|
+
case PROJECTOR_TYPE_GLM4V:
|
|
3364
|
+
case PROJECTOR_TYPE_PADDLEOCR:
|
|
3365
|
+
case PROJECTOR_TYPE_YOUTUVL:
|
|
3366
|
+
return (img->ny / params.patch_size) / 2;
|
|
3367
|
+
default:
|
|
3368
|
+
break;
|
|
3369
|
+
}
|
|
3370
|
+
return 1;
|
|
3371
|
+
}
|
|
3372
|
+
|
|
3373
|
+
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
|
3374
|
+
const auto & params = ctx->model.hparams;
|
|
3375
|
+
|
|
3376
|
+
// for models with fixed size image, the input image is already pre-processed and resized to square
|
|
3377
|
+
int patch_size = params.patch_size;
|
|
3378
|
+
int n_patches = (img->nx / patch_size) * (img->ny / patch_size);
|
|
3379
|
+
|
|
3380
|
+
projector_type proj = ctx->proj_type();
|
|
3381
|
+
|
|
3382
|
+
switch (proj) {
|
|
3383
|
+
case PROJECTOR_TYPE_MLP:
|
|
3384
|
+
case PROJECTOR_TYPE_MLP_NORM:
|
|
3385
|
+
case PROJECTOR_TYPE_JANUS_PRO:
|
|
3386
|
+
{
|
|
3387
|
+
// do nothing
|
|
3388
|
+
} break;
|
|
3389
|
+
case PROJECTOR_TYPE_LDP:
|
|
3390
|
+
case PROJECTOR_TYPE_LDPV2:
|
|
3391
|
+
case PROJECTOR_TYPE_GLM_EDGE:
|
|
3392
|
+
{
|
|
3393
|
+
n_patches /= 4;
|
|
3394
|
+
if (ctx->model.mm_boi) {
|
|
3395
|
+
n_patches += 2; // for BOI and EOI token embeddings
|
|
3396
|
+
}
|
|
3397
|
+
} break;
|
|
3398
|
+
case PROJECTOR_TYPE_MINICPMV:
|
|
3399
|
+
{
|
|
3400
|
+
// Use actual config value if available, otherwise fall back to hardcoded values
|
|
3401
|
+
if (params.minicpmv_query_num > 0) {
|
|
3402
|
+
n_patches = params.minicpmv_query_num;
|
|
3403
|
+
} else {
|
|
3404
|
+
// Fallback to hardcoded values for legacy models
|
|
3405
|
+
if (params.minicpmv_version == 2) {
|
|
3406
|
+
n_patches = 96;
|
|
3407
|
+
} else if (params.minicpmv_version == 3) {
|
|
3408
|
+
n_patches = 64;
|
|
3409
|
+
} else if (params.minicpmv_version == 4) {
|
|
3410
|
+
n_patches = 64;
|
|
3411
|
+
} else if (params.minicpmv_version == 5) {
|
|
3412
|
+
// MiniCPM-V 4.0
|
|
3413
|
+
n_patches = 64;
|
|
3414
|
+
} else if (params.minicpmv_version == 6) {
|
|
3415
|
+
// MiniCPM-V 4.5
|
|
3416
|
+
n_patches = 64;
|
|
3417
|
+
} else if (params.minicpmv_version == 100045) {
|
|
3418
|
+
// MiniCPM-o 4.5
|
|
3419
|
+
n_patches = 64;
|
|
3420
|
+
} else {
|
|
3421
|
+
GGML_ABORT("Unknown minicpmv version");
|
|
3422
|
+
}
|
|
3423
|
+
}
|
|
3424
|
+
} break;
|
|
3425
|
+
case PROJECTOR_TYPE_QWEN2VL:
|
|
3426
|
+
case PROJECTOR_TYPE_QWEN25VL:
|
|
3427
|
+
case PROJECTOR_TYPE_QWEN3VL:
|
|
3428
|
+
case PROJECTOR_TYPE_GLM4V:
|
|
3429
|
+
case PROJECTOR_TYPE_YOUTUVL:
|
|
3430
|
+
{
|
|
3431
|
+
// dynamic size (2 conv, so double patch size)
|
|
3432
|
+
int x_patch = img->nx / (params.patch_size * 2);
|
|
3433
|
+
int y_patch = img->ny / (params.patch_size * 2);
|
|
3434
|
+
n_patches = x_patch * y_patch;
|
|
3435
|
+
} break;
|
|
3436
|
+
case PROJECTOR_TYPE_GEMMA3:
|
|
3437
|
+
case PROJECTOR_TYPE_IDEFICS3:
|
|
3438
|
+
case PROJECTOR_TYPE_INTERNVL:
|
|
3439
|
+
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
|
|
3440
|
+
case PROJECTOR_TYPE_LLAMA4:
|
|
3441
|
+
{
|
|
3442
|
+
// both X and Y are downscaled by the scale factor
|
|
3443
|
+
int scale_factor = ctx->model.hparams.n_merge;
|
|
3444
|
+
n_patches /= (scale_factor * scale_factor);
|
|
3445
|
+
} break;
|
|
3446
|
+
case PROJECTOR_TYPE_GEMMA3NV:
|
|
3447
|
+
{
|
|
3448
|
+
// MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
|
|
3449
|
+
// regardless of input size (see architecture description)
|
|
3450
|
+
n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size;
|
|
3451
|
+
} break;
|
|
3452
|
+
case PROJECTOR_TYPE_LFM2:
|
|
3453
|
+
case PROJECTOR_TYPE_KIMIVL:
|
|
3454
|
+
case PROJECTOR_TYPE_KIMIK25:
|
|
3455
|
+
{
|
|
3456
|
+
// dynamic size
|
|
3457
|
+
int out_patch_size = params.patch_size * ctx->model.hparams.n_merge;
|
|
3458
|
+
int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
|
|
3459
|
+
int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
|
|
3460
|
+
n_patches = x_patch * y_patch;
|
|
3461
|
+
} break;
|
|
3462
|
+
case PROJECTOR_TYPE_PADDLEOCR:
|
|
3463
|
+
{
|
|
3464
|
+
// dynamic size
|
|
3465
|
+
int n_merge = ctx->model.hparams.n_merge;
|
|
3466
|
+
int stride = n_merge * n_merge;
|
|
3467
|
+
n_patches = CLIP_ALIGN(n_patches, stride) / stride;
|
|
3468
|
+
} break;
|
|
3469
|
+
case PROJECTOR_TYPE_PIXTRAL:
|
|
3470
|
+
case PROJECTOR_TYPE_LIGHTONOCR:
|
|
3471
|
+
{
|
|
3472
|
+
// dynamic size
|
|
3473
|
+
int n_merge = ctx->model.hparams.n_merge;
|
|
3474
|
+
int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
|
|
3475
|
+
int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
|
|
3476
|
+
if (ctx->model.token_embd_img_break) {
|
|
3477
|
+
n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
|
|
3478
|
+
} else {
|
|
3479
|
+
n_patches = n_patches_y * n_patches_x;
|
|
3480
|
+
}
|
|
3481
|
+
} break;
|
|
3482
|
+
case PROJECTOR_TYPE_VOXTRAL:
|
|
3483
|
+
case PROJECTOR_TYPE_ULTRAVOX:
|
|
3484
|
+
case PROJECTOR_TYPE_QWEN2A:
|
|
3485
|
+
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
|
3486
|
+
{
|
|
3487
|
+
n_patches = img->nx;
|
|
3488
|
+
|
|
3489
|
+
const int proj_stack_factor = ctx->model.hparams.proj_stack_factor;
|
|
3490
|
+
if (ctx->model.audio_has_stack_frames()) {
|
|
3491
|
+
GGML_ASSERT(proj_stack_factor > 0);
|
|
3492
|
+
const int n_len = CLIP_ALIGN(n_patches, proj_stack_factor);
|
|
3493
|
+
n_patches = n_len / proj_stack_factor;
|
|
3494
|
+
}
|
|
3495
|
+
|
|
3496
|
+
// whisper downscales input token by half after conv1d
|
|
3497
|
+
n_patches /= 2;
|
|
3498
|
+
|
|
3499
|
+
if (ctx->model.audio_has_avgpool()) {
|
|
3500
|
+
// divide by 2 because of nn.AvgPool1d(2, stride=2)
|
|
3501
|
+
n_patches /= 2;
|
|
3502
|
+
}
|
|
3503
|
+
} break;
|
|
3504
|
+
case PROJECTOR_TYPE_GLMA:
|
|
3505
|
+
{
|
|
3506
|
+
n_patches = img->nx;
|
|
3507
|
+
// whisper downscales input token by half after conv1d
|
|
3508
|
+
n_patches /= 2;
|
|
3509
|
+
// reshape by merge_factor
|
|
3510
|
+
n_patches /= ctx->model.hparams.proj_stack_factor;
|
|
3511
|
+
// for BOI and EOI token embeddings
|
|
3512
|
+
n_patches += 2;
|
|
3513
|
+
} break;
|
|
3514
|
+
case PROJECTOR_TYPE_COGVLM:
|
|
3515
|
+
{
|
|
3516
|
+
n_patches += 2; // for BOI and EOI token embeddings
|
|
3517
|
+
} break;
|
|
3518
|
+
case PROJECTOR_TYPE_LFM2A:
|
|
3519
|
+
{
|
|
3520
|
+
n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
|
|
3521
|
+
} break;
|
|
3522
|
+
default:
|
|
3523
|
+
GGML_ABORT("unsupported projector type");
|
|
3524
|
+
}
|
|
3525
|
+
|
|
3526
|
+
return n_patches;
|
|
3527
|
+
}
|
|
3528
|
+
|
|
3529
|
+
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
|
3530
|
+
clip_image_f32_batch imgs;
|
|
3531
|
+
clip_image_f32_ptr img_copy(clip_image_f32_init());
|
|
3532
|
+
*img_copy = *img;
|
|
3533
|
+
imgs.entries.push_back(std::move(img_copy));
|
|
3534
|
+
|
|
3535
|
+
return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
|
|
3536
|
+
}
|
|
3537
|
+
|
|
3538
|
+
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
|
|
3539
|
+
const clip_image_f32_batch & imgs = *imgs_c_ptr;
|
|
3540
|
+
int batch_size = imgs.entries.size();
|
|
3541
|
+
|
|
3542
|
+
// TODO @ngxson : implement batch size > 1 as a loop
|
|
3543
|
+
// we don't need true batching support because the cgraph will gonna be big anyway
|
|
3544
|
+
if (batch_size != 1) {
|
|
3545
|
+
return false; // only support batch size of 1
|
|
3546
|
+
}
|
|
3547
|
+
|
|
3548
|
+
// if buffers are not allocated, we need to do a warmup run to allocate them
|
|
3549
|
+
if (!ctx->is_allocated) {
|
|
3550
|
+
clip_model_loader::warmup(*ctx, *imgs_c_ptr);
|
|
3551
|
+
}
|
|
3552
|
+
|
|
3553
|
+
// build the inference graph
|
|
3554
|
+
ggml_backend_sched_reset(ctx->sched.get());
|
|
3555
|
+
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
|
|
3556
|
+
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
|
|
3557
|
+
|
|
3558
|
+
// set inputs
|
|
3559
|
+
const auto & model = ctx->model;
|
|
3560
|
+
const auto & hparams = model.hparams;
|
|
3561
|
+
|
|
3562
|
+
const int image_size_width = imgs.entries[0]->nx;
|
|
3563
|
+
const int image_size_height = imgs.entries[0]->ny;
|
|
3564
|
+
|
|
3565
|
+
const int patch_size = hparams.patch_size;
|
|
3566
|
+
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
|
3567
|
+
const int n_pos = num_patches + (model.class_embedding ? 1 : 0);
|
|
3568
|
+
const int pos_w = image_size_width / patch_size;
|
|
3569
|
+
const int pos_h = image_size_height / patch_size;
|
|
3570
|
+
|
|
3571
|
+
|
|
3572
|
+
auto get_inp_tensor = [&gf](const char * name) {
|
|
3573
|
+
ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
|
|
3574
|
+
if (inp == nullptr) {
|
|
3575
|
+
GGML_ABORT("Failed to get tensor %s", name);
|
|
3576
|
+
}
|
|
3577
|
+
if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) {
|
|
3578
|
+
GGML_ABORT("Tensor %s is not an input tensor", name);
|
|
3579
|
+
}
|
|
3580
|
+
return inp;
|
|
3581
|
+
};
|
|
3582
|
+
|
|
3583
|
+
auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
|
|
3584
|
+
ggml_tensor * cur = get_inp_tensor(name);
|
|
3585
|
+
GGML_ASSERT(cur->type == GGML_TYPE_F32);
|
|
3586
|
+
GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
|
|
3587
|
+
ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
|
|
3588
|
+
};
|
|
3589
|
+
|
|
3590
|
+
auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t> & values) {
|
|
3591
|
+
ggml_tensor * cur = get_inp_tensor(name);
|
|
3592
|
+
GGML_ASSERT(cur->type == GGML_TYPE_I32);
|
|
3593
|
+
GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
|
|
3594
|
+
ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
|
|
3595
|
+
};
|
|
3596
|
+
|
|
3597
|
+
// set input pixel values
|
|
3598
|
+
if (!imgs.is_audio) {
|
|
3599
|
+
size_t nelem = 0;
|
|
3600
|
+
for (const auto & img : imgs.entries) {
|
|
3601
|
+
nelem += img->nx * img->ny * 3;
|
|
3602
|
+
}
|
|
3603
|
+
std::vector<float> inp_raw(nelem);
|
|
3604
|
+
|
|
3605
|
+
// layout of data (note: the channel dim is unrolled to better visualize the layout):
|
|
3606
|
+
//
|
|
3607
|
+
// ┌──W──┐
|
|
3608
|
+
// │ H │ channel = R
|
|
3609
|
+
// ├─────┤ │
|
|
3610
|
+
// │ H │ channel = G
|
|
3611
|
+
// ├─────┤ │
|
|
3612
|
+
// │ H │ channel = B
|
|
3613
|
+
// └─────┘ │
|
|
3614
|
+
// ──────┘ x B
|
|
3615
|
+
|
|
3616
|
+
for (size_t i = 0; i < imgs.entries.size(); i++) {
|
|
3617
|
+
const int nx = imgs.entries[i]->nx;
|
|
3618
|
+
const int ny = imgs.entries[i]->ny;
|
|
3619
|
+
const int n = nx * ny;
|
|
3620
|
+
|
|
3621
|
+
for (int b = 0; b < batch_size; b++) {
|
|
3622
|
+
float * batch_entry = inp_raw.data() + b * (3*n);
|
|
3623
|
+
for (int y = 0; y < ny; y++) {
|
|
3624
|
+
for (int x = 0; x < nx; x++) {
|
|
3625
|
+
size_t base_src = 3*(y * nx + x); // idx of the first channel
|
|
3626
|
+
size_t base_dst = y * nx + x; // idx of the first channel
|
|
3627
|
+
batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ];
|
|
3628
|
+
batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
|
|
3629
|
+
batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
|
|
3630
|
+
}
|
|
3631
|
+
}
|
|
3632
|
+
}
|
|
3633
|
+
}
|
|
3634
|
+
set_input_f32("inp_raw", inp_raw);
|
|
3635
|
+
|
|
3636
|
+
} else {
|
|
3637
|
+
// audio input
|
|
3638
|
+
GGML_ASSERT(imgs.entries.size() == 1);
|
|
3639
|
+
const auto & mel_inp = imgs.entries[0];
|
|
3640
|
+
const int n_step = mel_inp->nx;
|
|
3641
|
+
const int n_mel = mel_inp->ny;
|
|
3642
|
+
std::vector<float> inp_raw(n_step * n_mel);
|
|
3643
|
+
std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float));
|
|
3644
|
+
set_input_f32("inp_raw", inp_raw);
|
|
3645
|
+
}
|
|
3646
|
+
|
|
3647
|
+
// set input per projector
|
|
3648
|
+
switch (ctx->model.proj_type) {
|
|
3649
|
+
case PROJECTOR_TYPE_MINICPMV:
|
|
3650
|
+
{
|
|
3651
|
+
// inspired from siglip:
|
|
3652
|
+
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
|
|
3653
|
+
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
|
|
3654
|
+
std::vector<int32_t> positions(pos_h * pos_w);
|
|
3655
|
+
int bucket_coords_h[1024];
|
|
3656
|
+
int bucket_coords_w[1024];
|
|
3657
|
+
for (int i = 0; i < pos_h; i++){
|
|
3658
|
+
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
|
|
3659
|
+
}
|
|
3660
|
+
for (int i = 0; i < pos_w; i++){
|
|
3661
|
+
bucket_coords_w[i] = std::floor(70.0*i/pos_w);
|
|
3662
|
+
}
|
|
3663
|
+
for (int i = 0, id = 0; i < pos_h; i++){
|
|
3664
|
+
for (int j = 0; j < pos_w; j++){
|
|
3665
|
+
positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
|
|
3666
|
+
}
|
|
3667
|
+
}
|
|
3668
|
+
set_input_i32("positions", positions);
|
|
3669
|
+
|
|
3670
|
+
// inputs for resampler projector
|
|
3671
|
+
// set the 2D positions (using float for sinusoidal embedding)
|
|
3672
|
+
int n_patches_per_col = image_size_width / patch_size;
|
|
3673
|
+
std::vector<float> pos_data(n_pos);
|
|
3674
|
+
// dimension H
|
|
3675
|
+
for (int i = 0; i < n_pos; i++) {
|
|
3676
|
+
pos_data[i] = static_cast<float>(i / n_patches_per_col);
|
|
3677
|
+
}
|
|
3678
|
+
set_input_f32("pos_h", pos_data);
|
|
3679
|
+
// dimension W
|
|
3680
|
+
for (int i = 0; i < n_pos; i++) {
|
|
3681
|
+
pos_data[i] = static_cast<float>(i % n_patches_per_col);
|
|
3682
|
+
}
|
|
3683
|
+
set_input_f32("pos_w", pos_data);
|
|
3684
|
+
// base frequency omega
|
|
3685
|
+
const float base_freq = 10000.0f;
|
|
3686
|
+
const int n_embd_proj = clip_n_mmproj_embd(ctx);
|
|
3687
|
+
std::vector<float> omega(n_embd_proj / 4);
|
|
3688
|
+
for (int i = 0; i < n_embd_proj / 4; ++i) {
|
|
3689
|
+
omega[i] = 1.0f / std::pow(base_freq, static_cast<float>(i) / (n_embd_proj / 4));
|
|
3690
|
+
}
|
|
3691
|
+
set_input_f32("omega", omega);
|
|
3692
|
+
} break;
|
|
3693
|
+
case PROJECTOR_TYPE_QWEN2VL:
|
|
3694
|
+
case PROJECTOR_TYPE_QWEN3VL:
|
|
3695
|
+
case PROJECTOR_TYPE_GLM4V:
|
|
3696
|
+
{
|
|
3697
|
+
const int merge_ratio = hparams.n_merge;
|
|
3698
|
+
const int pw = image_size_width / patch_size;
|
|
3699
|
+
const int ph = image_size_height / patch_size;
|
|
3700
|
+
std::vector<int> positions(n_pos * 4);
|
|
3701
|
+
int ptr = 0;
|
|
3702
|
+
for (int y = 0; y < ph; y += merge_ratio) {
|
|
3703
|
+
for (int x = 0; x < pw; x += merge_ratio) {
|
|
3704
|
+
for (int dy = 0; dy < 2; dy++) {
|
|
3705
|
+
for (int dx = 0; dx < 2; dx++) {
|
|
3706
|
+
positions[ ptr] = y + dy;
|
|
3707
|
+
positions[ num_patches + ptr] = x + dx;
|
|
3708
|
+
positions[2 * num_patches + ptr] = y + dy;
|
|
3709
|
+
positions[3 * num_patches + ptr] = x + dx;
|
|
3710
|
+
ptr++;
|
|
3711
|
+
}
|
|
3712
|
+
}
|
|
3713
|
+
}
|
|
3714
|
+
}
|
|
3715
|
+
|
|
3716
|
+
set_input_i32("positions", positions);
|
|
3717
|
+
} break;
|
|
3718
|
+
case PROJECTOR_TYPE_PADDLEOCR:
|
|
3719
|
+
{
|
|
3720
|
+
const int merge_ratio = hparams.n_merge;
|
|
3721
|
+
const int pw = image_size_width / patch_size;
|
|
3722
|
+
const int ph = image_size_height / patch_size;
|
|
3723
|
+
std::vector<int> positions(n_pos * 4);
|
|
3724
|
+
int ptr = 0;
|
|
3725
|
+
// NOTE: same as Qwen-VL, but x and y are swapped
|
|
3726
|
+
for (int y = 0; y < ph; y += merge_ratio) {
|
|
3727
|
+
for (int dy = 0; dy < 2; dy++) {
|
|
3728
|
+
for (int x = 0; x < pw; x += merge_ratio) {
|
|
3729
|
+
for (int dx = 0; dx < 2; dx++) {
|
|
3730
|
+
positions[ ptr] = y + dy;
|
|
3731
|
+
positions[ num_patches + ptr] = x + dx;
|
|
3732
|
+
positions[2 * num_patches + ptr] = y + dy;
|
|
3733
|
+
positions[3 * num_patches + ptr] = x + dx;
|
|
3734
|
+
ptr++;
|
|
3735
|
+
}
|
|
3736
|
+
}
|
|
3737
|
+
}
|
|
3738
|
+
}
|
|
3739
|
+
|
|
3740
|
+
set_input_i32("positions", positions);
|
|
3741
|
+
} break;
|
|
3742
|
+
case PROJECTOR_TYPE_QWEN25VL:
|
|
3743
|
+
case PROJECTOR_TYPE_YOUTUVL:
|
|
3744
|
+
{
|
|
3745
|
+
// pw * ph = number of tokens output by ViT after apply patch merger
|
|
3746
|
+
// ipw * ipw = number of vision token been processed inside ViT
|
|
3747
|
+
const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
|
|
3748
|
+
const int merge_ratio = 2;
|
|
3749
|
+
const int pw = image_size_width / patch_size / merge_ratio;
|
|
3750
|
+
const int ph = image_size_height / patch_size / merge_ratio;
|
|
3751
|
+
const int ipw = image_size_width / patch_size;
|
|
3752
|
+
const int iph = image_size_height / patch_size;
|
|
3753
|
+
|
|
3754
|
+
std::vector<int> idx (ph * pw);
|
|
3755
|
+
std::vector<int> inv_idx(ph * pw);
|
|
3756
|
+
|
|
3757
|
+
if (use_window_attn) {
|
|
3758
|
+
const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112;
|
|
3759
|
+
const int grid_window = attn_window_size / patch_size / merge_ratio;
|
|
3760
|
+
int dst = 0;
|
|
3761
|
+
// [num_vision_tokens, num_vision_tokens] attention mask tensor
|
|
3762
|
+
std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
|
|
3763
|
+
int mask_row = 0;
|
|
3764
|
+
|
|
3765
|
+
for (int y = 0; y < ph; y += grid_window) {
|
|
3766
|
+
for (int x = 0; x < pw; x += grid_window) {
|
|
3767
|
+
const int win_h = std::min(grid_window, ph - y);
|
|
3768
|
+
const int win_w = std::min(grid_window, pw - x);
|
|
3769
|
+
const int dst_0 = dst;
|
|
3770
|
+
// group all tokens belong to the same window togather (to a continue range)
|
|
3771
|
+
for (int dy = 0; dy < win_h; dy++) {
|
|
3772
|
+
for (int dx = 0; dx < win_w; dx++) {
|
|
3773
|
+
const int src = (y + dy) * pw + (x + dx);
|
|
3774
|
+
GGML_ASSERT(src < (int)idx.size());
|
|
3775
|
+
GGML_ASSERT(dst < (int)inv_idx.size());
|
|
3776
|
+
idx [src] = dst;
|
|
3777
|
+
inv_idx[dst] = src;
|
|
3778
|
+
dst++;
|
|
3779
|
+
}
|
|
3780
|
+
}
|
|
3781
|
+
|
|
3782
|
+
for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
|
|
3783
|
+
int row_offset = mask_row * (ipw * iph);
|
|
3784
|
+
std::fill(
|
|
3785
|
+
mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
|
|
3786
|
+
mask.begin() + row_offset + (dst * merge_ratio * merge_ratio),
|
|
3787
|
+
0.0);
|
|
3788
|
+
mask_row++;
|
|
3789
|
+
}
|
|
3790
|
+
}
|
|
3791
|
+
}
|
|
3792
|
+
|
|
3793
|
+
set_input_i32("window_idx", idx);
|
|
3794
|
+
set_input_i32("inv_window_idx", inv_idx);
|
|
3795
|
+
set_input_f32("window_mask", mask);
|
|
3796
|
+
} else {
|
|
3797
|
+
for (int i = 0; i < ph * pw; i++) {
|
|
3798
|
+
idx[i] = i;
|
|
3799
|
+
}
|
|
3800
|
+
}
|
|
3801
|
+
|
|
3802
|
+
const int mpow = merge_ratio * merge_ratio;
|
|
3803
|
+
std::vector<int> positions(n_pos * 4);
|
|
3804
|
+
|
|
3805
|
+
int ptr = 0;
|
|
3806
|
+
for (int y = 0; y < iph; y += merge_ratio) {
|
|
3807
|
+
for (int x = 0; x < ipw; x += merge_ratio) {
|
|
3808
|
+
for (int dy = 0; dy < 2; dy++) {
|
|
3809
|
+
for (int dx = 0; dx < 2; dx++) {
|
|
3810
|
+
auto remap = idx[ptr / mpow];
|
|
3811
|
+
remap = (remap * mpow) + (ptr % mpow);
|
|
3812
|
+
|
|
3813
|
+
positions[ remap] = y + dy;
|
|
3814
|
+
positions[ num_patches + remap] = x + dx;
|
|
3815
|
+
positions[2 * num_patches + remap] = y + dy;
|
|
3816
|
+
positions[3 * num_patches + remap] = x + dx;
|
|
3817
|
+
ptr++;
|
|
3818
|
+
}
|
|
3819
|
+
}
|
|
3820
|
+
}
|
|
3821
|
+
}
|
|
3822
|
+
|
|
3823
|
+
set_input_i32("positions", positions);
|
|
3824
|
+
} break;
|
|
3825
|
+
case PROJECTOR_TYPE_PIXTRAL:
|
|
3826
|
+
case PROJECTOR_TYPE_KIMIVL:
|
|
3827
|
+
case PROJECTOR_TYPE_KIMIK25:
|
|
3828
|
+
case PROJECTOR_TYPE_LIGHTONOCR:
|
|
3829
|
+
{
|
|
3830
|
+
// set the 2D positions
|
|
3831
|
+
int n_patches_per_col = image_size_width / patch_size;
|
|
3832
|
+
std::vector<int> pos_data(n_pos);
|
|
3833
|
+
// dimension H
|
|
3834
|
+
for (int i = 0; i < n_pos; i++) {
|
|
3835
|
+
pos_data[i] = i / n_patches_per_col;
|
|
3836
|
+
}
|
|
3837
|
+
set_input_i32("pos_h", pos_data);
|
|
3838
|
+
// dimension W
|
|
3839
|
+
for (int i = 0; i < n_pos; i++) {
|
|
3840
|
+
pos_data[i] = i % n_patches_per_col;
|
|
3841
|
+
}
|
|
3842
|
+
set_input_i32("pos_w", pos_data);
|
|
3843
|
+
} break;
|
|
3844
|
+
case PROJECTOR_TYPE_GLM_EDGE:
|
|
3845
|
+
{
|
|
3846
|
+
// llava and other models
|
|
3847
|
+
std::vector<int32_t> positions(n_pos);
|
|
3848
|
+
for (int i = 0; i < n_pos; i++) {
|
|
3849
|
+
positions[i] = i;
|
|
3850
|
+
}
|
|
3851
|
+
set_input_i32("positions", positions);
|
|
3852
|
+
} break;
|
|
3853
|
+
case PROJECTOR_TYPE_MLP:
|
|
3854
|
+
case PROJECTOR_TYPE_MLP_NORM:
|
|
3855
|
+
case PROJECTOR_TYPE_LDP:
|
|
3856
|
+
case PROJECTOR_TYPE_LDPV2:
|
|
3857
|
+
{
|
|
3858
|
+
// llava and other models
|
|
3859
|
+
std::vector<int32_t> positions(n_pos);
|
|
3860
|
+
for (int i = 0; i < n_pos; i++) {
|
|
3861
|
+
positions[i] = i;
|
|
3862
|
+
}
|
|
3863
|
+
set_input_i32("positions", positions);
|
|
3864
|
+
|
|
3865
|
+
// The patches vector is used to get rows to index into the embeds with;
|
|
3866
|
+
// we should skip dim 0 only if we have CLS to avoid going out of bounds
|
|
3867
|
+
// when retrieving the rows.
|
|
3868
|
+
int patch_offset = model.class_embedding ? 1 : 0;
|
|
3869
|
+
std::vector<int32_t> patches(num_patches);
|
|
3870
|
+
for (int i = 0; i < num_patches; i++) {
|
|
3871
|
+
patches[i] = i + patch_offset;
|
|
3872
|
+
}
|
|
3873
|
+
set_input_i32("patches", patches);
|
|
3874
|
+
} break;
|
|
3875
|
+
case PROJECTOR_TYPE_GEMMA3:
|
|
3876
|
+
case PROJECTOR_TYPE_GEMMA3NV:
|
|
3877
|
+
case PROJECTOR_TYPE_IDEFICS3:
|
|
3878
|
+
case PROJECTOR_TYPE_INTERNVL:
|
|
3879
|
+
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
|
|
3880
|
+
case PROJECTOR_TYPE_QWEN2A:
|
|
3881
|
+
case PROJECTOR_TYPE_GLMA:
|
|
3882
|
+
case PROJECTOR_TYPE_ULTRAVOX:
|
|
3883
|
+
case PROJECTOR_TYPE_LFM2:
|
|
3884
|
+
case PROJECTOR_TYPE_VOXTRAL:
|
|
3885
|
+
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
|
3886
|
+
case PROJECTOR_TYPE_JANUS_PRO:
|
|
3887
|
+
case PROJECTOR_TYPE_COGVLM:
|
|
3888
|
+
{
|
|
3889
|
+
// do nothing
|
|
3890
|
+
} break;
|
|
3891
|
+
case PROJECTOR_TYPE_LLAMA4:
|
|
3892
|
+
{
|
|
3893
|
+
// set the 2D positions
|
|
3894
|
+
int n_patches_per_col = image_size_width / patch_size;
|
|
3895
|
+
std::vector<int> pos_data(num_patches + 1, 0); // +1 for the [CLS] token
|
|
3896
|
+
// last pos is always kept 0, it's for CLS
|
|
3897
|
+
// dimension H
|
|
3898
|
+
for (int i = 0; i < num_patches; i++) {
|
|
3899
|
+
pos_data[i] = (i / n_patches_per_col) + 1;
|
|
3900
|
+
}
|
|
3901
|
+
set_input_i32("pos_h", pos_data);
|
|
3902
|
+
// dimension W
|
|
3903
|
+
for (int i = 0; i < num_patches; i++) {
|
|
3904
|
+
pos_data[i] = (i % n_patches_per_col) + 1;
|
|
3905
|
+
}
|
|
3906
|
+
set_input_i32("pos_w", pos_data);
|
|
3907
|
+
} break;
|
|
3908
|
+
case PROJECTOR_TYPE_LFM2A:
|
|
3909
|
+
{
|
|
3910
|
+
GGML_ASSERT(imgs.entries.size() == 1);
|
|
3911
|
+
const auto n_frames = clip_n_output_tokens(ctx, imgs.entries.front().get());
|
|
3912
|
+
|
|
3913
|
+
auto d_model = 512;
|
|
3914
|
+
auto seq_len = n_frames * 2 - 1;
|
|
3915
|
+
std::vector<float> pos_emb(d_model*seq_len);
|
|
3916
|
+
std::vector<double> inv_freq(d_model / 2);
|
|
3917
|
+
for (size_t i = 0; i < inv_freq.size(); ++i) {
|
|
3918
|
+
inv_freq[i] = std::exp(-(std::log(10000.0) / (float)d_model) * (2.0f * (float)(i)));
|
|
3919
|
+
}
|
|
3920
|
+
for (int64_t pos = 0; pos < seq_len; ++pos) {
|
|
3921
|
+
for (size_t i = 0; i < inv_freq.size(); ++i) {
|
|
3922
|
+
const float ang = (n_frames - pos - 1) * inv_freq[i];
|
|
3923
|
+
pos_emb[pos*d_model + 2*i + 0] = sinf(ang); // even
|
|
3924
|
+
pos_emb[pos*d_model + 2*i + 1] = cosf(ang); // odd
|
|
3925
|
+
}
|
|
3926
|
+
}
|
|
3927
|
+
set_input_f32("pos_emb", pos_emb);
|
|
3928
|
+
} break;
|
|
3929
|
+
default:
|
|
3930
|
+
GGML_ABORT("Unknown projector type");
|
|
3931
|
+
}
|
|
3932
|
+
|
|
3933
|
+
// ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
|
|
3934
|
+
ggml_backend_dev_t dev = ggml_backend_get_device(ctx->backend_cpu);
|
|
3935
|
+
ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
|
|
3936
|
+
if (reg) {
|
|
3937
|
+
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
|
|
3938
|
+
if (ggml_backend_set_n_threads_fn) {
|
|
3939
|
+
ggml_backend_set_n_threads_fn(ctx->backend_cpu, n_threads);
|
|
3940
|
+
}
|
|
3941
|
+
}
|
|
3942
|
+
|
|
3943
|
+
auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
|
|
3944
|
+
if (status != GGML_STATUS_SUCCESS) {
|
|
3945
|
+
LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
|
|
3946
|
+
return false;
|
|
3947
|
+
}
|
|
3948
|
+
|
|
3949
|
+
// the last node is the embedding tensor
|
|
3950
|
+
ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
|
3951
|
+
|
|
3952
|
+
// sanity check (only support batch size of 1 for now)
|
|
3953
|
+
const int n_tokens_out = embeddings->ne[1];
|
|
3954
|
+
const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
|
|
3955
|
+
if (n_tokens_out != expected_n_tokens_out) {
|
|
3956
|
+
LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
|
|
3957
|
+
GGML_ABORT("Invalid number of output tokens");
|
|
3958
|
+
}
|
|
3959
|
+
|
|
3960
|
+
// copy the embeddings to the location passed by the user
|
|
3961
|
+
if (vec != nullptr) {
|
|
3962
|
+
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
|
3963
|
+
}
|
|
3964
|
+
|
|
3965
|
+
// Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set
|
|
3966
|
+
if (std::getenv("MTMD_DEBUG_EMBEDDINGS") != nullptr) {
|
|
3967
|
+
const int64_t n_embd = embeddings->ne[0];
|
|
3968
|
+
const int64_t n_tokens = embeddings->ne[1];
|
|
3969
|
+
std::vector<float> emb_data(n_embd * n_tokens);
|
|
3970
|
+
ggml_backend_tensor_get(embeddings, emb_data.data(), 0, ggml_nbytes(embeddings));
|
|
3971
|
+
|
|
3972
|
+
LOG_INF("\n=== MTMD_DEBUG_EMBEDDINGS ===\n");
|
|
3973
|
+
LOG_INF("Shape: [%lld, %lld]\n", (long long)n_embd, (long long)n_tokens);
|
|
3974
|
+
|
|
3975
|
+
// Print first few values of first token
|
|
3976
|
+
LOG_INF("Token 0 (first 16 values): ");
|
|
3977
|
+
for (int i = 0; i < std::min((int64_t)16, n_embd); i++) {
|
|
3978
|
+
LOG_INF("%.6f ", emb_data[i]);
|
|
3979
|
+
}
|
|
3980
|
+
LOG_INF("\n");
|
|
3981
|
+
|
|
3982
|
+
// Print last few values of first token
|
|
3983
|
+
if (n_embd > 16) {
|
|
3984
|
+
LOG_INF("Token 0 (last 16 values): ");
|
|
3985
|
+
for (int64_t i = n_embd - 16; i < n_embd; i++) {
|
|
3986
|
+
LOG_INF("%.6f ", emb_data[i]);
|
|
3987
|
+
}
|
|
3988
|
+
LOG_INF("\n");
|
|
3989
|
+
}
|
|
3990
|
+
|
|
3991
|
+
// Compute and print statistics
|
|
3992
|
+
float sum = 0.0f, sum_sq = 0.0f, min_val = emb_data[0], max_val = emb_data[0];
|
|
3993
|
+
for (size_t i = 0; i < emb_data.size(); i++) {
|
|
3994
|
+
sum += emb_data[i];
|
|
3995
|
+
sum_sq += emb_data[i] * emb_data[i];
|
|
3996
|
+
min_val = std::min(min_val, emb_data[i]);
|
|
3997
|
+
max_val = std::max(max_val, emb_data[i]);
|
|
3998
|
+
}
|
|
3999
|
+
float mean = sum / emb_data.size();
|
|
4000
|
+
float variance = (sum_sq / emb_data.size()) - (mean * mean);
|
|
4001
|
+
LOG_INF("Stats: mean=%.6f, std=%.6f, min=%.6f, max=%.6f, sum=%.6f\n",
|
|
4002
|
+
mean, sqrtf(variance), min_val, max_val, sum);
|
|
4003
|
+
LOG_INF("=== END MTMD_DEBUG_EMBEDDINGS ===\n\n");
|
|
4004
|
+
}
|
|
4005
|
+
|
|
4006
|
+
return true;
|
|
4007
|
+
}
|
|
4008
|
+
|
|
4009
|
+
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|
4010
|
+
switch (ctx->model.proj_type) {
|
|
4011
|
+
case PROJECTOR_TYPE_LDP:
|
|
4012
|
+
return ctx->model.mm_model_block_1_block_2_1_b->ne[0];
|
|
4013
|
+
case PROJECTOR_TYPE_LDPV2:
|
|
4014
|
+
return ctx->model.mm_model_peg_0_b->ne[0];
|
|
4015
|
+
case PROJECTOR_TYPE_MLP:
|
|
4016
|
+
case PROJECTOR_TYPE_PIXTRAL:
|
|
4017
|
+
case PROJECTOR_TYPE_LIGHTONOCR:
|
|
4018
|
+
return ctx->model.mm_2_w->ne[1];
|
|
4019
|
+
case PROJECTOR_TYPE_MLP_NORM:
|
|
4020
|
+
return ctx->model.mm_3_b->ne[0];
|
|
4021
|
+
case PROJECTOR_TYPE_MINICPMV:
|
|
4022
|
+
return ctx->model.mm_model_proj->ne[0];
|
|
4023
|
+
case PROJECTOR_TYPE_GLM_EDGE:
|
|
4024
|
+
return ctx->model.mm_model_mlp_3_w->ne[1];
|
|
4025
|
+
case PROJECTOR_TYPE_QWEN2VL:
|
|
4026
|
+
case PROJECTOR_TYPE_QWEN25VL:
|
|
4027
|
+
case PROJECTOR_TYPE_JANUS_PRO:
|
|
4028
|
+
case PROJECTOR_TYPE_YOUTUVL:
|
|
4029
|
+
return ctx->model.mm_1_b->ne[0];
|
|
4030
|
+
case PROJECTOR_TYPE_QWEN3VL:
|
|
4031
|
+
// main path + deepstack paths
|
|
4032
|
+
return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
|
|
4033
|
+
case PROJECTOR_TYPE_GEMMA3:
|
|
4034
|
+
case PROJECTOR_TYPE_GEMMA3NV:
|
|
4035
|
+
return ctx->model.mm_input_proj_w->ne[0];
|
|
4036
|
+
case PROJECTOR_TYPE_IDEFICS3:
|
|
4037
|
+
return ctx->model.projection->ne[1];
|
|
4038
|
+
case PROJECTOR_TYPE_ULTRAVOX:
|
|
4039
|
+
case PROJECTOR_TYPE_VOXTRAL:
|
|
4040
|
+
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
|
4041
|
+
return ctx->model.mm_2_w->ne[1];
|
|
4042
|
+
case PROJECTOR_TYPE_INTERNVL:
|
|
4043
|
+
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
|
|
4044
|
+
return ctx->model.mm_3_w->ne[1];
|
|
4045
|
+
case PROJECTOR_TYPE_LLAMA4:
|
|
4046
|
+
return ctx->model.mm_model_proj->ne[1];
|
|
4047
|
+
case PROJECTOR_TYPE_QWEN2A:
|
|
4048
|
+
return ctx->model.mm_fc_w->ne[1];
|
|
4049
|
+
case PROJECTOR_TYPE_GLMA:
|
|
4050
|
+
return ctx->model.mm_2_w->ne[1];
|
|
4051
|
+
case PROJECTOR_TYPE_LFM2:
|
|
4052
|
+
case PROJECTOR_TYPE_KIMIVL:
|
|
4053
|
+
case PROJECTOR_TYPE_PADDLEOCR:
|
|
4054
|
+
case PROJECTOR_TYPE_KIMIK25:
|
|
4055
|
+
return ctx->model.mm_2_w->ne[1];
|
|
4056
|
+
case PROJECTOR_TYPE_COGVLM:
|
|
4057
|
+
return ctx->model.mm_4h_to_h_w->ne[1];
|
|
4058
|
+
case PROJECTOR_TYPE_LFM2A:
|
|
4059
|
+
return ctx->model.position_embeddings->ne[0];
|
|
4060
|
+
case PROJECTOR_TYPE_GLM4V:
|
|
4061
|
+
return ctx->model.mm_ffn_down_w->ne[1];
|
|
4062
|
+
default:
|
|
4063
|
+
GGML_ABORT("Unknown projector type");
|
|
4064
|
+
}
|
|
4065
|
+
}
|
|
4066
|
+
|
|
4067
|
+
int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
|
4068
|
+
// TODO: remove this function
|
|
4069
|
+
if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
|
|
4070
|
+
return ctx->model.hparams.minicpmv_version;
|
|
4071
|
+
}
|
|
4072
|
+
return 0;
|
|
4073
|
+
}
|
|
4074
|
+
|
|
4075
|
+
bool clip_is_glm(const struct clip_ctx * ctx) {
|
|
4076
|
+
// TODO: remove this function
|
|
4077
|
+
return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
|
|
4078
|
+
}
|
|
4079
|
+
|
|
4080
|
+
bool clip_is_llava(const struct clip_ctx * ctx) {
|
|
4081
|
+
return ctx->model.hparams.has_llava_projector;
|
|
4082
|
+
}
|
|
4083
|
+
|
|
4084
|
+
bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
|
|
4085
|
+
return ctx->model.modality == CLIP_MODALITY_VISION;
|
|
4086
|
+
}
|
|
4087
|
+
|
|
4088
|
+
bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
|
|
4089
|
+
return ctx->model.modality == CLIP_MODALITY_AUDIO;
|
|
4090
|
+
}
|
|
4091
|
+
|
|
4092
|
+
bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
|
|
4093
|
+
switch (ctx->proj_type()) {
|
|
4094
|
+
case PROJECTOR_TYPE_ULTRAVOX:
|
|
4095
|
+
case PROJECTOR_TYPE_QWEN2A:
|
|
4096
|
+
case PROJECTOR_TYPE_GLMA:
|
|
4097
|
+
case PROJECTOR_TYPE_VOXTRAL:
|
|
4098
|
+
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
|
4099
|
+
return true;
|
|
4100
|
+
default:
|
|
4101
|
+
return false;
|
|
4102
|
+
}
|
|
4103
|
+
}
|
|
4104
|
+
|
|
4105
|
+
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
|
|
4106
|
+
clip_image_f32 clip_img;
|
|
4107
|
+
clip_img.buf.resize(h * w * 3);
|
|
4108
|
+
for (int i = 0; i < h*w*3; i++)
|
|
4109
|
+
{
|
|
4110
|
+
clip_img.buf[i] = img[i];
|
|
4111
|
+
}
|
|
4112
|
+
clip_img.nx = w;
|
|
4113
|
+
clip_img.ny = h;
|
|
4114
|
+
clip_image_encode(ctx, n_threads, &clip_img, vec);
|
|
4115
|
+
return true;
|
|
4116
|
+
}
|
|
4117
|
+
|
|
4118
|
+
//
|
|
4119
|
+
// API used internally with mtmd
|
|
4120
|
+
//
|
|
4121
|
+
|
|
4122
|
+
projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
|
|
4123
|
+
return ctx->proj_type();
|
|
4124
|
+
}
|
|
4125
|
+
|
|
4126
|
+
void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) {
|
|
4127
|
+
clip_image_f32 * audio = new clip_image_f32;
|
|
4128
|
+
audio->nx = n_frames;
|
|
4129
|
+
audio->ny = n_mel;
|
|
4130
|
+
audio->buf.resize(n_frames * n_mel);
|
|
4131
|
+
std::memcpy(audio->buf.data(), mel, n_frames * n_mel * sizeof(float));
|
|
4132
|
+
|
|
4133
|
+
batch->entries.push_back(clip_image_f32_ptr(audio));
|
|
4134
|
+
batch->is_audio = true;
|
|
4135
|
+
}
|
|
4136
|
+
|
|
4137
|
+
const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
|
|
4138
|
+
return &ctx->model.hparams;
|
|
4139
|
+
}
|
|
4140
|
+
|
|
4141
|
+
//
|
|
4142
|
+
// API for debugging
|
|
4143
|
+
//
|
|
4144
|
+
void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
|
|
4145
|
+
clip_image_f32 img;
|
|
4146
|
+
img.nx = w;
|
|
4147
|
+
img.ny = h;
|
|
4148
|
+
img.buf.resize(h * w * 3);
|
|
4149
|
+
for (int i = 0; i < h * w * 3; i++) {
|
|
4150
|
+
img.buf[i] = static_cast<float>(fill_value);
|
|
4151
|
+
}
|
|
4152
|
+
clip_image_encode(ctx, 1, &img, nullptr);
|
|
4153
|
+
GGML_ASSERT(img.buf.empty() && "expected, always stop here");
|
|
4154
|
+
}
|