cui-llama.rn 1.7.4 → 1.7.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +217 -17
- package/android/src/main/CMakeLists.txt +34 -15
- package/android/src/main/java/com/rnllama/LlamaContext.java +79 -5
- package/android/src/main/java/com/rnllama/RNLlama.java +237 -0
- package/android/src/main/jni.cpp +213 -14
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
- package/cpp/README.md +1 -1
- package/cpp/chat-parser.cpp +385 -0
- package/cpp/chat-parser.h +120 -0
- package/cpp/chat.cpp +726 -596
- package/cpp/chat.h +71 -6
- package/cpp/common.cpp +56 -38
- package/cpp/common.h +9 -3
- package/cpp/ggml-backend-reg.cpp +5 -0
- package/cpp/ggml-backend.cpp +10 -2
- package/cpp/ggml-common.h +4 -0
- package/cpp/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
- package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/ggml-cpu/common.h +4 -3
- package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
- package/cpp/ggml-cpu/ggml-cpu.c +123 -104
- package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
- package/cpp/ggml-cpu/ops.cpp +330 -148
- package/cpp/ggml-cpu/ops.h +1 -0
- package/cpp/ggml-cpu/quants.c +1158 -0
- package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/ggml-cpu/repack.cpp +1571 -0
- package/cpp/ggml-cpu/repack.h +98 -0
- package/cpp/ggml-cpu/simd-mappings.h +330 -38
- package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/ggml-cpu/vec.cpp +87 -18
- package/cpp/ggml-cpu/vec.h +249 -94
- package/cpp/ggml-cpu.h +1 -0
- package/cpp/ggml-impl.h +63 -183
- package/cpp/ggml-llama-sim.metallib +0 -0
- package/cpp/ggml-llama.metallib +0 -0
- package/cpp/ggml-metal.m +152 -45
- package/cpp/ggml-quants.c +0 -2
- package/cpp/ggml.c +61 -21
- package/cpp/ggml.h +22 -3
- package/cpp/gguf.cpp +24 -3
- package/cpp/json-partial.cpp +256 -0
- package/cpp/json-partial.h +38 -0
- package/cpp/json-schema-to-grammar.cpp +5 -47
- package/cpp/json-schema-to-grammar.h +4 -4
- package/cpp/llama-arch.cpp +153 -3
- package/cpp/llama-arch.h +27 -1
- package/cpp/llama-batch.cpp +741 -272
- package/cpp/llama-batch.h +112 -54
- package/cpp/llama-chat.cpp +30 -8
- package/cpp/llama-chat.h +1 -0
- package/cpp/llama-context.cpp +524 -339
- package/cpp/llama-context.h +38 -17
- package/cpp/llama-cparams.cpp +4 -0
- package/cpp/llama-cparams.h +2 -0
- package/cpp/llama-grammar.cpp +12 -2
- package/cpp/llama-graph.cpp +431 -356
- package/cpp/llama-graph.h +126 -58
- package/cpp/llama-hparams.cpp +10 -2
- package/cpp/llama-hparams.h +19 -2
- package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
- package/cpp/llama-kv-cache-unified-iswa.h +128 -0
- package/cpp/llama-kv-cache-unified.cpp +1841 -0
- package/cpp/llama-kv-cache-unified.h +303 -0
- package/cpp/llama-kv-cells.h +439 -0
- package/cpp/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama-memory-hybrid.h +138 -0
- package/cpp/llama-memory-recurrent.cpp +1112 -0
- package/cpp/llama-memory-recurrent.h +183 -0
- package/cpp/llama-memory.cpp +41 -0
- package/cpp/llama-memory.h +86 -5
- package/cpp/llama-mmap.cpp +1 -1
- package/cpp/llama-model-loader.cpp +42 -17
- package/cpp/llama-model-saver.cpp +1 -0
- package/cpp/llama-model.cpp +1639 -513
- package/cpp/llama-model.h +26 -0
- package/cpp/llama-sampling.cpp +2 -2
- package/cpp/llama-vocab.cpp +65 -28
- package/cpp/llama-vocab.h +1 -0
- package/cpp/llama.cpp +11 -7
- package/cpp/llama.h +150 -42
- package/cpp/minja/chat-template.hpp +1 -1
- package/cpp/minja/minja.hpp +1 -1
- package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
- package/cpp/nlohmann/json_fwd.hpp +187 -0
- package/cpp/regex-partial.cpp +204 -0
- package/cpp/regex-partial.h +56 -0
- package/cpp/rn-llama.cpp +646 -35
- package/cpp/rn-llama.h +32 -1
- package/cpp/rn-tts.h +39 -0
- package/cpp/sampling.cpp +7 -8
- package/cpp/tools/mtmd/clip-impl.h +5 -0
- package/cpp/tools/mtmd/clip.cpp +572 -436
- package/cpp/tools/mtmd/clip.h +14 -4
- package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
- package/cpp/tools/mtmd/mtmd-audio.h +2 -17
- package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
- package/cpp/tools/mtmd/mtmd-helper.h +91 -0
- package/cpp/tools/mtmd/mtmd.cpp +368 -248
- package/cpp/tools/mtmd/mtmd.h +6 -70
- package/cpp/unicode.cpp +5 -0
- package/ios/CMakeLists.txt +26 -6
- package/ios/RNLlama.h +1 -1
- package/ios/RNLlama.mm +153 -3
- package/ios/RNLlamaContext.h +9 -1
- package/ios/RNLlamaContext.mm +112 -9
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/jest/mock.js +24 -0
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +46 -2
- package/src/index.ts +105 -1
- package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
- package/cpp/ggml-cpu/sgemm.cpp +0 -3544
- package/cpp/ggml-cpu/sgemm.h +0 -14
- package/cpp/llama-kv-cache.cpp +0 -2827
- package/cpp/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
- /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
package/cpp/tools/mtmd/clip.cpp
CHANGED
@@ -11,9 +11,6 @@
|
|
11
11
|
#include "ggml-backend.h"
|
12
12
|
#include "gguf.h"
|
13
13
|
|
14
|
-
#define STB_IMAGE_IMPLEMENTATION
|
15
|
-
#include "stb_image.h"
|
16
|
-
|
17
14
|
#include <cassert>
|
18
15
|
#include <cmath>
|
19
16
|
#include <cstdlib>
|
@@ -172,9 +169,6 @@ enum patch_merge_type {
|
|
172
169
|
};
|
173
170
|
|
174
171
|
struct clip_hparams {
|
175
|
-
bool has_vision = false;
|
176
|
-
bool has_audio = false;
|
177
|
-
|
178
172
|
int32_t image_size;
|
179
173
|
int32_t patch_size;
|
180
174
|
int32_t n_embd;
|
@@ -184,9 +178,13 @@ struct clip_hparams {
|
|
184
178
|
int32_t n_layer;
|
185
179
|
int32_t proj_scale_factor = 0; // idefics3
|
186
180
|
|
181
|
+
float image_mean[3];
|
182
|
+
float image_std[3];
|
183
|
+
|
187
184
|
// for models using dynamic image size, we need to have a smaller image size to warmup
|
188
185
|
// otherwise, user will get OOM everytime they load the model
|
189
186
|
int32_t warmup_image_size = 0;
|
187
|
+
int32_t warmup_audio_size = 3000;
|
190
188
|
|
191
189
|
ffn_op_type ffn_op = FFN_GELU;
|
192
190
|
|
@@ -195,7 +193,7 @@ struct clip_hparams {
|
|
195
193
|
float eps = 1e-6;
|
196
194
|
float rope_theta = 0.0;
|
197
195
|
|
198
|
-
std::vector<
|
196
|
+
std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
|
199
197
|
int32_t image_crop_resolution;
|
200
198
|
std::unordered_set<int32_t> vision_feature_layer;
|
201
199
|
int32_t attn_window_size = 0;
|
@@ -205,6 +203,10 @@ struct clip_hparams {
|
|
205
203
|
// audio
|
206
204
|
int32_t n_mel_bins = 0; // whisper preprocessor
|
207
205
|
int32_t proj_stack_factor = 0; // ultravox
|
206
|
+
|
207
|
+
// legacy
|
208
|
+
bool has_llava_projector = false;
|
209
|
+
int minicpmv_version = 0;
|
208
210
|
};
|
209
211
|
|
210
212
|
struct clip_layer {
|
@@ -242,8 +244,10 @@ struct clip_layer {
|
|
242
244
|
lm_ggml_tensor * ls_2_w = nullptr;
|
243
245
|
};
|
244
246
|
|
245
|
-
struct
|
246
|
-
|
247
|
+
struct clip_model {
|
248
|
+
clip_modality modality = CLIP_MODALITY_VISION;
|
249
|
+
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
250
|
+
clip_hparams hparams;
|
247
251
|
|
248
252
|
// embeddings
|
249
253
|
lm_ggml_tensor * class_embedding = nullptr;
|
@@ -260,7 +264,9 @@ struct clip_vision_model {
|
|
260
264
|
lm_ggml_tensor * post_ln_w;
|
261
265
|
lm_ggml_tensor * post_ln_b;
|
262
266
|
|
263
|
-
lm_ggml_tensor * projection;
|
267
|
+
lm_ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
|
268
|
+
lm_ggml_tensor * mm_fc_w;
|
269
|
+
lm_ggml_tensor * mm_fc_b;
|
264
270
|
|
265
271
|
// LLaVA projection
|
266
272
|
lm_ggml_tensor * mm_input_norm_w = nullptr;
|
@@ -357,14 +363,7 @@ struct clip_vision_model {
|
|
357
363
|
};
|
358
364
|
|
359
365
|
struct clip_ctx {
|
360
|
-
|
361
|
-
int minicpmv_version = 0;
|
362
|
-
|
363
|
-
struct clip_vision_model vision_model;
|
364
|
-
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
365
|
-
|
366
|
-
float image_mean[3];
|
367
|
-
float image_std[3];
|
366
|
+
clip_model model;
|
368
367
|
|
369
368
|
lm_gguf_context_ptr ctx_gguf;
|
370
369
|
lm_ggml_context_ptr ctx_data;
|
@@ -418,11 +417,16 @@ struct clip_ctx {
|
|
418
417
|
lm_ggml_backend_free(backend_cpu);
|
419
418
|
}
|
420
419
|
}
|
420
|
+
|
421
|
+
// this function is added so that we don't change too much of the existing code
|
422
|
+
projector_type proj_type() const {
|
423
|
+
return model.proj_type;
|
424
|
+
}
|
421
425
|
};
|
422
426
|
|
423
427
|
struct clip_graph {
|
424
428
|
clip_ctx * ctx;
|
425
|
-
const
|
429
|
+
const clip_model & model;
|
426
430
|
const clip_hparams & hparams;
|
427
431
|
|
428
432
|
// we only support single image per batch
|
@@ -445,7 +449,7 @@ struct clip_graph {
|
|
445
449
|
|
446
450
|
clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
|
447
451
|
ctx(ctx),
|
448
|
-
model(ctx->
|
452
|
+
model(ctx->model),
|
449
453
|
hparams(model.hparams),
|
450
454
|
img(img),
|
451
455
|
patch_size(hparams.patch_size),
|
@@ -477,7 +481,7 @@ struct clip_graph {
|
|
477
481
|
model.position_embeddings,
|
478
482
|
nullptr);
|
479
483
|
|
480
|
-
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
484
|
+
if (ctx->proj_type() == PROJECTOR_TYPE_GEMMA3) {
|
481
485
|
const int batch_size = 1;
|
482
486
|
LM_GGML_ASSERT(n_patches_x == n_patches_y);
|
483
487
|
const int patches_per_image = n_patches_x;
|
@@ -500,7 +504,7 @@ struct clip_graph {
|
|
500
504
|
lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, model.mm_input_proj_w)),
|
501
505
|
cur);
|
502
506
|
|
503
|
-
} else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
|
507
|
+
} else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
|
504
508
|
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
|
505
509
|
|
506
510
|
const int scale_factor = model.hparams.proj_scale_factor;
|
@@ -634,7 +638,7 @@ struct clip_graph {
|
|
634
638
|
const int n_pos = n_patches;
|
635
639
|
const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
|
636
640
|
|
637
|
-
norm_type norm_t = ctx->proj_type == PROJECTOR_TYPE_QWEN25VL
|
641
|
+
norm_type norm_t = ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
|
638
642
|
? NORM_TYPE_RMS // qwen 2.5 vl
|
639
643
|
: NORM_TYPE_NORMAL; // qwen 2 vl
|
640
644
|
|
@@ -850,11 +854,11 @@ struct clip_graph {
|
|
850
854
|
const int d_head = 128;
|
851
855
|
int n_head = n_embd/d_head;
|
852
856
|
int num_query = 96;
|
853
|
-
if (ctx->minicpmv_version == 2) {
|
857
|
+
if (ctx->model.hparams.minicpmv_version == 2) {
|
854
858
|
num_query = 96;
|
855
|
-
} else if (ctx->minicpmv_version == 3) {
|
859
|
+
} else if (ctx->model.hparams.minicpmv_version == 3) {
|
856
860
|
num_query = 64;
|
857
|
-
} else if (ctx->minicpmv_version == 4) {
|
861
|
+
} else if (ctx->model.hparams.minicpmv_version == 4) {
|
858
862
|
num_query = 64;
|
859
863
|
}
|
860
864
|
|
@@ -1071,7 +1075,7 @@ struct clip_graph {
|
|
1071
1075
|
int il_last = hparams.n_layer - 1;
|
1072
1076
|
int deepest_feature_layer = -1;
|
1073
1077
|
|
1074
|
-
if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
1078
|
+
if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV || ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) {
|
1075
1079
|
il_last += 1;
|
1076
1080
|
}
|
1077
1081
|
|
@@ -1205,7 +1209,7 @@ struct clip_graph {
|
|
1205
1209
|
}
|
1206
1210
|
|
1207
1211
|
// llava projector (also used by granite)
|
1208
|
-
if (ctx->has_llava_projector) {
|
1212
|
+
if (ctx->model.hparams.has_llava_projector) {
|
1209
1213
|
embeddings = lm_ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
|
1210
1214
|
|
1211
1215
|
lm_ggml_tensor * patches = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_patches);
|
@@ -1219,7 +1223,7 @@ struct clip_graph {
|
|
1219
1223
|
// print_tensor_info(embeddings, "embeddings");
|
1220
1224
|
|
1221
1225
|
// llava projector
|
1222
|
-
if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
|
1226
|
+
if (ctx->proj_type() == PROJECTOR_TYPE_MLP) {
|
1223
1227
|
embeddings = lm_ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
1224
1228
|
embeddings = lm_ggml_add(ctx0, embeddings, model.mm_0_b);
|
1225
1229
|
|
@@ -1229,7 +1233,7 @@ struct clip_graph {
|
|
1229
1233
|
embeddings = lm_ggml_add(ctx0, embeddings, model.mm_2_b);
|
1230
1234
|
}
|
1231
1235
|
}
|
1232
|
-
else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
1236
|
+
else if (ctx->proj_type() == PROJECTOR_TYPE_MLP_NORM) {
|
1233
1237
|
embeddings = lm_ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
1234
1238
|
embeddings = lm_ggml_add(ctx0, embeddings, model.mm_0_b);
|
1235
1239
|
// lm_ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
|
@@ -1250,7 +1254,7 @@ struct clip_graph {
|
|
1250
1254
|
embeddings = lm_ggml_add(ctx0, lm_ggml_mul(ctx0, embeddings, model.mm_4_w),
|
1251
1255
|
model.mm_4_b);
|
1252
1256
|
}
|
1253
|
-
else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
|
1257
|
+
else if (ctx->proj_type() == PROJECTOR_TYPE_LDP) {
|
1254
1258
|
// MobileVLM projector
|
1255
1259
|
int n_patch = 24;
|
1256
1260
|
lm_ggml_tensor * mlp_1 = lm_ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
|
@@ -1360,7 +1364,7 @@ struct clip_graph {
|
|
1360
1364
|
}
|
1361
1365
|
embeddings = block_1;
|
1362
1366
|
}
|
1363
|
-
else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2)
|
1367
|
+
else if (ctx->proj_type() == PROJECTOR_TYPE_LDPV2)
|
1364
1368
|
{
|
1365
1369
|
int n_patch = 24;
|
1366
1370
|
lm_ggml_tensor * mlp_0 = lm_ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
|
@@ -1390,7 +1394,7 @@ struct clip_graph {
|
|
1390
1394
|
}
|
1391
1395
|
|
1392
1396
|
// glm projector
|
1393
|
-
else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
1397
|
+
else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) {
|
1394
1398
|
size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
|
1395
1399
|
embeddings = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0,embeddings,1,0,2,3));
|
1396
1400
|
embeddings = lm_ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
|
@@ -1477,48 +1481,58 @@ struct clip_graph {
|
|
1477
1481
|
|
1478
1482
|
cb(cur, "after_transformer", -1);
|
1479
1483
|
|
1480
|
-
|
1481
|
-
|
1482
|
-
|
1483
|
-
|
1484
|
-
|
1485
|
-
|
1486
|
-
|
1487
|
-
|
1488
|
-
|
1484
|
+
if (ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX) {
|
1485
|
+
// StackAudioFrames
|
1486
|
+
// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
|
1487
|
+
{
|
1488
|
+
int64_t stride = n_embd * hparams.proj_stack_factor;
|
1489
|
+
int64_t padded_len = LM_GGML_PAD(lm_ggml_nelements(cur), stride);
|
1490
|
+
int64_t pad = padded_len - lm_ggml_nelements(cur);
|
1491
|
+
if (pad > 0) {
|
1492
|
+
cur = lm_ggml_view_1d(ctx0, cur, lm_ggml_nelements(cur), 0);
|
1493
|
+
cur = lm_ggml_pad(ctx0, cur, pad, 0, 0, 0);
|
1494
|
+
}
|
1495
|
+
cur = lm_ggml_view_2d(ctx0, cur, stride, padded_len / stride,
|
1496
|
+
lm_ggml_row_size(cur->type, stride), 0);
|
1489
1497
|
}
|
1490
|
-
cur = lm_ggml_view_2d(ctx0, cur, stride, padded_len / stride,
|
1491
|
-
lm_ggml_row_size(cur->type, stride), 0);
|
1492
|
-
}
|
1493
1498
|
|
1494
|
-
|
1499
|
+
cb(cur, "after_stacked", -1);
|
1495
1500
|
|
1496
|
-
|
1497
|
-
|
1498
|
-
|
1499
|
-
|
1500
|
-
|
1501
|
+
// UltravoxProjector
|
1502
|
+
{
|
1503
|
+
// pre-norm
|
1504
|
+
cur = lm_ggml_rms_norm(ctx0, cur, 1e-6);
|
1505
|
+
cur = lm_ggml_mul(ctx0, cur, model.mm_norm_pre_w);
|
1501
1506
|
|
1502
|
-
|
1503
|
-
|
1507
|
+
// ffn in
|
1508
|
+
cur = lm_ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
1504
1509
|
|
1505
|
-
|
1506
|
-
|
1507
|
-
|
1508
|
-
|
1509
|
-
|
1510
|
+
// swiglu
|
1511
|
+
{
|
1512
|
+
int64_t split_point = cur->ne[0] / 2;
|
1513
|
+
lm_ggml_tensor * x0 = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
|
1514
|
+
lm_ggml_tensor * x1 = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * lm_ggml_element_size(cur)));
|
1510
1515
|
|
1511
|
-
|
1512
|
-
|
1513
|
-
|
1516
|
+
// see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
|
1517
|
+
x1 = lm_ggml_silu(ctx0, x1);
|
1518
|
+
cur = lm_ggml_mul(ctx0, x0, x1);
|
1519
|
+
}
|
1520
|
+
|
1521
|
+
// mid-norm
|
1522
|
+
cur = lm_ggml_rms_norm(ctx0, cur, 1e-6);
|
1523
|
+
cur = lm_ggml_mul(ctx0, cur, model.mm_norm_mid_w);
|
1524
|
+
|
1525
|
+
// ffn out
|
1526
|
+
cur = lm_ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
1514
1527
|
}
|
1515
1528
|
|
1516
|
-
|
1517
|
-
|
1518
|
-
cur =
|
1529
|
+
} else if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2A) {
|
1530
|
+
// projector
|
1531
|
+
cur = lm_ggml_mul_mat(ctx0, model.mm_fc_w, cur);
|
1532
|
+
cur = lm_ggml_add(ctx0, cur, model.mm_fc_b);
|
1519
1533
|
|
1520
|
-
|
1521
|
-
|
1534
|
+
} else {
|
1535
|
+
LM_GGML_ABORT("%s: unknown projector type", __func__);
|
1522
1536
|
}
|
1523
1537
|
|
1524
1538
|
cb(cur, "projected", -1);
|
@@ -1661,6 +1675,17 @@ private:
|
|
1661
1675
|
inpL = cur;
|
1662
1676
|
}
|
1663
1677
|
|
1678
|
+
// TODO @ngxson : find a way to move this outside
|
1679
|
+
if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2A) {
|
1680
|
+
lm_ggml_tensor * cur = inpL;
|
1681
|
+
cur = lm_ggml_transpose(ctx0, cur);
|
1682
|
+
cur = lm_ggml_cont(ctx0, cur);
|
1683
|
+
cur = lm_ggml_pool_1d(ctx0, cur, LM_GGML_OP_POOL_AVG, 2, 2, 0);
|
1684
|
+
cur = lm_ggml_transpose(ctx0, cur);
|
1685
|
+
cur = lm_ggml_cont(ctx0, cur);
|
1686
|
+
inpL = cur;
|
1687
|
+
}
|
1688
|
+
|
1664
1689
|
// post-layernorm
|
1665
1690
|
if (model.post_ln_w) {
|
1666
1691
|
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1);
|
@@ -1930,7 +1955,7 @@ static lm_ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_
|
|
1930
1955
|
|
1931
1956
|
lm_ggml_cgraph * res;
|
1932
1957
|
|
1933
|
-
switch (ctx->proj_type) {
|
1958
|
+
switch (ctx->proj_type()) {
|
1934
1959
|
case PROJECTOR_TYPE_GEMMA3:
|
1935
1960
|
case PROJECTOR_TYPE_IDEFICS3:
|
1936
1961
|
{
|
@@ -1958,6 +1983,7 @@ static lm_ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_
|
|
1958
1983
|
res = graph.build_llama4();
|
1959
1984
|
} break;
|
1960
1985
|
case PROJECTOR_TYPE_ULTRAVOX:
|
1986
|
+
case PROJECTOR_TYPE_QWEN2A:
|
1961
1987
|
{
|
1962
1988
|
res = graph.build_whisper_enc();
|
1963
1989
|
} break;
|
@@ -1973,13 +1999,15 @@ struct clip_model_loader {
|
|
1973
1999
|
lm_ggml_context_ptr ctx_meta;
|
1974
2000
|
lm_gguf_context_ptr ctx_gguf;
|
1975
2001
|
|
1976
|
-
clip_ctx & ctx_clip;
|
1977
2002
|
std::string fname;
|
1978
2003
|
|
1979
2004
|
size_t model_size = 0; // in bytes
|
1980
2005
|
|
1981
|
-
|
1982
|
-
|
2006
|
+
bool has_vision = false;
|
2007
|
+
bool has_audio = false;
|
2008
|
+
|
2009
|
+
// TODO @ngxson : we should not pass clip_ctx here, it should be clip_model
|
2010
|
+
clip_model_loader(const char * fname) : fname(fname) {
|
1983
2011
|
struct lm_ggml_context * meta = nullptr;
|
1984
2012
|
|
1985
2013
|
struct lm_gguf_init_params params = {
|
@@ -2011,6 +2039,19 @@ struct clip_model_loader {
|
|
2011
2039
|
LOG_INF("\n");
|
2012
2040
|
}
|
2013
2041
|
|
2042
|
+
// modalities
|
2043
|
+
{
|
2044
|
+
get_bool(KEY_HAS_VISION_ENC, has_vision, false);
|
2045
|
+
get_bool(KEY_HAS_AUDIO_ENC, has_audio, false);
|
2046
|
+
|
2047
|
+
if (has_vision) {
|
2048
|
+
LOG_INF("%s: has vision encoder\n", __func__);
|
2049
|
+
}
|
2050
|
+
if (has_audio) {
|
2051
|
+
LOG_INF("%s: has audio encoder\n", __func__);
|
2052
|
+
}
|
2053
|
+
}
|
2054
|
+
|
2014
2055
|
// tensors
|
2015
2056
|
{
|
2016
2057
|
for (int i = 0; i < n_tensors; ++i) {
|
@@ -2026,28 +2067,44 @@ struct clip_model_loader {
|
|
2026
2067
|
}
|
2027
2068
|
}
|
2028
2069
|
|
2029
|
-
void load_hparams() {
|
2030
|
-
auto & hparams =
|
2070
|
+
void load_hparams(clip_model & model, clip_modality modality) {
|
2071
|
+
auto & hparams = model.hparams;
|
2031
2072
|
std::string log_ffn_op; // for logging
|
2032
2073
|
|
2074
|
+
// sanity check
|
2075
|
+
if (modality == CLIP_MODALITY_VISION) {
|
2076
|
+
LM_GGML_ASSERT(has_vision);
|
2077
|
+
} else if (modality == CLIP_MODALITY_AUDIO) {
|
2078
|
+
LM_GGML_ASSERT(has_audio);
|
2079
|
+
}
|
2080
|
+
model.modality = modality;
|
2081
|
+
|
2082
|
+
|
2033
2083
|
// projector type
|
2034
2084
|
std::string proj_type;
|
2035
2085
|
{
|
2036
2086
|
get_string(KEY_PROJ_TYPE, proj_type, false);
|
2037
2087
|
if (!proj_type.empty()) {
|
2038
|
-
|
2088
|
+
model.proj_type = clip_projector_type_from_string(proj_type);
|
2039
2089
|
}
|
2040
|
-
if (
|
2090
|
+
if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) {
|
2041
2091
|
throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str()));
|
2042
2092
|
}
|
2093
|
+
|
2094
|
+
// correct arch for multimodal models
|
2095
|
+
if (model.proj_type == PROJECTOR_TYPE_QWEN25O) {
|
2096
|
+
model.proj_type = modality == CLIP_MODALITY_VISION
|
2097
|
+
? PROJECTOR_TYPE_QWEN25VL
|
2098
|
+
: PROJECTOR_TYPE_QWEN2A;
|
2099
|
+
}
|
2043
2100
|
}
|
2044
2101
|
|
2102
|
+
const bool is_vision = model.modality == CLIP_MODALITY_VISION;
|
2103
|
+
const bool is_audio = model.modality == CLIP_MODALITY_AUDIO;
|
2104
|
+
|
2045
2105
|
// other hparams
|
2046
2106
|
{
|
2047
|
-
|
2048
|
-
get_bool(KEY_HAS_VISION_ENC, hparams.has_vision, false);
|
2049
|
-
|
2050
|
-
const char * prefix = hparams.has_vision ? "vision" : "audio";
|
2107
|
+
const char * prefix = is_vision ? "vision" : "audio";
|
2051
2108
|
get_u32(string_format(KEY_N_EMBD, prefix), hparams.n_embd);
|
2052
2109
|
get_u32(string_format(KEY_N_HEAD, prefix), hparams.n_head);
|
2053
2110
|
get_u32(string_format(KEY_N_FF, prefix), hparams.n_ff);
|
@@ -2055,27 +2112,40 @@ struct clip_model_loader {
|
|
2055
2112
|
get_u32(string_format(KEY_PROJ_DIM, prefix), hparams.projection_dim);
|
2056
2113
|
get_f32(string_format(KEY_LAYER_NORM_EPS, prefix), hparams.eps);
|
2057
2114
|
|
2058
|
-
if (
|
2115
|
+
if (is_vision) {
|
2059
2116
|
get_u32(KEY_IMAGE_SIZE, hparams.image_size);
|
2060
2117
|
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
|
2061
|
-
get_u32(KEY_IMAGE_CROP_RESOLUTION,
|
2062
|
-
|
2063
|
-
get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false); // legacy
|
2118
|
+
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
|
2119
|
+
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
|
2064
2120
|
|
2065
|
-
} else if (
|
2121
|
+
} else if (is_audio) {
|
2066
2122
|
get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
|
2067
2123
|
|
2068
2124
|
} else {
|
2069
|
-
|
2125
|
+
LM_GGML_ASSERT(false && "unknown modality");
|
2126
|
+
}
|
2127
|
+
|
2128
|
+
// for pinpoints, we need to convert it into a list of resolution candidates
|
2129
|
+
{
|
2130
|
+
std::vector<int> pinpoints;
|
2131
|
+
get_arr_int(KEY_IMAGE_GRID_PINPOINTS, pinpoints, false);
|
2132
|
+
if (!pinpoints.empty()) {
|
2133
|
+
for (size_t i = 0; i < pinpoints.size(); i += 2) {
|
2134
|
+
hparams.image_res_candidates.push_back({
|
2135
|
+
pinpoints[i],
|
2136
|
+
pinpoints[i+1],
|
2137
|
+
});
|
2138
|
+
}
|
2139
|
+
}
|
2070
2140
|
}
|
2071
2141
|
|
2072
2142
|
// default warmup value
|
2073
2143
|
hparams.warmup_image_size = hparams.image_size;
|
2074
2144
|
|
2075
|
-
|
2076
|
-
|
2077
|
-
|
2078
|
-
|
2145
|
+
hparams.has_llava_projector = model.proj_type == PROJECTOR_TYPE_MLP
|
2146
|
+
|| model.proj_type == PROJECTOR_TYPE_MLP_NORM
|
2147
|
+
|| model.proj_type == PROJECTOR_TYPE_LDP
|
2148
|
+
|| model.proj_type == PROJECTOR_TYPE_LDPV2;
|
2079
2149
|
|
2080
2150
|
{
|
2081
2151
|
bool use_gelu = false;
|
@@ -2105,7 +2175,7 @@ struct clip_model_loader {
|
|
2105
2175
|
}
|
2106
2176
|
}
|
2107
2177
|
|
2108
|
-
if (
|
2178
|
+
if (is_vision) {
|
2109
2179
|
int idx_mean = lm_gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN);
|
2110
2180
|
int idx_std = lm_gguf_find_key(ctx_gguf.get(), KEY_IMAGE_STD);
|
2111
2181
|
LM_GGML_ASSERT(idx_mean >= 0 && "image_mean not found");
|
@@ -2113,8 +2183,8 @@ struct clip_model_loader {
|
|
2113
2183
|
const float * mean_data = (const float *) lm_gguf_get_arr_data(ctx_gguf.get(), idx_mean);
|
2114
2184
|
const float * std_data = (const float *) lm_gguf_get_arr_data(ctx_gguf.get(), idx_std);
|
2115
2185
|
for (int i = 0; i < 3; ++i) {
|
2116
|
-
|
2117
|
-
|
2186
|
+
hparams.image_mean[i] = mean_data[i];
|
2187
|
+
hparams.image_std[i] = std_data[i];
|
2118
2188
|
}
|
2119
2189
|
}
|
2120
2190
|
|
@@ -2131,11 +2201,11 @@ struct clip_model_loader {
|
|
2131
2201
|
}
|
2132
2202
|
|
2133
2203
|
// model-specific params
|
2134
|
-
switch (
|
2204
|
+
switch (model.proj_type) {
|
2135
2205
|
case PROJECTOR_TYPE_MINICPMV:
|
2136
2206
|
{
|
2137
|
-
if (
|
2138
|
-
|
2207
|
+
if (hparams.minicpmv_version == 0) {
|
2208
|
+
hparams.minicpmv_version = 2; // default to 2 if not set
|
2139
2209
|
}
|
2140
2210
|
} break;
|
2141
2211
|
case PROJECTOR_TYPE_IDEFICS3:
|
@@ -2147,6 +2217,9 @@ struct clip_model_loader {
|
|
2147
2217
|
{
|
2148
2218
|
hparams.rope_theta = 10000.0f;
|
2149
2219
|
hparams.warmup_image_size = hparams.patch_size * 8;
|
2220
|
+
// Mistral Small 2506 needs 1024x1024 image size cap to prevent OOM
|
2221
|
+
// ref: https://github.com/ggml-org/llama.cpp/issues/14310
|
2222
|
+
hparams.image_size = 1024;
|
2150
2223
|
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
|
2151
2224
|
} break;
|
2152
2225
|
case PROJECTOR_TYPE_GEMMA3:
|
@@ -2180,20 +2253,13 @@ struct clip_model_loader {
|
|
2180
2253
|
{
|
2181
2254
|
hparams.rope_theta = 10000.0f;
|
2182
2255
|
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor);
|
2183
|
-
|
2184
|
-
// borrowed from llava-1.6
|
2185
|
-
const int isize = hparams.image_size;
|
2186
|
-
hparams.image_grid_pinpoints = {
|
2187
|
-
isize, isize*2, // 336, 672
|
2188
|
-
isize*2, isize, // 672, 336
|
2189
|
-
isize*2, isize*2, // 672, 672
|
2190
|
-
isize*3, isize, // 1008, 336
|
2191
|
-
isize, isize*3, // 336, 1008
|
2192
|
-
};
|
2256
|
+
set_llava_uhd_res_candidates(model, 3);
|
2193
2257
|
} break;
|
2194
2258
|
case PROJECTOR_TYPE_ULTRAVOX:
|
2259
|
+
case PROJECTOR_TYPE_QWEN2A:
|
2195
2260
|
{
|
2196
|
-
|
2261
|
+
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX;
|
2262
|
+
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
|
2197
2263
|
if (hparams.n_mel_bins != 128) {
|
2198
2264
|
throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__));
|
2199
2265
|
}
|
@@ -2205,23 +2271,22 @@ struct clip_model_loader {
|
|
2205
2271
|
}
|
2206
2272
|
|
2207
2273
|
LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
|
2208
|
-
LOG_INF("%s: has_vision_encoder: %d\n", __func__, hparams.has_vision);
|
2209
|
-
LOG_INF("%s: has_audio_encoder: %d\n", __func__, hparams.has_audio);
|
2210
2274
|
LOG_INF("%s: n_embd: %d\n", __func__, hparams.n_embd);
|
2211
2275
|
LOG_INF("%s: n_head: %d\n", __func__, hparams.n_head);
|
2212
2276
|
LOG_INF("%s: n_ff: %d\n", __func__, hparams.n_ff);
|
2213
2277
|
LOG_INF("%s: n_layer: %d\n", __func__, hparams.n_layer);
|
2214
2278
|
LOG_INF("%s: ffn_op: %s\n", __func__, log_ffn_op.c_str());
|
2215
2279
|
LOG_INF("%s: projection_dim: %d\n", __func__, hparams.projection_dim);
|
2216
|
-
|
2217
|
-
|
2280
|
+
if (is_vision) {
|
2281
|
+
LOG_INF("\n--- vision hparams ---\n");
|
2218
2282
|
LOG_INF("%s: image_size: %d\n", __func__, hparams.image_size);
|
2219
2283
|
LOG_INF("%s: patch_size: %d\n", __func__, hparams.patch_size);
|
2220
|
-
LOG_INF("%s: has_llava_proj: %d\n", __func__,
|
2221
|
-
LOG_INF("%s: minicpmv_version: %d\n", __func__,
|
2284
|
+
LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector);
|
2285
|
+
LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version);
|
2222
2286
|
LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
|
2223
2287
|
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
|
2224
|
-
} else if (
|
2288
|
+
} else if (is_audio) {
|
2289
|
+
LOG_INF("\n--- audio hparams ---\n");
|
2225
2290
|
LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins);
|
2226
2291
|
LOG_INF("%s: proj_stack_factor: %d\n", __func__, hparams.proj_stack_factor);
|
2227
2292
|
}
|
@@ -2231,13 +2296,14 @@ struct clip_model_loader {
|
|
2231
2296
|
}
|
2232
2297
|
}
|
2233
2298
|
|
2234
|
-
void load_tensors() {
|
2235
|
-
auto &
|
2299
|
+
void load_tensors(clip_ctx & ctx_clip) {
|
2300
|
+
auto & model = ctx_clip.model;
|
2301
|
+
auto & hparams = model.hparams;
|
2236
2302
|
std::map<std::string, size_t> tensor_offset;
|
2237
2303
|
std::vector<lm_ggml_tensor *> tensors_to_load;
|
2238
2304
|
|
2239
2305
|
// TODO @ngxson : support both audio and video in the future
|
2240
|
-
const char * prefix =
|
2306
|
+
const char * prefix = model.modality == CLIP_MODALITY_AUDIO ? "a" : "v";
|
2241
2307
|
|
2242
2308
|
// get offsets
|
2243
2309
|
for (int64_t i = 0; i < lm_gguf_get_n_tensors(ctx_gguf.get()); ++i) {
|
@@ -2272,26 +2338,24 @@ struct clip_model_loader {
|
|
2272
2338
|
return cur;
|
2273
2339
|
};
|
2274
2340
|
|
2275
|
-
|
2341
|
+
model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
|
2276
2342
|
|
2277
|
-
|
2343
|
+
model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false);
|
2344
|
+
model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, prefix, "bias"), false);
|
2278
2345
|
|
2279
|
-
|
2280
|
-
|
2346
|
+
model.post_ln_w = get_tensor(string_format(TN_LN_POST, prefix, "weight"), false);
|
2347
|
+
model.post_ln_b = get_tensor(string_format(TN_LN_POST, prefix, "bias"), false);
|
2281
2348
|
|
2282
|
-
|
2283
|
-
|
2349
|
+
model.patch_bias = get_tensor(TN_PATCH_BIAS, false);
|
2350
|
+
model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
|
2351
|
+
model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
|
2284
2352
|
|
2285
|
-
|
2286
|
-
vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
|
2287
|
-
vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
|
2288
|
-
|
2289
|
-
vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
|
2353
|
+
model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
|
2290
2354
|
|
2291
2355
|
// layers
|
2292
|
-
|
2356
|
+
model.layers.resize(hparams.n_layer);
|
2293
2357
|
for (int il = 0; il < hparams.n_layer; ++il) {
|
2294
|
-
auto & layer =
|
2358
|
+
auto & layer = model.layers[il];
|
2295
2359
|
layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"));
|
2296
2360
|
layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"));
|
2297
2361
|
layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight"));
|
@@ -2332,157 +2396,166 @@ struct clip_model_loader {
|
|
2332
2396
|
}
|
2333
2397
|
}
|
2334
2398
|
|
2335
|
-
switch (
|
2399
|
+
switch (model.proj_type) {
|
2336
2400
|
case PROJECTOR_TYPE_MLP:
|
2337
2401
|
case PROJECTOR_TYPE_MLP_NORM:
|
2338
2402
|
{
|
2339
2403
|
// LLaVA projection
|
2340
|
-
|
2341
|
-
|
2404
|
+
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"), false);
|
2405
|
+
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
|
2342
2406
|
// Yi-type llava
|
2343
|
-
|
2344
|
-
|
2407
|
+
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"), false);
|
2408
|
+
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
|
2345
2409
|
// missing in Yi-type llava
|
2346
|
-
|
2347
|
-
|
2410
|
+
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"), false);
|
2411
|
+
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
|
2348
2412
|
// Yi-type llava
|
2349
|
-
|
2350
|
-
|
2351
|
-
|
2352
|
-
|
2353
|
-
if (
|
2413
|
+
model.mm_3_w = get_tensor(string_format(TN_LLAVA_PROJ, 3, "weight"), false);
|
2414
|
+
model.mm_3_b = get_tensor(string_format(TN_LLAVA_PROJ, 3, "bias"), false);
|
2415
|
+
model.mm_4_w = get_tensor(string_format(TN_LLAVA_PROJ, 4, "weight"), false);
|
2416
|
+
model.mm_4_b = get_tensor(string_format(TN_LLAVA_PROJ, 4, "bias"), false);
|
2417
|
+
if (model.mm_3_w) {
|
2354
2418
|
// TODO: this is a hack to support Yi-type llava
|
2355
|
-
|
2419
|
+
model.proj_type = PROJECTOR_TYPE_MLP_NORM;
|
2356
2420
|
}
|
2357
|
-
|
2421
|
+
model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false);
|
2358
2422
|
} break;
|
2359
2423
|
case PROJECTOR_TYPE_LDP:
|
2360
2424
|
{
|
2361
2425
|
// MobileVLM projection
|
2362
|
-
|
2363
|
-
|
2364
|
-
|
2365
|
-
|
2366
|
-
|
2367
|
-
|
2368
|
-
|
2369
|
-
|
2370
|
-
|
2371
|
-
|
2372
|
-
|
2373
|
-
|
2374
|
-
|
2375
|
-
|
2376
|
-
|
2377
|
-
|
2378
|
-
|
2379
|
-
|
2380
|
-
|
2381
|
-
|
2382
|
-
|
2383
|
-
|
2384
|
-
|
2385
|
-
|
2426
|
+
model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
|
2427
|
+
model.mm_model_mlp_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
|
2428
|
+
model.mm_model_mlp_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
|
2429
|
+
model.mm_model_mlp_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
|
2430
|
+
model.mm_model_block_1_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
|
2431
|
+
model.mm_model_block_1_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
|
2432
|
+
model.mm_model_block_1_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
|
2433
|
+
model.mm_model_block_1_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
|
2434
|
+
model.mm_model_block_1_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
|
2435
|
+
model.mm_model_block_1_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
|
2436
|
+
model.mm_model_block_1_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
|
2437
|
+
model.mm_model_block_1_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
|
2438
|
+
model.mm_model_block_1_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
|
2439
|
+
model.mm_model_block_1_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
|
2440
|
+
model.mm_model_block_2_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
|
2441
|
+
model.mm_model_block_2_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
|
2442
|
+
model.mm_model_block_2_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
|
2443
|
+
model.mm_model_block_2_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
|
2444
|
+
model.mm_model_block_2_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
|
2445
|
+
model.mm_model_block_2_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
|
2446
|
+
model.mm_model_block_2_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
|
2447
|
+
model.mm_model_block_2_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
|
2448
|
+
model.mm_model_block_2_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
|
2449
|
+
model.mm_model_block_2_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
|
2386
2450
|
} break;
|
2387
2451
|
case PROJECTOR_TYPE_LDPV2:
|
2388
2452
|
{
|
2389
2453
|
// MobilVLM_V2 projection
|
2390
|
-
|
2391
|
-
|
2392
|
-
|
2393
|
-
|
2394
|
-
|
2395
|
-
|
2454
|
+
model.mm_model_mlp_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
|
2455
|
+
model.mm_model_mlp_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
|
2456
|
+
model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
|
2457
|
+
model.mm_model_mlp_2_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "bias"));
|
2458
|
+
model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight"));
|
2459
|
+
model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias"));
|
2396
2460
|
} break;
|
2397
2461
|
case PROJECTOR_TYPE_MINICPMV:
|
2398
2462
|
{
|
2399
|
-
//
|
2400
|
-
|
2401
|
-
|
2402
|
-
|
2403
|
-
|
2404
|
-
|
2405
|
-
|
2406
|
-
|
2407
|
-
|
2408
|
-
|
2409
|
-
|
2410
|
-
|
2411
|
-
|
2412
|
-
|
2413
|
-
|
2414
|
-
|
2415
|
-
|
2416
|
-
|
2417
|
-
|
2463
|
+
// model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
|
2464
|
+
model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K);
|
2465
|
+
model.mm_model_query = get_tensor(TN_MINICPMV_QUERY);
|
2466
|
+
model.mm_model_proj = get_tensor(TN_MINICPMV_PROJ);
|
2467
|
+
model.mm_model_kv_proj = get_tensor(TN_MINICPMV_KV_PROJ);
|
2468
|
+
model.mm_model_attn_q_w = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "weight"));
|
2469
|
+
model.mm_model_attn_k_w = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "weight"));
|
2470
|
+
model.mm_model_attn_v_w = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "weight"));
|
2471
|
+
model.mm_model_attn_q_b = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "bias"));
|
2472
|
+
model.mm_model_attn_k_b = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "bias"));
|
2473
|
+
model.mm_model_attn_v_b = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "bias"));
|
2474
|
+
model.mm_model_attn_o_w = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "weight"));
|
2475
|
+
model.mm_model_attn_o_b = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "bias"));
|
2476
|
+
model.mm_model_ln_q_w = get_tensor(string_format(TN_MINICPMV_LN, "q", "weight"));
|
2477
|
+
model.mm_model_ln_q_b = get_tensor(string_format(TN_MINICPMV_LN, "q", "bias"));
|
2478
|
+
model.mm_model_ln_kv_w = get_tensor(string_format(TN_MINICPMV_LN, "kv", "weight"));
|
2479
|
+
model.mm_model_ln_kv_b = get_tensor(string_format(TN_MINICPMV_LN, "kv", "bias"));
|
2480
|
+
model.mm_model_ln_post_w = get_tensor(string_format(TN_MINICPMV_LN, "post", "weight"));
|
2481
|
+
model.mm_model_ln_post_b = get_tensor(string_format(TN_MINICPMV_LN, "post", "bias"));
|
2418
2482
|
} break;
|
2419
2483
|
case PROJECTOR_TYPE_GLM_EDGE:
|
2420
2484
|
{
|
2421
|
-
|
2422
|
-
|
2423
|
-
|
2424
|
-
|
2425
|
-
|
2426
|
-
|
2427
|
-
|
2428
|
-
|
2429
|
-
|
2430
|
-
|
2485
|
+
model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight"));
|
2486
|
+
model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias"));
|
2487
|
+
model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR, "weight"));
|
2488
|
+
model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "weight"));
|
2489
|
+
model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "bias"));
|
2490
|
+
model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
|
2491
|
+
model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
|
2492
|
+
model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
|
2493
|
+
model.mm_glm_tok_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
|
2494
|
+
model.mm_glm_tok_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
|
2431
2495
|
} break;
|
2432
2496
|
case PROJECTOR_TYPE_QWEN2VL:
|
2433
2497
|
case PROJECTOR_TYPE_QWEN25VL:
|
2434
2498
|
{
|
2435
|
-
|
2436
|
-
|
2437
|
-
|
2438
|
-
|
2499
|
+
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
|
2500
|
+
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
|
2501
|
+
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
2502
|
+
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
2439
2503
|
} break;
|
2440
2504
|
case PROJECTOR_TYPE_GEMMA3:
|
2441
2505
|
{
|
2442
|
-
|
2443
|
-
|
2506
|
+
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
2507
|
+
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
|
2444
2508
|
} break;
|
2445
2509
|
case PROJECTOR_TYPE_IDEFICS3:
|
2446
2510
|
{
|
2447
|
-
|
2511
|
+
model.projection = get_tensor(TN_MM_PROJECTOR);
|
2448
2512
|
} break;
|
2449
2513
|
case PROJECTOR_TYPE_PIXTRAL:
|
2450
2514
|
{
|
2451
|
-
|
2452
|
-
|
2453
|
-
|
2454
|
-
|
2515
|
+
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
2516
|
+
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
|
2517
|
+
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
2518
|
+
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
|
2455
2519
|
// [IMG_BREAK] token embedding
|
2456
|
-
|
2520
|
+
model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
|
2457
2521
|
// for mistral small 3.1
|
2458
|
-
|
2459
|
-
|
2522
|
+
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
2523
|
+
model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
|
2460
2524
|
} break;
|
2461
2525
|
case PROJECTOR_TYPE_ULTRAVOX:
|
2462
2526
|
{
|
2463
|
-
|
2464
|
-
|
2465
|
-
|
2466
|
-
|
2467
|
-
|
2468
|
-
|
2469
|
-
|
2470
|
-
|
2527
|
+
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
|
2528
|
+
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
|
2529
|
+
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
|
2530
|
+
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
|
2531
|
+
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
2532
|
+
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
2533
|
+
model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
|
2534
|
+
model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight"));
|
2535
|
+
} break;
|
2536
|
+
case PROJECTOR_TYPE_QWEN2A:
|
2537
|
+
{
|
2538
|
+
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
|
2539
|
+
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
|
2540
|
+
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
|
2541
|
+
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
|
2542
|
+
model.mm_fc_w = get_tensor(string_format(TN_MM_AUDIO_FC, "weight"));
|
2543
|
+
model.mm_fc_b = get_tensor(string_format(TN_MM_AUDIO_FC, "bias"));
|
2471
2544
|
} break;
|
2472
2545
|
case PROJECTOR_TYPE_INTERNVL:
|
2473
2546
|
{
|
2474
|
-
|
2475
|
-
|
2476
|
-
|
2477
|
-
|
2478
|
-
|
2479
|
-
|
2547
|
+
model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
|
2548
|
+
model.mm_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
|
2549
|
+
model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
|
2550
|
+
model.mm_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
|
2551
|
+
model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
|
2552
|
+
model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
|
2480
2553
|
} break;
|
2481
2554
|
case PROJECTOR_TYPE_LLAMA4:
|
2482
2555
|
{
|
2483
|
-
|
2484
|
-
|
2485
|
-
|
2556
|
+
model.mm_model_proj = get_tensor(TN_MM_PROJECTOR);
|
2557
|
+
model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
|
2558
|
+
model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
|
2486
2559
|
} break;
|
2487
2560
|
default:
|
2488
2561
|
LM_GGML_ASSERT(false && "unknown projector type");
|
@@ -2575,21 +2648,20 @@ struct clip_model_loader {
|
|
2575
2648
|
}
|
2576
2649
|
}
|
2577
2650
|
|
2578
|
-
void alloc_compute_meta() {
|
2579
|
-
const auto & hparams = ctx_clip.
|
2651
|
+
void alloc_compute_meta(clip_ctx & ctx_clip) {
|
2652
|
+
const auto & hparams = ctx_clip.model.hparams;
|
2580
2653
|
ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * lm_ggml_tensor_overhead() + lm_ggml_graph_overhead());
|
2581
2654
|
|
2582
2655
|
// create a fake batch
|
2583
2656
|
clip_image_f32_batch batch;
|
2584
2657
|
clip_image_f32_ptr img(clip_image_f32_init());
|
2585
|
-
if (
|
2658
|
+
if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
|
2586
2659
|
img->nx = hparams.warmup_image_size;
|
2587
2660
|
img->ny = hparams.warmup_image_size;
|
2588
2661
|
} else {
|
2589
|
-
img->nx =
|
2662
|
+
img->nx = hparams.warmup_audio_size;
|
2590
2663
|
img->ny = hparams.n_mel_bins;
|
2591
2664
|
}
|
2592
|
-
img->buf.resize(img->nx * img->ny * 3);
|
2593
2665
|
batch.entries.push_back(std::move(img));
|
2594
2666
|
|
2595
2667
|
lm_ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
|
@@ -2665,25 +2737,57 @@ struct clip_model_loader {
|
|
2665
2737
|
output[i] = values[i];
|
2666
2738
|
}
|
2667
2739
|
}
|
2740
|
+
|
2741
|
+
void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) {
|
2742
|
+
auto & hparams = model.hparams;
|
2743
|
+
for (int x = 1; x <= max_patches_per_side; x++) {
|
2744
|
+
for (int y = 1; y <= max_patches_per_side; y++) {
|
2745
|
+
if (x == 1 && y == 1) {
|
2746
|
+
continue; // skip the first point
|
2747
|
+
}
|
2748
|
+
hparams.image_res_candidates.push_back(clip_image_size{
|
2749
|
+
x*hparams.image_size,
|
2750
|
+
y*hparams.image_size,
|
2751
|
+
});
|
2752
|
+
}
|
2753
|
+
}
|
2754
|
+
}
|
2668
2755
|
};
|
2669
2756
|
|
2670
|
-
struct
|
2757
|
+
struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params) {
|
2671
2758
|
g_logger_state.verbosity_thold = ctx_params.verbosity;
|
2672
|
-
clip_ctx *
|
2759
|
+
clip_ctx * ctx_vision = nullptr;
|
2760
|
+
clip_ctx * ctx_audio = nullptr;
|
2673
2761
|
|
2674
2762
|
try {
|
2675
|
-
|
2676
|
-
|
2677
|
-
loader.
|
2678
|
-
|
2679
|
-
|
2763
|
+
clip_model_loader loader(fname);
|
2764
|
+
|
2765
|
+
if (loader.has_vision) {
|
2766
|
+
ctx_vision = new clip_ctx(ctx_params);
|
2767
|
+
loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION);
|
2768
|
+
loader.load_tensors(*ctx_vision);
|
2769
|
+
loader.alloc_compute_meta(*ctx_vision);
|
2770
|
+
}
|
2771
|
+
|
2772
|
+
if (loader.has_audio) {
|
2773
|
+
ctx_audio = new clip_ctx(ctx_params);
|
2774
|
+
loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
|
2775
|
+
loader.load_tensors(*ctx_audio);
|
2776
|
+
loader.alloc_compute_meta(*ctx_audio);
|
2777
|
+
}
|
2778
|
+
|
2680
2779
|
} catch (const std::exception & e) {
|
2681
2780
|
LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what());
|
2682
|
-
|
2683
|
-
|
2781
|
+
if (ctx_vision) {
|
2782
|
+
delete ctx_vision;
|
2783
|
+
}
|
2784
|
+
if (ctx_audio) {
|
2785
|
+
delete ctx_audio;
|
2786
|
+
}
|
2787
|
+
return {nullptr, nullptr};
|
2684
2788
|
}
|
2685
2789
|
|
2686
|
-
return
|
2790
|
+
return {ctx_vision, ctx_audio};
|
2687
2791
|
}
|
2688
2792
|
|
2689
2793
|
struct clip_image_size * clip_image_size_init() {
|
@@ -2757,30 +2861,6 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny
|
|
2757
2861
|
memcpy(img->buf.data(), rgb_pixels, img->buf.size());
|
2758
2862
|
}
|
2759
2863
|
|
2760
|
-
bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
2761
|
-
int nx, ny, nc;
|
2762
|
-
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
|
2763
|
-
if (!data) {
|
2764
|
-
LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
|
2765
|
-
return false;
|
2766
|
-
}
|
2767
|
-
clip_build_img_from_pixels(data, nx, ny, img);
|
2768
|
-
stbi_image_free(data);
|
2769
|
-
return true;
|
2770
|
-
}
|
2771
|
-
|
2772
|
-
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
|
2773
|
-
int nx, ny, nc;
|
2774
|
-
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
2775
|
-
if (!data) {
|
2776
|
-
LOG_ERR("%s: failed to decode image bytes\n", __func__);
|
2777
|
-
return false;
|
2778
|
-
}
|
2779
|
-
clip_build_img_from_pixels(data, nx, ny, img);
|
2780
|
-
stbi_image_free(data);
|
2781
|
-
return true;
|
2782
|
-
}
|
2783
|
-
|
2784
2864
|
// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
|
2785
2865
|
static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
|
2786
2866
|
dst.nx = src.nx;
|
@@ -3026,36 +3106,41 @@ struct llava_uhd {
|
|
3026
3106
|
bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
|
3027
3107
|
};
|
3028
3108
|
|
3029
|
-
static int get_max_slices(struct clip_ctx * ctx) {
|
3030
|
-
if (clip_is_minicpmv(ctx)) {
|
3031
|
-
return 9;
|
3032
|
-
}
|
3033
|
-
return 0;
|
3034
|
-
}
|
3035
|
-
|
3036
3109
|
static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
|
3037
3110
|
slice_instructions res;
|
3038
3111
|
const int patch_size = clip_get_patch_size(ctx);
|
3039
3112
|
const int slice_size = clip_get_image_size(ctx);
|
3040
|
-
const int max_slice_nums = get_max_slices(ctx);
|
3041
3113
|
const int original_width = original_size.width;
|
3042
3114
|
const int original_height = original_size.height;
|
3043
|
-
|
3044
|
-
const
|
3045
|
-
const
|
3046
|
-
|
3047
|
-
|
3115
|
+
|
3116
|
+
const bool has_slices = original_size.width > slice_size || original_size.height > slice_size;
|
3117
|
+
const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();
|
3118
|
+
|
3119
|
+
if (!has_slices) {
|
3120
|
+
// skip slicing logic
|
3121
|
+
res.overview_size = clip_image_size{slice_size, slice_size};
|
3122
|
+
res.refined_size = clip_image_size{0, 0};
|
3123
|
+
res.grid_size = clip_image_size{0, 0};
|
3124
|
+
|
3125
|
+
return res;
|
3126
|
+
}
|
3048
3127
|
|
3049
3128
|
if (has_pinpoints) {
|
3050
3129
|
// has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
|
3051
3130
|
auto refine_size = llava_uhd::select_best_resolution(
|
3052
|
-
|
3053
|
-
|
3131
|
+
original_size,
|
3132
|
+
ctx->model.hparams.image_res_candidates);
|
3054
3133
|
res.overview_size = clip_image_size{slice_size, slice_size};
|
3055
3134
|
res.refined_size = refine_size;
|
3056
3135
|
res.grid_size = clip_image_size{0, 0};
|
3057
3136
|
res.padding_refined = true;
|
3058
3137
|
|
3138
|
+
LOG_DBG("%s: using pinpoints for slicing\n", __func__);
|
3139
|
+
LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n",
|
3140
|
+
__func__, original_width, original_height,
|
3141
|
+
res.overview_size.width, res.overview_size.height,
|
3142
|
+
res.refined_size.width, res.refined_size.height);
|
3143
|
+
|
3059
3144
|
for (int y = 0; y < refine_size.height; y += slice_size) {
|
3060
3145
|
for (int x = 0; x < refine_size.width; x += slice_size) {
|
3061
3146
|
slice_coordinates slice;
|
@@ -3064,13 +3149,16 @@ struct llava_uhd {
|
|
3064
3149
|
slice.size.width = std::min(slice_size, refine_size.width - x);
|
3065
3150
|
slice.size.height = std::min(slice_size, refine_size.height - y);
|
3066
3151
|
res.slices.push_back(slice);
|
3067
|
-
|
3068
|
-
|
3069
|
-
|
3152
|
+
LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
|
3153
|
+
__func__, (int)res.slices.size() - 1,
|
3154
|
+
slice.x, slice.y, slice.size.width, slice.size.height);
|
3070
3155
|
}
|
3071
|
-
res.grid_size.height++;
|
3072
3156
|
}
|
3073
3157
|
|
3158
|
+
res.grid_size.height = refine_size.height / slice_size;
|
3159
|
+
res.grid_size.width = refine_size.width / slice_size;
|
3160
|
+
LOG_DBG("%s: grid size: %d x %d\n", __func__, res.grid_size.width, res.grid_size.height);
|
3161
|
+
|
3074
3162
|
return res;
|
3075
3163
|
}
|
3076
3164
|
|
@@ -3079,17 +3167,23 @@ struct llava_uhd {
|
|
3079
3167
|
auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices);
|
3080
3168
|
res.overview_size = best_size;
|
3081
3169
|
|
3082
|
-
|
3083
|
-
//
|
3084
|
-
|
3085
|
-
|
3170
|
+
{
|
3171
|
+
const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it
|
3172
|
+
const float log_ratio = log((float)original_width / original_height);
|
3173
|
+
const float ratio = (float)original_width * original_height / (slice_size * slice_size);
|
3174
|
+
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
3086
3175
|
|
3087
|
-
} else {
|
3088
3176
|
auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio);
|
3089
3177
|
auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
|
3090
3178
|
res.grid_size = best_grid;
|
3091
3179
|
res.refined_size = refine_size;
|
3092
3180
|
|
3181
|
+
LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
|
3182
|
+
__func__, original_width, original_height,
|
3183
|
+
res.overview_size.width, res.overview_size.height,
|
3184
|
+
res.refined_size.width, res.refined_size.height,
|
3185
|
+
res.grid_size.width, res.grid_size.height);
|
3186
|
+
|
3093
3187
|
int width = refine_size.width;
|
3094
3188
|
int height = refine_size.height;
|
3095
3189
|
int grid_x = int(width / best_grid.width);
|
@@ -3106,7 +3200,9 @@ struct llava_uhd {
|
|
3106
3200
|
slice.size.width = grid_x;
|
3107
3201
|
slice.size.height = grid_y;
|
3108
3202
|
res.slices.push_back(slice);
|
3109
|
-
|
3203
|
+
LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
|
3204
|
+
__func__, (int)res.slices.size() - 1,
|
3205
|
+
slice.x, slice.y, slice.size.width, slice.size.height);
|
3110
3206
|
}
|
3111
3207
|
}
|
3112
3208
|
}
|
@@ -3164,48 +3260,55 @@ private:
|
|
3164
3260
|
return res;
|
3165
3261
|
}
|
3166
3262
|
|
3263
|
+
static clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max) {
|
3264
|
+
float scale_width = static_cast<float>(target_max.width) / orig.width;
|
3265
|
+
float scale_height = static_cast<float>(target_max.height) / orig.height;
|
3266
|
+
float scale = std::min(scale_width, scale_height);
|
3267
|
+
return clip_image_size{
|
3268
|
+
static_cast<int>(orig.width * scale),
|
3269
|
+
static_cast<int>(orig.height * scale),
|
3270
|
+
};
|
3271
|
+
}
|
3272
|
+
|
3167
3273
|
/**
|
3168
3274
|
* Selects the best resolution from a list of possible resolutions based on the original size.
|
3169
3275
|
*
|
3276
|
+
* For example, when given a list of resolutions:
|
3277
|
+
* - 100x100
|
3278
|
+
* - 200x100
|
3279
|
+
* - 100x200
|
3280
|
+
* - 200x200
|
3281
|
+
*
|
3282
|
+
* And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution).
|
3283
|
+
*
|
3170
3284
|
* @param original_size The original size of the image
|
3171
3285
|
* @param possible_resolutions A list of possible resolutions
|
3172
3286
|
* @return The best fit resolution
|
3173
3287
|
*/
|
3174
3288
|
static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions) {
|
3175
|
-
int original_width = original_size.width;
|
3176
|
-
int original_height = original_size.height;
|
3177
3289
|
clip_image_size best_fit;
|
3290
|
+
int min_wasted_area = std::numeric_limits<int>::max();
|
3178
3291
|
int max_effective_resolution = 0;
|
3179
|
-
|
3180
|
-
|
3181
|
-
|
3182
|
-
int
|
3183
|
-
|
3184
|
-
|
3185
|
-
int
|
3186
|
-
|
3187
|
-
|
3188
|
-
int wasted_resolution = (width * height) - effective_resolution;
|
3189
|
-
// LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
3190
|
-
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
3292
|
+
|
3293
|
+
for (const clip_image_size & candidate : possible_resolutions) {
|
3294
|
+
auto target_size = resize_maintain_aspect_ratio(original_size, candidate);
|
3295
|
+
int effective_resolution = std::min(
|
3296
|
+
target_size.width * target_size.height,
|
3297
|
+
original_size.width * original_size.height);
|
3298
|
+
int wasted_area = (candidate.width * candidate.height) - effective_resolution;
|
3299
|
+
|
3300
|
+
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_area < min_wasted_area)) {
|
3191
3301
|
max_effective_resolution = effective_resolution;
|
3192
|
-
|
3193
|
-
best_fit =
|
3302
|
+
min_wasted_area = wasted_area;
|
3303
|
+
best_fit = candidate;
|
3194
3304
|
}
|
3305
|
+
|
3306
|
+
LOG_DBG("%s: candidate: %d x %d, target: %d x %d, wasted: %d, effective: %d\n", __func__, candidate.width, candidate.height, target_size.width, target_size.height, wasted_area, effective_resolution);
|
3195
3307
|
}
|
3196
3308
|
|
3197
3309
|
return best_fit;
|
3198
3310
|
}
|
3199
3311
|
|
3200
|
-
// used by llava 1.6 with custom list of pinpoints
|
3201
|
-
static clip_image_size select_best_resolution(const std::vector<int32_t> & pinpoints, const clip_image_size & original_size) {
|
3202
|
-
std::vector<clip_image_size> possible_resolutions; // TODO @ngxson : construct this inside hparams, not here
|
3203
|
-
for (size_t i = 0; i < pinpoints.size(); i += 2) {
|
3204
|
-
possible_resolutions.push_back(clip_image_size{pinpoints[i], pinpoints[i+1]});
|
3205
|
-
}
|
3206
|
-
return select_best_resolution(original_size, possible_resolutions);
|
3207
|
-
}
|
3208
|
-
|
3209
3312
|
static int ensure_divide(int length, int patch_size) {
|
3210
3313
|
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
|
3211
3314
|
}
|
@@ -3271,7 +3374,7 @@ private:
|
|
3271
3374
|
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
|
3272
3375
|
clip_image_size original_size{img->nx, img->ny};
|
3273
3376
|
bool pad_to_square = true;
|
3274
|
-
auto & params = ctx->
|
3377
|
+
auto & params = ctx->model.hparams;
|
3275
3378
|
// The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
|
3276
3379
|
if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) {
|
3277
3380
|
pad_to_square = false;
|
@@ -3284,7 +3387,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
|
3284
3387
|
for (size_t i = 0; i < imgs.size(); ++i) {
|
3285
3388
|
// clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
|
3286
3389
|
clip_image_f32_ptr res(clip_image_f32_init());
|
3287
|
-
normalize_image_u8_to_f32(*imgs[i], *res,
|
3390
|
+
normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
|
3288
3391
|
res_imgs->entries.push_back(std::move(res));
|
3289
3392
|
}
|
3290
3393
|
|
@@ -3292,7 +3395,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
|
3292
3395
|
res_imgs->grid_y = inst.grid_size.height;
|
3293
3396
|
return true;
|
3294
3397
|
|
3295
|
-
} else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
3398
|
+
} else if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL) {
|
3296
3399
|
clip_image_u8 resized;
|
3297
3400
|
auto patch_size = params.patch_size * 2;
|
3298
3401
|
auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, patch_size, params.image_size);
|
@@ -3300,42 +3403,42 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
|
3300
3403
|
|
3301
3404
|
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
3302
3405
|
// clip_image_f32_ptr res(clip_image_f32_init());
|
3303
|
-
normalize_image_u8_to_f32(resized, *img_f32,
|
3406
|
+
normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
|
3304
3407
|
// res_imgs->data[0] = *res;
|
3305
3408
|
res_imgs->entries.push_back(std::move(img_f32));
|
3306
3409
|
return true;
|
3307
3410
|
}
|
3308
|
-
else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE
|
3309
|
-
|| ctx->proj_type == PROJECTOR_TYPE_GEMMA3
|
3310
|
-
|| ctx->proj_type == PROJECTOR_TYPE_IDEFICS3
|
3311
|
-
|| ctx->proj_type == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
|
3411
|
+
else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
|
3412
|
+
|| ctx->proj_type() == PROJECTOR_TYPE_GEMMA3
|
3413
|
+
|| ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3
|
3414
|
+
|| ctx->proj_type() == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
|
3312
3415
|
) {
|
3313
3416
|
clip_image_u8 resized_image;
|
3314
3417
|
int sz = params.image_size;
|
3315
3418
|
image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz});
|
3316
3419
|
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
3317
3420
|
//clip_image_save_to_bmp(resized_image, "resized.bmp");
|
3318
|
-
normalize_image_u8_to_f32(resized_image, *img_f32,
|
3421
|
+
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
|
3319
3422
|
res_imgs->entries.push_back(std::move(img_f32));
|
3320
3423
|
return true;
|
3321
3424
|
|
3322
|
-
} else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
|
3425
|
+
} else if (ctx->proj_type() == PROJECTOR_TYPE_PIXTRAL) {
|
3323
3426
|
clip_image_u8 resized_image;
|
3324
3427
|
auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
|
3325
3428
|
image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height);
|
3326
3429
|
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
3327
|
-
normalize_image_u8_to_f32(resized_image, *img_f32,
|
3430
|
+
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
|
3328
3431
|
res_imgs->entries.push_back(std::move(img_f32));
|
3329
3432
|
return true;
|
3330
3433
|
|
3331
|
-
} else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
|
3332
|
-
LM_GGML_ASSERT(!params.
|
3434
|
+
} else if (ctx->proj_type() == PROJECTOR_TYPE_LLAMA4) {
|
3435
|
+
LM_GGML_ASSERT(!params.image_res_candidates.empty());
|
3333
3436
|
auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
|
3334
3437
|
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
|
3335
3438
|
|
3336
3439
|
for (size_t i = 0; i < imgs.size(); ++i) {
|
3337
3440
|
clip_image_f32_ptr res(clip_image_f32_init());
|
3338
|
-
normalize_image_u8_to_f32(*imgs[i], *res,
|
3441
|
+
normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
|
3339
3442
|
res_imgs->entries.push_back(std::move(res));
|
3340
3443
|
}
|
3341
3444
|
|
@@ -3365,11 +3468,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
|
3365
3468
|
image_manipulation::resize_and_pad_image(*img, *temp, clip_image_size{params.image_size, params.image_size}, pad_color);
|
3366
3469
|
|
3367
3470
|
clip_image_f32_ptr res(clip_image_f32_init());
|
3368
|
-
normalize_image_u8_to_f32(*temp, *res,
|
3471
|
+
normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std);
|
3369
3472
|
res_imgs->entries.push_back(std::move(res));
|
3370
3473
|
return true;
|
3371
3474
|
|
3372
|
-
} else if (!params.
|
3475
|
+
} else if (!params.image_res_candidates.empty()) {
|
3373
3476
|
// "spatial_unpad" with "anyres" processing for llava-1.6
|
3374
3477
|
auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
|
3375
3478
|
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
|
@@ -3377,7 +3480,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
|
3377
3480
|
for (size_t i = 0; i < imgs.size(); ++i) {
|
3378
3481
|
// clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
|
3379
3482
|
clip_image_f32_ptr res(clip_image_f32_init());
|
3380
|
-
normalize_image_u8_to_f32(*imgs[i], *res,
|
3483
|
+
normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
|
3381
3484
|
res_imgs->entries.push_back(std::move(res));
|
3382
3485
|
}
|
3383
3486
|
|
@@ -3389,7 +3492,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
|
3389
3492
|
}
|
3390
3493
|
|
3391
3494
|
lm_ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
|
3392
|
-
return ctx->
|
3495
|
+
return ctx->model.image_newline;
|
3393
3496
|
}
|
3394
3497
|
|
3395
3498
|
void clip_free(clip_ctx * ctx) {
|
@@ -3401,8 +3504,8 @@ void clip_free(clip_ctx * ctx) {
|
|
3401
3504
|
|
3402
3505
|
// deprecated
|
3403
3506
|
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
|
3404
|
-
const int32_t nx = ctx->
|
3405
|
-
const int32_t ny = ctx->
|
3507
|
+
const int32_t nx = ctx->model.hparams.image_size;
|
3508
|
+
const int32_t ny = ctx->model.hparams.image_size;
|
3406
3509
|
return clip_embd_nbytes_by_img(ctx, nx, ny);
|
3407
3510
|
}
|
3408
3511
|
|
@@ -3414,101 +3517,124 @@ size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h
|
|
3414
3517
|
}
|
3415
3518
|
|
3416
3519
|
int32_t clip_get_image_size(const struct clip_ctx * ctx) {
|
3417
|
-
return ctx->
|
3520
|
+
return ctx->model.hparams.image_size;
|
3418
3521
|
}
|
3419
3522
|
|
3420
3523
|
int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
|
3421
|
-
return ctx->
|
3524
|
+
return ctx->model.hparams.patch_size;
|
3422
3525
|
}
|
3423
3526
|
|
3424
3527
|
int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
|
3425
|
-
return ctx->
|
3528
|
+
return ctx->model.hparams.n_embd;
|
3426
3529
|
}
|
3427
3530
|
|
3428
3531
|
const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
|
3429
|
-
return ctx->
|
3430
|
-
}
|
3431
|
-
|
3432
|
-
const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
|
3433
|
-
if (ctx->vision_model.hparams.image_grid_pinpoints.size()) {
|
3434
|
-
return &ctx->vision_model.hparams.image_grid_pinpoints.front();
|
3435
|
-
}
|
3436
|
-
return nullptr;
|
3437
|
-
}
|
3438
|
-
|
3439
|
-
size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
|
3440
|
-
return ctx->vision_model.hparams.image_grid_pinpoints.size();
|
3532
|
+
return ctx->model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat";
|
3441
3533
|
}
|
3442
3534
|
|
3443
3535
|
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
3444
|
-
const auto & params = ctx->
|
3536
|
+
const auto & params = ctx->model.hparams;
|
3445
3537
|
const int n_total = clip_n_output_tokens(ctx, img);
|
3446
|
-
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
3538
|
+
if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL) {
|
3447
3539
|
return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0);
|
3448
3540
|
}
|
3449
3541
|
return n_total;
|
3450
3542
|
}
|
3451
3543
|
|
3452
3544
|
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
3453
|
-
const auto & params = ctx->
|
3454
|
-
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
3545
|
+
const auto & params = ctx->model.hparams;
|
3546
|
+
if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL) {
|
3455
3547
|
return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0);
|
3456
3548
|
}
|
3457
3549
|
return 1;
|
3458
3550
|
}
|
3459
3551
|
|
3460
3552
|
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
3461
|
-
const auto & params = ctx->
|
3553
|
+
const auto & params = ctx->model.hparams;
|
3462
3554
|
|
3463
|
-
|
3464
|
-
int
|
3555
|
+
// only for models using fixed size square images
|
3556
|
+
int n_patches_sq = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
3465
3557
|
|
3466
|
-
|
3467
|
-
|
3468
|
-
|
3469
|
-
|
3470
|
-
|
3471
|
-
|
3472
|
-
|
3473
|
-
|
3474
|
-
|
3475
|
-
|
3476
|
-
|
3477
|
-
|
3478
|
-
|
3479
|
-
|
3480
|
-
|
3481
|
-
|
3482
|
-
|
3483
|
-
|
3484
|
-
|
3485
|
-
|
3486
|
-
|
3487
|
-
|
3488
|
-
|
3489
|
-
|
3490
|
-
|
3491
|
-
|
3492
|
-
|
3493
|
-
|
3494
|
-
|
3495
|
-
|
3496
|
-
|
3497
|
-
|
3498
|
-
|
3499
|
-
|
3500
|
-
|
3501
|
-
|
3502
|
-
|
3503
|
-
|
3504
|
-
|
3505
|
-
|
3506
|
-
|
3507
|
-
|
3508
|
-
|
3509
|
-
|
3510
|
-
|
3511
|
-
|
3558
|
+
projector_type proj = ctx->proj_type();
|
3559
|
+
|
3560
|
+
switch (proj) {
|
3561
|
+
case PROJECTOR_TYPE_MLP:
|
3562
|
+
case PROJECTOR_TYPE_MLP_NORM:
|
3563
|
+
{
|
3564
|
+
// do nothing
|
3565
|
+
} break;
|
3566
|
+
case PROJECTOR_TYPE_LDP:
|
3567
|
+
case PROJECTOR_TYPE_LDPV2:
|
3568
|
+
case PROJECTOR_TYPE_GLM_EDGE:
|
3569
|
+
{
|
3570
|
+
n_patches_sq /= 4;
|
3571
|
+
if (ctx->model.mm_glm_tok_boi) {
|
3572
|
+
n_patches_sq += 2; // for BOI and EOI token embeddings
|
3573
|
+
}
|
3574
|
+
} break;
|
3575
|
+
case PROJECTOR_TYPE_MINICPMV:
|
3576
|
+
{
|
3577
|
+
if (params.minicpmv_version == 2) {
|
3578
|
+
n_patches_sq = 96;
|
3579
|
+
} else if (params.minicpmv_version == 3) {
|
3580
|
+
n_patches_sq = 64;
|
3581
|
+
} else if (params.minicpmv_version == 4) {
|
3582
|
+
n_patches_sq = 64;
|
3583
|
+
} else {
|
3584
|
+
LM_GGML_ABORT("Unknown minicpmv version");
|
3585
|
+
}
|
3586
|
+
} break;
|
3587
|
+
case PROJECTOR_TYPE_QWEN2VL:
|
3588
|
+
case PROJECTOR_TYPE_QWEN25VL:
|
3589
|
+
{
|
3590
|
+
// dynamic size
|
3591
|
+
int patch_size = params.patch_size * 2;
|
3592
|
+
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
|
3593
|
+
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
|
3594
|
+
n_patches_sq = x_patch * y_patch;
|
3595
|
+
} break;
|
3596
|
+
case PROJECTOR_TYPE_GEMMA3:
|
3597
|
+
{
|
3598
|
+
int n_per_side = params.image_size / params.patch_size;
|
3599
|
+
int n_per_side_2d_pool = n_per_side / params.proj_scale_factor;
|
3600
|
+
n_patches_sq = n_per_side_2d_pool * n_per_side_2d_pool;
|
3601
|
+
} break;
|
3602
|
+
case PROJECTOR_TYPE_IDEFICS3:
|
3603
|
+
case PROJECTOR_TYPE_INTERNVL:
|
3604
|
+
{
|
3605
|
+
// both W and H are divided by proj_scale_factor
|
3606
|
+
n_patches_sq /= (params.proj_scale_factor * params.proj_scale_factor);
|
3607
|
+
} break;
|
3608
|
+
case PROJECTOR_TYPE_PIXTRAL:
|
3609
|
+
{
|
3610
|
+
// dynamic size
|
3611
|
+
int n_merge = params.spatial_merge_size;
|
3612
|
+
int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1);
|
3613
|
+
int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1);
|
3614
|
+
n_patches_sq = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
|
3615
|
+
} break;
|
3616
|
+
case PROJECTOR_TYPE_LLAMA4:
|
3617
|
+
{
|
3618
|
+
int scale_factor = ctx->model.hparams.proj_scale_factor;
|
3619
|
+
n_patches_sq /= (scale_factor * scale_factor);
|
3620
|
+
} break;
|
3621
|
+
case PROJECTOR_TYPE_ULTRAVOX:
|
3622
|
+
{
|
3623
|
+
const int proj_stack_factor = ctx->model.hparams.proj_stack_factor;
|
3624
|
+
const int n_len = CLIP_ALIGN(img->nx, proj_stack_factor);
|
3625
|
+
n_patches_sq = n_len / proj_stack_factor / 2;
|
3626
|
+
} break;
|
3627
|
+
case PROJECTOR_TYPE_QWEN2A:
|
3628
|
+
{
|
3629
|
+
// divide by 2 because of whisper
|
3630
|
+
// another divide by 2 because of nn.AvgPool1d(2, stride=2)
|
3631
|
+
n_patches_sq = img->nx / 4;
|
3632
|
+
} break;
|
3633
|
+
default:
|
3634
|
+
LM_GGML_ABORT("unsupported projector type");
|
3635
|
+
}
|
3636
|
+
|
3637
|
+
return n_patches_sq;
|
3512
3638
|
}
|
3513
3639
|
|
3514
3640
|
static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
|
@@ -3623,7 +3749,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
3623
3749
|
lm_ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
|
3624
3750
|
|
3625
3751
|
// set inputs
|
3626
|
-
const auto & model = ctx->
|
3752
|
+
const auto & model = ctx->model;
|
3627
3753
|
const auto & hparams = model.hparams;
|
3628
3754
|
|
3629
3755
|
const int image_size_width = imgs.entries[0]->nx;
|
@@ -3713,7 +3839,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
3713
3839
|
}
|
3714
3840
|
|
3715
3841
|
// set input per projector
|
3716
|
-
switch (ctx->proj_type) {
|
3842
|
+
switch (ctx->model.proj_type) {
|
3717
3843
|
case PROJECTOR_TYPE_MINICPMV:
|
3718
3844
|
{
|
3719
3845
|
// inspired from siglip:
|
@@ -3906,6 +4032,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
3906
4032
|
case PROJECTOR_TYPE_GEMMA3:
|
3907
4033
|
case PROJECTOR_TYPE_IDEFICS3:
|
3908
4034
|
case PROJECTOR_TYPE_INTERNVL:
|
4035
|
+
case PROJECTOR_TYPE_QWEN2A:
|
3909
4036
|
case PROJECTOR_TYPE_ULTRAVOX:
|
3910
4037
|
{
|
3911
4038
|
// do nothing
|
@@ -3966,7 +4093,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
3966
4093
|
const int n_tokens_out = embeddings->ne[1];
|
3967
4094
|
const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
|
3968
4095
|
if (n_tokens_out != expected_n_tokens_out) {
|
3969
|
-
LOG_ERR("%s: expected %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
|
4096
|
+
LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
|
3970
4097
|
LM_GGML_ABORT("Invalid number of output tokens");
|
3971
4098
|
}
|
3972
4099
|
|
@@ -3977,74 +4104,83 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
3977
4104
|
}
|
3978
4105
|
|
3979
4106
|
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
3980
|
-
|
4107
|
+
const auto & hparams = ctx->model.hparams;
|
4108
|
+
switch (ctx->model.proj_type) {
|
3981
4109
|
case PROJECTOR_TYPE_LDP:
|
3982
|
-
return ctx->
|
4110
|
+
return ctx->model.mm_model_block_1_block_2_1_b->ne[0];
|
3983
4111
|
case PROJECTOR_TYPE_LDPV2:
|
3984
|
-
return ctx->
|
4112
|
+
return ctx->model.mm_model_peg_0_b->ne[0];
|
3985
4113
|
case PROJECTOR_TYPE_MLP:
|
3986
4114
|
case PROJECTOR_TYPE_PIXTRAL:
|
3987
|
-
return ctx->
|
4115
|
+
return ctx->model.mm_2_w->ne[1];
|
3988
4116
|
case PROJECTOR_TYPE_MLP_NORM:
|
3989
|
-
return ctx->
|
4117
|
+
return ctx->model.mm_3_b->ne[0];
|
3990
4118
|
case PROJECTOR_TYPE_MINICPMV:
|
3991
|
-
if (
|
4119
|
+
if (hparams.minicpmv_version == 2) {
|
3992
4120
|
return 4096;
|
3993
|
-
} else if (
|
4121
|
+
} else if (hparams.minicpmv_version == 3) {
|
3994
4122
|
return 3584;
|
3995
|
-
} else if (
|
4123
|
+
} else if (hparams.minicpmv_version == 4) {
|
3996
4124
|
return 3584;
|
3997
4125
|
}
|
3998
4126
|
LM_GGML_ABORT("Unknown minicpmv version");
|
3999
4127
|
case PROJECTOR_TYPE_GLM_EDGE:
|
4000
|
-
return ctx->
|
4128
|
+
return ctx->model.mm_model_mlp_3_w->ne[1];
|
4001
4129
|
case PROJECTOR_TYPE_QWEN2VL:
|
4002
4130
|
case PROJECTOR_TYPE_QWEN25VL:
|
4003
|
-
return ctx->
|
4131
|
+
return ctx->model.mm_1_b->ne[0];
|
4004
4132
|
case PROJECTOR_TYPE_GEMMA3:
|
4005
|
-
return ctx->
|
4133
|
+
return ctx->model.mm_input_proj_w->ne[0];
|
4006
4134
|
case PROJECTOR_TYPE_IDEFICS3:
|
4007
|
-
return ctx->
|
4135
|
+
return ctx->model.projection->ne[1];
|
4008
4136
|
case PROJECTOR_TYPE_ULTRAVOX:
|
4009
|
-
return ctx->
|
4137
|
+
return ctx->model.mm_2_w->ne[1];
|
4010
4138
|
case PROJECTOR_TYPE_INTERNVL:
|
4011
|
-
return ctx->
|
4139
|
+
return ctx->model.mm_3_w->ne[1];
|
4012
4140
|
case PROJECTOR_TYPE_LLAMA4:
|
4013
|
-
return ctx->
|
4141
|
+
return ctx->model.mm_model_proj->ne[1];
|
4142
|
+
case PROJECTOR_TYPE_QWEN2A:
|
4143
|
+
return ctx->model.mm_fc_w->ne[1];
|
4014
4144
|
default:
|
4015
4145
|
LM_GGML_ABORT("Unknown projector type");
|
4016
4146
|
}
|
4017
4147
|
}
|
4018
4148
|
|
4019
4149
|
int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
4020
|
-
if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
|
4021
|
-
return ctx->minicpmv_version;
|
4150
|
+
if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
|
4151
|
+
return ctx->model.hparams.minicpmv_version;
|
4022
4152
|
}
|
4023
4153
|
return 0;
|
4024
4154
|
}
|
4025
4155
|
|
4026
4156
|
bool clip_is_glm(const struct clip_ctx * ctx) {
|
4027
|
-
return ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE;
|
4157
|
+
return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
|
4028
4158
|
}
|
4029
4159
|
|
4030
4160
|
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
|
4031
|
-
return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL
|
4161
|
+
return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
|
4162
|
+
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL;
|
4032
4163
|
}
|
4033
4164
|
|
4034
4165
|
bool clip_is_llava(const struct clip_ctx * ctx) {
|
4035
|
-
return ctx->has_llava_projector;
|
4166
|
+
return ctx->model.hparams.has_llava_projector;
|
4036
4167
|
}
|
4037
4168
|
|
4038
4169
|
bool clip_is_gemma3(const struct clip_ctx * ctx) {
|
4039
|
-
return ctx->proj_type == PROJECTOR_TYPE_GEMMA3;
|
4170
|
+
return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
|
4040
4171
|
}
|
4041
4172
|
|
4042
4173
|
bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
|
4043
|
-
return ctx->
|
4174
|
+
return ctx->model.modality == CLIP_MODALITY_VISION;
|
4044
4175
|
}
|
4045
4176
|
|
4046
4177
|
bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
|
4047
|
-
return ctx->
|
4178
|
+
return ctx->model.modality == CLIP_MODALITY_AUDIO;
|
4179
|
+
}
|
4180
|
+
|
4181
|
+
bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
|
4182
|
+
return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
|
4183
|
+
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN2A;
|
4048
4184
|
}
|
4049
4185
|
|
4050
4186
|
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
|
@@ -4065,7 +4201,7 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img,
|
|
4065
4201
|
//
|
4066
4202
|
|
4067
4203
|
projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
|
4068
|
-
return ctx->proj_type;
|
4204
|
+
return ctx->proj_type();
|
4069
4205
|
}
|
4070
4206
|
|
4071
4207
|
void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) {
|