cui-llama.rn 1.7.3 → 1.7.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +217 -17
- package/android/src/main/CMakeLists.txt +34 -15
- package/android/src/main/java/com/rnllama/LlamaContext.java +94 -8
- package/android/src/main/java/com/rnllama/RNLlama.java +247 -0
- package/android/src/main/jni.cpp +213 -14
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
- package/cpp/README.md +1 -1
- package/cpp/chat-parser.cpp +385 -0
- package/cpp/chat-parser.h +120 -0
- package/cpp/chat.cpp +726 -596
- package/cpp/chat.h +71 -6
- package/cpp/common.cpp +56 -38
- package/cpp/common.h +9 -3
- package/cpp/ggml-backend-reg.cpp +5 -0
- package/cpp/ggml-backend.cpp +10 -2
- package/cpp/ggml-common.h +4 -0
- package/cpp/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
- package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/ggml-cpu/common.h +4 -3
- package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
- package/cpp/ggml-cpu/ggml-cpu.c +123 -104
- package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
- package/cpp/ggml-cpu/ops.cpp +330 -148
- package/cpp/ggml-cpu/ops.h +1 -0
- package/cpp/ggml-cpu/quants.c +1158 -0
- package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/ggml-cpu/repack.cpp +1571 -0
- package/cpp/ggml-cpu/repack.h +98 -0
- package/cpp/ggml-cpu/simd-mappings.h +330 -38
- package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/ggml-cpu/vec.cpp +87 -18
- package/cpp/ggml-cpu/vec.h +249 -94
- package/cpp/ggml-cpu.h +1 -0
- package/cpp/ggml-impl.h +63 -183
- package/cpp/ggml-llama-sim.metallib +0 -0
- package/cpp/ggml-llama.metallib +0 -0
- package/cpp/ggml-metal.m +152 -45
- package/cpp/ggml-quants.c +0 -2
- package/cpp/ggml.c +61 -21
- package/cpp/ggml.h +22 -3
- package/cpp/gguf.cpp +24 -3
- package/cpp/json-partial.cpp +256 -0
- package/cpp/json-partial.h +38 -0
- package/cpp/json-schema-to-grammar.cpp +5 -47
- package/cpp/json-schema-to-grammar.h +4 -4
- package/cpp/llama-arch.cpp +153 -3
- package/cpp/llama-arch.h +27 -1
- package/cpp/llama-batch.cpp +741 -272
- package/cpp/llama-batch.h +112 -54
- package/cpp/llama-chat.cpp +30 -8
- package/cpp/llama-chat.h +1 -0
- package/cpp/llama-context.cpp +524 -339
- package/cpp/llama-context.h +38 -17
- package/cpp/llama-cparams.cpp +4 -0
- package/cpp/llama-cparams.h +2 -0
- package/cpp/llama-grammar.cpp +12 -2
- package/cpp/llama-graph.cpp +431 -356
- package/cpp/llama-graph.h +126 -58
- package/cpp/llama-hparams.cpp +10 -2
- package/cpp/llama-hparams.h +19 -2
- package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
- package/cpp/llama-kv-cache-unified-iswa.h +128 -0
- package/cpp/llama-kv-cache-unified.cpp +1841 -0
- package/cpp/llama-kv-cache-unified.h +303 -0
- package/cpp/llama-kv-cells.h +439 -0
- package/cpp/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama-memory-hybrid.h +138 -0
- package/cpp/llama-memory-recurrent.cpp +1112 -0
- package/cpp/llama-memory-recurrent.h +183 -0
- package/cpp/llama-memory.cpp +41 -0
- package/cpp/llama-memory.h +86 -5
- package/cpp/llama-mmap.cpp +1 -1
- package/cpp/llama-model-loader.cpp +42 -17
- package/cpp/llama-model-saver.cpp +1 -0
- package/cpp/llama-model.cpp +1639 -513
- package/cpp/llama-model.h +26 -0
- package/cpp/llama-sampling.cpp +2 -2
- package/cpp/llama-vocab.cpp +65 -28
- package/cpp/llama-vocab.h +1 -0
- package/cpp/llama.cpp +11 -7
- package/cpp/llama.h +150 -42
- package/cpp/minja/chat-template.hpp +1 -1
- package/cpp/minja/minja.hpp +1 -1
- package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
- package/cpp/nlohmann/json_fwd.hpp +187 -0
- package/cpp/regex-partial.cpp +204 -0
- package/cpp/regex-partial.h +56 -0
- package/cpp/rn-llama.cpp +646 -35
- package/cpp/rn-llama.h +32 -1
- package/cpp/rn-tts.h +39 -0
- package/cpp/sampling.cpp +7 -8
- package/cpp/tools/mtmd/clip-impl.h +5 -0
- package/cpp/tools/mtmd/clip.cpp +572 -436
- package/cpp/tools/mtmd/clip.h +14 -4
- package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
- package/cpp/tools/mtmd/mtmd-audio.h +2 -17
- package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
- package/cpp/tools/mtmd/mtmd-helper.h +91 -0
- package/cpp/tools/mtmd/mtmd.cpp +368 -248
- package/cpp/tools/mtmd/mtmd.h +6 -70
- package/cpp/unicode.cpp +5 -0
- package/ios/CMakeLists.txt +26 -6
- package/ios/RNLlama.h +1 -1
- package/ios/RNLlama.mm +153 -3
- package/ios/RNLlamaContext.h +9 -1
- package/ios/RNLlamaContext.mm +112 -9
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/jest/mock.js +24 -0
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +46 -2
- package/src/index.ts +105 -1
- package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
- package/cpp/ggml-cpu/sgemm.cpp +0 -3544
- package/cpp/ggml-cpu/sgemm.h +0 -14
- package/cpp/llama-kv-cache.cpp +0 -2827
- package/cpp/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
- /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
package/cpp/tools/mtmd/mtmd.cpp
CHANGED
@@ -95,15 +95,21 @@ mtmd_context_params mtmd_context_params_default() {
|
|
95
95
|
}
|
96
96
|
|
97
97
|
struct mtmd_context {
|
98
|
-
struct clip_ctx *
|
98
|
+
struct clip_ctx * ctx_v; // vision
|
99
|
+
struct clip_ctx * ctx_a; // audio
|
99
100
|
const struct llama_model * text_model;
|
100
101
|
std::vector<float> image_embd_v; // image embedding vector
|
101
102
|
|
102
103
|
bool print_timings;
|
103
104
|
int n_threads;
|
104
105
|
std::string media_marker;
|
105
|
-
|
106
|
-
|
106
|
+
const int n_embd_text;
|
107
|
+
|
108
|
+
// these are not token, but strings used to mark the beginning and end of image/audio embeddings
|
109
|
+
std::string img_beg;
|
110
|
+
std::string img_end;
|
111
|
+
std::string aud_beg;
|
112
|
+
std::string aud_end;
|
107
113
|
|
108
114
|
// for llava-uhd style models, we need special tokens in-between slices
|
109
115
|
// minicpmv calls them "slices", llama 4 calls them "tiles"
|
@@ -132,26 +138,61 @@ struct mtmd_context {
|
|
132
138
|
text_model (text_model),
|
133
139
|
print_timings(ctx_params.print_timings),
|
134
140
|
n_threads (ctx_params.n_threads),
|
135
|
-
media_marker (ctx_params.media_marker)
|
141
|
+
media_marker (ctx_params.media_marker),
|
142
|
+
n_embd_text (llama_model_n_embd(text_model))
|
136
143
|
{
|
137
144
|
if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
|
138
145
|
throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
|
139
146
|
}
|
140
147
|
|
148
|
+
if (media_marker.empty()) {
|
149
|
+
throw std::runtime_error("media_marker must not be empty");
|
150
|
+
}
|
151
|
+
|
141
152
|
clip_context_params ctx_clip_params;
|
142
153
|
ctx_clip_params.use_gpu = ctx_params.use_gpu;
|
143
154
|
ctx_clip_params.verbosity = ctx_params.verbosity;
|
144
|
-
|
145
|
-
|
155
|
+
auto res = clip_init(mmproj_fname, ctx_clip_params);
|
156
|
+
ctx_v = res.ctx_v;
|
157
|
+
ctx_a = res.ctx_a;
|
158
|
+
if (!ctx_v && !ctx_a) {
|
146
159
|
throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
|
147
160
|
}
|
148
161
|
|
149
|
-
|
150
|
-
|
151
|
-
|
162
|
+
// if both vision and audio mmproj are present, we need to validate their n_embd
|
163
|
+
if (ctx_v && ctx_a) {
|
164
|
+
int n_embd_v = clip_n_mmproj_embd(ctx_v);
|
165
|
+
int n_embd_a = clip_n_mmproj_embd(ctx_a);
|
166
|
+
if (n_embd_v != n_embd_a) {
|
167
|
+
throw std::runtime_error(string_format(
|
168
|
+
"mismatch between vision and audio mmproj (n_embd_v = %d, n_embd_a = %d)\n",
|
169
|
+
n_embd_v, n_embd_a));
|
170
|
+
}
|
171
|
+
}
|
172
|
+
|
173
|
+
// since we already validate n_embd of vision and audio mmproj,
|
174
|
+
// we can safely assume that they are the same
|
175
|
+
int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a);
|
176
|
+
if (n_embd_text != n_embd_clip) {
|
177
|
+
throw std::runtime_error(string_format(
|
178
|
+
"mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
|
179
|
+
"hint: you may be using wrong mmproj\n",
|
180
|
+
n_embd_text, n_embd_clip));
|
181
|
+
}
|
182
|
+
if (ctx_v) {
|
183
|
+
init_vision();
|
184
|
+
}
|
185
|
+
if (ctx_a) {
|
186
|
+
init_audio();
|
187
|
+
}
|
188
|
+
}
|
152
189
|
|
153
|
-
|
154
|
-
|
190
|
+
void init_vision() {
|
191
|
+
LM_GGML_ASSERT(ctx_v != nullptr);
|
192
|
+
use_mrope = clip_is_qwen2vl(ctx_v);
|
193
|
+
|
194
|
+
projector_type proj = clip_get_projector_type(ctx_v);
|
195
|
+
int minicpmv_version = clip_is_minicpmv(ctx_v);
|
155
196
|
if (minicpmv_version == 2) {
|
156
197
|
// minicpmv 2.5 format:
|
157
198
|
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
|
@@ -196,24 +237,82 @@ struct mtmd_context {
|
|
196
237
|
ov_img_first = false; // overview image is last
|
197
238
|
}
|
198
239
|
|
199
|
-
|
240
|
+
// set boi/eoi
|
241
|
+
if (proj == PROJECTOR_TYPE_GEMMA3) {
|
242
|
+
// <start_of_image> ... (image embeddings) ... <end_of_image>
|
243
|
+
img_beg = "<start_of_image>";
|
244
|
+
img_end = "<end_of_image>";
|
245
|
+
|
246
|
+
} else if (proj == PROJECTOR_TYPE_IDEFICS3) {
|
247
|
+
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
|
248
|
+
img_beg = "<fake_token_around_image><global-img>";
|
249
|
+
img_end = "<fake_token_around_image>";
|
250
|
+
|
251
|
+
} else if (proj == PROJECTOR_TYPE_PIXTRAL) {
|
252
|
+
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
|
253
|
+
img_end = "[IMG_END]";
|
254
|
+
|
255
|
+
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL) {
|
256
|
+
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
257
|
+
img_beg = "<|vision_start|>";
|
258
|
+
img_end = "<|vision_end|>";
|
259
|
+
|
260
|
+
} else if (proj == PROJECTOR_TYPE_LLAMA4) {
|
261
|
+
// (more details in mtmd_context constructor)
|
262
|
+
img_beg = "<|image_start|>";
|
263
|
+
img_end = "<|image_end|>";
|
264
|
+
LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
|
265
|
+
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
|
266
|
+
|
267
|
+
} else if (proj == PROJECTOR_TYPE_INTERNVL) {
|
268
|
+
// <img> ... (image embeddings) ... </img>
|
269
|
+
img_beg = "<img>";
|
270
|
+
img_end = "</img>";
|
271
|
+
|
272
|
+
}
|
273
|
+
}
|
274
|
+
|
275
|
+
void init_audio() {
|
276
|
+
LM_GGML_ASSERT(ctx_a != nullptr);
|
277
|
+
projector_type proj = clip_get_projector_type(ctx_a);
|
278
|
+
|
279
|
+
if (clip_has_whisper_encoder(ctx_a)) {
|
200
280
|
// TODO @ngxson : check if model n_mel is 128 or 80
|
201
281
|
w_filters = whisper_precalc_filters::get_128_bins();
|
202
282
|
}
|
203
283
|
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
284
|
+
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
|
285
|
+
" https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
|
286
|
+
|
287
|
+
if (proj == PROJECTOR_TYPE_QWEN2A) {
|
288
|
+
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
|
289
|
+
aud_beg = "<|audio_bos|>";
|
290
|
+
aud_end = "<|audio_eos|>";
|
291
|
+
|
208
292
|
}
|
209
|
-
|
210
|
-
|
211
|
-
|
293
|
+
}
|
294
|
+
|
295
|
+
// get clip ctx based on chunk type
|
296
|
+
clip_ctx * get_clip_ctx(const mtmd_input_chunk * chunk) const {
|
297
|
+
if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
298
|
+
return ctx_v;
|
299
|
+
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
300
|
+
return ctx_a;
|
212
301
|
}
|
302
|
+
LM_GGML_ABORT("unknown chunk type");
|
303
|
+
}
|
304
|
+
|
305
|
+
projector_type proj_type_v() const {
|
306
|
+
return ctx_v ? clip_get_projector_type(ctx_v) : PROJECTOR_TYPE_UNKNOWN;
|
307
|
+
}
|
308
|
+
|
309
|
+
projector_type proj_type_a() const {
|
310
|
+
return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN;
|
213
311
|
}
|
214
312
|
|
215
313
|
~mtmd_context() {
|
216
|
-
clip_free(
|
314
|
+
clip_free(ctx_a);
|
315
|
+
clip_free(ctx_v);
|
217
316
|
}
|
218
317
|
|
219
318
|
private:
|
@@ -260,162 +359,137 @@ void mtmd_free(mtmd_context * ctx) {
|
|
260
359
|
}
|
261
360
|
}
|
262
361
|
|
263
|
-
|
264
|
-
|
265
|
-
const
|
266
|
-
const std::string & text,
|
267
|
-
bool add_special,
|
268
|
-
bool parse_special) {
|
269
|
-
// upper limit for the number of tokens
|
270
|
-
int n_tokens = text.length() + 2 * add_special;
|
271
|
-
std::vector<llama_token> result(n_tokens);
|
272
|
-
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
273
|
-
if (n_tokens < 0) {
|
274
|
-
result.resize(-n_tokens);
|
275
|
-
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
276
|
-
LM_GGML_ASSERT(check == -n_tokens);
|
277
|
-
} else {
|
278
|
-
result.resize(n_tokens);
|
279
|
-
}
|
280
|
-
return result;
|
281
|
-
}
|
362
|
+
struct mtmd_tokenizer {
|
363
|
+
mtmd_context * ctx;
|
364
|
+
std::vector<const mtmd_bitmap *> bitmaps;
|
282
365
|
|
283
|
-
|
284
|
-
|
366
|
+
std::string input_text;
|
367
|
+
bool add_special;
|
368
|
+
bool parse_special;
|
369
|
+
const llama_vocab * vocab;
|
370
|
+
|
371
|
+
mtmd_input_chunks cur;
|
372
|
+
|
373
|
+
mtmd_tokenizer(mtmd_context * ctx,
|
285
374
|
const mtmd_input_text * text,
|
286
375
|
const mtmd_bitmap ** bitmaps,
|
287
|
-
size_t n_bitmaps) {
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
// a bit hacky here, but works for now
|
298
|
-
// for some models, we need to add prefix and suffix to the image embeddings
|
299
|
-
if (clip_is_gemma3(ctx->ctx_clip)) {
|
300
|
-
// gemma 3
|
301
|
-
// <start_of_image> ... (image embeddings) ... <end_of_image>
|
302
|
-
marker_modified = "<start_of_image>" + ctx->media_marker + "<end_of_image>";
|
303
|
-
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
304
|
-
|
305
|
-
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
|
306
|
-
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
|
307
|
-
marker_modified = "<fake_token_around_image><global-img>" + ctx->media_marker + "<fake_token_around_image>";
|
308
|
-
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
309
|
-
|
310
|
-
} else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
|
311
|
-
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
|
312
|
-
marker_modified = ctx->media_marker + "[IMG_END]";
|
313
|
-
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
314
|
-
|
315
|
-
} else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
316
|
-
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
317
|
-
marker_modified = "<|vision_start|>" + ctx->media_marker + "<|vision_end|>";
|
318
|
-
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
319
|
-
|
320
|
-
} else if (proj_type == PROJECTOR_TYPE_LLAMA4) {
|
321
|
-
// (more details in mtmd_context constructor)
|
322
|
-
marker_modified = "<|image_start|>" + ctx->media_marker + "<|image_end|>";
|
323
|
-
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
324
|
-
|
325
|
-
} else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
|
326
|
-
// <img> ... (image embeddings) ... </img>
|
327
|
-
marker_modified = "<img>" + ctx->media_marker + "</img>";
|
328
|
-
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
329
|
-
|
330
|
-
}
|
331
|
-
|
332
|
-
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
|
333
|
-
// for glm-edge, BOI and EOI token's embeddings are not present in the text model
|
334
|
-
|
335
|
-
std::vector<std::string> parts = string_split_str(prompt_modified, ctx->media_marker);
|
336
|
-
output->entries.clear();
|
337
|
-
output->entries.reserve(parts.size());
|
338
|
-
|
339
|
-
size_t i_bm = 0;
|
340
|
-
|
341
|
-
// utility for adding raw tokens
|
342
|
-
auto add_text_chunk = [&output](std::vector<llama_token> && tokens) {
|
343
|
-
mtmd_input_chunk chunk{
|
344
|
-
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
345
|
-
std::move(tokens),
|
346
|
-
nullptr, // image tokens
|
347
|
-
nullptr, // audio tokens
|
348
|
-
};
|
349
|
-
output->entries.emplace_back(std::move(chunk));
|
350
|
-
};
|
376
|
+
size_t n_bitmaps) : ctx(ctx), bitmaps(bitmaps, bitmaps + n_bitmaps) {
|
377
|
+
add_special = text->add_special;
|
378
|
+
parse_special = text->parse_special;
|
379
|
+
input_text = text->text;
|
380
|
+
vocab = llama_model_get_vocab(ctx->text_model);
|
381
|
+
|
382
|
+
// for compatibility, we convert image marker to media marker
|
383
|
+
string_replace_all(input_text, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
|
384
|
+
}
|
351
385
|
|
352
|
-
|
353
|
-
|
354
|
-
std::vector<
|
386
|
+
int32_t tokenize(mtmd_input_chunks * output) {
|
387
|
+
cur.entries.clear();
|
388
|
+
std::vector<std::string> parts = split_text(input_text, ctx->media_marker);
|
389
|
+
size_t i_bm = 0; // index of the current bitmap
|
390
|
+
for (auto & part : parts) {
|
391
|
+
if (part == ctx->media_marker) {
|
392
|
+
// this is a marker, we should add the next bitmap
|
393
|
+
if (i_bm >= bitmaps.size()) {
|
394
|
+
LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
|
395
|
+
__func__, bitmaps.size(), parts.size() - 1);
|
396
|
+
return 1;
|
397
|
+
}
|
398
|
+
const mtmd_bitmap * bitmap = bitmaps[i_bm++];
|
399
|
+
int32_t res = add_media(bitmap);
|
400
|
+
if (res != 0) {
|
401
|
+
return res;
|
402
|
+
}
|
403
|
+
} else {
|
404
|
+
// this is a text part, we should add it as text
|
405
|
+
add_text(part, parse_special);
|
406
|
+
}
|
407
|
+
}
|
355
408
|
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
409
|
+
if (add_special && llama_vocab_get_add_bos(vocab)) {
|
410
|
+
// if first chunk is text, we add BOS token to first text chunk
|
411
|
+
// otherwise, create a new text chunk with BOS token
|
412
|
+
if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
413
|
+
// add BOS token to the beginning of first text chunk
|
414
|
+
cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab));
|
415
|
+
} else {
|
416
|
+
// create a new text chunk with BOS token at the beginning
|
417
|
+
mtmd_input_chunk bos_chunk{
|
418
|
+
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
419
|
+
{llama_vocab_bos(vocab)},
|
420
|
+
nullptr, // image tokens
|
421
|
+
nullptr, // audio tokens
|
422
|
+
};
|
423
|
+
cur.entries.insert(cur.entries.begin(), std::move(bos_chunk));
|
424
|
+
}
|
425
|
+
}
|
362
426
|
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
std::move(image_tokens),
|
367
|
-
nullptr, // audio tokens
|
368
|
-
};
|
369
|
-
chunks.emplace_back(std::move(chunk));
|
427
|
+
if (add_special && llama_vocab_get_add_eos(vocab)) {
|
428
|
+
// if last chunk is text, we add EOS token to it
|
429
|
+
add_text({llama_vocab_eos(vocab)});
|
370
430
|
}
|
371
431
|
|
372
|
-
|
373
|
-
|
432
|
+
if (i_bm != bitmaps.size()) {
|
433
|
+
LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
|
434
|
+
__func__, bitmaps.size(), parts.size() - 1);
|
435
|
+
return 1;
|
436
|
+
}
|
437
|
+
|
438
|
+
*output = std::move(cur);
|
439
|
+
|
440
|
+
return 0;
|
441
|
+
}
|
442
|
+
|
443
|
+
void add_text(const std::string & txt, bool parse_special) {
|
444
|
+
LOG_DBG("%s: %s\n", __func__, txt.c_str());
|
445
|
+
auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special);
|
446
|
+
add_text(tokens);
|
447
|
+
}
|
374
448
|
|
375
|
-
|
376
|
-
// printf("tokenizing part: %s\n", part.c_str());
|
377
|
-
bool add_bos = &parts.front() == ∂
|
378
|
-
auto tokens = mtmd_tokenize_text_internal(vocab, part, text->add_special && add_bos, text->parse_special);
|
449
|
+
void add_text(const std::vector<llama_token> & tokens) {
|
379
450
|
if (tokens.empty()) {
|
380
|
-
|
451
|
+
return;
|
381
452
|
}
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
453
|
+
// if last entry is also a text chunk, add tokens to it instead of creating new chunk
|
454
|
+
if (!cur.entries.empty() && cur.entries.back().type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
455
|
+
cur.entries.back().tokens_text.insert(
|
456
|
+
cur.entries.back().tokens_text.end(),
|
457
|
+
tokens.begin(),
|
458
|
+
tokens.end());
|
459
|
+
} else {
|
460
|
+
mtmd_input_chunk chunk{
|
461
|
+
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
462
|
+
tokens,
|
463
|
+
nullptr, // image tokens
|
464
|
+
nullptr, // audio tokens
|
465
|
+
};
|
466
|
+
cur.entries.emplace_back(std::move(chunk));
|
394
467
|
}
|
468
|
+
}
|
395
469
|
|
396
|
-
|
470
|
+
int32_t add_media(const mtmd_bitmap * bitmap) {
|
471
|
+
if (!bitmap->is_audio) {
|
397
472
|
// handle image
|
398
473
|
|
399
|
-
if (
|
400
|
-
LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
|
401
|
-
return 1;
|
402
|
-
}
|
403
|
-
|
404
|
-
if (!ctx->has_vision) {
|
474
|
+
if (!ctx->ctx_v) {
|
405
475
|
LOG_ERR("%s: error: model does not support vision input\n", __func__);
|
406
476
|
return 2;
|
407
477
|
}
|
408
478
|
|
479
|
+
if (!ctx->img_beg.empty()) {
|
480
|
+
add_text(ctx->img_beg, true); // add image begin token
|
481
|
+
}
|
482
|
+
|
409
483
|
// convert mtmd_bitmap to clip_image_u8
|
410
484
|
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
411
|
-
img_u8->nx =
|
412
|
-
img_u8->ny =
|
413
|
-
img_u8->buf.resize(
|
414
|
-
std::memcpy(img_u8->buf.data(),
|
485
|
+
img_u8->nx = bitmap->nx;
|
486
|
+
img_u8->ny = bitmap->ny;
|
487
|
+
img_u8->buf.resize(bitmap->data.size());
|
488
|
+
std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
|
415
489
|
|
416
490
|
// preprocess image
|
417
491
|
clip_image_f32_batch batch_f32;
|
418
|
-
bool ok = clip_image_preprocess(ctx->
|
492
|
+
bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
|
419
493
|
if (!ok) {
|
420
494
|
LOG_ERR("Unable to preprocess image\n");
|
421
495
|
return 2;
|
@@ -427,8 +501,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
427
501
|
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|
428
502
|
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
|
429
503
|
) {
|
504
|
+
const int n_col = batch_f32.grid_x;
|
505
|
+
const int n_row = batch_f32.grid_y;
|
430
506
|
// split batch into chunks of single images
|
431
|
-
|
507
|
+
// NOTE: batch_f32 will be invalidated after this call
|
508
|
+
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
|
432
509
|
LM_GGML_ASSERT(chunks.size() > 0);
|
433
510
|
|
434
511
|
auto ov_chunk = std::move(chunks.front());
|
@@ -437,66 +514,65 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
437
514
|
// add overview image (first)
|
438
515
|
if (ctx->ov_img_first) {
|
439
516
|
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
|
440
|
-
|
517
|
+
add_text({ctx->tok_ov_img_start});
|
441
518
|
}
|
442
|
-
|
519
|
+
cur.entries.emplace_back(std::move(ov_chunk));
|
443
520
|
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
|
444
|
-
|
521
|
+
add_text({ctx->tok_ov_img_end});
|
445
522
|
}
|
446
523
|
}
|
447
524
|
|
448
525
|
// add slices (or tiles)
|
449
526
|
if (!chunks.empty()) {
|
450
|
-
|
451
|
-
const int n_row = batch_f32.grid_y;
|
527
|
+
LM_GGML_ASSERT((int)chunks.size() == n_row * n_col);
|
452
528
|
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
|
453
|
-
|
529
|
+
add_text({ctx->tok_slices_start});
|
454
530
|
}
|
455
531
|
for (int y = 0; y < n_row; y++) {
|
456
532
|
for (int x = 0; x < n_col; x++) {
|
457
533
|
const bool is_last_in_row = (x == n_col - 1);
|
458
534
|
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
|
459
|
-
|
535
|
+
add_text({ctx->tok_sli_img_start});
|
460
536
|
}
|
461
|
-
|
537
|
+
cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
|
462
538
|
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
|
463
|
-
|
539
|
+
add_text({ctx->tok_sli_img_end});
|
464
540
|
}
|
465
541
|
if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
|
466
|
-
|
542
|
+
add_text({ctx->tok_sli_img_mid});
|
467
543
|
}
|
468
544
|
}
|
469
545
|
if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
|
470
|
-
|
546
|
+
add_text({ctx->tok_row_end});
|
471
547
|
}
|
472
548
|
}
|
473
549
|
if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
|
474
|
-
|
550
|
+
add_text({ctx->tok_slices_end});
|
475
551
|
}
|
476
552
|
}
|
477
553
|
|
478
554
|
// add overview image (last)
|
479
555
|
if (!ctx->ov_img_first) {
|
480
556
|
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
|
481
|
-
|
557
|
+
add_text({ctx->tok_ov_img_start});
|
482
558
|
}
|
483
|
-
|
559
|
+
cur.entries.emplace_back(std::move(ov_chunk));
|
484
560
|
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
|
485
|
-
|
561
|
+
add_text({ctx->tok_ov_img_end});
|
486
562
|
}
|
487
563
|
}
|
488
564
|
|
489
565
|
} else {
|
490
566
|
size_t n_tokens = 0;
|
491
567
|
for (const auto & entry : batch_f32.entries) {
|
492
|
-
n_tokens += clip_n_output_tokens(ctx->
|
568
|
+
n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
|
493
569
|
}
|
494
570
|
|
495
571
|
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
496
572
|
if (ctx->use_mrope) {
|
497
573
|
// for Qwen2VL, we need this information for M-RoPE decoding positions
|
498
|
-
image_tokens->nx = clip_n_output_tokens_x(ctx->
|
499
|
-
image_tokens->ny = clip_n_output_tokens_y(ctx->
|
574
|
+
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
|
575
|
+
image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
|
500
576
|
image_tokens->use_mrope_pos = true;
|
501
577
|
} else {
|
502
578
|
// other models, we only need the total number of tokens
|
@@ -504,7 +580,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
504
580
|
image_tokens->ny = 1;
|
505
581
|
}
|
506
582
|
image_tokens->batch_f32 = std::move(batch_f32);
|
507
|
-
image_tokens->id =
|
583
|
+
image_tokens->id = bitmap->id; // optional
|
508
584
|
|
509
585
|
LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
|
510
586
|
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
|
@@ -516,35 +592,35 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
516
592
|
std::move(image_tokens),
|
517
593
|
nullptr, // audio tokens
|
518
594
|
};
|
519
|
-
|
595
|
+
cur.entries.emplace_back(std::move(chunk));
|
520
596
|
}
|
521
597
|
|
522
|
-
|
523
|
-
|
598
|
+
if (!ctx->img_end.empty()) {
|
599
|
+
add_text(ctx->img_end, true); // add image end token
|
600
|
+
}
|
524
601
|
|
525
602
|
} else {
|
526
603
|
// handle audio
|
527
604
|
|
528
|
-
if (
|
529
|
-
LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
|
530
|
-
return 1;
|
531
|
-
}
|
532
|
-
|
533
|
-
if (!ctx->has_audio) {
|
605
|
+
if (!ctx->ctx_a) {
|
534
606
|
LOG_ERR("%s: error: model does not support audio input\n", __func__);
|
535
607
|
return 2;
|
536
608
|
}
|
537
609
|
|
538
|
-
if (
|
610
|
+
if (bitmap->data.size() == 0) {
|
539
611
|
LOG_ERR("%s: error: empty audio data\n", __func__);
|
540
612
|
return 2;
|
541
613
|
}
|
542
614
|
|
615
|
+
if (!ctx->aud_beg.empty()) {
|
616
|
+
add_text(ctx->aud_beg, true); // add audio begin token
|
617
|
+
}
|
618
|
+
|
543
619
|
// preprocess audio
|
544
620
|
LM_GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
|
545
621
|
std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
|
546
|
-
const float * samples = (const float *)
|
547
|
-
size_t n_samples =
|
622
|
+
const float * samples = (const float *)bitmap->data.data();
|
623
|
+
size_t n_samples = bitmap->data.size() / sizeof(float);
|
548
624
|
bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks);
|
549
625
|
if (!ok) {
|
550
626
|
LOG_ERR("Unable to preprocess audio\n");
|
@@ -558,7 +634,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
558
634
|
mel_f32->nx = mel_spec.n_len;
|
559
635
|
mel_f32->ny = mel_spec.n_mel;
|
560
636
|
mel_f32->buf = std::move(mel_spec.data);
|
561
|
-
size_t n_tokens = clip_n_output_tokens(ctx->
|
637
|
+
size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get());
|
562
638
|
|
563
639
|
clip_image_f32_batch batch_f32;
|
564
640
|
batch_f32.is_audio = true;
|
@@ -567,7 +643,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
567
643
|
mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
|
568
644
|
audio_tokens->n_tokens = n_tokens;
|
569
645
|
audio_tokens->batch_f32 = std::move(batch_f32);
|
570
|
-
audio_tokens->id =
|
646
|
+
audio_tokens->id = bitmap->id; // optional
|
571
647
|
|
572
648
|
LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
|
573
649
|
|
@@ -577,15 +653,88 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
577
653
|
nullptr, // image tokens
|
578
654
|
std::move(audio_tokens),
|
579
655
|
};
|
580
|
-
|
656
|
+
cur.entries.emplace_back(std::move(chunk));
|
581
657
|
}
|
582
658
|
|
583
|
-
|
584
|
-
|
659
|
+
if (!ctx->aud_end.empty()) {
|
660
|
+
add_text(ctx->aud_end, true); // add audio end token
|
661
|
+
}
|
585
662
|
}
|
663
|
+
|
664
|
+
return 0;
|
586
665
|
}
|
587
666
|
|
588
|
-
|
667
|
+
std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
|
668
|
+
std::vector<mtmd_input_chunk> chunks;
|
669
|
+
|
670
|
+
for (auto & entry : batch_f32.entries) {
|
671
|
+
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
672
|
+
image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, entry.get());
|
673
|
+
image_tokens->ny = 1;
|
674
|
+
image_tokens->batch_f32.entries.push_back(std::move(entry));
|
675
|
+
image_tokens->id = id;
|
676
|
+
|
677
|
+
mtmd_input_chunk chunk{
|
678
|
+
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
679
|
+
{}, // text tokens
|
680
|
+
std::move(image_tokens),
|
681
|
+
nullptr, // audio tokens
|
682
|
+
};
|
683
|
+
chunks.emplace_back(std::move(chunk));
|
684
|
+
}
|
685
|
+
|
686
|
+
return chunks;
|
687
|
+
}
|
688
|
+
|
689
|
+
// for example: "a <__media__> b <__media__> c" --> "a", "<__media__>", "b", "<__media__>", "c"
|
690
|
+
static std::vector<std::string> split_text(const std::string & input, const std::string & delimiter) {
|
691
|
+
std::vector<std::string> result;
|
692
|
+
if (input.empty()) {
|
693
|
+
return result;
|
694
|
+
}
|
695
|
+
size_t start = 0;
|
696
|
+
size_t pos = 0;
|
697
|
+
while ((pos = input.find(delimiter, start)) != std::string::npos) {
|
698
|
+
if (pos > start) {
|
699
|
+
result.push_back(input.substr(start, pos - start));
|
700
|
+
}
|
701
|
+
result.push_back(delimiter);
|
702
|
+
start = pos + delimiter.length();
|
703
|
+
}
|
704
|
+
if (start < input.length()) {
|
705
|
+
result.push_back(input.substr(start));
|
706
|
+
}
|
707
|
+
return result;
|
708
|
+
}
|
709
|
+
|
710
|
+
// copied from common_tokenize
|
711
|
+
static std::vector<llama_token> mtmd_tokenize_text_internal(
|
712
|
+
const struct llama_vocab * vocab,
|
713
|
+
const std::string & text,
|
714
|
+
bool add_special,
|
715
|
+
bool parse_special) {
|
716
|
+
// upper limit for the number of tokens
|
717
|
+
int n_tokens = text.length() + 2 * add_special;
|
718
|
+
std::vector<llama_token> result(n_tokens);
|
719
|
+
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
720
|
+
if (n_tokens < 0) {
|
721
|
+
result.resize(-n_tokens);
|
722
|
+
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
723
|
+
LM_GGML_ASSERT(check == -n_tokens);
|
724
|
+
} else {
|
725
|
+
result.resize(n_tokens);
|
726
|
+
}
|
727
|
+
return result;
|
728
|
+
}
|
729
|
+
};
|
730
|
+
|
731
|
+
int32_t mtmd_tokenize(mtmd_context * ctx,
|
732
|
+
mtmd_input_chunks * output,
|
733
|
+
const mtmd_input_text * text,
|
734
|
+
const mtmd_bitmap ** bitmaps,
|
735
|
+
size_t n_bitmaps) {
|
736
|
+
mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
|
737
|
+
return tokenizer.tokenize(output);
|
589
738
|
}
|
590
739
|
|
591
740
|
int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
|
@@ -593,41 +742,54 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
|
|
593
742
|
LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
|
594
743
|
return 0;
|
595
744
|
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
745
|
+
if (!ctx->ctx_v) {
|
746
|
+
LOG_ERR("%s: model does not support vision input\n", __func__);
|
747
|
+
return 1;
|
748
|
+
}
|
596
749
|
return mtmd_encode(ctx, chunk->tokens_image.get());
|
597
750
|
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
598
|
-
|
751
|
+
if (!ctx->ctx_a) {
|
752
|
+
LOG_ERR("%s: model does not support audio input\n", __func__);
|
753
|
+
return 1;
|
754
|
+
}
|
755
|
+
int n_mmproj_embd = ctx->n_embd_text;
|
599
756
|
ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
|
600
757
|
bool ok = clip_image_batch_encode(
|
601
|
-
ctx->
|
758
|
+
ctx->ctx_a,
|
602
759
|
ctx->n_threads,
|
603
760
|
&chunk->tokens_audio->batch_f32,
|
604
761
|
ctx->image_embd_v.data());
|
605
762
|
return ok ? 0 : 1;
|
606
763
|
}
|
607
764
|
|
608
|
-
LOG_ERR("
|
765
|
+
LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
|
609
766
|
return 1;
|
610
767
|
}
|
611
768
|
|
612
769
|
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
|
613
|
-
|
770
|
+
clip_ctx * ctx_clip = ctx->ctx_v;
|
771
|
+
if (!ctx_clip) {
|
772
|
+
LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
|
773
|
+
return 1;
|
774
|
+
}
|
775
|
+
int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
|
614
776
|
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
|
615
777
|
bool ok = false;
|
616
778
|
|
617
|
-
if (clip_is_llava(
|
779
|
+
if (clip_is_llava(ctx_clip) || clip_is_minicpmv(ctx_clip) || clip_is_glm(ctx_clip)) {
|
618
780
|
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
|
619
781
|
const auto & entries = image_tokens->batch_f32.entries;
|
620
782
|
for (size_t i = 0; i < entries.size(); i++) {
|
621
|
-
int n_tokens_per_image = clip_n_output_tokens(
|
783
|
+
int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
|
622
784
|
ok = clip_image_encode(
|
623
|
-
|
785
|
+
ctx_clip,
|
624
786
|
ctx->n_threads,
|
625
787
|
entries[i].get(),
|
626
788
|
ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
|
627
789
|
}
|
628
790
|
} else {
|
629
791
|
ok = clip_image_batch_encode(
|
630
|
-
|
792
|
+
ctx_clip,
|
631
793
|
ctx->n_threads,
|
632
794
|
&image_tokens->batch_f32,
|
633
795
|
ctx->image_embd_v.data());
|
@@ -641,8 +803,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
|
|
641
803
|
}
|
642
804
|
|
643
805
|
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
|
644
|
-
|
645
|
-
if (proj_type == PROJECTOR_TYPE_GEMMA3) {
|
806
|
+
if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
|
646
807
|
return true;
|
647
808
|
}
|
648
809
|
return false;
|
@@ -653,60 +814,19 @@ bool mtmd_decode_use_mrope(mtmd_context * ctx) {
|
|
653
814
|
}
|
654
815
|
|
655
816
|
bool mtmd_support_vision(mtmd_context * ctx) {
|
656
|
-
return ctx->
|
817
|
+
return ctx->ctx_v != nullptr;
|
657
818
|
}
|
658
819
|
|
659
820
|
bool mtmd_support_audio(mtmd_context * ctx) {
|
660
|
-
return ctx->
|
661
|
-
}
|
662
|
-
|
663
|
-
// these 2 helpers below use internal clip_image_u8_ptr,
|
664
|
-
// so unfortunately they cannot moved to mtmd-helper.h
|
665
|
-
// however, in theory, user can decode image file to bitmap using
|
666
|
-
// whichever library they want, and then use mtmd_bitmap_init() to create bitmap
|
667
|
-
|
668
|
-
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
|
669
|
-
if (audio_helpers::is_audio_file((const char *)buf, len)) {
|
670
|
-
std::vector<float> pcmf32;
|
671
|
-
if (!audio_helpers::decode_audio_from_buf(buf, len, COMMON_SAMPLE_RATE, pcmf32)) {
|
672
|
-
LOG_ERR("Unable to read WAV audio file from buffer\n");
|
673
|
-
return nullptr;
|
674
|
-
}
|
675
|
-
return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
|
676
|
-
}
|
677
|
-
|
678
|
-
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
679
|
-
bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
|
680
|
-
if (!ok) {
|
681
|
-
LOG_ERR("Unable to load image from buffer\n");
|
682
|
-
return nullptr;
|
683
|
-
}
|
684
|
-
uint32_t nx, ny;
|
685
|
-
unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
|
686
|
-
return mtmd_bitmap_init(nx, ny, data);
|
821
|
+
return ctx->ctx_a != nullptr;
|
687
822
|
}
|
688
823
|
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
if (!f) {
|
693
|
-
LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
|
694
|
-
return nullptr;
|
824
|
+
int mtmd_get_audio_bitrate(mtmd_context * ctx) {
|
825
|
+
if (!ctx->ctx_a) {
|
826
|
+
return -1;
|
695
827
|
}
|
696
|
-
|
697
|
-
|
698
|
-
long file_size = ftell(f);
|
699
|
-
fseek(f, 0, SEEK_SET);
|
700
|
-
buf.resize(file_size);
|
701
|
-
|
702
|
-
size_t n_read = fread(buf.data(), 1, file_size, f);
|
703
|
-
fclose(f);
|
704
|
-
if (n_read != (size_t)file_size) {
|
705
|
-
LOG_ERR("Failed to read entire file %s", fname);
|
706
|
-
return nullptr;
|
707
|
-
}
|
708
|
-
|
709
|
-
return mtmd_helper_bitmap_init_from_buf(buf.data(), buf.size());
|
828
|
+
// for now, we assume that all audio models have the same bitrate
|
829
|
+
return 16000; // 16kHz
|
710
830
|
}
|
711
831
|
|
712
832
|
//
|