cui-llama.rn 1.7.3 → 1.7.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +217 -17
- package/android/src/main/CMakeLists.txt +34 -15
- package/android/src/main/java/com/rnllama/LlamaContext.java +94 -8
- package/android/src/main/java/com/rnllama/RNLlama.java +247 -0
- package/android/src/main/jni.cpp +213 -14
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
- package/cpp/README.md +1 -1
- package/cpp/chat-parser.cpp +385 -0
- package/cpp/chat-parser.h +120 -0
- package/cpp/chat.cpp +726 -596
- package/cpp/chat.h +71 -6
- package/cpp/common.cpp +56 -38
- package/cpp/common.h +9 -3
- package/cpp/ggml-backend-reg.cpp +5 -0
- package/cpp/ggml-backend.cpp +10 -2
- package/cpp/ggml-common.h +4 -0
- package/cpp/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
- package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/ggml-cpu/common.h +4 -3
- package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
- package/cpp/ggml-cpu/ggml-cpu.c +123 -104
- package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
- package/cpp/ggml-cpu/ops.cpp +330 -148
- package/cpp/ggml-cpu/ops.h +1 -0
- package/cpp/ggml-cpu/quants.c +1158 -0
- package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/ggml-cpu/repack.cpp +1571 -0
- package/cpp/ggml-cpu/repack.h +98 -0
- package/cpp/ggml-cpu/simd-mappings.h +330 -38
- package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/ggml-cpu/vec.cpp +87 -18
- package/cpp/ggml-cpu/vec.h +249 -94
- package/cpp/ggml-cpu.h +1 -0
- package/cpp/ggml-impl.h +63 -183
- package/cpp/ggml-llama-sim.metallib +0 -0
- package/cpp/ggml-llama.metallib +0 -0
- package/cpp/ggml-metal.m +152 -45
- package/cpp/ggml-quants.c +0 -2
- package/cpp/ggml.c +61 -21
- package/cpp/ggml.h +22 -3
- package/cpp/gguf.cpp +24 -3
- package/cpp/json-partial.cpp +256 -0
- package/cpp/json-partial.h +38 -0
- package/cpp/json-schema-to-grammar.cpp +5 -47
- package/cpp/json-schema-to-grammar.h +4 -4
- package/cpp/llama-arch.cpp +153 -3
- package/cpp/llama-arch.h +27 -1
- package/cpp/llama-batch.cpp +741 -272
- package/cpp/llama-batch.h +112 -54
- package/cpp/llama-chat.cpp +30 -8
- package/cpp/llama-chat.h +1 -0
- package/cpp/llama-context.cpp +524 -339
- package/cpp/llama-context.h +38 -17
- package/cpp/llama-cparams.cpp +4 -0
- package/cpp/llama-cparams.h +2 -0
- package/cpp/llama-grammar.cpp +12 -2
- package/cpp/llama-graph.cpp +431 -356
- package/cpp/llama-graph.h +126 -58
- package/cpp/llama-hparams.cpp +10 -2
- package/cpp/llama-hparams.h +19 -2
- package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
- package/cpp/llama-kv-cache-unified-iswa.h +128 -0
- package/cpp/llama-kv-cache-unified.cpp +1841 -0
- package/cpp/llama-kv-cache-unified.h +303 -0
- package/cpp/llama-kv-cells.h +439 -0
- package/cpp/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama-memory-hybrid.h +138 -0
- package/cpp/llama-memory-recurrent.cpp +1112 -0
- package/cpp/llama-memory-recurrent.h +183 -0
- package/cpp/llama-memory.cpp +41 -0
- package/cpp/llama-memory.h +86 -5
- package/cpp/llama-mmap.cpp +1 -1
- package/cpp/llama-model-loader.cpp +42 -17
- package/cpp/llama-model-saver.cpp +1 -0
- package/cpp/llama-model.cpp +1639 -513
- package/cpp/llama-model.h +26 -0
- package/cpp/llama-sampling.cpp +2 -2
- package/cpp/llama-vocab.cpp +65 -28
- package/cpp/llama-vocab.h +1 -0
- package/cpp/llama.cpp +11 -7
- package/cpp/llama.h +150 -42
- package/cpp/minja/chat-template.hpp +1 -1
- package/cpp/minja/minja.hpp +1 -1
- package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
- package/cpp/nlohmann/json_fwd.hpp +187 -0
- package/cpp/regex-partial.cpp +204 -0
- package/cpp/regex-partial.h +56 -0
- package/cpp/rn-llama.cpp +646 -35
- package/cpp/rn-llama.h +32 -1
- package/cpp/rn-tts.h +39 -0
- package/cpp/sampling.cpp +7 -8
- package/cpp/tools/mtmd/clip-impl.h +5 -0
- package/cpp/tools/mtmd/clip.cpp +572 -436
- package/cpp/tools/mtmd/clip.h +14 -4
- package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
- package/cpp/tools/mtmd/mtmd-audio.h +2 -17
- package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
- package/cpp/tools/mtmd/mtmd-helper.h +91 -0
- package/cpp/tools/mtmd/mtmd.cpp +368 -248
- package/cpp/tools/mtmd/mtmd.h +6 -70
- package/cpp/unicode.cpp +5 -0
- package/ios/CMakeLists.txt +26 -6
- package/ios/RNLlama.h +1 -1
- package/ios/RNLlama.mm +153 -3
- package/ios/RNLlamaContext.h +9 -1
- package/ios/RNLlamaContext.mm +112 -9
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/jest/mock.js +24 -0
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +46 -2
- package/src/index.ts +105 -1
- package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
- package/cpp/ggml-cpu/sgemm.cpp +0 -3544
- package/cpp/ggml-cpu/sgemm.h +0 -14
- package/cpp/llama-kv-cache.cpp +0 -2827
- package/cpp/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
- /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
package/cpp/rn-llama.cpp
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
#include "rn-llama.h"
|
2
|
+
#include "rn-tts.h"
|
2
3
|
|
3
4
|
// Include multimodal support
|
4
5
|
#include "tools/mtmd/mtmd.h"
|
6
|
+
#include "tools/mtmd/mtmd-helper.h"
|
5
7
|
#include "tools/mtmd/clip.h"
|
6
8
|
|
7
9
|
namespace rnllama {
|
@@ -23,38 +25,39 @@ static const std::string base64_chars =
|
|
23
25
|
"abcdefghijklmnopqrstuvwxyz"
|
24
26
|
"0123456789+/";
|
25
27
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
28
|
+
static inline bool is_base64(uint8_t c) {
|
29
|
+
return (isalnum(c) || (c == '+') || (c == '/'));
|
30
|
+
}
|
31
|
+
|
32
|
+
using raw_buffer = std::vector<uint8_t>;
|
33
|
+
|
34
|
+
static inline raw_buffer base64_decode(const std::string & encoded_string) {
|
30
35
|
int i = 0;
|
31
36
|
int j = 0;
|
32
37
|
int in_ = 0;
|
33
|
-
unsigned char char_array_4[4], char_array_3[3];
|
34
38
|
|
35
|
-
|
36
|
-
if (isspace(encoded_string[in_])) {
|
37
|
-
in_++;
|
38
|
-
continue;
|
39
|
-
}
|
39
|
+
int in_len = encoded_string.size();
|
40
40
|
|
41
|
-
|
42
|
-
|
43
|
-
}
|
41
|
+
uint8_t char_array_4[4];
|
42
|
+
uint8_t char_array_3[3];
|
44
43
|
|
44
|
+
raw_buffer ret;
|
45
|
+
|
46
|
+
while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
|
45
47
|
char_array_4[i++] = encoded_string[in_]; in_++;
|
46
48
|
if (i == 4) {
|
47
49
|
for (i = 0; i < 4; i++) {
|
48
50
|
char_array_4[i] = base64_chars.find(char_array_4[i]);
|
49
51
|
}
|
50
52
|
|
51
|
-
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
|
53
|
+
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
|
52
54
|
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
53
|
-
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +
|
55
|
+
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
54
56
|
|
55
|
-
for (i = 0; i < 3; i++) {
|
56
|
-
|
57
|
+
for (i = 0; (i < 3); i++) {
|
58
|
+
ret.push_back(char_array_3[i]);
|
57
59
|
}
|
60
|
+
|
58
61
|
i = 0;
|
59
62
|
}
|
60
63
|
}
|
@@ -68,16 +71,16 @@ static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
|
|
68
71
|
char_array_4[j] = base64_chars.find(char_array_4[j]);
|
69
72
|
}
|
70
73
|
|
71
|
-
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
|
74
|
+
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
|
72
75
|
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
73
|
-
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +
|
76
|
+
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
74
77
|
|
75
78
|
for (j = 0; j < i - 1; j++) {
|
76
|
-
|
79
|
+
ret.push_back(char_array_3[j]);
|
77
80
|
}
|
78
81
|
}
|
79
82
|
|
80
|
-
return
|
83
|
+
return ret;
|
81
84
|
}
|
82
85
|
|
83
86
|
static const std::vector<lm_ggml_type> kv_cache_types = {
|
@@ -248,6 +251,7 @@ void llama_rn_context::rewind() {
|
|
248
251
|
generated_text = "";
|
249
252
|
generated_text.reserve(params.n_ctx);
|
250
253
|
generated_token_probs.clear();
|
254
|
+
audio_tokens.clear();
|
251
255
|
truncated = false;
|
252
256
|
context_full = false;
|
253
257
|
stopped_eos = false;
|
@@ -258,6 +262,8 @@ void llama_rn_context::rewind() {
|
|
258
262
|
n_remain = 0;
|
259
263
|
n_past = 0;
|
260
264
|
params.sampling.n_prev = n_ctx;
|
265
|
+
next_token_uses_guide_token = true;
|
266
|
+
guide_tokens.clear();
|
261
267
|
}
|
262
268
|
|
263
269
|
bool llama_rn_context::initSampling() {
|
@@ -305,7 +311,8 @@ common_chat_params llama_rn_context::getFormattedChatWithJinja(
|
|
305
311
|
const std::string &json_schema,
|
306
312
|
const std::string &tools,
|
307
313
|
const bool ¶llel_tool_calls,
|
308
|
-
const std::string &tool_choice
|
314
|
+
const std::string &tool_choice,
|
315
|
+
const bool &enable_thinking
|
309
316
|
) const {
|
310
317
|
common_chat_templates_inputs inputs;
|
311
318
|
inputs.use_jinja = true;
|
@@ -321,7 +328,7 @@ common_chat_params llama_rn_context::getFormattedChatWithJinja(
|
|
321
328
|
if (!json_schema.empty()) {
|
322
329
|
inputs.json_schema = json::parse(json_schema);
|
323
330
|
}
|
324
|
-
inputs.
|
331
|
+
inputs.enable_thinking = enable_thinking;
|
325
332
|
|
326
333
|
// If chat_template is provided, create new one and use it (probably slow)
|
327
334
|
if (!chat_template.empty()) {
|
@@ -419,7 +426,8 @@ void llama_rn_context::loadPrompt(const std::vector<std::string> &media_paths) {
|
|
419
426
|
}
|
420
427
|
|
421
428
|
// Manage KV cache
|
422
|
-
|
429
|
+
auto * kv = llama_get_memory(ctx);
|
430
|
+
llama_memory_seq_rm(kv, 0, n_past, -1);
|
423
431
|
|
424
432
|
LOG_VERBOSE("prompt ingested, n_past: %d, cached: %s, to_eval: %s",
|
425
433
|
n_past,
|
@@ -438,6 +446,10 @@ void llama_rn_context::loadPrompt(const std::vector<std::string> &media_paths) {
|
|
438
446
|
n_past, embd.size(), num_prompt_tokens, has_media ? 1 : 0);
|
439
447
|
}
|
440
448
|
|
449
|
+
void llama_rn_context::setGuideTokens(const std::vector<llama_token> &tokens) {
|
450
|
+
guide_tokens = tokens;
|
451
|
+
}
|
452
|
+
|
441
453
|
void llama_rn_context::beginCompletion() {
|
442
454
|
// number of tokens to keep when resetting context
|
443
455
|
n_remain = params.n_predict;
|
@@ -469,8 +481,9 @@ completion_token_output llama_rn_context::nextToken()
|
|
469
481
|
const int n_left = n_past - params.n_keep - 1;
|
470
482
|
const int n_discard = n_left/2;
|
471
483
|
|
472
|
-
|
473
|
-
|
484
|
+
auto * kv = llama_get_memory(ctx);
|
485
|
+
llama_memory_seq_rm (kv, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
486
|
+
llama_memory_seq_add(kv, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
474
487
|
|
475
488
|
for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++)
|
476
489
|
{
|
@@ -528,7 +541,14 @@ completion_token_output llama_rn_context::nextToken()
|
|
528
541
|
std::vector<llama_token_data> candidates;
|
529
542
|
candidates.reserve(llama_vocab_n_tokens(vocab));
|
530
543
|
|
531
|
-
|
544
|
+
llama_token new_token_id = common_sampler_sample(ctx_sampling, ctx, -1);
|
545
|
+
|
546
|
+
if (next_token_uses_guide_token && !guide_tokens.empty() && !llama_vocab_is_control(vocab, new_token_id) && !llama_vocab_is_eog(vocab, new_token_id)) {
|
547
|
+
new_token_id = guide_tokens[0];
|
548
|
+
guide_tokens.erase(guide_tokens.begin());
|
549
|
+
}
|
550
|
+
next_token_uses_guide_token = (new_token_id == 198);
|
551
|
+
result.tok = new_token_id;
|
532
552
|
|
533
553
|
llama_token_data_array cur_p = *common_sampler_get_candidates(ctx_sampling);
|
534
554
|
|
@@ -611,6 +631,13 @@ completion_token_output llama_rn_context::doCompletion()
|
|
611
631
|
const std::string token_text = token_with_probs.tok == -1 ? "" : common_token_to_piece(ctx, token_with_probs.tok);
|
612
632
|
generated_text += token_text;
|
613
633
|
|
634
|
+
if (isVocoderEnabled()) {
|
635
|
+
tts_type type = getTTSType();
|
636
|
+
if ((type == OUTETTS_V0_2 || type == OUTETTS_V0_3) && (token_with_probs.tok >= 151672 && token_with_probs.tok <= 155772)) {
|
637
|
+
audio_tokens.push_back(token_with_probs.tok);
|
638
|
+
}
|
639
|
+
}
|
640
|
+
|
614
641
|
if (params.sampling.n_probs > 0)
|
615
642
|
{
|
616
643
|
generated_token_probs.push_back(token_with_probs);
|
@@ -687,6 +714,94 @@ std::vector<float> llama_rn_context::getEmbedding(common_params &embd_params)
|
|
687
714
|
return out;
|
688
715
|
}
|
689
716
|
|
717
|
+
// Helper function to format rerank task: [BOS]query[EOS][SEP]doc[EOS]
|
718
|
+
static std::vector<llama_token> format_rerank(const llama_vocab * vocab, const std::vector<llama_token> & query, const std::vector<llama_token> & doc) {
|
719
|
+
std::vector<llama_token> result;
|
720
|
+
|
721
|
+
// Get EOS token - use SEP token as fallback if EOS is not available
|
722
|
+
llama_token eos_token = llama_vocab_eos(vocab);
|
723
|
+
if (eos_token == LLAMA_TOKEN_NULL) {
|
724
|
+
eos_token = llama_vocab_sep(vocab);
|
725
|
+
}
|
726
|
+
|
727
|
+
result.reserve(doc.size() + query.size() + 4);
|
728
|
+
if (llama_vocab_get_add_bos(vocab)) {
|
729
|
+
result.push_back(llama_vocab_bos(vocab));
|
730
|
+
}
|
731
|
+
result.insert(result.end(), query.begin(), query.end());
|
732
|
+
if (llama_vocab_get_add_eos(vocab)) {
|
733
|
+
result.push_back(eos_token);
|
734
|
+
}
|
735
|
+
if (llama_vocab_get_add_sep(vocab)) {
|
736
|
+
result.push_back(llama_vocab_sep(vocab));
|
737
|
+
}
|
738
|
+
result.insert(result.end(), doc.begin(), doc.end());
|
739
|
+
if (llama_vocab_get_add_eos(vocab)) {
|
740
|
+
result.push_back(eos_token);
|
741
|
+
}
|
742
|
+
|
743
|
+
return result;
|
744
|
+
}
|
745
|
+
|
746
|
+
std::vector<float> llama_rn_context::rerank(const std::string &query, const std::vector<std::string> &documents)
|
747
|
+
{
|
748
|
+
std::vector<float> scores;
|
749
|
+
|
750
|
+
// Check if this model supports reranking (requires rank pooling type)
|
751
|
+
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
752
|
+
if (pooling_type != LLAMA_POOLING_TYPE_RANK) {
|
753
|
+
throw std::runtime_error("reranking not supported, pooling_type: " + std::to_string(pooling_type));
|
754
|
+
}
|
755
|
+
|
756
|
+
if (!params.embedding) {
|
757
|
+
throw std::runtime_error("embedding disabled but required for reranking");
|
758
|
+
}
|
759
|
+
|
760
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
761
|
+
std::vector<llama_token> query_tokens = common_tokenize(vocab, query, false, true);
|
762
|
+
|
763
|
+
scores.reserve(documents.size());
|
764
|
+
|
765
|
+
for (size_t i = 0; i < documents.size(); ++i) {
|
766
|
+
rewind();
|
767
|
+
embd = {};
|
768
|
+
|
769
|
+
const std::string & document = documents[i];
|
770
|
+
|
771
|
+
std::vector<llama_token> doc_tokens = common_tokenize(vocab, document, false, true);
|
772
|
+
|
773
|
+
std::vector<llama_token> rerank_tokens = format_rerank(vocab, query_tokens, doc_tokens);
|
774
|
+
|
775
|
+
llama_memory_clear(llama_get_memory(ctx), false);
|
776
|
+
|
777
|
+
// Process the rerank input
|
778
|
+
try {
|
779
|
+
params.prompt = tokens_to_str(ctx, rerank_tokens.begin(), rerank_tokens.end());
|
780
|
+
initSampling();
|
781
|
+
loadPrompt({}); // No media paths for rerank
|
782
|
+
beginCompletion();
|
783
|
+
doCompletion();
|
784
|
+
|
785
|
+
// Get the rerank score (single embedding value for rank pooling)
|
786
|
+
float *data = llama_get_embeddings_seq(ctx, 0);
|
787
|
+
if (data) {
|
788
|
+
scores.push_back(data[0]); // For rank pooling, the score is the first (and only) dimension
|
789
|
+
} else {
|
790
|
+
scores.push_back(-1e6f); // Default low score if computation failed
|
791
|
+
}
|
792
|
+
} catch (const std::exception &e) {
|
793
|
+
LOG_WARNING("rerank computation failed for document %zu: %s", i, e.what());
|
794
|
+
scores.push_back(-1e6f);
|
795
|
+
}
|
796
|
+
endCompletion();
|
797
|
+
|
798
|
+
// Clear KV cache again to prepare for next document or restore original state
|
799
|
+
llama_memory_clear(llama_get_memory(ctx), false);
|
800
|
+
}
|
801
|
+
|
802
|
+
return scores;
|
803
|
+
}
|
804
|
+
|
690
805
|
std::string llama_rn_context::bench(int pp, int tg, int pl, int nr)
|
691
806
|
{
|
692
807
|
if (is_predicting) {
|
@@ -721,7 +836,7 @@ std::string llama_rn_context::bench(int pp, int tg, int pl, int nr)
|
|
721
836
|
}
|
722
837
|
batch.logits[batch.n_tokens - 1] = 1; // true
|
723
838
|
|
724
|
-
|
839
|
+
llama_memory_clear(llama_get_memory(ctx), true);
|
725
840
|
|
726
841
|
const int64_t t_pp_start = llama_time_us();
|
727
842
|
if (llama_decode(ctx, batch) != 0)
|
@@ -729,7 +844,8 @@ std::string llama_rn_context::bench(int pp, int tg, int pl, int nr)
|
|
729
844
|
LOG_ERROR("llama_decode() failed during prompt", "");
|
730
845
|
}
|
731
846
|
const int64_t t_pp_end = llama_time_us();
|
732
|
-
|
847
|
+
|
848
|
+
llama_memory_clear(llama_get_memory(ctx), true);
|
733
849
|
|
734
850
|
if (is_interrupted) break;
|
735
851
|
|
@@ -753,7 +869,7 @@ std::string llama_rn_context::bench(int pp, int tg, int pl, int nr)
|
|
753
869
|
|
754
870
|
const int64_t t_tg_end = llama_time_us();
|
755
871
|
|
756
|
-
|
872
|
+
llama_memory_clear(llama_get_memory(ctx), true);
|
757
873
|
|
758
874
|
const double t_pp = (t_pp_end - t_pp_start) / 1000000.0;
|
759
875
|
const double t_tg = (t_tg_end - t_tg_start) / 1000000.0;
|
@@ -779,7 +895,7 @@ std::string llama_rn_context::bench(int pp, int tg, int pl, int nr)
|
|
779
895
|
tg_std = 0;
|
780
896
|
}
|
781
897
|
|
782
|
-
if (is_interrupted)
|
898
|
+
if (is_interrupted) llama_memory_clear(llama_get_memory(ctx), true);
|
783
899
|
endCompletion();
|
784
900
|
|
785
901
|
char model_desc[128];
|
@@ -903,11 +1019,11 @@ mtmd_tokenize_result tokenizeWithMedia(llama_rn_context_mtmd *mtmd_wrapper, cons
|
|
903
1019
|
}
|
904
1020
|
|
905
1021
|
// Decode base64
|
906
|
-
|
1022
|
+
raw_buffer media_data = base64_decode(base64_data);
|
907
1023
|
LOG_INFO("[DEBUG] Base64 decoded, size: %zu bytes", media_data.size());
|
908
1024
|
|
909
1025
|
// Load bitmap from memory buffer using direct initialization
|
910
|
-
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(media_data.data(), media_data.size()));
|
1026
|
+
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mtmd_wrapper->mtmd_ctx, media_data.data(), media_data.size()));
|
911
1027
|
if (!bmp.ptr) {
|
912
1028
|
bitmaps.entries.clear();
|
913
1029
|
throw std::runtime_error("Failed to load base64 media");
|
@@ -942,7 +1058,7 @@ mtmd_tokenize_result tokenizeWithMedia(llama_rn_context_mtmd *mtmd_wrapper, cons
|
|
942
1058
|
fclose(file);
|
943
1059
|
|
944
1060
|
// Create bitmap directly
|
945
|
-
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(media_path.c_str()));
|
1061
|
+
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(mtmd_wrapper->mtmd_ctx, media_path.c_str()));
|
946
1062
|
if (!bmp.ptr) {
|
947
1063
|
bitmaps.entries.clear();
|
948
1064
|
throw std::runtime_error("Failed to load media");
|
@@ -1176,7 +1292,8 @@ void llama_rn_context::processMedia(
|
|
1176
1292
|
}
|
1177
1293
|
|
1178
1294
|
// Clear all KV cache entries after position n_past
|
1179
|
-
|
1295
|
+
auto * kv = llama_get_memory(ctx);
|
1296
|
+
llama_memory_seq_rm(kv, 0, n_past, -1);
|
1180
1297
|
|
1181
1298
|
LOG_INFO("[DEBUG] Evaluating chunks: n_past=%d, n_batch=%d", n_past, params.n_batch);
|
1182
1299
|
|
@@ -1282,4 +1399,498 @@ void llama_rn_context::releaseMultimodal() {
|
|
1282
1399
|
}
|
1283
1400
|
}
|
1284
1401
|
|
1402
|
+
struct llama_rn_context_vocoder {
|
1403
|
+
common_init_result init_result;
|
1404
|
+
llama_model *model = nullptr;
|
1405
|
+
llama_context *ctx = nullptr;
|
1406
|
+
tts_type type = UNKNOWN;
|
1407
|
+
};
|
1408
|
+
|
1409
|
+
bool llama_rn_context::initVocoder(const std::string &vocoder_model_path) {
|
1410
|
+
if (vocoder_wrapper != nullptr) {
|
1411
|
+
return true;
|
1412
|
+
}
|
1413
|
+
params.model.path = vocoder_model_path;
|
1414
|
+
params.embedding = true;
|
1415
|
+
params.ctx_shift = false;
|
1416
|
+
params.n_ubatch = params.n_batch;
|
1417
|
+
|
1418
|
+
llama_rn_context_vocoder *wrapper = new llama_rn_context_vocoder{
|
1419
|
+
.init_result = common_init_from_params(params),
|
1420
|
+
};
|
1421
|
+
|
1422
|
+
wrapper->model = wrapper->init_result.model.get();
|
1423
|
+
wrapper->ctx = wrapper->init_result.context.get();
|
1424
|
+
|
1425
|
+
if (wrapper->model == nullptr || wrapper->ctx == nullptr) {
|
1426
|
+
LOG_ERROR("Failed to load vocoder model: %s", vocoder_model_path.c_str());
|
1427
|
+
delete wrapper;
|
1428
|
+
return false;
|
1429
|
+
}
|
1430
|
+
|
1431
|
+
wrapper->type = getTTSType();
|
1432
|
+
vocoder_wrapper = wrapper;
|
1433
|
+
has_vocoder = true;
|
1434
|
+
return true;
|
1435
|
+
}
|
1436
|
+
|
1437
|
+
bool llama_rn_context::isVocoderEnabled() const {
|
1438
|
+
return has_vocoder && vocoder_wrapper != nullptr;
|
1439
|
+
}
|
1440
|
+
|
1441
|
+
void llama_rn_context::releaseVocoder() {
|
1442
|
+
if (vocoder_wrapper != nullptr) {
|
1443
|
+
delete vocoder_wrapper;
|
1444
|
+
vocoder_wrapper = nullptr;
|
1445
|
+
}
|
1446
|
+
has_vocoder = false;
|
1447
|
+
}
|
1448
|
+
|
1449
|
+
tts_type llama_rn_context::getTTSType(json speaker) {
|
1450
|
+
if (vocoder_wrapper == nullptr) {
|
1451
|
+
return UNKNOWN;
|
1452
|
+
}
|
1453
|
+
if (speaker.is_object() && speaker.contains("version")) {
|
1454
|
+
std::string version = speaker["version"].get<std::string>();
|
1455
|
+
if (version == "0.2") {
|
1456
|
+
return OUTETTS_V0_2;
|
1457
|
+
} else if (version == "0.3") {
|
1458
|
+
return OUTETTS_V0_3;
|
1459
|
+
} else {
|
1460
|
+
LOG_ERROR("Unsupported speaker version '%s'\n", version.c_str());
|
1461
|
+
}
|
1462
|
+
}
|
1463
|
+
if (vocoder_wrapper->type != UNKNOWN) {
|
1464
|
+
return vocoder_wrapper->type;
|
1465
|
+
}
|
1466
|
+
const char *chat_template = llama_model_chat_template(model, nullptr);
|
1467
|
+
if (chat_template && std::string(chat_template) == "outetts-0.3") {
|
1468
|
+
return OUTETTS_V0_3;
|
1469
|
+
}
|
1470
|
+
return OUTETTS_V0_2;
|
1471
|
+
}
|
1472
|
+
|
1473
|
+
static std::string audio_text_from_speaker(json speaker, const tts_type type = OUTETTS_V0_2) {
|
1474
|
+
std::string audio_text = "<|text_start|>";
|
1475
|
+
|
1476
|
+
if (type == OUTETTS_V0_2 || type == OUTETTS_V0_3) {
|
1477
|
+
std::string separator = (type == OUTETTS_V0_3) ? "<|space|>" : "<|text_sep|>";
|
1478
|
+
for (const auto &word : speaker["words"]) {
|
1479
|
+
audio_text += word["word"].get<std::string>() + separator;
|
1480
|
+
}
|
1481
|
+
}
|
1482
|
+
|
1483
|
+
return audio_text;
|
1484
|
+
}
|
1485
|
+
|
1486
|
+
static std::string audio_data_from_speaker(json speaker, const tts_type type = OUTETTS_V0_2) {
|
1487
|
+
std::string audio_data = "<|audio_start|>\n";
|
1488
|
+
|
1489
|
+
if (type == OUTETTS_V0_2 || type == OUTETTS_V0_3) {
|
1490
|
+
std::string code_start = (type == OUTETTS_V0_3) ? "" : "<|code_start|>";
|
1491
|
+
std::string code_end = (type == OUTETTS_V0_3) ? "<|space|>" : "<|code_end|>";
|
1492
|
+
for (const auto &word : speaker["words"]) {
|
1493
|
+
std::string word_text = word["word"].get<std::string>();
|
1494
|
+
double duration = word["duration"].get<double>();
|
1495
|
+
std::vector<int> codes = word["codes"].get<std::vector<int>>();
|
1496
|
+
|
1497
|
+
// Create the audio output entry
|
1498
|
+
std::ostringstream word_entry;
|
1499
|
+
word_entry << word_text << "<|t_" << std::fixed << std::setprecision(2)
|
1500
|
+
<< duration << "|>" + code_start;
|
1501
|
+
for (const auto &Code : codes) {
|
1502
|
+
word_entry << "<|" << Code << "|>";
|
1503
|
+
}
|
1504
|
+
word_entry << code_end << "\n";
|
1505
|
+
audio_data += word_entry.str();
|
1506
|
+
}
|
1507
|
+
}
|
1508
|
+
|
1509
|
+
return audio_data;
|
1510
|
+
}
|
1511
|
+
|
1512
|
+
static const std::map<int, std::string> ones = {
|
1513
|
+
{0, "zero"}, {1, "one"}, {2, "two"}, {3, "three"}, {4, "four"},
|
1514
|
+
{5, "five"}, {6, "six"}, {7, "seven"}, {8, "eight"}, {9, "nine"},
|
1515
|
+
{10, "ten"}, {11, "eleven"}, {12, "twelve"}, {13, "thirteen"}, {14, "fourteen"},
|
1516
|
+
{15, "fifteen"}, {16, "sixteen"}, {17, "seventeen"}, {18, "eighteen"}, {19, "nineteen"}
|
1517
|
+
};
|
1518
|
+
|
1519
|
+
static const std::map<int, std::string> tens = {
|
1520
|
+
{2, "twenty"}, {3, "thirty"}, {4, "forty"}, {5, "fifty"},
|
1521
|
+
{6, "sixty"}, {7, "seventy"}, {8, "eighty"}, {9, "ninety"}
|
1522
|
+
};
|
1523
|
+
|
1524
|
+
// Convert a number less than 1000 to words
|
1525
|
+
static std::string convert_less_than_thousand(int num) {
|
1526
|
+
std::string result;
|
1527
|
+
|
1528
|
+
if (num >= 100) {
|
1529
|
+
result += ones.at(num / 100) + " hundred ";
|
1530
|
+
num %= 100;
|
1531
|
+
}
|
1532
|
+
|
1533
|
+
if (num >= 20) {
|
1534
|
+
result += tens.at(num / 10);
|
1535
|
+
if (num % 10 > 0) {
|
1536
|
+
result += "-" + ones.at(num % 10);
|
1537
|
+
}
|
1538
|
+
} else if (num > 0) {
|
1539
|
+
result += ones.at(num);
|
1540
|
+
}
|
1541
|
+
|
1542
|
+
return result;
|
1543
|
+
}
|
1544
|
+
|
1545
|
+
static std::string number_to_words(const std::string & number_str) {
|
1546
|
+
try {
|
1547
|
+
size_t decimal_pos = number_str.find('.');
|
1548
|
+
std::string integer_part = number_str.substr(0, decimal_pos);
|
1549
|
+
|
1550
|
+
int int_number = std::stoi(integer_part);
|
1551
|
+
std::string result;
|
1552
|
+
|
1553
|
+
if (int_number == 0) {
|
1554
|
+
result = "zero";
|
1555
|
+
} else {
|
1556
|
+
if (int_number >= 1000000000) {
|
1557
|
+
int billions = int_number / 1000000000;
|
1558
|
+
result += convert_less_than_thousand(billions) + " billion ";
|
1559
|
+
int_number %= 1000000000;
|
1560
|
+
}
|
1561
|
+
|
1562
|
+
if (int_number >= 1000000) {
|
1563
|
+
int millions = int_number / 1000000;
|
1564
|
+
result += convert_less_than_thousand(millions) + " million ";
|
1565
|
+
int_number %= 1000000;
|
1566
|
+
}
|
1567
|
+
|
1568
|
+
if (int_number >= 1000) {
|
1569
|
+
int thousands = int_number / 1000;
|
1570
|
+
result += convert_less_than_thousand(thousands) + " thousand ";
|
1571
|
+
int_number %= 1000;
|
1572
|
+
}
|
1573
|
+
|
1574
|
+
if (int_number > 0) {
|
1575
|
+
result += convert_less_than_thousand(int_number);
|
1576
|
+
}
|
1577
|
+
}
|
1578
|
+
|
1579
|
+
// Handle decimal part
|
1580
|
+
if (decimal_pos != std::string::npos) {
|
1581
|
+
result += " point";
|
1582
|
+
std::string decimal_part = number_str.substr(decimal_pos + 1);
|
1583
|
+
for (char digit : decimal_part) {
|
1584
|
+
result += " " + ones.at(digit - '0');
|
1585
|
+
}
|
1586
|
+
}
|
1587
|
+
|
1588
|
+
return result;
|
1589
|
+
} catch (const std::exception& e) {
|
1590
|
+
// Skip if fails
|
1591
|
+
return " ";
|
1592
|
+
}
|
1593
|
+
}
|
1594
|
+
|
1595
|
+
static std::string replace_numbers_with_words(const std::string & input_text) {
|
1596
|
+
std::regex number_pattern(R"(\d+(\.\d+)?)");
|
1597
|
+
std::string result;
|
1598
|
+
auto it = std::sregex_iterator(input_text.begin(), input_text.end(), number_pattern);
|
1599
|
+
auto end = std::sregex_iterator();
|
1600
|
+
|
1601
|
+
size_t last_pos = 0;
|
1602
|
+
for (std::sregex_iterator i = it; i != end; ++i) {
|
1603
|
+
const std::smatch& match = *i;
|
1604
|
+
result.append(input_text, last_pos, match.position() - last_pos);
|
1605
|
+
result.append(number_to_words(match.str()));
|
1606
|
+
last_pos = match.position() + match.length();
|
1607
|
+
}
|
1608
|
+
result.append(input_text, last_pos);
|
1609
|
+
|
1610
|
+
return result;
|
1611
|
+
}
|
1612
|
+
|
1613
|
+
// Based on: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/version/v1/prompt_processor.py#L39
|
1614
|
+
static std::string process_text(const std::string & text, const tts_type tts_type = OUTETTS_V0_2) {
|
1615
|
+
|
1616
|
+
// For now I skipped text romanization as I am unsure how to handle
|
1617
|
+
// uroman and MeCab implementations in C++
|
1618
|
+
// maybe something like https://github.com/anyascii/anyascii/ could work.
|
1619
|
+
// currently only English would be supported in this function
|
1620
|
+
|
1621
|
+
std::string processed_text = replace_numbers_with_words(text);
|
1622
|
+
|
1623
|
+
std::transform(processed_text.begin(), processed_text.end(),
|
1624
|
+
processed_text.begin(), ::tolower);
|
1625
|
+
|
1626
|
+
std::regex special_chars(R"([-_/,\.\\])");
|
1627
|
+
processed_text = std::regex_replace(processed_text, special_chars, " ");
|
1628
|
+
|
1629
|
+
std::regex non_alpha(R"([^a-z\s])");
|
1630
|
+
processed_text = std::regex_replace(processed_text, non_alpha, "");
|
1631
|
+
|
1632
|
+
std::regex multiple_spaces(R"(\s+)");
|
1633
|
+
processed_text = std::regex_replace(processed_text, multiple_spaces, " ");
|
1634
|
+
|
1635
|
+
processed_text = std::regex_replace(processed_text, std::regex(R"(^\s+|\s+$)"), "");
|
1636
|
+
|
1637
|
+
/*
|
1638
|
+
Replace spaces with the separator token same as in line 365
|
1639
|
+
|
1640
|
+
for (auto & c : prompt_user) {
|
1641
|
+
if (c == ' ') {
|
1642
|
+
prompt_clean += "<|text_sep|>";
|
1643
|
+
*/
|
1644
|
+
std::string separator = (tts_type == OUTETTS_V0_3) ? "<|space|>" : "<|text_sep|>";
|
1645
|
+
processed_text = std::regex_replace(processed_text, std::regex(R"(\s)"), separator);
|
1646
|
+
|
1647
|
+
return processed_text;
|
1648
|
+
}
|
1649
|
+
|
1650
|
+
std::string llama_rn_context::getFormattedAudioCompletion(const std::string &speaker_json_str, const std::string &text_to_speak) {
|
1651
|
+
if (!isVocoderEnabled()) {
|
1652
|
+
throw std::runtime_error("Vocoder is not enabled but audio completion is requested");
|
1653
|
+
}
|
1654
|
+
std::string audio_text = default_audio_text;
|
1655
|
+
std::string audio_data = default_audio_data;
|
1656
|
+
|
1657
|
+
json speaker = speaker_json_str.empty() ? json::object() : json::parse(speaker_json_str);
|
1658
|
+
const tts_type type = getTTSType(speaker);
|
1659
|
+
if (type == UNKNOWN) {
|
1660
|
+
LOG_ERROR("Unknown TTS version");
|
1661
|
+
return "";
|
1662
|
+
}
|
1663
|
+
|
1664
|
+
if (type == OUTETTS_V0_3) {
|
1665
|
+
audio_text = std::regex_replace(audio_text, std::regex(R"(<\|text_sep\|>)"), "<|space|>");
|
1666
|
+
audio_data = std::regex_replace(audio_data, std::regex(R"(<\|code_start\|>)"), "");
|
1667
|
+
audio_data = std::regex_replace(audio_data, std::regex(R"(<\|code_end\|>)"), "<|space|>");
|
1668
|
+
}
|
1669
|
+
|
1670
|
+
if (!speaker_json_str.empty()) {
|
1671
|
+
audio_text = audio_text_from_speaker(speaker, type);
|
1672
|
+
audio_data = audio_data_from_speaker(speaker, type);
|
1673
|
+
}
|
1674
|
+
|
1675
|
+
return "<|im_start|>\n" + audio_text + process_text(text_to_speak, type) + "<|text_end|>\n" + audio_data + "\n";
|
1676
|
+
}
|
1677
|
+
|
1678
|
+
std::vector<llama_token> llama_rn_context::getAudioCompletionGuideTokens(const std::string &text_to_speak) {
|
1679
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
1680
|
+
const tts_type type = getTTSType();
|
1681
|
+
std::string clean_text = process_text(text_to_speak, type);
|
1682
|
+
|
1683
|
+
const std::string& delimiter = (type == OUTETTS_V0_3 ? "<|space|>" : "<|text_sep|>");
|
1684
|
+
|
1685
|
+
std::vector<llama_token> result;
|
1686
|
+
size_t start = 0;
|
1687
|
+
size_t end = clean_text.find(delimiter);
|
1688
|
+
|
1689
|
+
//first token is always a newline, as it was not previously added
|
1690
|
+
result.push_back(common_tokenize(vocab, "\n", false, true)[0]);
|
1691
|
+
|
1692
|
+
while (end != std::string::npos) {
|
1693
|
+
std::string current_word = clean_text.substr(start, end - start);
|
1694
|
+
auto tmp = common_tokenize(vocab, current_word, false, true);
|
1695
|
+
result.push_back(tmp[0]);
|
1696
|
+
start = end + delimiter.length();
|
1697
|
+
end = clean_text.find(delimiter, start);
|
1698
|
+
}
|
1699
|
+
|
1700
|
+
// Add the last part
|
1701
|
+
std::string current_word = clean_text.substr(start);
|
1702
|
+
auto tmp = common_tokenize(vocab, current_word, false, true);
|
1703
|
+
if (tmp.size() > 0) {
|
1704
|
+
result.push_back(tmp[0]);
|
1705
|
+
}
|
1706
|
+
return result;
|
1707
|
+
}
|
1708
|
+
|
1709
|
+
static void fill_hann_window(int length, bool periodic, float * output) {
|
1710
|
+
int offset = -1;
|
1711
|
+
if (periodic) {
|
1712
|
+
offset = 0;
|
1713
|
+
}
|
1714
|
+
for (int i = 0; i < length; i++) {
|
1715
|
+
output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
|
1716
|
+
}
|
1717
|
+
}
|
1718
|
+
|
1719
|
+
static void twiddle(float * real, float * imag, int k, int N) {
|
1720
|
+
float angle = 2 * M_PI * k / N;
|
1721
|
+
*real = cos(angle);
|
1722
|
+
*imag = sin(angle);
|
1723
|
+
}
|
1724
|
+
|
1725
|
+
static void irfft(int n, const float * inp_cplx, float * out_real) {
|
1726
|
+
int N = n / 2 + 1;
|
1727
|
+
|
1728
|
+
std::vector<float> real_input(N);
|
1729
|
+
std::vector<float> imag_input(N);
|
1730
|
+
for (int i = 0; i < N; ++i) {
|
1731
|
+
real_input[i] = inp_cplx[2 * i];
|
1732
|
+
imag_input[i] = inp_cplx[2 * i + 1];
|
1733
|
+
}
|
1734
|
+
|
1735
|
+
std::vector<float> real_output(n);
|
1736
|
+
std::vector<float> imag_output(n);
|
1737
|
+
|
1738
|
+
for (int k = 0; k < n; ++k) {
|
1739
|
+
real_output[k] = 0.0f;
|
1740
|
+
imag_output[k] = 0.0f;
|
1741
|
+
for (int m = 0; m < N; ++m) {
|
1742
|
+
float twiddle_real;
|
1743
|
+
float twiddle_imag;
|
1744
|
+
|
1745
|
+
twiddle(&twiddle_real, &twiddle_imag, k * m, n);
|
1746
|
+
|
1747
|
+
real_output[k] += real_input[m] * twiddle_real - imag_input[m] * twiddle_imag;
|
1748
|
+
imag_output[k] += real_input[m] * twiddle_imag + imag_input[m] * twiddle_real;
|
1749
|
+
}
|
1750
|
+
}
|
1751
|
+
|
1752
|
+
for (int i = 0; i < n; ++i) {
|
1753
|
+
out_real[i] = real_output[i] / N;
|
1754
|
+
}
|
1755
|
+
}
|
1756
|
+
|
1757
|
+
static void fold(const std::vector<float> & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector<float> & output) {
|
1758
|
+
int64_t output_height = n_out;
|
1759
|
+
int64_t kernel_w = n_win;
|
1760
|
+
int64_t stride_w = n_hop;
|
1761
|
+
int64_t width = n_out;
|
1762
|
+
|
1763
|
+
output.resize(width, 0.0f);
|
1764
|
+
|
1765
|
+
int64_t col_idx = 0;
|
1766
|
+
for (int64_t w_col = 0; w_col < width; ++w_col) {
|
1767
|
+
int64_t start = w_col * stride_w - n_pad;
|
1768
|
+
int64_t end = start + kernel_w;
|
1769
|
+
|
1770
|
+
for (int64_t w_im = start; w_im < end; ++w_im) {
|
1771
|
+
if (w_im >= 0 && w_im < output_height && col_idx < (int64_t) data.size()) {
|
1772
|
+
output[w_im] += data[col_idx];
|
1773
|
+
}
|
1774
|
+
col_idx++;
|
1775
|
+
}
|
1776
|
+
}
|
1777
|
+
|
1778
|
+
output.resize(n_out - 2 * n_pad);
|
1779
|
+
}
|
1780
|
+
|
1781
|
+
static std::vector<float> embd_to_audio(
|
1782
|
+
const float * embd,
|
1783
|
+
const int n_codes,
|
1784
|
+
const int n_embd,
|
1785
|
+
const int n_thread) {
|
1786
|
+
const int n_fft = 1280;
|
1787
|
+
const int n_hop = 320;
|
1788
|
+
const int n_win = 1280;
|
1789
|
+
const int n_pad = (n_win - n_hop)/2;
|
1790
|
+
const int n_out = (n_codes - 1)*n_hop + n_win;
|
1791
|
+
|
1792
|
+
std::vector<float> hann(n_fft);
|
1793
|
+
|
1794
|
+
fill_hann_window(hann.size(), true, hann.data());
|
1795
|
+
|
1796
|
+
int n_spec = n_embd*n_codes;
|
1797
|
+
|
1798
|
+
std::vector<float> E (n_spec);
|
1799
|
+
std::vector<float> S (n_spec);
|
1800
|
+
std::vector<float> ST(n_spec);
|
1801
|
+
|
1802
|
+
for (int l = 0; l < n_codes; ++l) {
|
1803
|
+
for (int k = 0; k < n_embd; ++k) {
|
1804
|
+
E[k*n_codes + l] = embd[l*n_embd + k];
|
1805
|
+
}
|
1806
|
+
}
|
1807
|
+
|
1808
|
+
for (int k = 0; k < n_embd/2; ++k) {
|
1809
|
+
for (int l = 0; l < n_codes; ++l) {
|
1810
|
+
float mag = E[(k )*n_codes + l];
|
1811
|
+
float phi = E[(k + n_embd/2)*n_codes + l];
|
1812
|
+
|
1813
|
+
mag = exp(mag);
|
1814
|
+
|
1815
|
+
if (mag > 1e2) {
|
1816
|
+
mag = 1e2;
|
1817
|
+
}
|
1818
|
+
S[2*(k*n_codes + l) + 0] = mag*cosf(phi);
|
1819
|
+
S[2*(k*n_codes + l) + 1] = mag*sinf(phi);
|
1820
|
+
}
|
1821
|
+
}
|
1822
|
+
|
1823
|
+
for (int l = 0; l < n_codes; ++l) {
|
1824
|
+
for (int k = 0; k < n_embd/2; ++k) {
|
1825
|
+
ST[l*n_embd + 2*k + 0] = S[2*(k*n_codes + l) + 0];
|
1826
|
+
ST[l*n_embd + 2*k + 1] = S[2*(k*n_codes + l) + 1];
|
1827
|
+
}
|
1828
|
+
}
|
1829
|
+
|
1830
|
+
std::vector<float> res (n_codes*n_fft);
|
1831
|
+
std::vector<float> hann2(n_codes*n_fft);
|
1832
|
+
|
1833
|
+
std::vector<std::thread> workers(n_thread);
|
1834
|
+
for (int i = 0; i < n_thread; ++i) {
|
1835
|
+
workers[i] = std::thread([&, i]() {
|
1836
|
+
for (int l = i; l < n_codes; l += n_thread) {
|
1837
|
+
irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft);
|
1838
|
+
for (int j = 0; j < n_fft; ++j) {
|
1839
|
+
res [l*n_fft + j] *= hann[j];
|
1840
|
+
hann2[l*n_fft + j] = hann[j] * hann[j];
|
1841
|
+
}
|
1842
|
+
}
|
1843
|
+
});
|
1844
|
+
}
|
1845
|
+
for (int i = 0; i < n_thread; ++i) {
|
1846
|
+
workers[i].join();
|
1847
|
+
}
|
1848
|
+
|
1849
|
+
std::vector<float> audio;
|
1850
|
+
std::vector<float> env;
|
1851
|
+
|
1852
|
+
fold(res, n_out, n_win, n_hop, n_pad, audio);
|
1853
|
+
fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once
|
1854
|
+
|
1855
|
+
for (size_t i = 0; i < audio.size(); ++i) {
|
1856
|
+
audio[i] /= env[i];
|
1857
|
+
}
|
1858
|
+
|
1859
|
+
return audio;
|
1860
|
+
}
|
1861
|
+
|
1862
|
+
std::vector<float> llama_rn_context::decodeAudioTokens(const std::vector<llama_token> &tokens) {
|
1863
|
+
if (!isVocoderEnabled()) {
|
1864
|
+
throw std::runtime_error("Vocoder is not enabled but audio completion is requested");
|
1865
|
+
}
|
1866
|
+
std::vector<llama_token> tokens_audio = tokens;
|
1867
|
+
tts_type type = getTTSType();
|
1868
|
+
if (type == OUTETTS_V0_3 || type == OUTETTS_V0_2) {
|
1869
|
+
tokens_audio.erase(std::remove_if(tokens_audio.begin(), tokens_audio.end(), [](llama_token t) { return t < 151672 || t > 155772; }), tokens_audio.end());
|
1870
|
+
for (auto & token : tokens_audio) {
|
1871
|
+
token -= 151672;
|
1872
|
+
}
|
1873
|
+
} else {
|
1874
|
+
LOG_ERROR("Unsupported audio tokens");
|
1875
|
+
return std::vector<float>();
|
1876
|
+
}
|
1877
|
+
const int n_codes = tokens_audio.size();
|
1878
|
+
llama_batch batch = llama_batch_init(n_codes, 0, 1);
|
1879
|
+
for (size_t i = 0; i < tokens_audio.size(); ++i) {
|
1880
|
+
llama_batch_add(&batch, tokens_audio[i], i, { 0 }, true);
|
1881
|
+
}
|
1882
|
+
if (batch.n_tokens != n_codes) {
|
1883
|
+
LOG_ERROR("batch.n_tokens != n_codes: %d != %d", batch.n_tokens, n_codes);
|
1884
|
+
return std::vector<float>();
|
1885
|
+
}
|
1886
|
+
if (llama_encode(vocoder_wrapper->ctx, batch) != 0) {
|
1887
|
+
LOG_ERROR("llama_encode() failed");
|
1888
|
+
return std::vector<float>();
|
1889
|
+
}
|
1890
|
+
llama_synchronize(vocoder_wrapper->ctx);
|
1891
|
+
const int n_embd = llama_model_n_embd(vocoder_wrapper->model);
|
1892
|
+
const float * embd = llama_get_embeddings(vocoder_wrapper->ctx);
|
1893
|
+
return embd_to_audio(embd, n_codes, n_embd, params.cpuparams.n_threads);
|
1894
|
+
}
|
1895
|
+
|
1285
1896
|
}
|