cui-llama.rn 1.7.4 → 1.7.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +217 -17
- package/android/src/main/CMakeLists.txt +34 -15
- package/android/src/main/java/com/rnllama/LlamaContext.java +79 -5
- package/android/src/main/java/com/rnllama/RNLlama.java +237 -0
- package/android/src/main/jni.cpp +213 -14
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
- package/cpp/README.md +1 -1
- package/cpp/chat-parser.cpp +385 -0
- package/cpp/chat-parser.h +120 -0
- package/cpp/chat.cpp +726 -596
- package/cpp/chat.h +71 -6
- package/cpp/common.cpp +56 -38
- package/cpp/common.h +9 -3
- package/cpp/ggml-backend-reg.cpp +5 -0
- package/cpp/ggml-backend.cpp +10 -2
- package/cpp/ggml-common.h +4 -0
- package/cpp/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
- package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/ggml-cpu/common.h +4 -3
- package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
- package/cpp/ggml-cpu/ggml-cpu.c +123 -104
- package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
- package/cpp/ggml-cpu/ops.cpp +330 -148
- package/cpp/ggml-cpu/ops.h +1 -0
- package/cpp/ggml-cpu/quants.c +1158 -0
- package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/ggml-cpu/repack.cpp +1571 -0
- package/cpp/ggml-cpu/repack.h +98 -0
- package/cpp/ggml-cpu/simd-mappings.h +330 -38
- package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/ggml-cpu/vec.cpp +87 -18
- package/cpp/ggml-cpu/vec.h +249 -94
- package/cpp/ggml-cpu.h +1 -0
- package/cpp/ggml-impl.h +63 -183
- package/cpp/ggml-llama-sim.metallib +0 -0
- package/cpp/ggml-llama.metallib +0 -0
- package/cpp/ggml-metal.m +152 -45
- package/cpp/ggml-quants.c +0 -2
- package/cpp/ggml.c +61 -21
- package/cpp/ggml.h +22 -3
- package/cpp/gguf.cpp +24 -3
- package/cpp/json-partial.cpp +256 -0
- package/cpp/json-partial.h +38 -0
- package/cpp/json-schema-to-grammar.cpp +5 -47
- package/cpp/json-schema-to-grammar.h +4 -4
- package/cpp/llama-arch.cpp +153 -3
- package/cpp/llama-arch.h +27 -1
- package/cpp/llama-batch.cpp +741 -272
- package/cpp/llama-batch.h +112 -54
- package/cpp/llama-chat.cpp +30 -8
- package/cpp/llama-chat.h +1 -0
- package/cpp/llama-context.cpp +524 -339
- package/cpp/llama-context.h +38 -17
- package/cpp/llama-cparams.cpp +4 -0
- package/cpp/llama-cparams.h +2 -0
- package/cpp/llama-grammar.cpp +12 -2
- package/cpp/llama-graph.cpp +431 -356
- package/cpp/llama-graph.h +126 -58
- package/cpp/llama-hparams.cpp +10 -2
- package/cpp/llama-hparams.h +19 -2
- package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
- package/cpp/llama-kv-cache-unified-iswa.h +128 -0
- package/cpp/llama-kv-cache-unified.cpp +1841 -0
- package/cpp/llama-kv-cache-unified.h +303 -0
- package/cpp/llama-kv-cells.h +439 -0
- package/cpp/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama-memory-hybrid.h +138 -0
- package/cpp/llama-memory-recurrent.cpp +1112 -0
- package/cpp/llama-memory-recurrent.h +183 -0
- package/cpp/llama-memory.cpp +41 -0
- package/cpp/llama-memory.h +86 -5
- package/cpp/llama-mmap.cpp +1 -1
- package/cpp/llama-model-loader.cpp +42 -17
- package/cpp/llama-model-saver.cpp +1 -0
- package/cpp/llama-model.cpp +1639 -513
- package/cpp/llama-model.h +26 -0
- package/cpp/llama-sampling.cpp +2 -2
- package/cpp/llama-vocab.cpp +65 -28
- package/cpp/llama-vocab.h +1 -0
- package/cpp/llama.cpp +11 -7
- package/cpp/llama.h +150 -42
- package/cpp/minja/chat-template.hpp +1 -1
- package/cpp/minja/minja.hpp +1 -1
- package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
- package/cpp/nlohmann/json_fwd.hpp +187 -0
- package/cpp/regex-partial.cpp +204 -0
- package/cpp/regex-partial.h +56 -0
- package/cpp/rn-llama.cpp +646 -35
- package/cpp/rn-llama.h +32 -1
- package/cpp/rn-tts.h +39 -0
- package/cpp/sampling.cpp +7 -8
- package/cpp/tools/mtmd/clip-impl.h +5 -0
- package/cpp/tools/mtmd/clip.cpp +572 -436
- package/cpp/tools/mtmd/clip.h +14 -4
- package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
- package/cpp/tools/mtmd/mtmd-audio.h +2 -17
- package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
- package/cpp/tools/mtmd/mtmd-helper.h +91 -0
- package/cpp/tools/mtmd/mtmd.cpp +368 -248
- package/cpp/tools/mtmd/mtmd.h +6 -70
- package/cpp/unicode.cpp +5 -0
- package/ios/CMakeLists.txt +26 -6
- package/ios/RNLlama.h +1 -1
- package/ios/RNLlama.mm +153 -3
- package/ios/RNLlamaContext.h +9 -1
- package/ios/RNLlamaContext.mm +112 -9
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/jest/mock.js +24 -0
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +46 -2
- package/src/index.ts +105 -1
- package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
- package/cpp/ggml-cpu/sgemm.cpp +0 -3544
- package/cpp/ggml-cpu/sgemm.h +0 -14
- package/cpp/llama-kv-cache.cpp +0 -2827
- package/cpp/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
- /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
@@ -0,0 +1,38 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include "nlohmann/json.hpp"
|
4
|
+
|
5
|
+
// Healing marker (empty if the JSON was fully parsed / wasn't healed).
|
6
|
+
struct common_healing_marker {
|
7
|
+
// Raw marker.
|
8
|
+
std::string marker;
|
9
|
+
|
10
|
+
// Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format).
|
11
|
+
std::string json_dump_marker;
|
12
|
+
};
|
13
|
+
|
14
|
+
// Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string)
|
15
|
+
struct common_json {
|
16
|
+
nlohmann::ordered_json json;
|
17
|
+
|
18
|
+
common_healing_marker healing_marker;
|
19
|
+
};
|
20
|
+
|
21
|
+
// Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty.
|
22
|
+
//
|
23
|
+
// Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON.
|
24
|
+
// This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker.
|
25
|
+
// (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format).
|
26
|
+
//
|
27
|
+
// For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again).
|
28
|
+
bool common_json_parse(
|
29
|
+
const std::string & input,
|
30
|
+
const std::string & healing_marker,
|
31
|
+
common_json & out);
|
32
|
+
|
33
|
+
// Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds.
|
34
|
+
bool common_json_parse(
|
35
|
+
std::string::const_iterator & it,
|
36
|
+
const std::string::const_iterator & end,
|
37
|
+
const std::string & healing_marker,
|
38
|
+
common_json & out);
|
package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
#pragma once
|
2
2
|
|
3
|
-
#include "
|
4
|
-
|
5
|
-
#
|
6
|
-
#include
|
3
|
+
#include "nlohmann/json_fwd.hpp"
|
4
|
+
|
5
|
+
#include <functional>
|
6
|
+
#include <string>
|
7
7
|
|
8
8
|
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
|
9
9
|
bool force_gbnf = false);
|
@@ -24,6 +24,7 @@ enum llm_arch {
|
|
24
24
|
LLM_ARCH_BERT,
|
25
25
|
LLM_ARCH_NOMIC_BERT,
|
26
26
|
LLM_ARCH_NOMIC_BERT_MOE,
|
27
|
+
LLM_ARCH_NEO_BERT,
|
27
28
|
LLM_ARCH_JINA_BERT_V2,
|
28
29
|
LLM_ARCH_BLOOM,
|
29
30
|
LLM_ARCH_STABLELM,
|
@@ -45,6 +46,7 @@ enum llm_arch {
|
|
45
46
|
LLM_ARCH_GEMMA,
|
46
47
|
LLM_ARCH_GEMMA2,
|
47
48
|
LLM_ARCH_GEMMA3,
|
49
|
+
LLM_ARCH_GEMMA3N,
|
48
50
|
LLM_ARCH_STARCODER2,
|
49
51
|
LLM_ARCH_MAMBA,
|
50
52
|
LLM_ARCH_XVERSE,
|
@@ -76,6 +78,8 @@ enum llm_arch {
|
|
76
78
|
LLM_ARCH_WAVTOKENIZER_DEC,
|
77
79
|
LLM_ARCH_PLM,
|
78
80
|
LLM_ARCH_BAILINGMOE,
|
81
|
+
LLM_ARCH_DOTS1,
|
82
|
+
LLM_ARCH_ARCEE,
|
79
83
|
LLM_ARCH_UNKNOWN,
|
80
84
|
};
|
81
85
|
|
@@ -148,6 +152,7 @@ enum llm_kv {
|
|
148
152
|
LLM_KV_ATTENTION_SCALE,
|
149
153
|
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
150
154
|
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
155
|
+
LLM_KV_ATTENTION_LAYER_INDICES,
|
151
156
|
|
152
157
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
153
158
|
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
@@ -190,13 +195,13 @@ enum llm_kv {
|
|
190
195
|
LLM_KV_TOKENIZER_MASK_ID,
|
191
196
|
LLM_KV_TOKENIZER_ADD_BOS,
|
192
197
|
LLM_KV_TOKENIZER_ADD_EOS,
|
198
|
+
LLM_KV_TOKENIZER_ADD_SEP,
|
193
199
|
LLM_KV_TOKENIZER_ADD_PREFIX,
|
194
200
|
LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
|
195
201
|
LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
|
196
202
|
LLM_KV_TOKENIZER_HF_JSON,
|
197
203
|
LLM_KV_TOKENIZER_RWKV,
|
198
204
|
LLM_KV_TOKENIZER_CHAT_TEMPLATE,
|
199
|
-
LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,
|
200
205
|
LLM_KV_TOKENIZER_FIM_PRE_ID,
|
201
206
|
LLM_KV_TOKENIZER_FIM_SUF_ID,
|
202
207
|
LLM_KV_TOKENIZER_FIM_MID_ID,
|
@@ -213,6 +218,8 @@ enum llm_kv {
|
|
213
218
|
LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
|
214
219
|
LLM_KV_CONVNEXT_BLOCK_COUNT,
|
215
220
|
|
221
|
+
LLM_KV_CLASSIFIER_OUTPUT_LABELS,
|
222
|
+
|
216
223
|
// deprecated:
|
217
224
|
LLM_KV_TOKENIZER_PREFIX_ID,
|
218
225
|
LLM_KV_TOKENIZER_SUFFIX_ID,
|
@@ -263,6 +270,22 @@ enum llm_tensor {
|
|
263
270
|
LLM_TENSOR_LAYER_OUT_NORM,
|
264
271
|
LLM_TENSOR_POST_ATTN_NORM,
|
265
272
|
LLM_TENSOR_POST_MLP_NORM,
|
273
|
+
LLM_TENSOR_PER_LAYER_TOKEN_EMBD, // gemma3n
|
274
|
+
LLM_TENSOR_PER_LAYER_MODEL_PROJ, // gemma3n
|
275
|
+
LLM_TENSOR_PER_LAYER_INP_GATE, // gemma3n
|
276
|
+
LLM_TENSOR_PER_LAYER_PROJ, // gemma3n
|
277
|
+
LLM_TENSOR_PER_LAYER_PROJ_NORM, // gemma3n
|
278
|
+
LLM_TENSOR_PER_LAYER_POST_NORM, // gemma3n
|
279
|
+
LLM_TENSOR_ALTUP_PROJ, // gemma3n
|
280
|
+
LLM_TENSOR_ALTUP_UNEMBD_PROJ, // gemma3n
|
281
|
+
LLM_TENSOR_ALTUP_CORRECT_COEF, // gemma3n
|
282
|
+
LLM_TENSOR_ALTUP_CORRECT_SCALE, // gemma3n
|
283
|
+
LLM_TENSOR_ALTUP_PREDICT_COEF, // gemma3n
|
284
|
+
LLM_TENSOR_ALTUP_ROUTER, // gemma3n
|
285
|
+
LLM_TENSOR_ALTUP_ROUTER_NORM, // gemma3n
|
286
|
+
LLM_TENSOR_LAUREL_L, // gemma3n
|
287
|
+
LLM_TENSOR_LAUREL_R, // gemma3n
|
288
|
+
LLM_TENSOR_LAUREL_POST_NORM, // gemma3n
|
266
289
|
LLM_TENSOR_SSM_IN,
|
267
290
|
LLM_TENSOR_SSM_CONV1D,
|
268
291
|
LLM_TENSOR_SSM_X,
|
@@ -435,3 +458,6 @@ const char * llm_arch_name(llm_arch arch);
|
|
435
458
|
llm_arch llm_arch_from_string(const std::string & name);
|
436
459
|
|
437
460
|
const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
|
461
|
+
|
462
|
+
bool llm_arch_is_recurrent(const llm_arch & arch);
|
463
|
+
bool llm_arch_is_hybrid (const llm_arch & arch);
|
@@ -2,88 +2,146 @@
|
|
2
2
|
|
3
3
|
#include "llama.h"
|
4
4
|
|
5
|
+
#include "llama-cparams.h"
|
6
|
+
|
5
7
|
#include <array>
|
6
8
|
#include <vector>
|
9
|
+
#include <set>
|
10
|
+
#include <bitset>
|
11
|
+
#include <unordered_map>
|
7
12
|
|
8
|
-
//
|
9
|
-
//
|
13
|
+
// keep this struct lightweight
|
14
|
+
// it points to data in `llama_batch_allocr`
|
10
15
|
struct llama_ubatch {
|
11
16
|
bool equal_seqs;
|
12
17
|
// TODO: whole_seqs for embeddings?
|
13
18
|
|
14
|
-
uint32_t n_tokens;
|
15
|
-
uint32_t n_seq_tokens; // tokens per sequence
|
16
|
-
uint32_t n_seqs;
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
19
|
+
uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
|
20
|
+
uint32_t n_seq_tokens; // tokens per sequence set
|
21
|
+
uint32_t n_seqs; // sequence sets in the ubatch
|
22
|
+
uint32_t n_seqs_unq; // unique sequence ids in the ubatch
|
23
|
+
|
24
|
+
// seq_id_unq: unique sequence ids in the ubatch
|
25
|
+
// seq_idx: indices of the unique sequence ids in the ubatch in [0, n_seqs_unq)
|
26
|
+
// used for extracting sequence pooled embeddings
|
27
|
+
|
28
|
+
// // size | idx | val
|
29
|
+
llama_token * token; // [n_tokens] | i | id, token
|
30
|
+
float * embd; // [n_embd, n_tokens] | i | embd
|
31
|
+
llama_pos * pos; // [n_tokens] | i | pos
|
32
|
+
int32_t * n_seq_id; // [n_tokens] | i | -
|
33
|
+
llama_seq_id ** seq_id; // [n_tokens] | s | s0, s1, seq_id
|
34
|
+
llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id
|
35
|
+
int32_t * seq_idx; // [LLAMA_MAX_SEQ] | - | seq_idx
|
36
|
+
int8_t * output; // [n_tokens] | i | -
|
24
37
|
};
|
25
38
|
|
26
|
-
|
27
|
-
|
39
|
+
// a helper for sanitizing, fulfilling and splitting a batch
|
40
|
+
class llama_batch_allocr {
|
41
|
+
public:
|
42
|
+
llama_batch_allocr(uint32_t n_pos_per_embd);
|
28
43
|
|
29
|
-
|
44
|
+
// sanitize and auto-gen missing data in the input batch
|
45
|
+
// memory is optional. if provided will be used to check for sequence continuity and to determine the positions
|
46
|
+
bool init(
|
47
|
+
const llama_batch & batch_inp,
|
48
|
+
const llama_vocab & vocab,
|
49
|
+
const llama_memory_i * memory,
|
50
|
+
uint32_t n_embd,
|
51
|
+
bool output_all);
|
30
52
|
|
31
|
-
|
32
|
-
size_t length;
|
33
|
-
};
|
53
|
+
const llama_batch & get_batch() const;
|
34
54
|
|
35
|
-
|
36
|
-
|
37
|
-
// tokens left in this batch
|
38
|
-
size_t n_tokens;
|
55
|
+
uint32_t get_n_tokens() const;
|
56
|
+
uint32_t get_n_outputs() const;
|
39
57
|
|
40
|
-
|
58
|
+
// the array of output indices in the order they were encountered during the ubatch splitting
|
59
|
+
std::vector<int32_t> & get_out_ids();
|
41
60
|
|
42
|
-
|
61
|
+
// min/max positions of each sequence in the current ubatch
|
62
|
+
llama_pos seq_pos_min(llama_seq_id seq_id) const;
|
63
|
+
llama_pos seq_pos_max(llama_seq_id seq_id) const;
|
43
64
|
|
44
|
-
//
|
45
|
-
|
46
|
-
// batch indices of the output
|
47
|
-
std::vector<int64_t> out_ids;
|
48
|
-
std::vector<llama_sbatch_seq> seq;
|
65
|
+
// call once before splitting the batch to reset the internal state
|
66
|
+
void split_reset();
|
49
67
|
|
50
|
-
|
68
|
+
// simple split, unknown number of sequence sets of unequal lengths
|
69
|
+
llama_ubatch split_simple(uint32_t n_ubatch);
|
51
70
|
|
52
|
-
//
|
53
|
-
|
54
|
-
std::vector<float> ubatch_embd;
|
55
|
-
std::vector<llama_pos> ubatch_pos;
|
56
|
-
std::vector<int32_t> ubatch_n_seq_id;
|
57
|
-
std::vector<llama_seq_id *> ubatch_seq_id;
|
58
|
-
std::vector<int8_t> ubatch_output;
|
71
|
+
// make ubatches of equal-length sequences sets
|
72
|
+
llama_ubatch split_equal(uint32_t n_ubatch);
|
59
73
|
|
60
|
-
|
74
|
+
// sequence-set-wise split - each ubatch contains a single sequence-set
|
75
|
+
llama_ubatch split_seq(uint32_t n_ubatch);
|
61
76
|
|
62
|
-
|
77
|
+
// a helper method for creating a well-defined ubatch of tokens
|
78
|
+
// TODO: support embeddings if needed in the future
|
79
|
+
llama_ubatch ubatch_reserve(uint32_t n_seq_tokens, uint32_t n_seqs);
|
63
80
|
|
64
|
-
|
65
|
-
|
81
|
+
private:
|
82
|
+
void clear();
|
66
83
|
|
67
|
-
//
|
68
|
-
llama_ubatch
|
84
|
+
// create the next ubatch based on the provided batch indices (idxs) and the number of sequence sets (n_seqs)
|
85
|
+
// return llama_ubatch.n_tokens == 0 if the entire batch was consumed
|
86
|
+
llama_ubatch ubatch_add(const std::vector<int32_t> & idxs, uint32_t n_seqs, bool equal_seqs);
|
69
87
|
|
70
|
-
//
|
71
|
-
|
88
|
+
// for debugging, start with LLAMA_BATCH_DEBUG=2
|
89
|
+
void ubatch_print(const llama_ubatch & ubatch, int debug);
|
72
90
|
|
73
|
-
|
74
|
-
|
75
|
-
|
91
|
+
llama_batch batch;
|
92
|
+
|
93
|
+
// only for debugging purposes
|
94
|
+
const llama_vocab * vocab;
|
95
|
+
|
96
|
+
// TODO: this is more of a temporary solution until we have a better way to handle multiple positions per token/embd
|
97
|
+
// ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
|
98
|
+
const uint32_t n_pos_per_embd;
|
76
99
|
|
77
|
-
|
78
|
-
|
79
|
-
struct llama_batch batch;
|
100
|
+
uint32_t n_embd;
|
101
|
+
uint32_t n_outputs;
|
80
102
|
|
81
103
|
std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
|
104
|
+
|
82
105
|
std::vector<llama_pos> pos;
|
83
106
|
std::vector<int32_t> n_seq_id;
|
84
107
|
std::vector<llama_seq_id *> seq_id;
|
85
|
-
std::vector<
|
108
|
+
std::vector<llama_seq_id> seq_id_unq;
|
109
|
+
std::vector<int32_t> seq_idx;
|
110
|
+
std::vector<int8_t> output;
|
111
|
+
|
112
|
+
using pos_set_t = std::set<llama_pos>;
|
113
|
+
using seq_cpl_t = std::vector<bool>;
|
114
|
+
|
115
|
+
std::vector<pos_set_t> seq_pos; // seq_pos[s]: the set of positions in sequence s
|
116
|
+
std::vector<seq_cpl_t> seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1
|
117
|
+
|
118
|
+
using idx_vec_t = std::vector<int32_t>;
|
119
|
+
using seq_set_t = std::bitset<LLAMA_MAX_SEQ>;
|
86
120
|
|
87
|
-
//
|
88
|
-
|
121
|
+
std::vector<seq_set_t> seq_set; // seq_set[i]: the sequence set of token i
|
122
|
+
|
123
|
+
std::unordered_map<seq_set_t, idx_vec_t> seq_set_map; // the indices at which the sequence set appears
|
124
|
+
|
125
|
+
// batch indices of the output
|
126
|
+
std::vector<int32_t> out_ids;
|
127
|
+
|
128
|
+
// used[i] indicates if token i has already been used in a previous ubatch
|
129
|
+
std::vector<bool> used;
|
130
|
+
|
131
|
+
// llama_ubatch points to this data:
|
132
|
+
struct ubatch {
|
133
|
+
std::vector<llama_token> token;
|
134
|
+
std::vector<float> embd;
|
135
|
+
std::vector<llama_pos> pos;
|
136
|
+
std::vector<int32_t> n_seq_id;
|
137
|
+
std::vector<llama_seq_id *> seq_id;
|
138
|
+
std::vector<llama_seq_id> seq_id_unq;
|
139
|
+
std::vector<int32_t> seq_idx;
|
140
|
+
std::vector<int8_t> output;
|
141
|
+
};
|
142
|
+
|
143
|
+
// current splitting state:
|
144
|
+
std::vector<ubatch> ubatches;
|
145
|
+
|
146
|
+
int debug;
|
89
147
|
};
|
@@ -1,7 +1,6 @@
|
|
1
1
|
#pragma once
|
2
2
|
|
3
3
|
#include "llama.h"
|
4
|
-
#include "llama-batch.h"
|
5
4
|
#include "llama-cparams.h"
|
6
5
|
#include "llama-graph.h"
|
7
6
|
#include "llama-adapter.h"
|
@@ -13,11 +12,14 @@
|
|
13
12
|
#include <vector>
|
14
13
|
|
15
14
|
struct llama_model;
|
16
|
-
|
15
|
+
class llama_batch_allocr;
|
17
16
|
|
18
17
|
class llama_io_read_i;
|
19
18
|
class llama_io_write_i;
|
20
19
|
|
20
|
+
struct llama_memory_i;
|
21
|
+
struct llama_memory_context_i;
|
22
|
+
|
21
23
|
struct llama_context {
|
22
24
|
// init scheduler and compute buffers, reserve worst-case graphs
|
23
25
|
llama_context(
|
@@ -44,10 +46,12 @@ struct llama_context {
|
|
44
46
|
uint32_t n_threads() const;
|
45
47
|
uint32_t n_threads_batch() const;
|
46
48
|
|
47
|
-
|
48
|
-
const llama_kv_cache * get_kv_self() const;
|
49
|
+
llama_memory_t get_memory() const;
|
49
50
|
|
50
|
-
|
51
|
+
// return true of the KV cache was updated
|
52
|
+
// TODO: remove
|
53
|
+
bool kv_self_update(bool optimize);
|
54
|
+
void kv_self_defrag_sched();
|
51
55
|
|
52
56
|
enum llama_pooling_type pooling_type() const;
|
53
57
|
|
@@ -88,8 +92,18 @@ struct llama_context {
|
|
88
92
|
int32_t il_start,
|
89
93
|
int32_t il_end);
|
90
94
|
|
91
|
-
|
92
|
-
|
95
|
+
// process a single ubatch with a specific graph type
|
96
|
+
// if memory_context is provided, it will be applied first to the context's memory
|
97
|
+
// ret contains the status of the graph computation
|
98
|
+
// returns nullptr only if ret != LM_GGML_STATUS_SUCCESS
|
99
|
+
llm_graph_result_ptr process_ubatch(
|
100
|
+
const llama_ubatch & ubatch,
|
101
|
+
llm_graph_type gtype,
|
102
|
+
llama_memory_context_i * mctx,
|
103
|
+
lm_ggml_status & ret);
|
104
|
+
|
105
|
+
int encode(const llama_batch & batch_inp);
|
106
|
+
int decode(const llama_batch & batch_inp);
|
93
107
|
|
94
108
|
//
|
95
109
|
// state save/load
|
@@ -167,7 +181,7 @@ private:
|
|
167
181
|
|
168
182
|
// Make sure enough space is available for outputs.
|
169
183
|
// Returns max number of outputs for which space was reserved.
|
170
|
-
|
184
|
+
uint32_t output_reserve(int32_t n_outputs);
|
171
185
|
|
172
186
|
//
|
173
187
|
// graph
|
@@ -180,16 +194,18 @@ public:
|
|
180
194
|
lm_ggml_cgraph * graph_init();
|
181
195
|
|
182
196
|
// returns the result of lm_ggml_backend_sched_graph_compute_async execution
|
183
|
-
lm_ggml_status graph_compute(
|
184
|
-
|
185
|
-
|
197
|
+
lm_ggml_status graph_compute(lm_ggml_cgraph * gf, bool batched);
|
198
|
+
|
199
|
+
// reserve a graph with a dummy ubatch of the specified size
|
200
|
+
lm_ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
|
186
201
|
|
187
202
|
private:
|
188
203
|
llm_graph_result_ptr graph_build(
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
204
|
+
lm_ggml_context * ctx,
|
205
|
+
lm_ggml_cgraph * gf,
|
206
|
+
const llama_ubatch & ubatch,
|
207
|
+
llm_graph_type gtype,
|
208
|
+
const llama_memory_context_i * mctx);
|
193
209
|
|
194
210
|
llm_graph_cb graph_get_cb() const;
|
195
211
|
|
@@ -214,6 +230,9 @@ private:
|
|
214
230
|
|
215
231
|
std::unique_ptr<llama_memory_i> memory;
|
216
232
|
|
233
|
+
// TODO: temporary, until the llama_kv_self_defrag() API is removed
|
234
|
+
bool memory_force_optimize = false;
|
235
|
+
|
217
236
|
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
218
237
|
size_t logits_size = 0; // capacity (of floats) for logits
|
219
238
|
float * logits = nullptr;
|
@@ -227,8 +246,10 @@ private:
|
|
227
246
|
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
|
228
247
|
std::map<llama_seq_id, std::vector<float>> embd_seq;
|
229
248
|
|
230
|
-
|
231
|
-
|
249
|
+
// reuse the batch_allocr to avoid unnecessary memory allocations
|
250
|
+
std::unique_ptr<llama_batch_allocr> balloc;
|
251
|
+
|
252
|
+
uint32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
|
232
253
|
|
233
254
|
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
234
255
|
|