cui-llama.rn 1.7.3 → 1.7.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +217 -17
- package/android/src/main/CMakeLists.txt +34 -15
- package/android/src/main/java/com/rnllama/LlamaContext.java +94 -8
- package/android/src/main/java/com/rnllama/RNLlama.java +247 -0
- package/android/src/main/jni.cpp +213 -14
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
- package/cpp/README.md +1 -1
- package/cpp/chat-parser.cpp +385 -0
- package/cpp/chat-parser.h +120 -0
- package/cpp/chat.cpp +726 -596
- package/cpp/chat.h +71 -6
- package/cpp/common.cpp +56 -38
- package/cpp/common.h +9 -3
- package/cpp/ggml-backend-reg.cpp +5 -0
- package/cpp/ggml-backend.cpp +10 -2
- package/cpp/ggml-common.h +4 -0
- package/cpp/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
- package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/ggml-cpu/common.h +4 -3
- package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
- package/cpp/ggml-cpu/ggml-cpu.c +123 -104
- package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
- package/cpp/ggml-cpu/ops.cpp +330 -148
- package/cpp/ggml-cpu/ops.h +1 -0
- package/cpp/ggml-cpu/quants.c +1158 -0
- package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/ggml-cpu/repack.cpp +1571 -0
- package/cpp/ggml-cpu/repack.h +98 -0
- package/cpp/ggml-cpu/simd-mappings.h +330 -38
- package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/ggml-cpu/vec.cpp +87 -18
- package/cpp/ggml-cpu/vec.h +249 -94
- package/cpp/ggml-cpu.h +1 -0
- package/cpp/ggml-impl.h +63 -183
- package/cpp/ggml-llama-sim.metallib +0 -0
- package/cpp/ggml-llama.metallib +0 -0
- package/cpp/ggml-metal.m +152 -45
- package/cpp/ggml-quants.c +0 -2
- package/cpp/ggml.c +61 -21
- package/cpp/ggml.h +22 -3
- package/cpp/gguf.cpp +24 -3
- package/cpp/json-partial.cpp +256 -0
- package/cpp/json-partial.h +38 -0
- package/cpp/json-schema-to-grammar.cpp +5 -47
- package/cpp/json-schema-to-grammar.h +4 -4
- package/cpp/llama-arch.cpp +153 -3
- package/cpp/llama-arch.h +27 -1
- package/cpp/llama-batch.cpp +741 -272
- package/cpp/llama-batch.h +112 -54
- package/cpp/llama-chat.cpp +30 -8
- package/cpp/llama-chat.h +1 -0
- package/cpp/llama-context.cpp +524 -339
- package/cpp/llama-context.h +38 -17
- package/cpp/llama-cparams.cpp +4 -0
- package/cpp/llama-cparams.h +2 -0
- package/cpp/llama-grammar.cpp +12 -2
- package/cpp/llama-graph.cpp +431 -356
- package/cpp/llama-graph.h +126 -58
- package/cpp/llama-hparams.cpp +10 -2
- package/cpp/llama-hparams.h +19 -2
- package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
- package/cpp/llama-kv-cache-unified-iswa.h +128 -0
- package/cpp/llama-kv-cache-unified.cpp +1841 -0
- package/cpp/llama-kv-cache-unified.h +303 -0
- package/cpp/llama-kv-cells.h +439 -0
- package/cpp/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama-memory-hybrid.h +138 -0
- package/cpp/llama-memory-recurrent.cpp +1112 -0
- package/cpp/llama-memory-recurrent.h +183 -0
- package/cpp/llama-memory.cpp +41 -0
- package/cpp/llama-memory.h +86 -5
- package/cpp/llama-mmap.cpp +1 -1
- package/cpp/llama-model-loader.cpp +42 -17
- package/cpp/llama-model-saver.cpp +1 -0
- package/cpp/llama-model.cpp +1639 -513
- package/cpp/llama-model.h +26 -0
- package/cpp/llama-sampling.cpp +2 -2
- package/cpp/llama-vocab.cpp +65 -28
- package/cpp/llama-vocab.h +1 -0
- package/cpp/llama.cpp +11 -7
- package/cpp/llama.h +150 -42
- package/cpp/minja/chat-template.hpp +1 -1
- package/cpp/minja/minja.hpp +1 -1
- package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
- package/cpp/nlohmann/json_fwd.hpp +187 -0
- package/cpp/regex-partial.cpp +204 -0
- package/cpp/regex-partial.h +56 -0
- package/cpp/rn-llama.cpp +646 -35
- package/cpp/rn-llama.h +32 -1
- package/cpp/rn-tts.h +39 -0
- package/cpp/sampling.cpp +7 -8
- package/cpp/tools/mtmd/clip-impl.h +5 -0
- package/cpp/tools/mtmd/clip.cpp +572 -436
- package/cpp/tools/mtmd/clip.h +14 -4
- package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
- package/cpp/tools/mtmd/mtmd-audio.h +2 -17
- package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
- package/cpp/tools/mtmd/mtmd-helper.h +91 -0
- package/cpp/tools/mtmd/mtmd.cpp +368 -248
- package/cpp/tools/mtmd/mtmd.h +6 -70
- package/cpp/unicode.cpp +5 -0
- package/ios/CMakeLists.txt +26 -6
- package/ios/RNLlama.h +1 -1
- package/ios/RNLlama.mm +153 -3
- package/ios/RNLlamaContext.h +9 -1
- package/ios/RNLlamaContext.mm +112 -9
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/jest/mock.js +24 -0
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +46 -2
- package/src/index.ts +105 -1
- package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
- package/cpp/ggml-cpu/sgemm.cpp +0 -3544
- package/cpp/ggml-cpu/sgemm.h +0 -14
- package/cpp/llama-kv-cache.cpp +0 -2827
- package/cpp/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
- /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
package/cpp/ggml-cpu/ggml-cpu.c
CHANGED
@@ -3,11 +3,11 @@
|
|
3
3
|
|
4
4
|
#include "ggml-backend-impl.h"
|
5
5
|
#include "ggml-backend.h"
|
6
|
-
#include "
|
6
|
+
#include "traits.h"
|
7
7
|
#include "ggml-cpu-impl.h"
|
8
8
|
#include "ggml-cpu.h"
|
9
9
|
#include "ggml-impl.h"
|
10
|
-
#include "
|
10
|
+
#include "quants.h"
|
11
11
|
#include "ggml-threading.h"
|
12
12
|
#include "unary-ops.h"
|
13
13
|
#include "binary-ops.h"
|
@@ -72,15 +72,13 @@
|
|
72
72
|
#define UNUSED LM_GGML_UNUSED
|
73
73
|
#define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0)
|
74
74
|
|
75
|
+
// precomputed f32 table for f16 (256 KB) (simd-mappings.h)
|
76
|
+
float lm_ggml_table_f32_f16[1 << 16];
|
77
|
+
|
75
78
|
#if defined(__ARM_ARCH)
|
76
79
|
struct lm_ggml_arm_arch_features_type {
|
77
|
-
int has_neon;
|
78
|
-
int has_dotprod;
|
79
|
-
int has_i8mm;
|
80
|
-
int has_sve;
|
81
80
|
int sve_cnt;
|
82
|
-
|
83
|
-
} lm_ggml_arm_arch_features = {-1, -1, -1, -1, 0, -1};
|
81
|
+
} lm_ggml_arm_arch_features = { 0 };
|
84
82
|
#endif
|
85
83
|
|
86
84
|
|
@@ -270,7 +268,11 @@ static const struct lm_ggml_type_traits_cpu type_traits_cpu[LM_GGML_TYPE_COUNT]
|
|
270
268
|
.from_float = quantize_row_q4_K,
|
271
269
|
.vec_dot = lm_ggml_vec_dot_q4_K_q8_K,
|
272
270
|
.vec_dot_type = LM_GGML_TYPE_Q8_K,
|
271
|
+
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
272
|
+
.nrows = 2,
|
273
|
+
#else
|
273
274
|
.nrows = 1,
|
275
|
+
#endif
|
274
276
|
},
|
275
277
|
[LM_GGML_TYPE_Q5_K] = {
|
276
278
|
.from_float = quantize_row_q5_K,
|
@@ -555,6 +557,14 @@ void lm_ggml_barrier(struct lm_ggml_threadpool * tp) {
|
|
555
557
|
#endif
|
556
558
|
}
|
557
559
|
|
560
|
+
void lm_ggml_threadpool_chunk_set(struct lm_ggml_threadpool * tp, int value) {
|
561
|
+
atomic_store_explicit(&tp->current_chunk, value, memory_order_relaxed);
|
562
|
+
}
|
563
|
+
|
564
|
+
int lm_ggml_threadpool_chunk_add(struct lm_ggml_threadpool * tp, int value) {
|
565
|
+
return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed);
|
566
|
+
}
|
567
|
+
|
558
568
|
#if defined(__gnu_linux__)
|
559
569
|
static cpu_set_t lm_ggml_get_numa_affinity(void) {
|
560
570
|
cpu_set_t cpuset;
|
@@ -666,87 +676,15 @@ bool lm_ggml_is_numa(void) {
|
|
666
676
|
|
667
677
|
#if defined(__linux__) && defined(__aarch64__)
|
668
678
|
#include <sys/auxv.h>
|
669
|
-
#elif defined(__APPLE__)
|
670
|
-
#include <sys/sysctl.h>
|
671
|
-
#endif
|
672
|
-
|
673
|
-
#if !defined(HWCAP2_I8MM)
|
674
|
-
#define HWCAP2_I8MM (1 << 13)
|
675
|
-
#endif
|
676
|
-
|
677
|
-
#if !defined(HWCAP2_SME)
|
678
|
-
#define HWCAP2_SME (1 << 23)
|
679
679
|
#endif
|
680
680
|
|
681
681
|
static void lm_ggml_init_arm_arch_features(void) {
|
682
|
-
#if defined(__linux__) && defined(__aarch64__)
|
683
|
-
uint32_t hwcap = getauxval(AT_HWCAP);
|
684
|
-
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
685
|
-
|
686
|
-
lm_ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
|
687
|
-
lm_ggml_arm_arch_features.has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
|
688
|
-
lm_ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
689
|
-
lm_ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
|
690
|
-
lm_ggml_arm_arch_features.has_sme = !!(hwcap2 & HWCAP2_SME);
|
691
|
-
|
692
|
-
#if defined(__ARM_FEATURE_SVE)
|
682
|
+
#if defined(__linux__) && defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
|
693
683
|
lm_ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
|
694
684
|
#endif
|
695
|
-
#elif defined(__APPLE__)
|
696
|
-
int oldp = 0;
|
697
|
-
size_t size = sizeof(oldp);
|
698
|
-
if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
|
699
|
-
oldp = 0;
|
700
|
-
}
|
701
|
-
lm_ggml_arm_arch_features.has_neon = oldp;
|
702
|
-
|
703
|
-
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) {
|
704
|
-
oldp = 0;
|
705
|
-
}
|
706
|
-
lm_ggml_arm_arch_features.has_dotprod = oldp;
|
707
|
-
|
708
|
-
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
|
709
|
-
oldp = 0;
|
710
|
-
}
|
711
|
-
lm_ggml_arm_arch_features.has_i8mm = oldp;
|
712
|
-
|
713
|
-
if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) != 0) {
|
714
|
-
oldp = 0;
|
715
|
-
}
|
716
|
-
lm_ggml_arm_arch_features.has_sme = oldp;
|
717
|
-
|
718
|
-
lm_ggml_arm_arch_features.has_sve = 0;
|
719
|
-
lm_ggml_arm_arch_features.sve_cnt = 0;
|
720
|
-
#else
|
721
|
-
// Run-time CPU feature detection not implemented for this platform, fallback to compile time
|
722
|
-
#if defined(__ARM_NEON)
|
723
|
-
lm_ggml_arm_arch_features.has_neon = 1;
|
724
|
-
#else
|
725
|
-
lm_ggml_arm_arch_features.has_neon = 0;
|
726
|
-
#endif
|
727
|
-
|
728
|
-
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
729
|
-
lm_ggml_arm_arch_features.has_i8mm = 1;
|
730
|
-
#else
|
731
|
-
lm_ggml_arm_arch_features.has_i8mm = 0;
|
732
|
-
#endif
|
733
|
-
|
734
|
-
#if defined(__ARM_FEATURE_SVE)
|
735
|
-
lm_ggml_arm_arch_features.has_sve = 1;
|
736
|
-
lm_ggml_arm_arch_features.sve_cnt = 16;
|
737
|
-
#else
|
738
|
-
lm_ggml_arm_arch_features.has_sve = 0;
|
739
|
-
lm_ggml_arm_arch_features.sve_cnt = 0;
|
740
|
-
#endif
|
741
|
-
|
742
|
-
#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_SME2)
|
743
|
-
lm_ggml_arm_arch_features.has_sme = 1;
|
744
|
-
#else
|
745
|
-
lm_ggml_arm_arch_features.has_sme = 0;
|
746
|
-
#endif
|
747
|
-
#endif
|
748
685
|
}
|
749
|
-
|
686
|
+
|
687
|
+
#endif // __ARM_ARCH
|
750
688
|
|
751
689
|
struct lm_ggml_tensor * lm_ggml_new_i32(struct lm_ggml_context * ctx, int32_t value) {
|
752
690
|
LM_GGML_ASSERT(!lm_ggml_get_no_alloc(ctx));
|
@@ -801,7 +739,7 @@ struct lm_ggml_tensor * lm_ggml_set_i32 (struct lm_ggml_tensor * tensor, int32_t
|
|
801
739
|
{
|
802
740
|
assert(tensor->nb[0] == sizeof(lm_ggml_fp16_t));
|
803
741
|
for (int i = 0; i < n; i++) {
|
804
|
-
lm_ggml_vec_set_f16(nc, (lm_ggml_fp16_t *)(data + i*n1),
|
742
|
+
lm_ggml_vec_set_f16(nc, (lm_ggml_fp16_t *)(data + i*n1), LM_GGML_CPU_FP32_TO_FP16(value));
|
805
743
|
}
|
806
744
|
} break;
|
807
745
|
case LM_GGML_TYPE_BF16:
|
@@ -860,7 +798,7 @@ struct lm_ggml_tensor * lm_ggml_set_f32(struct lm_ggml_tensor * tensor, float va
|
|
860
798
|
{
|
861
799
|
assert(tensor->nb[0] == sizeof(lm_ggml_fp16_t));
|
862
800
|
for (int i = 0; i < n; i++) {
|
863
|
-
lm_ggml_vec_set_f16(nc, (lm_ggml_fp16_t *)(data + i*n1),
|
801
|
+
lm_ggml_vec_set_f16(nc, (lm_ggml_fp16_t *)(data + i*n1), LM_GGML_CPU_FP32_TO_FP16(value));
|
864
802
|
}
|
865
803
|
} break;
|
866
804
|
case LM_GGML_TYPE_BF16:
|
@@ -911,7 +849,7 @@ int32_t lm_ggml_get_i32_1d(const struct lm_ggml_tensor * tensor, int i) {
|
|
911
849
|
case LM_GGML_TYPE_F16:
|
912
850
|
{
|
913
851
|
LM_GGML_ASSERT(tensor->nb[0] == sizeof(lm_ggml_fp16_t));
|
914
|
-
return
|
852
|
+
return LM_GGML_CPU_FP16_TO_FP32(((lm_ggml_fp16_t *)(tensor->data))[i]);
|
915
853
|
}
|
916
854
|
case LM_GGML_TYPE_BF16:
|
917
855
|
{
|
@@ -956,7 +894,7 @@ void lm_ggml_set_i32_1d(const struct lm_ggml_tensor * tensor, int i, int32_t val
|
|
956
894
|
case LM_GGML_TYPE_F16:
|
957
895
|
{
|
958
896
|
LM_GGML_ASSERT(tensor->nb[0] == sizeof(lm_ggml_fp16_t));
|
959
|
-
((lm_ggml_fp16_t *)(tensor->data))[i] =
|
897
|
+
((lm_ggml_fp16_t *)(tensor->data))[i] = LM_GGML_CPU_FP32_TO_FP16(value);
|
960
898
|
} break;
|
961
899
|
case LM_GGML_TYPE_BF16:
|
962
900
|
{
|
@@ -985,7 +923,7 @@ int32_t lm_ggml_get_i32_nd(const struct lm_ggml_tensor * tensor, int i0, int i1,
|
|
985
923
|
case LM_GGML_TYPE_I32:
|
986
924
|
return ((int32_t *) data)[0];
|
987
925
|
case LM_GGML_TYPE_F16:
|
988
|
-
return
|
926
|
+
return LM_GGML_CPU_FP16_TO_FP32(((lm_ggml_fp16_t *) data)[0]);
|
989
927
|
case LM_GGML_TYPE_BF16:
|
990
928
|
return LM_GGML_BF16_TO_FP32(((lm_ggml_bf16_t *) data)[0]);
|
991
929
|
case LM_GGML_TYPE_F32:
|
@@ -1012,7 +950,7 @@ void lm_ggml_set_i32_nd(const struct lm_ggml_tensor * tensor, int i0, int i1, in
|
|
1012
950
|
} break;
|
1013
951
|
case LM_GGML_TYPE_F16:
|
1014
952
|
{
|
1015
|
-
((lm_ggml_fp16_t *)(data))[0] =
|
953
|
+
((lm_ggml_fp16_t *)(data))[0] = LM_GGML_CPU_FP32_TO_FP16(value);
|
1016
954
|
} break;
|
1017
955
|
case LM_GGML_TYPE_BF16:
|
1018
956
|
{
|
@@ -1050,7 +988,7 @@ float lm_ggml_get_f32_1d(const struct lm_ggml_tensor * tensor, int i) {
|
|
1050
988
|
}
|
1051
989
|
case LM_GGML_TYPE_F16:
|
1052
990
|
{
|
1053
|
-
return
|
991
|
+
return LM_GGML_CPU_FP16_TO_FP32(((lm_ggml_fp16_t *)(tensor->data))[i]);
|
1054
992
|
}
|
1055
993
|
case LM_GGML_TYPE_BF16:
|
1056
994
|
{
|
@@ -1089,7 +1027,7 @@ void lm_ggml_set_f32_1d(const struct lm_ggml_tensor * tensor, int i, float value
|
|
1089
1027
|
} break;
|
1090
1028
|
case LM_GGML_TYPE_F16:
|
1091
1029
|
{
|
1092
|
-
((lm_ggml_fp16_t *)(tensor->data))[i] =
|
1030
|
+
((lm_ggml_fp16_t *)(tensor->data))[i] = LM_GGML_CPU_FP32_TO_FP16(value);
|
1093
1031
|
} break;
|
1094
1032
|
case LM_GGML_TYPE_BF16:
|
1095
1033
|
{
|
@@ -1116,7 +1054,7 @@ float lm_ggml_get_f32_nd(const struct lm_ggml_tensor * tensor, int i0, int i1, i
|
|
1116
1054
|
case LM_GGML_TYPE_I32:
|
1117
1055
|
return ((int32_t *) data)[0];
|
1118
1056
|
case LM_GGML_TYPE_F16:
|
1119
|
-
return
|
1057
|
+
return LM_GGML_CPU_FP16_TO_FP32(((lm_ggml_fp16_t *) data)[0]);
|
1120
1058
|
case LM_GGML_TYPE_BF16:
|
1121
1059
|
return LM_GGML_BF16_TO_FP32(((lm_ggml_bf16_t *) data)[0]);
|
1122
1060
|
case LM_GGML_TYPE_F32:
|
@@ -1143,7 +1081,7 @@ void lm_ggml_set_f32_nd(const struct lm_ggml_tensor * tensor, int i0, int i1, in
|
|
1143
1081
|
} break;
|
1144
1082
|
case LM_GGML_TYPE_F16:
|
1145
1083
|
{
|
1146
|
-
((lm_ggml_fp16_t *)(data))[0] =
|
1084
|
+
((lm_ggml_fp16_t *)(data))[0] = LM_GGML_CPU_FP32_TO_FP16(value);
|
1147
1085
|
} break;
|
1148
1086
|
case LM_GGML_TYPE_BF16:
|
1149
1087
|
{
|
@@ -1955,6 +1893,10 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru
|
|
1955
1893
|
{
|
1956
1894
|
lm_ggml_compute_forward_pad_reflect_1d(params, tensor);
|
1957
1895
|
} break;
|
1896
|
+
case LM_GGML_OP_ROLL:
|
1897
|
+
{
|
1898
|
+
lm_ggml_compute_forward_roll(params, tensor);
|
1899
|
+
} break;
|
1958
1900
|
case LM_GGML_OP_ARANGE:
|
1959
1901
|
{
|
1960
1902
|
lm_ggml_compute_forward_arange(params, tensor);
|
@@ -2279,6 +2221,7 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads) {
|
|
2279
2221
|
case LM_GGML_OP_UPSCALE:
|
2280
2222
|
case LM_GGML_OP_PAD:
|
2281
2223
|
case LM_GGML_OP_PAD_REFLECT_1D:
|
2224
|
+
case LM_GGML_OP_ROLL:
|
2282
2225
|
case LM_GGML_OP_ARANGE:
|
2283
2226
|
case LM_GGML_OP_TIMESTEP_EMBEDDING:
|
2284
2227
|
case LM_GGML_OP_ARGSORT:
|
@@ -2414,12 +2357,32 @@ static bool lm_ggml_thread_apply_priority(int32_t prio) {
|
|
2414
2357
|
// This is up to the applications.
|
2415
2358
|
DWORD p = THREAD_PRIORITY_NORMAL;
|
2416
2359
|
switch (prio) {
|
2360
|
+
case LM_GGML_SCHED_PRIO_LOW: p = THREAD_PRIORITY_BELOW_NORMAL; break;
|
2417
2361
|
case LM_GGML_SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break;
|
2418
2362
|
case LM_GGML_SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break;
|
2419
2363
|
case LM_GGML_SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break;
|
2420
2364
|
case LM_GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
|
2421
2365
|
}
|
2422
2366
|
|
2367
|
+
if (prio != LM_GGML_SCHED_PRIO_LOW) {
|
2368
|
+
// Tell Windows that this thread should not be throttled (needs its own CPU core).
|
2369
|
+
// Newer Windows 11 versions aggresively park (offline) CPU cores and often place
|
2370
|
+
// all our threads onto the first 4 cores which results in terrible performance with
|
2371
|
+
// n_threads > 4
|
2372
|
+
#if _WIN32_WINNT >= 0x0602
|
2373
|
+
THREAD_POWER_THROTTLING_STATE t;
|
2374
|
+
ZeroMemory(&t, sizeof(t));
|
2375
|
+
t.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION;
|
2376
|
+
t.ControlMask = THREAD_POWER_THROTTLING_EXECUTION_SPEED;
|
2377
|
+
t.StateMask = 0;
|
2378
|
+
|
2379
|
+
if (!SetThreadInformation(GetCurrentThread(), ThreadPowerThrottling, &t, sizeof(t))) {
|
2380
|
+
LM_GGML_LOG_DEBUG("failed to disable thread power throttling %d : (%d)\n", prio, (int) GetLastError());
|
2381
|
+
return false;
|
2382
|
+
}
|
2383
|
+
#endif
|
2384
|
+
}
|
2385
|
+
|
2423
2386
|
if (prio == LM_GGML_SCHED_PRIO_NORMAL) {
|
2424
2387
|
// Keep inherited policy/priority
|
2425
2388
|
return true;
|
@@ -2447,6 +2410,8 @@ static bool lm_ggml_thread_apply_priority(int32_t prio) {
|
|
2447
2410
|
struct sched_param p;
|
2448
2411
|
int32_t policy = SCHED_OTHER;
|
2449
2412
|
switch (prio) {
|
2413
|
+
// TODO: there seems to be no way to set lower prio on Apple platforms
|
2414
|
+
case LM_GGML_SCHED_PRIO_LOW: policy = SCHED_OTHER; p.sched_priority = 0; break;
|
2450
2415
|
case LM_GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
|
2451
2416
|
case LM_GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
|
2452
2417
|
case LM_GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
|
@@ -2503,6 +2468,7 @@ static bool lm_ggml_thread_apply_priority(int32_t prio) {
|
|
2503
2468
|
struct sched_param p;
|
2504
2469
|
int32_t policy = SCHED_OTHER;
|
2505
2470
|
switch (prio) {
|
2471
|
+
case LM_GGML_SCHED_PRIO_LOW: policy = SCHED_BATCH; p.sched_priority = 0; break;
|
2506
2472
|
case LM_GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
|
2507
2473
|
case LM_GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
|
2508
2474
|
case LM_GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
|
@@ -3178,9 +3144,24 @@ void lm_ggml_cpu_fp32_to_fp16(const float * x, lm_ggml_fp16_t * y, int64_t n) {
|
|
3178
3144
|
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
3179
3145
|
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
3180
3146
|
}
|
3147
|
+
#elif defined(__NNPA__)
|
3148
|
+
for (; i + 7 < n; i += 8) {
|
3149
|
+
float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
|
3150
|
+
float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
|
3151
|
+
uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
|
3152
|
+
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
|
3153
|
+
vec_xst(v_y, 0, (lm_ggml_fp16_t *)(y + i));
|
3154
|
+
}
|
3155
|
+
for (; i + 3 < n; i += 4) {
|
3156
|
+
float32x4_t v_x = vec_xl(0, (const float *)(x + i));
|
3157
|
+
float32x4_t v_zero = vec_splats(0.0f);
|
3158
|
+
uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
|
3159
|
+
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
|
3160
|
+
vec_xst(v_y, 0, (lm_ggml_fp16_t *)(y + i));
|
3161
|
+
}
|
3181
3162
|
#endif
|
3182
3163
|
for (; i < n; ++i) {
|
3183
|
-
y[i] =
|
3164
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16(x[i]);
|
3184
3165
|
}
|
3185
3166
|
}
|
3186
3167
|
|
@@ -3204,9 +3185,25 @@ void lm_ggml_cpu_fp16_to_fp32(const lm_ggml_fp16_t * x, float * y, int64_t n) {
|
|
3204
3185
|
__m128 y_vec = _mm_cvtph_ps(x_vec);
|
3205
3186
|
_mm_storeu_ps(y + i, y_vec);
|
3206
3187
|
}
|
3188
|
+
#elif defined(__NNPA__)
|
3189
|
+
for (; i + 7 < n; i += 8) {
|
3190
|
+
uint16x8_t v_x = vec_xl(0, (const lm_ggml_fp16_t *)(x + i));
|
3191
|
+
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
|
3192
|
+
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
|
3193
|
+
float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
|
3194
|
+
vec_xst(v_yh, 0, (float *)(y + i + 0));
|
3195
|
+
vec_xst(v_yl, 0, (float *)(y + i + 4));
|
3196
|
+
}
|
3197
|
+
for (; i + 3 < n; i += 4) {
|
3198
|
+
uint16x8_t v_x = vec_xl(0, (const lm_ggml_fp16_t *)(x + i));
|
3199
|
+
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
|
3200
|
+
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
|
3201
|
+
vec_xst(v_yh, 0, (float *)(y + i));
|
3202
|
+
}
|
3207
3203
|
#endif
|
3204
|
+
|
3208
3205
|
for (; i < n; ++i) {
|
3209
|
-
y[i] =
|
3206
|
+
y[i] = LM_GGML_CPU_FP16_TO_FP32(x[i]);
|
3210
3207
|
}
|
3211
3208
|
}
|
3212
3209
|
|
@@ -3406,9 +3403,17 @@ int lm_ggml_cpu_has_vxe(void) {
|
|
3406
3403
|
#endif
|
3407
3404
|
}
|
3408
3405
|
|
3406
|
+
int lm_ggml_cpu_has_nnpa(void) {
|
3407
|
+
#if defined(LM_GGML_NNPA)
|
3408
|
+
return 1;
|
3409
|
+
#else
|
3410
|
+
return 0;
|
3411
|
+
#endif
|
3412
|
+
}
|
3413
|
+
|
3409
3414
|
int lm_ggml_cpu_has_neon(void) {
|
3410
3415
|
#if defined(__ARM_ARCH) && defined(__ARM_NEON)
|
3411
|
-
return
|
3416
|
+
return 1;
|
3412
3417
|
#else
|
3413
3418
|
return 0;
|
3414
3419
|
#endif
|
@@ -3416,7 +3421,7 @@ int lm_ggml_cpu_has_neon(void) {
|
|
3416
3421
|
|
3417
3422
|
int lm_ggml_cpu_has_dotprod(void) {
|
3418
3423
|
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
|
3419
|
-
return
|
3424
|
+
return 1;
|
3420
3425
|
#else
|
3421
3426
|
return 0;
|
3422
3427
|
#endif
|
@@ -3424,7 +3429,7 @@ int lm_ggml_cpu_has_dotprod(void) {
|
|
3424
3429
|
|
3425
3430
|
int lm_ggml_cpu_has_sve(void) {
|
3426
3431
|
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
|
3427
|
-
return
|
3432
|
+
return 1;
|
3428
3433
|
#else
|
3429
3434
|
return 0;
|
3430
3435
|
#endif
|
@@ -3432,7 +3437,7 @@ int lm_ggml_cpu_has_sve(void) {
|
|
3432
3437
|
|
3433
3438
|
int lm_ggml_cpu_has_matmul_int8(void) {
|
3434
3439
|
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8)
|
3435
|
-
return
|
3440
|
+
return 1;
|
3436
3441
|
#else
|
3437
3442
|
return 0;
|
3438
3443
|
#endif
|
@@ -3448,14 +3453,14 @@ int lm_ggml_cpu_get_sve_cnt(void) {
|
|
3448
3453
|
|
3449
3454
|
int lm_ggml_cpu_has_sme(void) {
|
3450
3455
|
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SME)
|
3451
|
-
return
|
3456
|
+
return 1;
|
3452
3457
|
#else
|
3453
3458
|
return 0;
|
3454
3459
|
#endif
|
3455
3460
|
}
|
3456
3461
|
|
3457
3462
|
void lm_ggml_cpu_init(void) {
|
3458
|
-
// needed to initialize
|
3463
|
+
// needed to initialize lm_ggml_time
|
3459
3464
|
{
|
3460
3465
|
struct lm_ggml_init_params params = { 0, NULL, false };
|
3461
3466
|
struct lm_ggml_context * ctx = lm_ggml_init(params);
|
@@ -3476,14 +3481,28 @@ void lm_ggml_cpu_init(void) {
|
|
3476
3481
|
uint16_t u16;
|
3477
3482
|
lm_ggml_fp16_t fp16;
|
3478
3483
|
} u = {i};
|
3479
|
-
float f =
|
3480
|
-
|
3481
|
-
|
3484
|
+
float f = LM_GGML_COMPUTE_FP16_TO_FP32(u.fp16);
|
3485
|
+
lm_ggml_table_f32_f16[i] = f;
|
3486
|
+
lm_ggml_table_gelu_f16[i] = LM_GGML_CPU_FP32_TO_FP16(lm_ggml_gelu_f32(f));
|
3487
|
+
lm_ggml_table_gelu_quick_f16[i] = LM_GGML_CPU_FP32_TO_FP16(lm_ggml_gelu_quick_f32(f));
|
3482
3488
|
}
|
3483
3489
|
|
3484
3490
|
const uint64_t t_end = lm_ggml_time_us(); UNUSED(t_end);
|
3485
3491
|
|
3486
3492
|
LM_GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);
|
3493
|
+
|
3494
|
+
#ifdef LM_GGML_USE_OPENMP
|
3495
|
+
//if (!getenv("OMP_WAIT_POLICY")) {
|
3496
|
+
// // set the wait policy to active, so that OpenMP threads don't sleep
|
3497
|
+
// putenv("OMP_WAIT_POLICY=active");
|
3498
|
+
//}
|
3499
|
+
|
3500
|
+
if (!getenv("KMP_BLOCKTIME")) {
|
3501
|
+
// set the time to wait before sleeping a thread
|
3502
|
+
// this is less aggressive than setting the wait policy to active, but should achieve similar results in most cases
|
3503
|
+
putenv("KMP_BLOCKTIME=200"); // 200ms
|
3504
|
+
}
|
3505
|
+
#endif
|
3487
3506
|
}
|
3488
3507
|
|
3489
3508
|
#if defined(__ARM_ARCH)
|
@@ -1,8 +1,8 @@
|
|
1
1
|
#include "ggml-backend.h"
|
2
2
|
#include "ggml-backend-impl.h"
|
3
3
|
#include "ggml-cpu.h"
|
4
|
-
#include "
|
5
|
-
#include "
|
4
|
+
#include "repack.h"
|
5
|
+
#include "traits.h"
|
6
6
|
#include "ggml-impl.h"
|
7
7
|
#include "amx/amx.h"
|
8
8
|
|
@@ -11,7 +11,7 @@
|
|
11
11
|
#include <vector>
|
12
12
|
|
13
13
|
#ifdef LM_GGML_USE_CPU_HBM
|
14
|
-
# include "
|
14
|
+
# include "hbm.h"
|
15
15
|
#endif
|
16
16
|
|
17
17
|
#ifdef LM_GGML_USE_CPU_KLEIDIAI
|
@@ -51,9 +51,9 @@ std::vector<lm_ggml_backend_buffer_type_t>& lm_ggml_backend_cpu_get_extra_buffer
|
|
51
51
|
}
|
52
52
|
#endif
|
53
53
|
|
54
|
-
#ifdef
|
55
|
-
if (
|
56
|
-
bufts.push_back(
|
54
|
+
#ifdef LM_GGML_USE_CPU_REPACK
|
55
|
+
if (lm_ggml_backend_cpu_repack_buffer_type()) {
|
56
|
+
bufts.push_back(lm_ggml_backend_cpu_repack_buffer_type());
|
57
57
|
}
|
58
58
|
#endif
|
59
59
|
|
@@ -578,6 +578,9 @@ static lm_ggml_backend_feature * lm_ggml_backend_cpu_get_features(lm_ggml_backen
|
|
578
578
|
if (lm_ggml_cpu_has_vxe()) {
|
579
579
|
features.push_back({ "VXE", "1" });
|
580
580
|
}
|
581
|
+
if (lm_ggml_cpu_has_nnpa()) {
|
582
|
+
features.push_back({ "NNPA", "1" });
|
583
|
+
}
|
581
584
|
if (lm_ggml_cpu_has_wasm_simd()) {
|
582
585
|
features.push_back({ "WASM_SIMD", "1" });
|
583
586
|
}
|
@@ -596,8 +599,8 @@ static lm_ggml_backend_feature * lm_ggml_backend_cpu_get_features(lm_ggml_backen
|
|
596
599
|
#ifdef LM_GGML_USE_CPU_KLEIDIAI
|
597
600
|
features.push_back({ "KLEIDIAI", "1" });
|
598
601
|
#endif
|
599
|
-
#ifdef
|
600
|
-
features.push_back({ "
|
602
|
+
#ifdef LM_GGML_USE_CPU_REPACK
|
603
|
+
features.push_back({ "REPACK", "1" });
|
601
604
|
#endif
|
602
605
|
|
603
606
|
features.push_back({ nullptr, nullptr });
|