cui-llama.rn 1.7.4 → 1.7.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +217 -17
- package/android/src/main/CMakeLists.txt +34 -15
- package/android/src/main/java/com/rnllama/LlamaContext.java +79 -5
- package/android/src/main/java/com/rnllama/RNLlama.java +237 -0
- package/android/src/main/jni.cpp +213 -14
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
- package/cpp/README.md +1 -1
- package/cpp/chat-parser.cpp +385 -0
- package/cpp/chat-parser.h +120 -0
- package/cpp/chat.cpp +726 -596
- package/cpp/chat.h +71 -6
- package/cpp/common.cpp +56 -38
- package/cpp/common.h +9 -3
- package/cpp/ggml-backend-reg.cpp +5 -0
- package/cpp/ggml-backend.cpp +10 -2
- package/cpp/ggml-common.h +4 -0
- package/cpp/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
- package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/ggml-cpu/common.h +4 -3
- package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
- package/cpp/ggml-cpu/ggml-cpu.c +123 -104
- package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
- package/cpp/ggml-cpu/ops.cpp +330 -148
- package/cpp/ggml-cpu/ops.h +1 -0
- package/cpp/ggml-cpu/quants.c +1158 -0
- package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/ggml-cpu/repack.cpp +1571 -0
- package/cpp/ggml-cpu/repack.h +98 -0
- package/cpp/ggml-cpu/simd-mappings.h +330 -38
- package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/ggml-cpu/vec.cpp +87 -18
- package/cpp/ggml-cpu/vec.h +249 -94
- package/cpp/ggml-cpu.h +1 -0
- package/cpp/ggml-impl.h +63 -183
- package/cpp/ggml-llama-sim.metallib +0 -0
- package/cpp/ggml-llama.metallib +0 -0
- package/cpp/ggml-metal.m +152 -45
- package/cpp/ggml-quants.c +0 -2
- package/cpp/ggml.c +61 -21
- package/cpp/ggml.h +22 -3
- package/cpp/gguf.cpp +24 -3
- package/cpp/json-partial.cpp +256 -0
- package/cpp/json-partial.h +38 -0
- package/cpp/json-schema-to-grammar.cpp +5 -47
- package/cpp/json-schema-to-grammar.h +4 -4
- package/cpp/llama-arch.cpp +153 -3
- package/cpp/llama-arch.h +27 -1
- package/cpp/llama-batch.cpp +741 -272
- package/cpp/llama-batch.h +112 -54
- package/cpp/llama-chat.cpp +30 -8
- package/cpp/llama-chat.h +1 -0
- package/cpp/llama-context.cpp +524 -339
- package/cpp/llama-context.h +38 -17
- package/cpp/llama-cparams.cpp +4 -0
- package/cpp/llama-cparams.h +2 -0
- package/cpp/llama-grammar.cpp +12 -2
- package/cpp/llama-graph.cpp +431 -356
- package/cpp/llama-graph.h +126 -58
- package/cpp/llama-hparams.cpp +10 -2
- package/cpp/llama-hparams.h +19 -2
- package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
- package/cpp/llama-kv-cache-unified-iswa.h +128 -0
- package/cpp/llama-kv-cache-unified.cpp +1841 -0
- package/cpp/llama-kv-cache-unified.h +303 -0
- package/cpp/llama-kv-cells.h +439 -0
- package/cpp/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama-memory-hybrid.h +138 -0
- package/cpp/llama-memory-recurrent.cpp +1112 -0
- package/cpp/llama-memory-recurrent.h +183 -0
- package/cpp/llama-memory.cpp +41 -0
- package/cpp/llama-memory.h +86 -5
- package/cpp/llama-mmap.cpp +1 -1
- package/cpp/llama-model-loader.cpp +42 -17
- package/cpp/llama-model-saver.cpp +1 -0
- package/cpp/llama-model.cpp +1639 -513
- package/cpp/llama-model.h +26 -0
- package/cpp/llama-sampling.cpp +2 -2
- package/cpp/llama-vocab.cpp +65 -28
- package/cpp/llama-vocab.h +1 -0
- package/cpp/llama.cpp +11 -7
- package/cpp/llama.h +150 -42
- package/cpp/minja/chat-template.hpp +1 -1
- package/cpp/minja/minja.hpp +1 -1
- package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
- package/cpp/nlohmann/json_fwd.hpp +187 -0
- package/cpp/regex-partial.cpp +204 -0
- package/cpp/regex-partial.h +56 -0
- package/cpp/rn-llama.cpp +646 -35
- package/cpp/rn-llama.h +32 -1
- package/cpp/rn-tts.h +39 -0
- package/cpp/sampling.cpp +7 -8
- package/cpp/tools/mtmd/clip-impl.h +5 -0
- package/cpp/tools/mtmd/clip.cpp +572 -436
- package/cpp/tools/mtmd/clip.h +14 -4
- package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
- package/cpp/tools/mtmd/mtmd-audio.h +2 -17
- package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
- package/cpp/tools/mtmd/mtmd-helper.h +91 -0
- package/cpp/tools/mtmd/mtmd.cpp +368 -248
- package/cpp/tools/mtmd/mtmd.h +6 -70
- package/cpp/unicode.cpp +5 -0
- package/ios/CMakeLists.txt +26 -6
- package/ios/RNLlama.h +1 -1
- package/ios/RNLlama.mm +153 -3
- package/ios/RNLlamaContext.h +9 -1
- package/ios/RNLlamaContext.mm +112 -9
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/jest/mock.js +24 -0
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +46 -2
- package/src/index.ts +105 -1
- package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
- package/cpp/ggml-cpu/sgemm.cpp +0 -3544
- package/cpp/ggml-cpu/sgemm.h +0 -14
- package/cpp/llama-kv-cache.cpp +0 -2827
- package/cpp/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
- /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
package/cpp/ggml-cpu/vec.h
CHANGED
@@ -5,6 +5,7 @@
|
|
5
5
|
#include "ggml-impl.h"
|
6
6
|
#include "simd-mappings.h"
|
7
7
|
#include "ggml.h"
|
8
|
+
#include "ggml-cpu.h"
|
8
9
|
|
9
10
|
#if defined(LM_GGML_USE_ACCELERATE)
|
10
11
|
#include <Accelerate/Accelerate.h>
|
@@ -57,7 +58,7 @@ inline static void lm_ggml_vec_set_bf16(const int n, lm_ggml_bf16_t * x, const l
|
|
57
58
|
inline static void lm_ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
|
58
59
|
inline static void lm_ggml_vec_add_f16 (const int n, lm_ggml_fp16_t * z, const lm_ggml_fp16_t * x, const lm_ggml_fp16_t * y) {
|
59
60
|
for (int i = 0; i < n; ++i) {
|
60
|
-
z[i] =
|
61
|
+
z[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(x[i]) + LM_GGML_CPU_FP16_TO_FP32(y[i]));
|
61
62
|
}
|
62
63
|
}
|
63
64
|
inline static void lm_ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
|
@@ -66,7 +67,7 @@ inline static void lm_ggml_vec_acc1_f32(const int n, float * y, const float v)
|
|
66
67
|
inline static void lm_ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
|
67
68
|
inline static void lm_ggml_vec_sub_f16 (const int n, lm_ggml_fp16_t * z, const lm_ggml_fp16_t * x, const lm_ggml_fp16_t * y) {
|
68
69
|
for (int i = 0; i < n; ++i) {
|
69
|
-
z[i] =
|
70
|
+
z[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(x[i]) - LM_GGML_CPU_FP16_TO_FP32(y[i]));
|
70
71
|
}
|
71
72
|
}
|
72
73
|
inline static void lm_ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
@@ -74,20 +75,20 @@ inline static void lm_ggml_vec_cpy_f32 (const int n, float * y, const float * x)
|
|
74
75
|
inline static void lm_ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
|
75
76
|
inline static void lm_ggml_vec_neg_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
|
76
77
|
for (int i = 0; i < n; ++i) {
|
77
|
-
y[i] =
|
78
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16(-LM_GGML_CPU_FP16_TO_FP32(x[i]));
|
78
79
|
}
|
79
80
|
}
|
80
81
|
|
81
82
|
inline static void lm_ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
82
83
|
inline static void lm_ggml_vec_mul_f16 (const int n, lm_ggml_fp16_t * z, const lm_ggml_fp16_t * x, const lm_ggml_fp16_t * y) {
|
83
84
|
for (int i = 0; i < n; ++i) {
|
84
|
-
z[i] =
|
85
|
+
z[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(x[i]) * LM_GGML_CPU_FP16_TO_FP32(y[i]));
|
85
86
|
}
|
86
87
|
}
|
87
88
|
inline static void lm_ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
88
89
|
inline static void lm_ggml_vec_div_f16 (const int n, lm_ggml_fp16_t * z, const lm_ggml_fp16_t * x, const lm_ggml_fp16_t * y) {
|
89
90
|
for (int i = 0; i < n; ++i) {
|
90
|
-
z[i] =
|
91
|
+
z[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(x[i]) / LM_GGML_CPU_FP16_TO_FP32(y[i]));
|
91
92
|
}
|
92
93
|
}
|
93
94
|
|
@@ -130,13 +131,13 @@ inline static void lm_ggml_vec_dot_f16_unroll(const int n, const int xs, float *
|
|
130
131
|
// leftovers
|
131
132
|
for (int i = np; i < n; ++i) {
|
132
133
|
for (int j = 0; j < LM_GGML_VEC_DOT_UNROLL; ++j) {
|
133
|
-
sumf[j] += (lm_ggml_float)(
|
134
|
+
sumf[j] += (lm_ggml_float)(LM_GGML_CPU_FP16_TO_FP32(x[j][i])*LM_GGML_CPU_FP16_TO_FP32(y[i]));
|
134
135
|
}
|
135
136
|
}
|
136
137
|
#else
|
137
138
|
for (int i = 0; i < n; ++i) {
|
138
139
|
for (int j = 0; j < LM_GGML_VEC_DOT_UNROLL; ++j) {
|
139
|
-
sumf[j] += (lm_ggml_float)(
|
140
|
+
sumf[j] += (lm_ggml_float)(LM_GGML_CPU_FP16_TO_FP32(x[j][i])*LM_GGML_CPU_FP16_TO_FP32(y[i]));
|
140
141
|
}
|
141
142
|
}
|
142
143
|
#endif
|
@@ -148,27 +149,108 @@ inline static void lm_ggml_vec_dot_f16_unroll(const int n, const int xs, float *
|
|
148
149
|
|
149
150
|
inline static void lm_ggml_vec_mad_f32(const int n, float * LM_GGML_RESTRICT y, const float * LM_GGML_RESTRICT x, const float v) {
|
150
151
|
#if defined(LM_GGML_SIMD)
|
151
|
-
|
152
|
+
#if defined(__ARM_FEATURE_SVE)
|
152
153
|
|
153
|
-
|
154
|
+
const int sve_register_length = lm_ggml_cpu_get_sve_cnt() * 8;
|
155
|
+
const int lm_ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
|
156
|
+
const int lm_ggml_f32_step = 8 * lm_ggml_f32_epr; // choose 8 SVE registers
|
157
|
+
LM_GGML_F32_VEC vx = LM_GGML_F32_VEC_SET1(v);
|
154
158
|
|
155
|
-
|
156
|
-
|
159
|
+
const int np = (n & ~(lm_ggml_f32_step - 1));
|
160
|
+
svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
161
|
+
svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
162
|
+
for (int i = 0; i < np; i += lm_ggml_f32_step) {
|
157
163
|
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
ay[j] = LM_GGML_F32_VEC_LOAD(y + i + j*LM_GGML_F32_EPR);
|
162
|
-
ay[j] = LM_GGML_F32_VEC_FMA(ay[j], ax[j], vx);
|
164
|
+
ax1 = LM_GGML_F32_VEC_LOAD(x + i);
|
165
|
+
ay1 = LM_GGML_F32_VEC_LOAD(y + i);
|
166
|
+
ay1 = LM_GGML_F32_VEC_FMA(ax1, vx, ay1);
|
163
167
|
|
164
|
-
LM_GGML_F32_VEC_STORE(y + i
|
168
|
+
LM_GGML_F32_VEC_STORE(y + i, ay1);
|
169
|
+
|
170
|
+
ax2 = LM_GGML_F32_VEC_LOAD(x + i + 1*lm_ggml_f32_epr);
|
171
|
+
ay2 = LM_GGML_F32_VEC_LOAD(y + i + 1*lm_ggml_f32_epr);
|
172
|
+
ay2 = LM_GGML_F32_VEC_FMA(ax2, vx, ay2);
|
173
|
+
|
174
|
+
LM_GGML_F32_VEC_STORE(y + i + 1*lm_ggml_f32_epr, ay2);
|
175
|
+
|
176
|
+
ax3 = LM_GGML_F32_VEC_LOAD(x + i + 2*lm_ggml_f32_epr);
|
177
|
+
ay3 = LM_GGML_F32_VEC_LOAD(y + i + 2*lm_ggml_f32_epr);
|
178
|
+
ay3 = LM_GGML_F32_VEC_FMA(ax3, vx, ay3);
|
179
|
+
|
180
|
+
LM_GGML_F32_VEC_STORE(y + i + 2*lm_ggml_f32_epr, ay3);
|
181
|
+
|
182
|
+
ax4 = LM_GGML_F32_VEC_LOAD(x + i + 3*lm_ggml_f32_epr);
|
183
|
+
ay4 = LM_GGML_F32_VEC_LOAD(y + i + 3*lm_ggml_f32_epr);
|
184
|
+
ay4 = LM_GGML_F32_VEC_FMA(ax4, vx, ay4);
|
185
|
+
|
186
|
+
LM_GGML_F32_VEC_STORE(y + i + 3*lm_ggml_f32_epr, ay4);
|
187
|
+
|
188
|
+
ax5 = LM_GGML_F32_VEC_LOAD(x + i + 4*lm_ggml_f32_epr);
|
189
|
+
ay5 = LM_GGML_F32_VEC_LOAD(y + i + 4*lm_ggml_f32_epr);
|
190
|
+
ay5 = LM_GGML_F32_VEC_FMA(ax5, vx, ay5);
|
191
|
+
|
192
|
+
LM_GGML_F32_VEC_STORE(y + i + 4*lm_ggml_f32_epr, ay5);
|
193
|
+
|
194
|
+
ax6 = LM_GGML_F32_VEC_LOAD(x + i + 5*lm_ggml_f32_epr);
|
195
|
+
ay6 = LM_GGML_F32_VEC_LOAD(y + i + 5*lm_ggml_f32_epr);
|
196
|
+
ay6 = LM_GGML_F32_VEC_FMA(ax6, vx, ay6);
|
197
|
+
|
198
|
+
LM_GGML_F32_VEC_STORE(y + i + 5*lm_ggml_f32_epr, ay6);
|
199
|
+
|
200
|
+
ax7 = LM_GGML_F32_VEC_LOAD(x + i + 6*lm_ggml_f32_epr);
|
201
|
+
ay7 = LM_GGML_F32_VEC_LOAD(y + i + 6*lm_ggml_f32_epr);
|
202
|
+
ay7 = LM_GGML_F32_VEC_FMA(ax7, vx, ay7);
|
203
|
+
|
204
|
+
LM_GGML_F32_VEC_STORE(y + i + 6*lm_ggml_f32_epr, ay7);
|
205
|
+
|
206
|
+
ax8 = LM_GGML_F32_VEC_LOAD(x + i + 7*lm_ggml_f32_epr);
|
207
|
+
ay8 = LM_GGML_F32_VEC_LOAD(y + i + 7*lm_ggml_f32_epr);
|
208
|
+
ay8 = LM_GGML_F32_VEC_FMA(ax8, vx, ay8);
|
209
|
+
|
210
|
+
LM_GGML_F32_VEC_STORE(y + i + 7*lm_ggml_f32_epr, ay8);
|
165
211
|
}
|
166
|
-
|
212
|
+
// leftovers
|
213
|
+
// Since 8 unrolls are done in above loop, leftovers lie in range [0, lm_ggml_f32_step] which is handled in below loop
|
214
|
+
const int np2 = (n & ~(lm_ggml_f32_epr - 1));
|
215
|
+
for (int i = np; i < np2; i += lm_ggml_f32_epr) {
|
216
|
+
ax1 = LM_GGML_F32_VEC_LOAD(x + i);
|
217
|
+
ay1 = LM_GGML_F32_VEC_LOAD(y + i);
|
218
|
+
ay1 = LM_GGML_F32_VEC_FMA(ax1, vx, ay1);
|
219
|
+
|
220
|
+
LM_GGML_F32_VEC_STORE(y + i, ay1);
|
221
|
+
}
|
222
|
+
// maximum number of leftover elements will be less that lm_ggml_f32_epr. Apply predicated svmad on available elements only
|
223
|
+
if (np2 < n) {
|
224
|
+
svbool_t pg =svwhilelt_b32(np2, n);
|
225
|
+
ax1 = svld1_f32(pg, x + np2);
|
226
|
+
ay1 = svld1_f32(pg, y + np2);
|
227
|
+
ay1 = svmad_f32_m(pg, ax1, vx, ay1);
|
228
|
+
|
229
|
+
svst1_f32(pg, y + np2, ay1);
|
230
|
+
}
|
231
|
+
#else
|
232
|
+
const int np = (n & ~(LM_GGML_F32_STEP - 1));
|
167
233
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
234
|
+
LM_GGML_F32_VEC vx = LM_GGML_F32_VEC_SET1(v);
|
235
|
+
|
236
|
+
LM_GGML_F32_VEC ax[LM_GGML_F32_ARR];
|
237
|
+
LM_GGML_F32_VEC ay[LM_GGML_F32_ARR];
|
238
|
+
|
239
|
+
for (int i = 0; i < np; i += LM_GGML_F32_STEP) {
|
240
|
+
for (int j = 0; j < LM_GGML_F32_ARR; j++) {
|
241
|
+
ax[j] = LM_GGML_F32_VEC_LOAD(x + i + j*LM_GGML_F32_EPR);
|
242
|
+
ay[j] = LM_GGML_F32_VEC_LOAD(y + i + j*LM_GGML_F32_EPR);
|
243
|
+
ay[j] = LM_GGML_F32_VEC_FMA(ay[j], ax[j], vx);
|
244
|
+
|
245
|
+
LM_GGML_F32_VEC_STORE(y + i + j*LM_GGML_F32_EPR, ay[j]);
|
246
|
+
}
|
247
|
+
}
|
248
|
+
|
249
|
+
// leftovers
|
250
|
+
for (int i = np; i < n; ++i) {
|
251
|
+
y[i] += x[i]*v;
|
252
|
+
}
|
253
|
+
#endif
|
172
254
|
#else
|
173
255
|
// scalar
|
174
256
|
for (int i = 0; i < n; ++i) {
|
@@ -198,12 +280,12 @@ inline static void lm_ggml_vec_mad_f16(const int n, lm_ggml_fp16_t * LM_GGML_RES
|
|
198
280
|
|
199
281
|
// leftovers
|
200
282
|
for (int i = np; i < n; ++i) {
|
201
|
-
y[i] =
|
283
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(y[i]) + LM_GGML_CPU_FP16_TO_FP32(x[i])*v);
|
202
284
|
}
|
203
285
|
#else
|
204
286
|
// scalar
|
205
287
|
for (int i = 0; i < n; ++i) {
|
206
|
-
y[i] =
|
288
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(y[i]) + LM_GGML_CPU_FP16_TO_FP32(x[i])*v);
|
207
289
|
}
|
208
290
|
#endif
|
209
291
|
}
|
@@ -220,36 +302,45 @@ inline static void lm_ggml_vec_mad_f32_unroll(const int n, const int xs, const i
|
|
220
302
|
}
|
221
303
|
|
222
304
|
#if defined(LM_GGML_SIMD)
|
223
|
-
|
305
|
+
#if defined(__ARM_FEATURE_SVE)
|
306
|
+
// scalar Route to scalar implementation //TODO: Write SVE code
|
307
|
+
for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) {
|
308
|
+
for (int i = 0; i < n; ++i) {
|
309
|
+
y[i] += x[k][i]*v[k][0];
|
310
|
+
}
|
311
|
+
}
|
312
|
+
#else
|
313
|
+
const int np = (n & ~(LM_GGML_F32_STEP - 1));
|
224
314
|
|
225
|
-
|
315
|
+
LM_GGML_F32_VEC vx[LM_GGML_VEC_MAD_UNROLL];
|
226
316
|
|
227
|
-
|
228
|
-
|
229
|
-
|
317
|
+
for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) {
|
318
|
+
vx[k] = LM_GGML_F32_VEC_SET1(v[k][0]);
|
319
|
+
}
|
230
320
|
|
231
|
-
|
232
|
-
|
321
|
+
LM_GGML_F32_VEC ax[LM_GGML_VEC_MAD_UNROLL][LM_GGML_F32_ARR];
|
322
|
+
LM_GGML_F32_VEC ay[LM_GGML_F32_ARR];
|
233
323
|
|
234
|
-
|
235
|
-
|
236
|
-
|
324
|
+
for (int i = 0; i < np; i += LM_GGML_F32_STEP) {
|
325
|
+
for (int j = 0; j < LM_GGML_F32_ARR; j++) {
|
326
|
+
ay[j] = LM_GGML_F32_VEC_LOAD(y + i + j*LM_GGML_F32_EPR);
|
237
327
|
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
328
|
+
for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) {
|
329
|
+
ax[k][j] = LM_GGML_F32_VEC_LOAD(x[k] + i + j*LM_GGML_F32_EPR);
|
330
|
+
ay[j] = LM_GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
|
331
|
+
}
|
242
332
|
|
243
|
-
|
333
|
+
LM_GGML_F32_VEC_STORE(y + i + j*LM_GGML_F32_EPR, ay[j]);
|
334
|
+
}
|
244
335
|
}
|
245
|
-
}
|
246
336
|
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
337
|
+
// leftovers
|
338
|
+
for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) {
|
339
|
+
for (int i = np; i < n; ++i) {
|
340
|
+
y[i] += x[k][i]*v[k][0];
|
341
|
+
}
|
251
342
|
}
|
252
|
-
|
343
|
+
#endif
|
253
344
|
#else
|
254
345
|
// scalar
|
255
346
|
for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) {
|
@@ -265,25 +356,53 @@ inline static void lm_ggml_vec_scale_f32(const int n, float * y, const float v
|
|
265
356
|
#if defined(LM_GGML_USE_ACCELERATE)
|
266
357
|
vDSP_vsmul(y, 1, &v, y, 1, n);
|
267
358
|
#elif defined(LM_GGML_SIMD)
|
268
|
-
|
359
|
+
#if defined(__ARM_FEATURE_SVE)
|
360
|
+
const int sve_register_length = lm_ggml_cpu_get_sve_cnt() * 8;
|
361
|
+
const int lm_ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
|
362
|
+
const int lm_ggml_f32_step = 2 * lm_ggml_f32_epr;
|
363
|
+
|
364
|
+
LM_GGML_F32_VEC vx = LM_GGML_F32_VEC_SET1(v);
|
365
|
+
const int np = (n & ~(lm_ggml_f32_step - 1));
|
366
|
+
svfloat32_t ay1;
|
367
|
+
svfloat32_t ay2;
|
368
|
+
for (int i = 0; i < np; i += lm_ggml_f32_step) {
|
369
|
+
ay1 = LM_GGML_F32_VEC_LOAD(y + i);
|
370
|
+
ay1 = LM_GGML_F32_VEC_MUL(ay1, vx);
|
371
|
+
LM_GGML_F32_VEC_STORE(y + i, ay1);
|
372
|
+
|
373
|
+
ay2 = LM_GGML_F32_VEC_LOAD(y + i + 1*lm_ggml_f32_epr);
|
374
|
+
ay2 = LM_GGML_F32_VEC_MUL(ay2, vx);
|
375
|
+
LM_GGML_F32_VEC_STORE(y + i + 1*lm_ggml_f32_epr, ay2);
|
376
|
+
}
|
377
|
+
// leftovers
|
378
|
+
// maximum number of leftover elements will be less that lm_ggml_f32_epr. Apply predicated svmad on available elements only
|
379
|
+
if (np < n) {
|
380
|
+
svbool_t pg = svwhilelt_b32(np, n);
|
381
|
+
ay1 = svld1_f32(pg, y + np);
|
382
|
+
ay1 = svmul_f32_m(pg, ay1, vx);
|
383
|
+
svst1_f32(pg, y + np, ay1);
|
384
|
+
}
|
385
|
+
#else
|
386
|
+
const int np = (n & ~(LM_GGML_F32_STEP - 1));
|
269
387
|
|
270
|
-
|
388
|
+
LM_GGML_F32_VEC vx = LM_GGML_F32_VEC_SET1(v);
|
271
389
|
|
272
|
-
|
390
|
+
LM_GGML_F32_VEC ay[LM_GGML_F32_ARR];
|
273
391
|
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
392
|
+
for (int i = 0; i < np; i += LM_GGML_F32_STEP) {
|
393
|
+
for (int j = 0; j < LM_GGML_F32_ARR; j++) {
|
394
|
+
ay[j] = LM_GGML_F32_VEC_LOAD(y + i + j*LM_GGML_F32_EPR);
|
395
|
+
ay[j] = LM_GGML_F32_VEC_MUL(ay[j], vx);
|
278
396
|
|
279
|
-
|
397
|
+
LM_GGML_F32_VEC_STORE(y + i + j*LM_GGML_F32_EPR, ay[j]);
|
398
|
+
}
|
280
399
|
}
|
281
|
-
}
|
282
400
|
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
401
|
+
// leftovers
|
402
|
+
for (int i = np; i < n; ++i) {
|
403
|
+
y[i] *= v;
|
404
|
+
}
|
405
|
+
#endif
|
287
406
|
#else
|
288
407
|
// scalar
|
289
408
|
for (int i = 0; i < n; ++i) {
|
@@ -311,12 +430,12 @@ inline static void lm_ggml_vec_scale_f16(const int n, lm_ggml_fp16_t * y, const
|
|
311
430
|
|
312
431
|
// leftovers
|
313
432
|
for (int i = np; i < n; ++i) {
|
314
|
-
y[i] =
|
433
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(y[i])*v);
|
315
434
|
}
|
316
435
|
#else
|
317
436
|
// scalar
|
318
437
|
for (int i = 0; i < n; ++i) {
|
319
|
-
y[i] =
|
438
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(y[i])*v);
|
320
439
|
}
|
321
440
|
#endif
|
322
441
|
}
|
@@ -325,103 +444,103 @@ inline static void lm_ggml_vec_norm_f32 (const int n, float * s, const float * x
|
|
325
444
|
inline static void lm_ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
326
445
|
inline static void lm_ggml_vec_sqr_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
|
327
446
|
for (int i = 0; i < n; ++i) {
|
328
|
-
float v =
|
329
|
-
y[i] =
|
447
|
+
float v = LM_GGML_CPU_FP16_TO_FP32(x[i]);
|
448
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16(v*v);
|
330
449
|
}
|
331
450
|
}
|
332
451
|
inline static void lm_ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
333
452
|
inline static void lm_ggml_vec_sqrt_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
|
334
453
|
for (int i = 0; i < n; ++i) {
|
335
|
-
y[i] =
|
454
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16(sqrtf(LM_GGML_CPU_FP16_TO_FP32(x[i])));
|
336
455
|
}
|
337
456
|
}
|
338
457
|
inline static void lm_ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
|
339
458
|
inline static void lm_ggml_vec_log_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
|
340
459
|
for (int i = 0; i < n; ++i) {
|
341
|
-
y[i] =
|
460
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16(logf(LM_GGML_CPU_FP16_TO_FP32(x[i])));
|
342
461
|
}
|
343
462
|
}
|
344
463
|
inline static void lm_ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
|
345
464
|
inline static void lm_ggml_vec_sin_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
|
346
465
|
for (int i = 0; i < n; ++i) {
|
347
|
-
y[i] =
|
466
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16(sinf(LM_GGML_CPU_FP16_TO_FP32(x[i])));
|
348
467
|
}
|
349
468
|
}
|
350
469
|
inline static void lm_ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
|
351
470
|
inline static void lm_ggml_vec_cos_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
|
352
471
|
for (int i = 0; i < n; ++i) {
|
353
|
-
y[i] =
|
472
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16(cosf(LM_GGML_CPU_FP16_TO_FP32(x[i])));
|
354
473
|
}
|
355
474
|
}
|
356
475
|
inline static void lm_ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
|
357
476
|
inline static void lm_ggml_vec_abs_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
|
358
477
|
for (int i = 0; i < n; ++i) {
|
359
|
-
y[i] =
|
478
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16(fabsf(LM_GGML_CPU_FP16_TO_FP32(x[i])));
|
360
479
|
}
|
361
480
|
}
|
362
481
|
inline static void lm_ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
|
363
482
|
inline static void lm_ggml_vec_sgn_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
|
364
483
|
for (int i = 0; i < n; ++i) {
|
365
|
-
float v =
|
366
|
-
y[i] =
|
484
|
+
float v = LM_GGML_CPU_FP16_TO_FP32(x[i]);
|
485
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
|
367
486
|
}
|
368
487
|
}
|
369
488
|
inline static void lm_ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
|
370
489
|
inline static void lm_ggml_vec_step_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
|
371
490
|
for (int i = 0; i < n; ++i) {
|
372
|
-
y[i] =
|
491
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16((LM_GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
|
373
492
|
}
|
374
493
|
}
|
375
494
|
inline static void lm_ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
376
495
|
inline static void lm_ggml_vec_tanh_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
|
377
496
|
for (int i = 0; i < n; ++i) {
|
378
|
-
y[i] =
|
497
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16(tanhf(LM_GGML_CPU_FP16_TO_FP32(x[i])));
|
379
498
|
}
|
380
499
|
}
|
381
500
|
inline static void lm_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
|
382
501
|
inline static void lm_ggml_vec_elu_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
|
383
502
|
for (int i = 0; i < n; ++i) {
|
384
|
-
y[i] =
|
503
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16(expm1f(LM_GGML_CPU_FP16_TO_FP32(x[i])));
|
385
504
|
}
|
386
505
|
}
|
387
506
|
inline static void lm_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
388
507
|
inline static void lm_ggml_vec_relu_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
|
389
508
|
for (int i = 0; i < n; ++i) {
|
390
|
-
float v =
|
391
|
-
y[i] =
|
509
|
+
float v = LM_GGML_CPU_FP16_TO_FP32(x[i]);
|
510
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f);
|
392
511
|
}
|
393
512
|
}
|
394
513
|
inline static void lm_ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
395
514
|
inline static void lm_ggml_vec_leaky_relu_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x, const float ns) {
|
396
515
|
for (int i = 0; i < n; ++i) {
|
397
|
-
float v =
|
398
|
-
y[i] =
|
516
|
+
float v = LM_GGML_CPU_FP16_TO_FP32(x[i]);
|
517
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
|
399
518
|
}
|
400
519
|
}
|
401
520
|
inline static void lm_ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
|
402
521
|
inline static void lm_ggml_vec_sigmoid_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
|
403
522
|
for (int i = 0; i < n; ++i) {
|
404
|
-
y[i] =
|
523
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-LM_GGML_CPU_FP16_TO_FP32(x[i]))));
|
405
524
|
}
|
406
525
|
}
|
407
526
|
// TODO: optimize performance
|
408
527
|
inline static void lm_ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
409
528
|
inline static void lm_ggml_vec_hardswish_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
|
410
529
|
for (int i = 0; i < n; ++i) {
|
411
|
-
float v =
|
412
|
-
y[i] =
|
530
|
+
float v = LM_GGML_CPU_FP16_TO_FP32(x[i]);
|
531
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
|
413
532
|
}
|
414
533
|
}
|
415
534
|
inline static void lm_ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
416
535
|
inline static void lm_ggml_vec_hardsigmoid_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
|
417
536
|
for (int i = 0; i < n; ++i) {
|
418
|
-
y[i] =
|
537
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (LM_GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
|
419
538
|
}
|
420
539
|
}
|
421
540
|
inline static void lm_ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
|
422
541
|
inline static void lm_ggml_vec_exp_f16 (const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
|
423
542
|
for (int i = 0; i < n; ++i) {
|
424
|
-
y[i] =
|
543
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16(expf(LM_GGML_CPU_FP16_TO_FP32(x[i])));
|
425
544
|
}
|
426
545
|
}
|
427
546
|
|
@@ -443,9 +562,9 @@ inline static void lm_ggml_vec_gelu_f16(const int n, lm_ggml_fp16_t * y, const l
|
|
443
562
|
|
444
563
|
inline static void lm_ggml_vec_gelu_erf_f16(const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
|
445
564
|
for (int i = 0; i < n; ++i) {
|
446
|
-
float xi =
|
565
|
+
float xi = LM_GGML_CPU_FP16_TO_FP32(x[i]);
|
447
566
|
float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
|
448
|
-
y[i] =
|
567
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16(res);
|
449
568
|
}
|
450
569
|
}
|
451
570
|
|
@@ -458,9 +577,9 @@ inline static void lm_ggml_vec_gelu_f32(const int n, float * y, const float * x)
|
|
458
577
|
} else if (x[i] >= 10.0f) {
|
459
578
|
y[i] = x[i];
|
460
579
|
} else {
|
461
|
-
lm_ggml_fp16_t fp16 =
|
580
|
+
lm_ggml_fp16_t fp16 = LM_GGML_CPU_FP32_TO_FP16(x[i]);
|
462
581
|
memcpy(&t, &fp16, sizeof(uint16_t));
|
463
|
-
y[i] =
|
582
|
+
y[i] = LM_GGML_CPU_FP16_TO_FP32(lm_ggml_table_gelu_f16[t]);
|
464
583
|
}
|
465
584
|
}
|
466
585
|
}
|
@@ -494,9 +613,9 @@ inline static float lm_ggml_gelu_quick_f32(float x) {
|
|
494
613
|
inline static void lm_ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
|
495
614
|
uint16_t t;
|
496
615
|
for (int i = 0; i < n; ++i) {
|
497
|
-
lm_ggml_fp16_t fp16 =
|
616
|
+
lm_ggml_fp16_t fp16 = LM_GGML_CPU_FP32_TO_FP16(x[i]);
|
498
617
|
memcpy(&t, &fp16, sizeof(uint16_t));
|
499
|
-
y[i] =
|
618
|
+
y[i] = LM_GGML_CPU_FP16_TO_FP32(lm_ggml_table_gelu_quick_f16[t]);
|
500
619
|
}
|
501
620
|
}
|
502
621
|
#else
|
@@ -509,8 +628,8 @@ inline static void lm_ggml_vec_gelu_quick_f32(const int n, float * y, const floa
|
|
509
628
|
|
510
629
|
inline static void lm_ggml_vec_gelu_quick_f16(const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
|
511
630
|
for (int i = 0; i < n; ++i) {
|
512
|
-
float v =
|
513
|
-
y[i] =
|
631
|
+
float v = LM_GGML_CPU_FP16_TO_FP32(x[i]);
|
632
|
+
y[i] = LM_GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
|
514
633
|
}
|
515
634
|
}
|
516
635
|
|
@@ -519,8 +638,8 @@ inline static float lm_ggml_silu_f32(float x) {
|
|
519
638
|
return x/(1.0f + expf(-x));
|
520
639
|
}
|
521
640
|
inline static lm_ggml_fp16_t lm_ggml_silu_f16(lm_ggml_fp16_t x) {
|
522
|
-
float v =
|
523
|
-
return
|
641
|
+
float v = LM_GGML_CPU_FP16_TO_FP32(x);
|
642
|
+
return LM_GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v)));
|
524
643
|
}
|
525
644
|
|
526
645
|
#if __FINITE_MATH_ONLY__
|
@@ -528,6 +647,42 @@ inline static lm_ggml_fp16_t lm_ggml_silu_f16(lm_ggml_fp16_t x) {
|
|
528
647
|
#error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
|
529
648
|
#endif
|
530
649
|
|
650
|
+
/* Below function was borrowed from the GitHub repository:
|
651
|
+
https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */
|
652
|
+
#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
653
|
+
inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
|
654
|
+
// Constants
|
655
|
+
const svfloat32_t log2_e = svdup_n_f32(1.4426950409f);
|
656
|
+
const svfloat32_t ln2 = svdup_n_f32(0.6931473921f);
|
657
|
+
const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f);
|
658
|
+
const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1));
|
659
|
+
const svfloat32_t one = svdup_n_f32(1.0f);
|
660
|
+
const svfloat32_t inactive1 = svdup_n_f32(0.0f);
|
661
|
+
const svint32_t inactive2 = svdup_n_s32(0);
|
662
|
+
|
663
|
+
// Algorithm starts here
|
664
|
+
svfloat32_t t0 = svmul_f32_m(pg, src, log2_e); // y = x * log2(e)
|
665
|
+
svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0); // rount to int (float)
|
666
|
+
svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1); // n
|
667
|
+
|
668
|
+
t1 = svsub_f32_m(pg, t0, t1); // a = y - floor(y)
|
669
|
+
t1 = svadd_f32_m(pg, t1, one); // b = a + 1
|
670
|
+
|
671
|
+
svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17); // v = b >> 17 (u32)
|
672
|
+
svfloat32_t t4 = svexpa_f32(t3); // c = fexpa(v)
|
673
|
+
t4 = svscale_f32_m(pg, t4, t2); // fexpa(v) * 2^(n)
|
674
|
+
|
675
|
+
// and_(t2.d, t1.d, not_mask17.d)
|
676
|
+
svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
|
677
|
+
t5 = svsub_f32_m(pg, t1, t5); // z
|
678
|
+
t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq); // ln2 + half_ln2_sq * z
|
679
|
+
t0 = svmla_f32_m(pg, one, t5, t0); // 1 + (ln2 * z) + (half_ln2_sq * z * z)
|
680
|
+
t0 = svmul_f32_m(pg, t0, t4); // Final result
|
681
|
+
|
682
|
+
return t0;
|
683
|
+
}
|
684
|
+
#endif
|
685
|
+
|
531
686
|
#if defined(__ARM_NEON) && defined(__aarch64__)
|
532
687
|
|
533
688
|
// adapted from arm limited optimized routine
|
@@ -733,9 +888,9 @@ inline static float lm_ggml_silu_backward_f32(float x, float dy) {
|
|
733
888
|
}
|
734
889
|
|
735
890
|
inline static lm_ggml_fp16_t lm_ggml_silu_backward_f16(lm_ggml_fp16_t x, lm_ggml_fp16_t dy) {
|
736
|
-
const float v =
|
891
|
+
const float v = LM_GGML_CPU_FP16_TO_FP32(x);
|
737
892
|
const float s = 1.0f/(1.0f + expf(-v));
|
738
|
-
return
|
893
|
+
return LM_GGML_CPU_FP32_TO_FP16(LM_GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
|
739
894
|
}
|
740
895
|
|
741
896
|
inline static void lm_ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
|
@@ -773,7 +928,7 @@ inline static void lm_ggml_vec_sum_f32_ggf(const int n, lm_ggml_float * s, const
|
|
773
928
|
inline static void lm_ggml_vec_sum_f16_ggf(const int n, float * s, const lm_ggml_fp16_t * x) {
|
774
929
|
float sum = 0.0f;
|
775
930
|
for (int i = 0; i < n; ++i) {
|
776
|
-
sum +=
|
931
|
+
sum += LM_GGML_CPU_FP16_TO_FP32(x[i]);
|
777
932
|
}
|
778
933
|
*s = sum;
|
779
934
|
}
|
package/cpp/ggml-cpu.h
CHANGED
@@ -101,6 +101,7 @@ extern "C" {
|
|
101
101
|
LM_GGML_BACKEND_API int lm_ggml_cpu_has_riscv_v (void);
|
102
102
|
LM_GGML_BACKEND_API int lm_ggml_cpu_has_vsx (void);
|
103
103
|
LM_GGML_BACKEND_API int lm_ggml_cpu_has_vxe (void);
|
104
|
+
LM_GGML_BACKEND_API int lm_ggml_cpu_has_nnpa (void);
|
104
105
|
LM_GGML_BACKEND_API int lm_ggml_cpu_has_wasm_simd (void);
|
105
106
|
LM_GGML_BACKEND_API int lm_ggml_cpu_has_llamafile (void);
|
106
107
|
|