cui-llama.rn 1.7.3 → 1.7.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +217 -17
- package/android/src/main/CMakeLists.txt +34 -15
- package/android/src/main/java/com/rnllama/LlamaContext.java +94 -8
- package/android/src/main/java/com/rnllama/RNLlama.java +247 -0
- package/android/src/main/jni.cpp +213 -14
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
- package/cpp/README.md +1 -1
- package/cpp/chat-parser.cpp +385 -0
- package/cpp/chat-parser.h +120 -0
- package/cpp/chat.cpp +726 -596
- package/cpp/chat.h +71 -6
- package/cpp/common.cpp +56 -38
- package/cpp/common.h +9 -3
- package/cpp/ggml-backend-reg.cpp +5 -0
- package/cpp/ggml-backend.cpp +10 -2
- package/cpp/ggml-common.h +4 -0
- package/cpp/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
- package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/ggml-cpu/common.h +4 -3
- package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
- package/cpp/ggml-cpu/ggml-cpu.c +123 -104
- package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
- package/cpp/ggml-cpu/ops.cpp +330 -148
- package/cpp/ggml-cpu/ops.h +1 -0
- package/cpp/ggml-cpu/quants.c +1158 -0
- package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/ggml-cpu/repack.cpp +1571 -0
- package/cpp/ggml-cpu/repack.h +98 -0
- package/cpp/ggml-cpu/simd-mappings.h +330 -38
- package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/ggml-cpu/vec.cpp +87 -18
- package/cpp/ggml-cpu/vec.h +249 -94
- package/cpp/ggml-cpu.h +1 -0
- package/cpp/ggml-impl.h +63 -183
- package/cpp/ggml-llama-sim.metallib +0 -0
- package/cpp/ggml-llama.metallib +0 -0
- package/cpp/ggml-metal.m +152 -45
- package/cpp/ggml-quants.c +0 -2
- package/cpp/ggml.c +61 -21
- package/cpp/ggml.h +22 -3
- package/cpp/gguf.cpp +24 -3
- package/cpp/json-partial.cpp +256 -0
- package/cpp/json-partial.h +38 -0
- package/cpp/json-schema-to-grammar.cpp +5 -47
- package/cpp/json-schema-to-grammar.h +4 -4
- package/cpp/llama-arch.cpp +153 -3
- package/cpp/llama-arch.h +27 -1
- package/cpp/llama-batch.cpp +741 -272
- package/cpp/llama-batch.h +112 -54
- package/cpp/llama-chat.cpp +30 -8
- package/cpp/llama-chat.h +1 -0
- package/cpp/llama-context.cpp +524 -339
- package/cpp/llama-context.h +38 -17
- package/cpp/llama-cparams.cpp +4 -0
- package/cpp/llama-cparams.h +2 -0
- package/cpp/llama-grammar.cpp +12 -2
- package/cpp/llama-graph.cpp +431 -356
- package/cpp/llama-graph.h +126 -58
- package/cpp/llama-hparams.cpp +10 -2
- package/cpp/llama-hparams.h +19 -2
- package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
- package/cpp/llama-kv-cache-unified-iswa.h +128 -0
- package/cpp/llama-kv-cache-unified.cpp +1841 -0
- package/cpp/llama-kv-cache-unified.h +303 -0
- package/cpp/llama-kv-cells.h +439 -0
- package/cpp/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama-memory-hybrid.h +138 -0
- package/cpp/llama-memory-recurrent.cpp +1112 -0
- package/cpp/llama-memory-recurrent.h +183 -0
- package/cpp/llama-memory.cpp +41 -0
- package/cpp/llama-memory.h +86 -5
- package/cpp/llama-mmap.cpp +1 -1
- package/cpp/llama-model-loader.cpp +42 -17
- package/cpp/llama-model-saver.cpp +1 -0
- package/cpp/llama-model.cpp +1639 -513
- package/cpp/llama-model.h +26 -0
- package/cpp/llama-sampling.cpp +2 -2
- package/cpp/llama-vocab.cpp +65 -28
- package/cpp/llama-vocab.h +1 -0
- package/cpp/llama.cpp +11 -7
- package/cpp/llama.h +150 -42
- package/cpp/minja/chat-template.hpp +1 -1
- package/cpp/minja/minja.hpp +1 -1
- package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
- package/cpp/nlohmann/json_fwd.hpp +187 -0
- package/cpp/regex-partial.cpp +204 -0
- package/cpp/regex-partial.h +56 -0
- package/cpp/rn-llama.cpp +646 -35
- package/cpp/rn-llama.h +32 -1
- package/cpp/rn-tts.h +39 -0
- package/cpp/sampling.cpp +7 -8
- package/cpp/tools/mtmd/clip-impl.h +5 -0
- package/cpp/tools/mtmd/clip.cpp +572 -436
- package/cpp/tools/mtmd/clip.h +14 -4
- package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
- package/cpp/tools/mtmd/mtmd-audio.h +2 -17
- package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
- package/cpp/tools/mtmd/mtmd-helper.h +91 -0
- package/cpp/tools/mtmd/mtmd.cpp +368 -248
- package/cpp/tools/mtmd/mtmd.h +6 -70
- package/cpp/unicode.cpp +5 -0
- package/ios/CMakeLists.txt +26 -6
- package/ios/RNLlama.h +1 -1
- package/ios/RNLlama.mm +153 -3
- package/ios/RNLlamaContext.h +9 -1
- package/ios/RNLlamaContext.mm +112 -9
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/jest/mock.js +24 -0
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +46 -2
- package/src/index.ts +105 -1
- package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
- package/cpp/ggml-cpu/sgemm.cpp +0 -3544
- package/cpp/ggml-cpu/sgemm.h +0 -14
- package/cpp/llama-kv-cache.cpp +0 -2827
- package/cpp/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
- /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
@@ -0,0 +1,98 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#define LM_GGML_COMMON_DECL_CPP
|
4
|
+
#include "ggml-common.h"
|
5
|
+
|
6
|
+
#include "traits.h"
|
7
|
+
#include "ggml.h"
|
8
|
+
|
9
|
+
// GGML internal header
|
10
|
+
|
11
|
+
lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_repack_buffer_type(void);
|
12
|
+
|
13
|
+
template <int K> constexpr int QK_0() {
|
14
|
+
if constexpr (K == 4) {
|
15
|
+
return QK4_0;
|
16
|
+
}
|
17
|
+
if constexpr (K == 8) {
|
18
|
+
return QK8_0;
|
19
|
+
}
|
20
|
+
return -1;
|
21
|
+
}
|
22
|
+
|
23
|
+
template <int K, int N> struct block {
|
24
|
+
lm_ggml_half d[N]; // deltas for N qK_0 blocks
|
25
|
+
int8_t qs[(QK_0<K>() * N * K) / 8]; // quants for N qK_0 blocks
|
26
|
+
};
|
27
|
+
|
28
|
+
// control size
|
29
|
+
static_assert(sizeof(block<4, 4>) == 4 * sizeof(lm_ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding");
|
30
|
+
static_assert(sizeof(block<4, 8>) == 8 * sizeof(lm_ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding");
|
31
|
+
static_assert(sizeof(block<8, 4>) == 4 * sizeof(lm_ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding");
|
32
|
+
static_assert(sizeof(block<8, 8>) == 8 * sizeof(lm_ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding");
|
33
|
+
|
34
|
+
using block_q4_0x4 = block<4, 4>;
|
35
|
+
using block_q4_0x8 = block<4, 8>;
|
36
|
+
using block_q8_0x4 = block<8, 4>;
|
37
|
+
using block_q8_0x8 = block<8, 8>;
|
38
|
+
|
39
|
+
struct block_q4_Kx8 {
|
40
|
+
lm_ggml_half d[8]; // super-block scale for quantized scales
|
41
|
+
lm_ggml_half dmin[8]; // super-block scale for quantized mins
|
42
|
+
uint8_t scales[96]; // scales and mins, quantized with 6 bits
|
43
|
+
uint8_t qs[1024]; // 4--bit quants
|
44
|
+
};
|
45
|
+
|
46
|
+
static_assert(sizeof(block_q4_Kx8) == sizeof(lm_ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
|
47
|
+
|
48
|
+
struct block_q8_Kx4 {
|
49
|
+
float d[4]; // delta
|
50
|
+
int8_t qs[QK_K * 4]; // quants
|
51
|
+
int16_t bsums[QK_K / 4]; // sum of quants in groups of 16
|
52
|
+
};
|
53
|
+
|
54
|
+
static_assert(sizeof(block_q8_Kx4) == sizeof(float) * 4 + QK_K * 4 + (QK_K / 4) * sizeof(int16_t), "wrong q8_K block size/padding");
|
55
|
+
|
56
|
+
struct block_iq4_nlx4 {
|
57
|
+
lm_ggml_half d[4]; // deltas for 4 iq4_nl blocks
|
58
|
+
uint8_t qs[QK4_NL * 2]; // nibbles / quants for 4 iq4_nl blocks
|
59
|
+
};
|
60
|
+
|
61
|
+
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(lm_ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
|
62
|
+
|
63
|
+
#if defined(__cplusplus)
|
64
|
+
extern "C" {
|
65
|
+
#endif
|
66
|
+
|
67
|
+
void lm_ggml_quantize_mat_q8_0_4x4(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k);
|
68
|
+
void lm_ggml_quantize_mat_q8_0_4x8(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k);
|
69
|
+
void lm_ggml_quantize_mat_q8_K_4x8(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k);
|
70
|
+
void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
|
71
|
+
void lm_ggml_gemv_q4_0_4x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
|
72
|
+
void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
|
73
|
+
void lm_ggml_gemv_q4_K_8x8_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
|
74
|
+
void lm_ggml_gemv_iq4_nl_4x4_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
|
75
|
+
void lm_ggml_gemm_q4_0_4x4_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
|
76
|
+
void lm_ggml_gemm_q4_0_4x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
|
77
|
+
void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
|
78
|
+
void lm_ggml_gemm_q4_K_8x8_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
|
79
|
+
void lm_ggml_gemm_iq4_nl_4x4_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
|
80
|
+
|
81
|
+
// Native implementations
|
82
|
+
void lm_ggml_quantize_mat_q8_0_4x4_generic(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k);
|
83
|
+
void lm_ggml_quantize_mat_q8_0_4x8_generic(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k);
|
84
|
+
void lm_ggml_quantize_mat_q8_K_4x8_generic(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k);
|
85
|
+
void lm_ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
|
86
|
+
void lm_ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
|
87
|
+
void lm_ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
|
88
|
+
void lm_ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
|
89
|
+
void lm_ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
|
90
|
+
void lm_ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
|
91
|
+
void lm_ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
|
92
|
+
void lm_ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
|
93
|
+
void lm_ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
|
94
|
+
void lm_ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc);
|
95
|
+
|
96
|
+
#if defined(__cplusplus)
|
97
|
+
} // extern "C"
|
98
|
+
#endif
|
@@ -2,10 +2,167 @@
|
|
2
2
|
|
3
3
|
#include "ggml-cpu-impl.h"
|
4
4
|
|
5
|
+
#ifdef __ARM_FEATURE_SVE
|
6
|
+
#include <arm_sve.h>
|
7
|
+
#endif // __ARM_FEATURE_SVE
|
8
|
+
|
9
|
+
#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
|
10
|
+
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
11
|
+
//
|
12
|
+
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
13
|
+
//
|
14
|
+
#include <arm_neon.h>
|
15
|
+
#endif
|
16
|
+
|
17
|
+
#if defined(__F16C__)
|
18
|
+
#include <immintrin.h>
|
19
|
+
#endif
|
20
|
+
|
21
|
+
#ifdef __cplusplus
|
22
|
+
extern "C" {
|
23
|
+
#endif
|
24
|
+
|
5
25
|
//
|
6
26
|
// simd mappings
|
7
27
|
//
|
8
28
|
|
29
|
+
// FP16 to FP32 conversion
|
30
|
+
|
31
|
+
// 16-bit float
|
32
|
+
// on Arm, we use __fp16
|
33
|
+
// on x86, we use uint16_t
|
34
|
+
//
|
35
|
+
// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
|
36
|
+
// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
|
37
|
+
//
|
38
|
+
#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
|
39
|
+
#define LM_GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x)
|
40
|
+
#define LM_GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x)
|
41
|
+
|
42
|
+
#define LM_GGML_CPU_FP16_TO_FP32(x) LM_GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
43
|
+
|
44
|
+
static inline float neon_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
|
45
|
+
__fp16 tmp;
|
46
|
+
memcpy(&tmp, &h, sizeof(lm_ggml_fp16_t));
|
47
|
+
return (float)tmp;
|
48
|
+
}
|
49
|
+
|
50
|
+
static inline lm_ggml_fp16_t neon_compute_fp32_to_fp16(float f) {
|
51
|
+
lm_ggml_fp16_t res;
|
52
|
+
__fp16 tmp = f;
|
53
|
+
memcpy(&res, &tmp, sizeof(lm_ggml_fp16_t));
|
54
|
+
return res;
|
55
|
+
}
|
56
|
+
#elif defined(__F16C__)
|
57
|
+
#ifdef _MSC_VER
|
58
|
+
#define LM_GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
59
|
+
#define LM_GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
60
|
+
#else
|
61
|
+
#define LM_GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
62
|
+
#define LM_GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
63
|
+
#endif
|
64
|
+
#elif defined(__POWER9_VECTOR__)
|
65
|
+
#define LM_GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x)
|
66
|
+
#define LM_GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x)
|
67
|
+
/* the inline asm below is about 12% faster than the lookup method */
|
68
|
+
#define LM_GGML_CPU_FP16_TO_FP32(x) LM_GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
69
|
+
#define LM_GGML_CPU_FP32_TO_FP16(x) LM_GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
70
|
+
|
71
|
+
static inline float power_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
|
72
|
+
float f;
|
73
|
+
double d;
|
74
|
+
__asm__(
|
75
|
+
"mtfprd %0,%2\n"
|
76
|
+
"xscvhpdp %0,%0\n"
|
77
|
+
"frsp %1,%0\n" :
|
78
|
+
/* temp */ "=d"(d),
|
79
|
+
/* out */ "=f"(f):
|
80
|
+
/* in */ "r"(h));
|
81
|
+
return f;
|
82
|
+
}
|
83
|
+
|
84
|
+
static inline lm_ggml_fp16_t power_compute_fp32_to_fp16(float f) {
|
85
|
+
double d;
|
86
|
+
lm_ggml_fp16_t r;
|
87
|
+
__asm__( /* xscvdphp can work on double or single precision */
|
88
|
+
"xscvdphp %0,%2\n"
|
89
|
+
"mffprd %1,%0\n" :
|
90
|
+
/* temp */ "=d"(d),
|
91
|
+
/* out */ "=r"(r):
|
92
|
+
/* in */ "f"(f));
|
93
|
+
return r;
|
94
|
+
}
|
95
|
+
#elif defined(__riscv) && defined(__riscv_zfhmin)
|
96
|
+
static inline float riscv_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
|
97
|
+
float f;
|
98
|
+
__asm__(
|
99
|
+
"fmv.h.x %[f], %[h]\n\t"
|
100
|
+
"fcvt.s.h %[f], %[f]"
|
101
|
+
: [f] "=&f" (f)
|
102
|
+
: [h] "r" (h)
|
103
|
+
);
|
104
|
+
return f;
|
105
|
+
}
|
106
|
+
|
107
|
+
static inline lm_ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
|
108
|
+
lm_ggml_fp16_t res;
|
109
|
+
__asm__(
|
110
|
+
"fcvt.h.s %[f], %[f]\n\t"
|
111
|
+
"fmv.x.h %[h], %[f]"
|
112
|
+
: [h] "=&r" (res)
|
113
|
+
: [f] "f" (f)
|
114
|
+
);
|
115
|
+
return res;
|
116
|
+
}
|
117
|
+
|
118
|
+
#define LM_GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x)
|
119
|
+
#define LM_GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
|
120
|
+
#define LM_GGML_CPU_FP16_TO_FP32(x) LM_GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
121
|
+
#define LM_GGML_CPU_FP32_TO_FP16(x) LM_GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
122
|
+
#elif defined(__NNPA__)
|
123
|
+
#define LM_GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
|
124
|
+
#define LM_GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
|
125
|
+
|
126
|
+
#define LM_GGML_CPU_FP16_TO_FP32(x) LM_GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
127
|
+
#define LM_GGML_CPU_FP32_TO_FP16(x) LM_GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
128
|
+
|
129
|
+
static inline float nnpa_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
|
130
|
+
uint16x8_t v_h = vec_splats(h);
|
131
|
+
uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
|
132
|
+
return vec_extend_to_fp32_hi(v_hd, 0)[0];
|
133
|
+
}
|
134
|
+
|
135
|
+
static inline lm_ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
|
136
|
+
float32x4_t v_f = vec_splats(f);
|
137
|
+
float32x4_t v_zero = vec_splats(0.0f);
|
138
|
+
uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
|
139
|
+
uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
|
140
|
+
return vec_extract(v_h, 0);
|
141
|
+
}
|
142
|
+
#endif
|
143
|
+
|
144
|
+
// precomputed f32 table for f16 (256 KB)
|
145
|
+
// defined in ggml-cpu.c, initialized in lm_ggml_cpu_init()
|
146
|
+
extern float lm_ggml_table_f32_f16[1 << 16];
|
147
|
+
|
148
|
+
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into lm_ggml_lookup_fp16_to_fp32,
|
149
|
+
// so we define LM_GGML_CPU_FP16_TO_FP32 and LM_GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
|
150
|
+
// This is also true for POWER9.
|
151
|
+
#if !defined(LM_GGML_CPU_FP16_TO_FP32)
|
152
|
+
inline static float lm_ggml_lookup_fp16_to_fp32(lm_ggml_fp16_t f) {
|
153
|
+
uint16_t s;
|
154
|
+
memcpy(&s, &f, sizeof(uint16_t));
|
155
|
+
return lm_ggml_table_f32_f16[s];
|
156
|
+
}
|
157
|
+
|
158
|
+
#define LM_GGML_CPU_FP16_TO_FP32(x) lm_ggml_lookup_fp16_to_fp32(x)
|
159
|
+
#endif
|
160
|
+
|
161
|
+
#if !defined(LM_GGML_CPU_FP32_TO_FP16)
|
162
|
+
#define LM_GGML_CPU_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x)
|
163
|
+
#endif
|
164
|
+
|
165
|
+
|
9
166
|
// we define a common set of C macros which map to specific intrinsics based on the current architecture
|
10
167
|
// we then implement the fundamental computation operations below using only these macros
|
11
168
|
// adding support for new architectures requires to define the corresponding SIMD macros
|
@@ -17,7 +174,123 @@
|
|
17
174
|
// number of elements to fit in a single register
|
18
175
|
//
|
19
176
|
|
20
|
-
#if defined(
|
177
|
+
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA)
|
178
|
+
|
179
|
+
#define LM_GGML_SIMD
|
180
|
+
|
181
|
+
// F32 SVE
|
182
|
+
#define LM_GGML_F32_EPR 8
|
183
|
+
#define DEFAULT_PG svptrue_b32()
|
184
|
+
|
185
|
+
#define LM_GGML_F32xt svfloat32_t
|
186
|
+
#define LM_GGML_F32xt_ZERO svdup_n_f32(0.0f)
|
187
|
+
#define LM_GGML_F32xt_SET1(x) svdup_n_f32(x)
|
188
|
+
#define LM_GGML_F32xt_LOAD_IMPL(pg, a, ...) svld1_f32(pg, a)
|
189
|
+
#define LM_GGML_F32xt_LOAD(...) LM_GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
|
190
|
+
#define LM_GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b)
|
191
|
+
#define LM_GGML_F32xt_STORE(...) LM_GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
|
192
|
+
#define LM_GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, a, b, c)
|
193
|
+
#define LM_GGML_F32xt_FMA(...) LM_GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
|
194
|
+
#define LM_GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
|
195
|
+
#define LM_GGML_F32xt_ADD(...) LM_GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
|
196
|
+
#define LM_GGML_F32xt_MUL_IMPL(pg, a, b) svmul_f32_m(pg, a, b)
|
197
|
+
#define LM_GGML_F32xt_MUL(...) LM_GGML_F32xt_MUL_IMPL(DEFAULT_PG, __VA_ARGS__)
|
198
|
+
#define LM_GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
|
199
|
+
#define LM_GGML_F32xt_REDUCE_ONE(...) LM_GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, __VA_ARGS__)
|
200
|
+
#define LM_GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \
|
201
|
+
{ \
|
202
|
+
sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2); \
|
203
|
+
sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4); \
|
204
|
+
sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6); \
|
205
|
+
sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8); \
|
206
|
+
sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3); \
|
207
|
+
sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7); \
|
208
|
+
sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5); \
|
209
|
+
(res) = (lm_ggml_float) LM_GGML_F32xt_REDUCE_ONE(sum1); \
|
210
|
+
}
|
211
|
+
#define LM_GGML_F32xt_REDUCE(...) LM_GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, __VA_ARGS__)
|
212
|
+
|
213
|
+
#define LM_GGML_F32_VEC LM_GGML_F32xt
|
214
|
+
#define LM_GGML_F32_VEC_ZERO LM_GGML_F32xt_ZERO
|
215
|
+
#define LM_GGML_F32_VEC_SET1 LM_GGML_F32xt_SET1
|
216
|
+
#define LM_GGML_F32_VEC_LOAD LM_GGML_F32xt_LOAD
|
217
|
+
#define LM_GGML_F32_VEC_STORE LM_GGML_F32xt_STORE
|
218
|
+
#define LM_GGML_F32_VEC_FMA LM_GGML_F32xt_FMA
|
219
|
+
#define LM_GGML_F32_VEC_ADD LM_GGML_F32xt_ADD
|
220
|
+
#define LM_GGML_F32_VEC_MUL LM_GGML_F32xt_MUL
|
221
|
+
#define LM_GGML_F32_VEC_REDUCE LM_GGML_F32xt_REDUCE
|
222
|
+
|
223
|
+
// F16 NEON
|
224
|
+
|
225
|
+
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
226
|
+
#define LM_GGML_F16_STEP 32
|
227
|
+
#define LM_GGML_F16_EPR 8
|
228
|
+
|
229
|
+
#define LM_GGML_F16x8 float16x8_t
|
230
|
+
#define LM_GGML_F16x8_ZERO vdupq_n_f16(0.0f)
|
231
|
+
#define LM_GGML_F16x8_SET1(x) vdupq_n_f16(x)
|
232
|
+
#define LM_GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
|
233
|
+
#define LM_GGML_F16x8_STORE vst1q_f16
|
234
|
+
#define LM_GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
235
|
+
#define LM_GGML_F16x8_ADD vaddq_f16
|
236
|
+
#define LM_GGML_F16x8_MUL vmulq_f16
|
237
|
+
#define LM_GGML_F16x8_REDUCE(res, x) \
|
238
|
+
do { \
|
239
|
+
int offset = LM_GGML_F16_ARR >> 1; \
|
240
|
+
for (int i = 0; i < offset; ++i) { \
|
241
|
+
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
242
|
+
} \
|
243
|
+
offset >>= 1; \
|
244
|
+
for (int i = 0; i < offset; ++i) { \
|
245
|
+
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
246
|
+
} \
|
247
|
+
offset >>= 1; \
|
248
|
+
for (int i = 0; i < offset; ++i) { \
|
249
|
+
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
250
|
+
} \
|
251
|
+
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
|
252
|
+
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
|
253
|
+
(res) = (lm_ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
254
|
+
} while (0)
|
255
|
+
|
256
|
+
#define LM_GGML_F16_VEC LM_GGML_F16x8
|
257
|
+
#define LM_GGML_F16_VEC_ZERO LM_GGML_F16x8_ZERO
|
258
|
+
#define LM_GGML_F16_VEC_SET1 LM_GGML_F16x8_SET1
|
259
|
+
#define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F16x8_LOAD(p)
|
260
|
+
#define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
|
261
|
+
#define LM_GGML_F16_VEC_FMA LM_GGML_F16x8_FMA
|
262
|
+
#define LM_GGML_F16_VEC_ADD LM_GGML_F16x8_ADD
|
263
|
+
#define LM_GGML_F16_VEC_MUL LM_GGML_F16x8_MUL
|
264
|
+
#define LM_GGML_F16_VEC_REDUCE LM_GGML_F16x8_REDUCE
|
265
|
+
#else
|
266
|
+
// if FP16 vector arithmetic is not supported, we use FP32 instead
|
267
|
+
// and take advantage of the vcvt_ functions to convert to/from FP16
|
268
|
+
|
269
|
+
#define LM_GGML_F16_STEP 16
|
270
|
+
#define LM_GGML_F16_EPR 4
|
271
|
+
|
272
|
+
#define LM_GGML_F32Cx4 float32x4_t
|
273
|
+
#define LM_GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
|
274
|
+
#define LM_GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
|
275
|
+
#define LM_GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
|
276
|
+
#define LM_GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
|
277
|
+
#define LM_GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
278
|
+
#define LM_GGML_F32Cx4_ADD vaddq_f32
|
279
|
+
#define LM_GGML_F32Cx4_MUL vmulq_f32
|
280
|
+
#define LM_GGML_F32Cx4_REDUCE LM_GGML_F32x4_REDUCE
|
281
|
+
|
282
|
+
#define LM_GGML_F16_VEC LM_GGML_F32Cx4
|
283
|
+
#define LM_GGML_F16_VEC_ZERO LM_GGML_F32Cx4_ZERO
|
284
|
+
#define LM_GGML_F16_VEC_SET1 LM_GGML_F32Cx4_SET1
|
285
|
+
#define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F32Cx4_LOAD(p)
|
286
|
+
#define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
|
287
|
+
#define LM_GGML_F16_VEC_FMA LM_GGML_F32Cx4_FMA
|
288
|
+
#define LM_GGML_F16_VEC_ADD LM_GGML_F32Cx4_ADD
|
289
|
+
#define LM_GGML_F16_VEC_MUL LM_GGML_F32Cx4_MUL
|
290
|
+
#define LM_GGML_F16_VEC_REDUCE LM_GGML_F32Cx4_REDUCE
|
291
|
+
#endif
|
292
|
+
|
293
|
+
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
|
21
294
|
|
22
295
|
#define LM_GGML_SIMD
|
23
296
|
|
@@ -299,7 +572,7 @@ static inline __m256 __avx_f32cx8_load(const lm_ggml_fp16_t * x) {
|
|
299
572
|
float tmp[8];
|
300
573
|
|
301
574
|
for (int i = 0; i < 8; i++) {
|
302
|
-
tmp[i] =
|
575
|
+
tmp[i] = LM_GGML_CPU_FP16_TO_FP32(x[i]);
|
303
576
|
}
|
304
577
|
|
305
578
|
return _mm256_loadu_ps(tmp);
|
@@ -310,7 +583,7 @@ static inline void __avx_f32cx8_store(lm_ggml_fp16_t *x, __m256 y) {
|
|
310
583
|
_mm256_storeu_ps(arr, y);
|
311
584
|
|
312
585
|
for (int i = 0; i < 8; i++)
|
313
|
-
x[i] =
|
586
|
+
x[i] = LM_GGML_CPU_FP32_TO_FP16(arr[i]);
|
314
587
|
}
|
315
588
|
#define LM_GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
|
316
589
|
#define LM_GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
|
@@ -458,10 +731,10 @@ static inline unsigned char lm_ggml_endian_byte(int i) {
|
|
458
731
|
inline static v128_t __wasm_f16x4_load(const lm_ggml_fp16_t * p) {
|
459
732
|
float tmp[4];
|
460
733
|
|
461
|
-
tmp[0] =
|
462
|
-
tmp[1] =
|
463
|
-
tmp[2] =
|
464
|
-
tmp[3] =
|
734
|
+
tmp[0] = LM_GGML_CPU_FP16_TO_FP32(p[0]);
|
735
|
+
tmp[1] = LM_GGML_CPU_FP16_TO_FP32(p[1]);
|
736
|
+
tmp[2] = LM_GGML_CPU_FP16_TO_FP32(p[2]);
|
737
|
+
tmp[3] = LM_GGML_CPU_FP16_TO_FP32(p[3]);
|
465
738
|
|
466
739
|
return wasm_v128_load(tmp);
|
467
740
|
}
|
@@ -471,10 +744,10 @@ inline static void __wasm_f16x4_store(lm_ggml_fp16_t * p, v128_t x) {
|
|
471
744
|
|
472
745
|
wasm_v128_store(tmp, x);
|
473
746
|
|
474
|
-
p[0] =
|
475
|
-
p[1] =
|
476
|
-
p[2] =
|
477
|
-
p[3] =
|
747
|
+
p[0] = LM_GGML_CPU_FP32_TO_FP16(tmp[0]);
|
748
|
+
p[1] = LM_GGML_CPU_FP32_TO_FP16(tmp[1]);
|
749
|
+
p[2] = LM_GGML_CPU_FP32_TO_FP16(tmp[2]);
|
750
|
+
p[3] = LM_GGML_CPU_FP32_TO_FP16(tmp[3]);
|
478
751
|
}
|
479
752
|
|
480
753
|
#define LM_GGML_F16x4 v128_t
|
@@ -574,10 +847,10 @@ inline static void __wasm_f16x4_store(lm_ggml_fp16_t * p, v128_t x) {
|
|
574
847
|
static inline __m128 __sse_f16x4_load(const lm_ggml_fp16_t * x) {
|
575
848
|
float tmp[4];
|
576
849
|
|
577
|
-
tmp[0] =
|
578
|
-
tmp[1] =
|
579
|
-
tmp[2] =
|
580
|
-
tmp[3] =
|
850
|
+
tmp[0] = LM_GGML_CPU_FP16_TO_FP32(x[0]);
|
851
|
+
tmp[1] = LM_GGML_CPU_FP16_TO_FP32(x[1]);
|
852
|
+
tmp[2] = LM_GGML_CPU_FP16_TO_FP32(x[2]);
|
853
|
+
tmp[3] = LM_GGML_CPU_FP16_TO_FP32(x[3]);
|
581
854
|
|
582
855
|
return _mm_loadu_ps(tmp);
|
583
856
|
}
|
@@ -587,10 +860,10 @@ static inline void __sse_f16x4_store(lm_ggml_fp16_t * x, __m128 y) {
|
|
587
860
|
|
588
861
|
_mm_storeu_ps(arr, y);
|
589
862
|
|
590
|
-
x[0] =
|
591
|
-
x[1] =
|
592
|
-
x[2] =
|
593
|
-
x[3] =
|
863
|
+
x[0] = LM_GGML_CPU_FP32_TO_FP16(arr[0]);
|
864
|
+
x[1] = LM_GGML_CPU_FP32_TO_FP16(arr[1]);
|
865
|
+
x[2] = LM_GGML_CPU_FP32_TO_FP16(arr[2]);
|
866
|
+
x[3] = LM_GGML_CPU_FP32_TO_FP16(arr[3]);
|
594
867
|
}
|
595
868
|
|
596
869
|
#define LM_GGML_F32Cx4 __m128
|
@@ -712,7 +985,7 @@ static inline void __lasx_f32cx8_store(lm_ggml_fp16_t * x, __m256 y) {
|
|
712
985
|
#define LM_GGML_F32x4_ZERO __lsx_vldi(0)
|
713
986
|
#define LM_GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
714
987
|
#define LM_GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
|
715
|
-
#define LM_GGML_F32x4_STORE(
|
988
|
+
#define LM_GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
|
716
989
|
#define LM_GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
717
990
|
#define LM_GGML_F32x4_ADD __lsx_vfadd_s
|
718
991
|
#define LM_GGML_F32x4_MUL __lsx_vfmul_s
|
@@ -758,10 +1031,10 @@ static inline void __lasx_f32cx8_store(lm_ggml_fp16_t * x, __m256 y) {
|
|
758
1031
|
static inline __m128 __lsx_f16x4_load(const lm_ggml_fp16_t * x) {
|
759
1032
|
float tmp[4];
|
760
1033
|
|
761
|
-
tmp[0] =
|
762
|
-
tmp[1] =
|
763
|
-
tmp[2] =
|
764
|
-
tmp[3] =
|
1034
|
+
tmp[0] = LM_GGML_CPU_FP16_TO_FP32(x[0]);
|
1035
|
+
tmp[1] = LM_GGML_CPU_FP16_TO_FP32(x[1]);
|
1036
|
+
tmp[2] = LM_GGML_CPU_FP16_TO_FP32(x[2]);
|
1037
|
+
tmp[3] = LM_GGML_CPU_FP16_TO_FP32(x[3]);
|
765
1038
|
|
766
1039
|
return __lsx_vld(tmp, 0);
|
767
1040
|
}
|
@@ -771,10 +1044,10 @@ static inline void __lsx_f16x4_store(lm_ggml_fp16_t * x, __m128 y) {
|
|
771
1044
|
|
772
1045
|
__lsx_vst(y, arr, 0);
|
773
1046
|
|
774
|
-
x[0] =
|
775
|
-
x[1] =
|
776
|
-
x[2] =
|
777
|
-
x[3] =
|
1047
|
+
x[0] = LM_GGML_CPU_FP32_TO_FP16(arr[0]);
|
1048
|
+
x[1] = LM_GGML_CPU_FP32_TO_FP16(arr[1]);
|
1049
|
+
x[2] = LM_GGML_CPU_FP32_TO_FP16(arr[2]);
|
1050
|
+
x[3] = LM_GGML_CPU_FP32_TO_FP16(arr[3]);
|
778
1051
|
}
|
779
1052
|
|
780
1053
|
#define LM_GGML_F32Cx4 __m128
|
@@ -806,7 +1079,7 @@ static inline void __lsx_f16x4_store(lm_ggml_fp16_t * x, __m128 y) {
|
|
806
1079
|
#define LM_GGML_F32_STEP 32
|
807
1080
|
#define LM_GGML_F32_EPR 4
|
808
1081
|
|
809
|
-
#define LM_GGML_F32x4
|
1082
|
+
#define LM_GGML_F32x4 float32x4_t
|
810
1083
|
#define LM_GGML_F32x4_ZERO vec_splats(0.0f)
|
811
1084
|
#define LM_GGML_F32x4_SET1 vec_splats
|
812
1085
|
#define LM_GGML_F32x4_LOAD(p) vec_xl(0, p)
|
@@ -828,10 +1101,8 @@ static inline void __lsx_f16x4_store(lm_ggml_fp16_t * x, __m128 y) {
|
|
828
1101
|
for (int i = 0; i < offset; ++i) { \
|
829
1102
|
x[i] = vec_add(x[i], x[offset + i]); \
|
830
1103
|
} \
|
831
|
-
|
832
|
-
|
833
|
-
vec_extract(x[0], 2) + \
|
834
|
-
vec_extract(x[0], 3); \
|
1104
|
+
float32x4_t tmp = x[0] + vec_reve(x[0]); \
|
1105
|
+
res = tmp[0] + tmp[1]; \
|
835
1106
|
}
|
836
1107
|
|
837
1108
|
#define LM_GGML_F32_VEC LM_GGML_F32x4
|
@@ -848,28 +1119,45 @@ static inline void __lsx_f16x4_store(lm_ggml_fp16_t * x, __m128 y) {
|
|
848
1119
|
#define LM_GGML_F16_STEP LM_GGML_F32_STEP
|
849
1120
|
#define LM_GGML_F16_EPR LM_GGML_F32_EPR
|
850
1121
|
|
851
|
-
static inline
|
1122
|
+
static inline float32x4_t __lzs_f16cx4_load(const lm_ggml_fp16_t * x) {
|
1123
|
+
#if defined(__NNPA__)
|
1124
|
+
uint16x8_t v_x = vec_xl(0, (const lm_ggml_fp16_t *)x);
|
1125
|
+
uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
|
1126
|
+
return vec_extend_to_fp32_hi(v_xd, 0);
|
1127
|
+
#else
|
852
1128
|
float tmp[4];
|
853
1129
|
|
854
1130
|
for (int i = 0; i < 4; i++) {
|
855
|
-
tmp[i] =
|
1131
|
+
tmp[i] = LM_GGML_CPU_FP16_TO_FP32(x[i]);
|
856
1132
|
}
|
857
1133
|
|
858
1134
|
// note: keep type-cast here to prevent compiler bugs
|
859
1135
|
// see: https://github.com/ggml-org/llama.cpp/issues/12846
|
860
1136
|
return vec_xl(0, (const float *)(tmp));
|
1137
|
+
#endif
|
861
1138
|
}
|
862
1139
|
|
863
|
-
static inline void __lzs_f16cx4_store(lm_ggml_fp16_t * x,
|
1140
|
+
static inline void __lzs_f16cx4_store(lm_ggml_fp16_t * x, float32x4_t v_y) {
|
1141
|
+
#if defined(__NNPA__)
|
1142
|
+
float32x4_t v_zero = vec_splats(0.0f);
|
1143
|
+
uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
|
1144
|
+
uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
|
1145
|
+
|
1146
|
+
x[0] = vec_extract(v_x, 0);
|
1147
|
+
x[1] = vec_extract(v_x, 1);
|
1148
|
+
x[2] = vec_extract(v_x, 2);
|
1149
|
+
x[3] = vec_extract(v_x, 3);
|
1150
|
+
#else
|
864
1151
|
float arr[4];
|
865
1152
|
|
866
1153
|
// note: keep type-cast here to prevent compiler bugs
|
867
1154
|
// see: https://github.com/ggml-org/llama.cpp/issues/12846
|
868
|
-
vec_xst(
|
1155
|
+
vec_xst(v_y, 0, (float *)(arr));
|
869
1156
|
|
870
1157
|
for (int i = 0; i < 4; i++) {
|
871
|
-
x[i] =
|
1158
|
+
x[i] = LM_GGML_CPU_FP32_TO_FP16(arr[i]);
|
872
1159
|
}
|
1160
|
+
#endif
|
873
1161
|
}
|
874
1162
|
|
875
1163
|
#define LM_GGML_F16_VEC LM_GGML_F32x4
|
@@ -890,3 +1178,7 @@ static inline void __lzs_f16cx4_store(lm_ggml_fp16_t * x, __vector float y) {
|
|
890
1178
|
#define LM_GGML_F32_ARR (LM_GGML_F32_STEP/LM_GGML_F32_EPR)
|
891
1179
|
#define LM_GGML_F16_ARR (LM_GGML_F16_STEP/LM_GGML_F16_EPR)
|
892
1180
|
#endif
|
1181
|
+
|
1182
|
+
#ifdef __cplusplus
|
1183
|
+
}
|
1184
|
+
#endif
|