@novastera-oss/llamarn 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +80 -14
- package/RNLlamaCpp.podspec +10 -3
- package/android/CMakeLists.txt +8 -0
- package/android/src/main/cpp/include/llama.h +62 -125
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/PureCppImpl.cpp +9 -27
- package/cpp/SystemUtils.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/README.md +11 -3
- package/cpp/llama.cpp/build-xcframework.sh +1 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/common/arg.cpp +153 -113
- package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
- package/cpp/llama.cpp/common/chat-parser.h +117 -0
- package/cpp/llama.cpp/common/chat.cpp +847 -699
- package/cpp/llama.cpp/common/chat.h +73 -6
- package/cpp/llama.cpp/common/common.cpp +50 -82
- package/cpp/llama.cpp/common/common.h +21 -17
- package/cpp/llama.cpp/common/json-partial.cpp +255 -0
- package/cpp/llama.cpp/common/json-partial.h +37 -0
- package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
- package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
- package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
- package/cpp/llama.cpp/common/regex-partial.h +56 -0
- package/cpp/llama.cpp/common/sampling.cpp +7 -8
- package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
- package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
- package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
- package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
- package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
- package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
- package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
- package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
- package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
- package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
- package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
- package/cpp/llama.cpp/include/llama.h +62 -125
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
- package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
- package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
- package/cpp/llama.cpp/models/templates/README.md +2 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
- package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
- package/cpp/llama.cpp/src/llama-arch.h +2 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
- package/cpp/llama.cpp/src/llama-context.cpp +340 -123
- package/cpp/llama.cpp/src/llama-context.h +30 -0
- package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
- package/cpp/llama.cpp/src/llama-cparams.h +2 -0
- package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
- package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
- package/cpp/llama.cpp/src/llama-graph.h +52 -7
- package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
- package/cpp/llama.cpp/src/llama-hparams.h +37 -5
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
- package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
- package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
- package/cpp/llama.cpp/src/llama-memory.h +4 -3
- package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
- package/cpp/llama.cpp/src/llama-model.cpp +529 -172
- package/cpp/llama.cpp/src/llama-model.h +6 -1
- package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
- package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
- package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
- package/cpp/llama.cpp/src/llama-vocab.h +6 -0
- package/cpp/llama.cpp/src/llama.cpp +14 -0
- package/cpp/rn-completion.cpp +60 -5
- package/ios/include/chat.h +73 -6
- package/ios/include/common/minja/chat-template.hpp +9 -5
- package/ios/include/common/minja/minja.hpp +69 -36
- package/ios/include/common.h +21 -17
- package/ios/include/llama.h +62 -125
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
- package/cpp/llama.cpp/common/stb_image.h +0 -7988
- package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
|
@@ -17,29 +17,98 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
|
|
|
17
17
|
|
|
18
18
|
#if defined(GGML_SIMD)
|
|
19
19
|
float sumf = 0.0f;
|
|
20
|
-
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
21
20
|
|
|
22
|
-
|
|
21
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
22
|
+
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
|
|
23
|
+
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
|
|
24
|
+
const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
|
|
25
|
+
|
|
26
|
+
const int np = (n & ~(ggml_f32_step - 1));
|
|
27
|
+
svfloat32_t sum1 = svdup_n_f32(0.0f);
|
|
28
|
+
svfloat32_t sum2 = svdup_n_f32(0.0f);
|
|
29
|
+
svfloat32_t sum3 = svdup_n_f32(0.0f);
|
|
30
|
+
svfloat32_t sum4 = svdup_n_f32(0.0f);
|
|
31
|
+
svfloat32_t sum5 = svdup_n_f32(0.0f);
|
|
32
|
+
svfloat32_t sum6 = svdup_n_f32(0.0f);
|
|
33
|
+
svfloat32_t sum7 = svdup_n_f32(0.0f);
|
|
34
|
+
svfloat32_t sum8 = svdup_n_f32(0.0f);
|
|
35
|
+
svfloat32_t ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8;
|
|
36
|
+
svfloat32_t ay1,ay2,ay3,ay4,ay5,ay6,ay7,ay8;
|
|
37
|
+
for (int i = 0; i < np; i += ggml_f32_step) {
|
|
38
|
+
ax1 = GGML_F32_VEC_LOAD(x + i);
|
|
39
|
+
ay1 = GGML_F32_VEC_LOAD(y + i);
|
|
40
|
+
sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
|
|
41
|
+
|
|
42
|
+
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
|
|
43
|
+
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
|
44
|
+
sum2 = GGML_F32_VEC_FMA(ax2, ay2, sum2);
|
|
45
|
+
|
|
46
|
+
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
|
|
47
|
+
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
|
|
48
|
+
sum3 = GGML_F32_VEC_FMA(ax3, ay3, sum3);
|
|
49
|
+
|
|
50
|
+
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
|
|
51
|
+
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
|
|
52
|
+
sum4 = GGML_F32_VEC_FMA(ax4, ay4, sum4);
|
|
53
|
+
|
|
54
|
+
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
|
|
55
|
+
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
|
|
56
|
+
sum5 = GGML_F32_VEC_FMA(ax5, ay5, sum5);
|
|
57
|
+
|
|
58
|
+
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
|
|
59
|
+
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
|
|
60
|
+
sum6 = GGML_F32_VEC_FMA(ax6, ay6, sum6);
|
|
61
|
+
|
|
62
|
+
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
|
|
63
|
+
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
|
|
64
|
+
sum7 = GGML_F32_VEC_FMA(ax7, ay7, sum7);
|
|
65
|
+
|
|
66
|
+
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
|
|
67
|
+
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
|
|
68
|
+
sum8 = GGML_F32_VEC_FMA(ax8, ay8, sum8);
|
|
69
|
+
}
|
|
70
|
+
// leftovers
|
|
71
|
+
// Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
|
|
72
|
+
const int np2 = (n & ~(ggml_f32_epr - 1));
|
|
73
|
+
for (int i = np; i < np2; i += ggml_f32_epr) {
|
|
74
|
+
ax1 = GGML_F32_VEC_LOAD(x + i);
|
|
75
|
+
ay1 = GGML_F32_VEC_LOAD(y + i);
|
|
76
|
+
sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
|
|
77
|
+
}
|
|
78
|
+
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
|
79
|
+
if (np2 < n) {
|
|
80
|
+
svbool_t pg = svwhilelt_b32(np2, n);
|
|
81
|
+
ax1 = svld1_f32(pg, x + np2);
|
|
82
|
+
ay1 = svld1_f32(pg, y + np2);
|
|
83
|
+
sum1 = svmad_f32_m(pg, ax1, ay1, sum1);
|
|
84
|
+
}
|
|
85
|
+
// reduce sum1,sum2 to sum1
|
|
86
|
+
GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
|
|
87
|
+
#else
|
|
88
|
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
23
89
|
|
|
24
|
-
|
|
25
|
-
GGML_F32_VEC ay[GGML_F32_ARR];
|
|
90
|
+
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
|
26
91
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
|
30
|
-
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
|
92
|
+
GGML_F32_VEC ax[GGML_F32_ARR];
|
|
93
|
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
|
31
94
|
|
|
32
|
-
|
|
95
|
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
|
96
|
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
|
97
|
+
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
|
98
|
+
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
|
99
|
+
|
|
100
|
+
sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
|
|
101
|
+
}
|
|
33
102
|
}
|
|
34
|
-
}
|
|
35
103
|
|
|
36
|
-
|
|
37
|
-
|
|
104
|
+
// reduce sum0..sum3 to sum0
|
|
105
|
+
GGML_F32_VEC_REDUCE(sumf, sum);
|
|
38
106
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
107
|
+
// leftovers
|
|
108
|
+
for (int i = np; i < n; ++i) {
|
|
109
|
+
sumf += x[i]*y[i];
|
|
110
|
+
}
|
|
111
|
+
#endif
|
|
43
112
|
#else
|
|
44
113
|
// scalar
|
|
45
114
|
ggml_float sumf = 0.0;
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
#include "ggml-impl.h"
|
|
6
6
|
#include "simd-mappings.h"
|
|
7
7
|
#include "ggml.h"
|
|
8
|
+
#include "ggml-cpu.h"
|
|
8
9
|
|
|
9
10
|
#if defined(GGML_USE_ACCELERATE)
|
|
10
11
|
#include <Accelerate/Accelerate.h>
|
|
@@ -148,27 +149,108 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
|
|
148
149
|
|
|
149
150
|
inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
|
|
150
151
|
#if defined(GGML_SIMD)
|
|
151
|
-
|
|
152
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
152
153
|
|
|
153
|
-
|
|
154
|
+
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
|
|
155
|
+
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
|
|
156
|
+
const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
|
|
157
|
+
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
|
154
158
|
|
|
155
|
-
|
|
156
|
-
|
|
159
|
+
const int np = (n & ~(ggml_f32_step - 1));
|
|
160
|
+
svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
|
161
|
+
svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
|
162
|
+
for (int i = 0; i < np; i += ggml_f32_step) {
|
|
157
163
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
|
162
|
-
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
|
|
164
|
+
ax1 = GGML_F32_VEC_LOAD(x + i);
|
|
165
|
+
ay1 = GGML_F32_VEC_LOAD(y + i);
|
|
166
|
+
ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
|
|
163
167
|
|
|
164
|
-
GGML_F32_VEC_STORE(y + i
|
|
168
|
+
GGML_F32_VEC_STORE(y + i, ay1);
|
|
169
|
+
|
|
170
|
+
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
|
|
171
|
+
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
|
172
|
+
ay2 = GGML_F32_VEC_FMA(ax2, vx, ay2);
|
|
173
|
+
|
|
174
|
+
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
|
|
175
|
+
|
|
176
|
+
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
|
|
177
|
+
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
|
|
178
|
+
ay3 = GGML_F32_VEC_FMA(ax3, vx, ay3);
|
|
179
|
+
|
|
180
|
+
GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
|
|
181
|
+
|
|
182
|
+
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
|
|
183
|
+
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
|
|
184
|
+
ay4 = GGML_F32_VEC_FMA(ax4, vx, ay4);
|
|
185
|
+
|
|
186
|
+
GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
|
|
187
|
+
|
|
188
|
+
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
|
|
189
|
+
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
|
|
190
|
+
ay5 = GGML_F32_VEC_FMA(ax5, vx, ay5);
|
|
191
|
+
|
|
192
|
+
GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
|
|
193
|
+
|
|
194
|
+
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
|
|
195
|
+
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
|
|
196
|
+
ay6 = GGML_F32_VEC_FMA(ax6, vx, ay6);
|
|
197
|
+
|
|
198
|
+
GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
|
|
199
|
+
|
|
200
|
+
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
|
|
201
|
+
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
|
|
202
|
+
ay7 = GGML_F32_VEC_FMA(ax7, vx, ay7);
|
|
203
|
+
|
|
204
|
+
GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
|
|
205
|
+
|
|
206
|
+
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
|
|
207
|
+
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
|
|
208
|
+
ay8 = GGML_F32_VEC_FMA(ax8, vx, ay8);
|
|
209
|
+
|
|
210
|
+
GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
|
|
165
211
|
}
|
|
166
|
-
|
|
212
|
+
// leftovers
|
|
213
|
+
// Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
|
|
214
|
+
const int np2 = (n & ~(ggml_f32_epr - 1));
|
|
215
|
+
for (int i = np; i < np2; i += ggml_f32_epr) {
|
|
216
|
+
ax1 = GGML_F32_VEC_LOAD(x + i);
|
|
217
|
+
ay1 = GGML_F32_VEC_LOAD(y + i);
|
|
218
|
+
ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
|
|
219
|
+
|
|
220
|
+
GGML_F32_VEC_STORE(y + i, ay1);
|
|
221
|
+
}
|
|
222
|
+
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
|
223
|
+
if (np2 < n) {
|
|
224
|
+
svbool_t pg =svwhilelt_b32(np2, n);
|
|
225
|
+
ax1 = svld1_f32(pg, x + np2);
|
|
226
|
+
ay1 = svld1_f32(pg, y + np2);
|
|
227
|
+
ay1 = svmad_f32_m(pg, ax1, vx, ay1);
|
|
228
|
+
|
|
229
|
+
svst1_f32(pg, y + np2, ay1);
|
|
230
|
+
}
|
|
231
|
+
#else
|
|
232
|
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
167
233
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
234
|
+
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
|
235
|
+
|
|
236
|
+
GGML_F32_VEC ax[GGML_F32_ARR];
|
|
237
|
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
|
238
|
+
|
|
239
|
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
|
240
|
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
|
241
|
+
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
|
242
|
+
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
|
243
|
+
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
|
|
244
|
+
|
|
245
|
+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// leftovers
|
|
250
|
+
for (int i = np; i < n; ++i) {
|
|
251
|
+
y[i] += x[i]*v;
|
|
252
|
+
}
|
|
253
|
+
#endif
|
|
172
254
|
#else
|
|
173
255
|
// scalar
|
|
174
256
|
for (int i = 0; i < n; ++i) {
|
|
@@ -220,36 +302,45 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
|
|
|
220
302
|
}
|
|
221
303
|
|
|
222
304
|
#if defined(GGML_SIMD)
|
|
223
|
-
|
|
305
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
306
|
+
// scalar Route to scalar implementation //TODO: Write SVE code
|
|
307
|
+
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
|
308
|
+
for (int i = 0; i < n; ++i) {
|
|
309
|
+
y[i] += x[k][i]*v[k][0];
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
#else
|
|
313
|
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
224
314
|
|
|
225
|
-
|
|
315
|
+
GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
|
|
226
316
|
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
317
|
+
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
|
318
|
+
vx[k] = GGML_F32_VEC_SET1(v[k][0]);
|
|
319
|
+
}
|
|
230
320
|
|
|
231
|
-
|
|
232
|
-
|
|
321
|
+
GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
|
|
322
|
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
|
233
323
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
324
|
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
|
325
|
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
|
326
|
+
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
|
237
327
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
328
|
+
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
|
329
|
+
ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
|
|
330
|
+
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
|
|
331
|
+
}
|
|
242
332
|
|
|
243
|
-
|
|
333
|
+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
|
334
|
+
}
|
|
244
335
|
}
|
|
245
|
-
}
|
|
246
336
|
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
337
|
+
// leftovers
|
|
338
|
+
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
|
339
|
+
for (int i = np; i < n; ++i) {
|
|
340
|
+
y[i] += x[k][i]*v[k][0];
|
|
341
|
+
}
|
|
251
342
|
}
|
|
252
|
-
|
|
343
|
+
#endif
|
|
253
344
|
#else
|
|
254
345
|
// scalar
|
|
255
346
|
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
|
@@ -265,25 +356,53 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
|
|
265
356
|
#if defined(GGML_USE_ACCELERATE)
|
|
266
357
|
vDSP_vsmul(y, 1, &v, y, 1, n);
|
|
267
358
|
#elif defined(GGML_SIMD)
|
|
268
|
-
|
|
359
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
360
|
+
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
|
|
361
|
+
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
|
|
362
|
+
const int ggml_f32_step = 2 * ggml_f32_epr;
|
|
363
|
+
|
|
364
|
+
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
|
365
|
+
const int np = (n & ~(ggml_f32_step - 1));
|
|
366
|
+
svfloat32_t ay1;
|
|
367
|
+
svfloat32_t ay2;
|
|
368
|
+
for (int i = 0; i < np; i += ggml_f32_step) {
|
|
369
|
+
ay1 = GGML_F32_VEC_LOAD(y + i);
|
|
370
|
+
ay1 = GGML_F32_VEC_MUL(ay1, vx);
|
|
371
|
+
GGML_F32_VEC_STORE(y + i, ay1);
|
|
372
|
+
|
|
373
|
+
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
|
374
|
+
ay2 = GGML_F32_VEC_MUL(ay2, vx);
|
|
375
|
+
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
|
|
376
|
+
}
|
|
377
|
+
// leftovers
|
|
378
|
+
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
|
379
|
+
if (np < n) {
|
|
380
|
+
svbool_t pg = svwhilelt_b32(np, n);
|
|
381
|
+
ay1 = svld1_f32(pg, y + np);
|
|
382
|
+
ay1 = svmul_f32_m(pg, ay1, vx);
|
|
383
|
+
svst1_f32(pg, y + np, ay1);
|
|
384
|
+
}
|
|
385
|
+
#else
|
|
386
|
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
269
387
|
|
|
270
|
-
|
|
388
|
+
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
|
271
389
|
|
|
272
|
-
|
|
390
|
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
|
273
391
|
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
392
|
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
|
393
|
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
|
394
|
+
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
|
395
|
+
ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
|
|
278
396
|
|
|
279
|
-
|
|
397
|
+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
|
398
|
+
}
|
|
280
399
|
}
|
|
281
|
-
}
|
|
282
400
|
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
401
|
+
// leftovers
|
|
402
|
+
for (int i = np; i < n; ++i) {
|
|
403
|
+
y[i] *= v;
|
|
404
|
+
}
|
|
405
|
+
#endif
|
|
287
406
|
#else
|
|
288
407
|
// scalar
|
|
289
408
|
for (int i = 0; i < n; ++i) {
|
|
@@ -428,6 +547,7 @@ inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp
|
|
|
428
547
|
static const float GELU_COEF_A = 0.044715f;
|
|
429
548
|
static const float GELU_QUICK_COEF = -1.702f;
|
|
430
549
|
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
|
550
|
+
static const float SQRT_2_INV = 0.70710678118654752440084436210484f;
|
|
431
551
|
|
|
432
552
|
inline static float ggml_gelu_f32(float x) {
|
|
433
553
|
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
|
@@ -440,6 +560,14 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
|
|
|
440
560
|
}
|
|
441
561
|
}
|
|
442
562
|
|
|
563
|
+
inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
564
|
+
for (int i = 0; i < n; ++i) {
|
|
565
|
+
float xi = GGML_FP16_TO_FP32(x[i]);
|
|
566
|
+
float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
|
|
567
|
+
y[i] = GGML_FP32_TO_FP16(res);
|
|
568
|
+
}
|
|
569
|
+
}
|
|
570
|
+
|
|
443
571
|
#ifdef GGML_GELU_FP16
|
|
444
572
|
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
|
445
573
|
uint16_t t;
|
|
@@ -463,6 +591,13 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
|
|
463
591
|
}
|
|
464
592
|
#endif
|
|
465
593
|
|
|
594
|
+
inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
|
|
595
|
+
for (int i = 0; i < n; ++i) {
|
|
596
|
+
float xi = x[i];
|
|
597
|
+
y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
|
|
466
601
|
inline static float ggml_gelu_quick_f32(float x) {
|
|
467
602
|
return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
|
|
468
603
|
}
|
|
@@ -512,6 +647,42 @@ inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
|
|
|
512
647
|
#error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
|
|
513
648
|
#endif
|
|
514
649
|
|
|
650
|
+
/* Below function was borrowed from the GitHub repository:
|
|
651
|
+
https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */
|
|
652
|
+
#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
|
653
|
+
inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
|
|
654
|
+
// Constants
|
|
655
|
+
const svfloat32_t log2_e = svdup_n_f32(1.4426950409f);
|
|
656
|
+
const svfloat32_t ln2 = svdup_n_f32(0.6931473921f);
|
|
657
|
+
const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f);
|
|
658
|
+
const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1));
|
|
659
|
+
const svfloat32_t one = svdup_n_f32(1.0f);
|
|
660
|
+
const svfloat32_t inactive1 = svdup_n_f32(0.0f);
|
|
661
|
+
const svint32_t inactive2 = svdup_n_s32(0);
|
|
662
|
+
|
|
663
|
+
// Algorithm starts here
|
|
664
|
+
svfloat32_t t0 = svmul_f32_m(pg, src, log2_e); // y = x * log2(e)
|
|
665
|
+
svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0); // rount to int (float)
|
|
666
|
+
svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1); // n
|
|
667
|
+
|
|
668
|
+
t1 = svsub_f32_m(pg, t0, t1); // a = y - floor(y)
|
|
669
|
+
t1 = svadd_f32_m(pg, t1, one); // b = a + 1
|
|
670
|
+
|
|
671
|
+
svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17); // v = b >> 17 (u32)
|
|
672
|
+
svfloat32_t t4 = svexpa_f32(t3); // c = fexpa(v)
|
|
673
|
+
t4 = svscale_f32_m(pg, t4, t2); // fexpa(v) * 2^(n)
|
|
674
|
+
|
|
675
|
+
// and_(t2.d, t1.d, not_mask17.d)
|
|
676
|
+
svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
|
|
677
|
+
t5 = svsub_f32_m(pg, t1, t5); // z
|
|
678
|
+
t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq); // ln2 + half_ln2_sq * z
|
|
679
|
+
t0 = svmla_f32_m(pg, one, t5, t0); // 1 + (ln2 * z) + (half_ln2_sq * z * z)
|
|
680
|
+
t0 = svmul_f32_m(pg, t0, t4); // Final result
|
|
681
|
+
|
|
682
|
+
return t0;
|
|
683
|
+
}
|
|
684
|
+
#endif
|
|
685
|
+
|
|
515
686
|
#if defined(__ARM_NEON) && defined(__aarch64__)
|
|
516
687
|
|
|
517
688
|
// adapted from arm limited optimized routine
|
|
@@ -1,47 +1,61 @@
|
|
|
1
1
|
#include "acc.cuh"
|
|
2
2
|
|
|
3
|
-
static __global__ void acc_f32(const float * x, const float * y, float * dst, const
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
const
|
|
3
|
+
static __global__ void acc_f32(const float * x, const float * y, float * dst, const int64_t ne,
|
|
4
|
+
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
|
|
5
|
+
const int64_t s11, const int64_t s12, const int64_t s13, const int64_t offset) {
|
|
6
|
+
const int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
|
|
7
|
+
|
|
7
8
|
if (i >= ne) {
|
|
8
9
|
return;
|
|
9
10
|
}
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
11
|
+
|
|
12
|
+
int64_t src1_idx = i - offset;
|
|
13
|
+
|
|
14
|
+
int64_t tmp = src1_idx;
|
|
15
|
+
const int64_t i13 = tmp / s13;
|
|
16
|
+
tmp -= i13 * s13;
|
|
17
|
+
const int64_t i12 = tmp / s12;
|
|
18
|
+
tmp -= i12 * s12;
|
|
19
|
+
const int64_t i11 = tmp / s11;
|
|
20
|
+
tmp -= i11 * s11;
|
|
21
|
+
const int64_t i10 = tmp;
|
|
22
|
+
|
|
23
|
+
float val = x[i];
|
|
24
|
+
if (src1_idx >= 0 && i10 < ne10 && i11 < ne11 && i12 < ne12 && i13 < ne13) {
|
|
25
|
+
val += y[((i13*ne12 + i12) * ne11 + i11) * ne10 + i10];
|
|
18
26
|
}
|
|
27
|
+
dst[i] = val;
|
|
19
28
|
}
|
|
20
29
|
|
|
21
|
-
static void acc_f32_cuda(const float * x, const float * y, float * dst, const
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
|
|
25
|
-
acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12,
|
|
30
|
+
static void acc_f32_cuda(const float * x, const float * y, float * dst, const int64_t n_elements,
|
|
31
|
+
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
|
|
32
|
+
const int64_t s1, const int64_t s2, const int64_t s3, const int64_t offset, cudaStream_t stream) {
|
|
33
|
+
const int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
|
|
34
|
+
acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, ne13, s1, s2, s3, offset);
|
|
26
35
|
}
|
|
27
36
|
|
|
28
37
|
void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
29
38
|
const ggml_tensor * src0 = dst->src[0];
|
|
30
39
|
const ggml_tensor * src1 = dst->src[1];
|
|
31
|
-
|
|
32
|
-
const float *
|
|
33
|
-
float *
|
|
40
|
+
|
|
41
|
+
const float * src0_d = (const float *) src0->data;
|
|
42
|
+
const float * src1_d = (const float *) src1->data;
|
|
43
|
+
float * dst_d = (float *) dst->data;
|
|
44
|
+
|
|
34
45
|
cudaStream_t stream = ctx.stream();
|
|
35
46
|
|
|
36
47
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
37
48
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
38
49
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
39
|
-
GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
|
|
40
50
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
51
|
+
GGML_ASSERT(ggml_is_contiguous(src1));
|
|
52
|
+
GGML_ASSERT(dst->nb[0] == ggml_element_size(dst));
|
|
53
|
+
GGML_ASSERT(ggml_is_contiguously_allocated(dst));
|
|
54
|
+
|
|
55
|
+
const int64_t s1 = dst->op_params[0] / sizeof(float);
|
|
56
|
+
const int64_t s2 = dst->op_params[1] / sizeof(float);
|
|
57
|
+
const int64_t s3 = dst->op_params[2] / sizeof(float);
|
|
58
|
+
const int64_t offset = dst->op_params[3] / sizeof(float);
|
|
45
59
|
|
|
46
|
-
acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2],
|
|
60
|
+
acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], s1, s2, s3, offset, stream);
|
|
47
61
|
}
|
|
@@ -168,7 +168,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
|
|
|
168
168
|
|
|
169
169
|
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
|
|
170
170
|
|
|
171
|
-
#if !defined(GGML_USE_HIP)
|
|
171
|
+
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
|
172
172
|
static const char * cu_get_error_str(CUresult err) {
|
|
173
173
|
const char * err_str;
|
|
174
174
|
cuGetErrorString(err, &err_str);
|
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
#include "cpy.cuh"
|
|
2
2
|
#include "dequantize.cuh"
|
|
3
|
+
#ifdef GGML_USE_MUSA
|
|
4
|
+
#include "ggml-musa/mudnn.cuh"
|
|
5
|
+
#endif // GGML_USE_MUSA
|
|
3
6
|
|
|
4
7
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
|
5
8
|
|
|
@@ -597,7 +600,14 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
|
|
597
600
|
#endif
|
|
598
601
|
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
|
599
602
|
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
|
|
600
|
-
|
|
603
|
+
#ifdef GGML_USE_MUSA
|
|
604
|
+
if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
|
|
605
|
+
CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
|
|
606
|
+
} else
|
|
607
|
+
#endif // GGML_USE_MUSA
|
|
608
|
+
{
|
|
609
|
+
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
|
610
|
+
}
|
|
601
611
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
|
602
612
|
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
|
603
613
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
|
|
@@ -623,8 +623,8 @@ static __global__ void flash_attn_combine_results(
|
|
|
623
623
|
__builtin_assume(tid < D);
|
|
624
624
|
|
|
625
625
|
extern __shared__ float2 meta[];
|
|
626
|
-
|
|
627
|
-
((float *) meta)[
|
|
626
|
+
for (int i = tid; i < 2*parallel_blocks; i += D) {
|
|
627
|
+
((float *) meta)[i] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + i];
|
|
628
628
|
}
|
|
629
629
|
|
|
630
630
|
__syncthreads();
|
|
@@ -678,10 +678,14 @@ void launch_fattn(
|
|
|
678
678
|
) {
|
|
679
679
|
constexpr int ncols = ncols1 * ncols2;
|
|
680
680
|
|
|
681
|
+
const bool is_mla = DV == 512; // TODO better parameterization
|
|
682
|
+
|
|
681
683
|
const ggml_tensor * Q = dst->src[0];
|
|
682
684
|
const ggml_tensor * K = dst->src[1];
|
|
683
685
|
const ggml_tensor * V = dst->src[2];
|
|
684
686
|
|
|
687
|
+
GGML_ASSERT(V || is_mla);
|
|
688
|
+
|
|
685
689
|
const ggml_tensor * mask = dst->src[3];
|
|
686
690
|
|
|
687
691
|
ggml_tensor * KQV = dst;
|
|
@@ -689,6 +693,10 @@ void launch_fattn(
|
|
|
689
693
|
GGML_ASSERT(Q->type == GGML_TYPE_F32);
|
|
690
694
|
GGML_ASSERT(KQV->type == GGML_TYPE_F32);
|
|
691
695
|
|
|
696
|
+
GGML_ASSERT( Q->nb[0] == ggml_element_size(Q));
|
|
697
|
+
GGML_ASSERT( K->nb[0] == ggml_element_size(K));
|
|
698
|
+
GGML_ASSERT(!V || V->nb[0] == ggml_element_size(V));
|
|
699
|
+
|
|
692
700
|
GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
|
|
693
701
|
GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
|
|
694
702
|
"the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
|
|
@@ -713,10 +721,10 @@ void launch_fattn(
|
|
|
713
721
|
size_t nb12 = K->nb[2];
|
|
714
722
|
size_t nb13 = K->nb[3];
|
|
715
723
|
|
|
716
|
-
const char * V_data = (const char *) V->data;
|
|
717
|
-
size_t nb21 = V->nb[1];
|
|
718
|
-
size_t nb22 = V->nb[2];
|
|
719
|
-
size_t nb23 = V->nb[3];
|
|
724
|
+
const char * V_data = V ? (const char *) V->data : nullptr;
|
|
725
|
+
size_t nb21 = V ? V->nb[1] : nb11;
|
|
726
|
+
size_t nb22 = V ? V->nb[2] : nb12;
|
|
727
|
+
size_t nb23 = V ? V->nb[3] : nb13;
|
|
720
728
|
|
|
721
729
|
if (need_f16_K && K->type != GGML_TYPE_F16) {
|
|
722
730
|
GGML_ASSERT(ggml_is_contiguously_allocated(K));
|
|
@@ -733,7 +741,7 @@ void launch_fattn(
|
|
|
733
741
|
nb13 = nb13*bs*sizeof(half)/ts;
|
|
734
742
|
}
|
|
735
743
|
|
|
736
|
-
if (need_f16_V && V->type != GGML_TYPE_F16) {
|
|
744
|
+
if (V && need_f16_V && V->type != GGML_TYPE_F16) {
|
|
737
745
|
GGML_ASSERT(ggml_is_contiguously_allocated(V));
|
|
738
746
|
V_f16.alloc(ggml_nelements(V));
|
|
739
747
|
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
|