@novastera-oss/llamarn 0.2.7 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +8 -3
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +56 -22
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +1 -2
- package/cpp/llama.cpp/README.md +4 -5
- package/cpp/llama.cpp/build-xcframework.sh +1 -1
- package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
- package/cpp/llama.cpp/common/arg.cpp +24 -0
- package/cpp/llama.cpp/common/chat.cpp +37 -20
- package/cpp/llama.cpp/common/chat.h +2 -0
- package/cpp/llama.cpp/common/common.cpp +3 -0
- package/cpp/llama.cpp/common/common.h +5 -0
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/convert_hf_to_gguf.py +860 -23
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +9 -0
- package/cpp/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +206 -10
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -1
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +36 -18
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +68 -5
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +16 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +37 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -103
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1405 -240
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +212 -34
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +35 -11
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +187 -54
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +71 -29
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +8 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +14 -12
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +5 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +15 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +12 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +269 -110
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +2 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -5
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +125 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +51 -9
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +394 -80
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +616 -239
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +741 -571
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +697 -1098
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +104 -62
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +39 -38
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +767 -292
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +58 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +23 -3
- package/cpp/llama.cpp/ggml/src/ggml.c +449 -72
- package/cpp/llama.cpp/ggml/src/gguf.cpp +13 -2
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +285 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +27 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +137 -21
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +109 -7
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +8 -43
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +265 -3
- package/cpp/llama.cpp/src/llama-arch.h +36 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +596 -359
- package/cpp/llama.cpp/src/llama-batch.h +105 -70
- package/cpp/llama.cpp/src/llama-chat.cpp +26 -6
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +101 -107
- package/cpp/llama.cpp/src/llama-context.h +13 -13
- package/cpp/llama.cpp/src/llama-graph.cpp +286 -404
- package/cpp/llama.cpp/src/llama-graph.h +78 -79
- package/cpp/llama.cpp/src/llama-hparams.cpp +11 -1
- package/cpp/llama.cpp/src/llama-hparams.h +11 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +74 -66
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +312 -157
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +79 -46
- package/cpp/llama.cpp/src/llama-kv-cells.h +97 -21
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +73 -69
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +19 -22
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +88 -77
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
- package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
- package/cpp/llama.cpp/src/llama-memory.h +21 -22
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +5301 -2922
- package/cpp/llama.cpp/src/llama-model.h +40 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +88 -5
- package/cpp/llama.cpp/src/llama-vocab.cpp +37 -3
- package/cpp/llama.cpp/src/llama-vocab.h +42 -0
- package/cpp/rn-utils.h +3 -0
- package/ios/include/chat.h +2 -0
- package/ios/include/common.h +5 -0
- package/ios/include/llama.h +8 -43
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3891 -3744
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5095 -4900
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5066 -4871
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3919 -3773
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
- package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
|
@@ -58,7 +58,7 @@ inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf
|
|
|
58
58
|
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
|
|
59
59
|
inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
|
60
60
|
for (int i = 0; i < n; ++i) {
|
|
61
|
-
z[i] =
|
|
61
|
+
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
|
|
62
62
|
}
|
|
63
63
|
}
|
|
64
64
|
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
|
|
@@ -67,7 +67,7 @@ inline static void ggml_vec_acc1_f32(const int n, float * y, const float v)
|
|
|
67
67
|
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
|
|
68
68
|
inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
|
69
69
|
for (int i = 0; i < n; ++i) {
|
|
70
|
-
z[i] =
|
|
70
|
+
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i]));
|
|
71
71
|
}
|
|
72
72
|
}
|
|
73
73
|
inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
|
@@ -75,20 +75,20 @@ inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)
|
|
|
75
75
|
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
|
|
76
76
|
inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
77
77
|
for (int i = 0; i < n; ++i) {
|
|
78
|
-
y[i] =
|
|
78
|
+
y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i]));
|
|
79
79
|
}
|
|
80
80
|
}
|
|
81
81
|
|
|
82
82
|
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
|
83
83
|
inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
|
84
84
|
for (int i = 0; i < n; ++i) {
|
|
85
|
-
z[i] =
|
|
85
|
+
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i]));
|
|
86
86
|
}
|
|
87
87
|
}
|
|
88
88
|
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
|
89
89
|
inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
|
90
90
|
for (int i = 0; i < n; ++i) {
|
|
91
|
-
z[i] =
|
|
91
|
+
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i]));
|
|
92
92
|
}
|
|
93
93
|
}
|
|
94
94
|
|
|
@@ -131,13 +131,13 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
|
|
131
131
|
// leftovers
|
|
132
132
|
for (int i = np; i < n; ++i) {
|
|
133
133
|
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
|
134
|
-
sumf[j] += (ggml_float)(
|
|
134
|
+
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
135
135
|
}
|
|
136
136
|
}
|
|
137
137
|
#else
|
|
138
138
|
for (int i = 0; i < n; ++i) {
|
|
139
139
|
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
|
140
|
-
sumf[j] += (ggml_float)(
|
|
140
|
+
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
141
141
|
}
|
|
142
142
|
}
|
|
143
143
|
#endif
|
|
@@ -163,49 +163,49 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
|
|
163
163
|
|
|
164
164
|
ax1 = GGML_F32_VEC_LOAD(x + i);
|
|
165
165
|
ay1 = GGML_F32_VEC_LOAD(y + i);
|
|
166
|
-
ay1 = GGML_F32_VEC_FMA(ax1, vx
|
|
166
|
+
ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
|
|
167
167
|
|
|
168
168
|
GGML_F32_VEC_STORE(y + i, ay1);
|
|
169
169
|
|
|
170
170
|
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
|
|
171
171
|
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
|
172
|
-
ay2 = GGML_F32_VEC_FMA(ax2, vx
|
|
172
|
+
ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
|
|
173
173
|
|
|
174
174
|
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
|
|
175
175
|
|
|
176
176
|
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
|
|
177
177
|
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
|
|
178
|
-
ay3 = GGML_F32_VEC_FMA(ax3, vx
|
|
178
|
+
ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
|
|
179
179
|
|
|
180
180
|
GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
|
|
181
181
|
|
|
182
182
|
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
|
|
183
183
|
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
|
|
184
|
-
ay4 = GGML_F32_VEC_FMA(ax4, vx
|
|
184
|
+
ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
|
|
185
185
|
|
|
186
186
|
GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
|
|
187
187
|
|
|
188
188
|
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
|
|
189
189
|
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
|
|
190
|
-
ay5 = GGML_F32_VEC_FMA(ax5, vx
|
|
190
|
+
ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
|
|
191
191
|
|
|
192
192
|
GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
|
|
193
193
|
|
|
194
194
|
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
|
|
195
195
|
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
|
|
196
|
-
ay6 = GGML_F32_VEC_FMA(ax6, vx
|
|
196
|
+
ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
|
|
197
197
|
|
|
198
198
|
GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
|
|
199
199
|
|
|
200
200
|
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
|
|
201
201
|
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
|
|
202
|
-
ay7 = GGML_F32_VEC_FMA(ax7, vx
|
|
202
|
+
ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
|
|
203
203
|
|
|
204
204
|
GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
|
|
205
205
|
|
|
206
206
|
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
|
|
207
207
|
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
|
|
208
|
-
ay8 = GGML_F32_VEC_FMA(ax8, vx
|
|
208
|
+
ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
|
|
209
209
|
|
|
210
210
|
GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
|
|
211
211
|
}
|
|
@@ -215,7 +215,7 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
|
|
215
215
|
for (int i = np; i < np2; i += ggml_f32_epr) {
|
|
216
216
|
ax1 = GGML_F32_VEC_LOAD(x + i);
|
|
217
217
|
ay1 = GGML_F32_VEC_LOAD(y + i);
|
|
218
|
-
ay1 = GGML_F32_VEC_FMA(ax1, vx
|
|
218
|
+
ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
|
|
219
219
|
|
|
220
220
|
GGML_F32_VEC_STORE(y + i, ay1);
|
|
221
221
|
}
|
|
@@ -280,12 +280,12 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
|
|
|
280
280
|
|
|
281
281
|
// leftovers
|
|
282
282
|
for (int i = np; i < n; ++i) {
|
|
283
|
-
y[i] =
|
|
283
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
|
284
284
|
}
|
|
285
285
|
#else
|
|
286
286
|
// scalar
|
|
287
287
|
for (int i = 0; i < n; ++i) {
|
|
288
|
-
y[i] =
|
|
288
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
|
289
289
|
}
|
|
290
290
|
#endif
|
|
291
291
|
}
|
|
@@ -351,6 +351,45 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
|
|
|
351
351
|
#endif
|
|
352
352
|
}
|
|
353
353
|
|
|
354
|
+
inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
|
|
355
|
+
#if defined(GGML_USE_ACCELERATE)
|
|
356
|
+
vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
|
|
357
|
+
#elif defined(GGML_SIMD)
|
|
358
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
359
|
+
// scalar ; TODO: Write SVE code
|
|
360
|
+
for (int i = 0; i < n; ++i) {
|
|
361
|
+
y[i] = x[i]*s + b;
|
|
362
|
+
}
|
|
363
|
+
#else
|
|
364
|
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
365
|
+
|
|
366
|
+
GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
|
|
367
|
+
GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
|
|
368
|
+
|
|
369
|
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
|
370
|
+
|
|
371
|
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
|
372
|
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
|
373
|
+
ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
|
374
|
+
ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb);
|
|
375
|
+
|
|
376
|
+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
// leftovers
|
|
381
|
+
for (int i = np; i < n; ++i) {
|
|
382
|
+
y[i] = x[i]*s + b;
|
|
383
|
+
}
|
|
384
|
+
#endif
|
|
385
|
+
#else
|
|
386
|
+
// scalar
|
|
387
|
+
for (int i = 0; i < n; ++i) {
|
|
388
|
+
y[i] = x[i]*s + b;
|
|
389
|
+
}
|
|
390
|
+
#endif
|
|
391
|
+
}
|
|
392
|
+
|
|
354
393
|
//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
|
|
355
394
|
inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
|
356
395
|
#if defined(GGML_USE_ACCELERATE)
|
|
@@ -430,12 +469,12 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
|
|
|
430
469
|
|
|
431
470
|
// leftovers
|
|
432
471
|
for (int i = np; i < n; ++i) {
|
|
433
|
-
y[i] =
|
|
472
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
|
434
473
|
}
|
|
435
474
|
#else
|
|
436
475
|
// scalar
|
|
437
476
|
for (int i = 0; i < n; ++i) {
|
|
438
|
-
y[i] =
|
|
477
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
|
439
478
|
}
|
|
440
479
|
#endif
|
|
441
480
|
}
|
|
@@ -444,103 +483,103 @@ inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) {
|
|
|
444
483
|
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
|
445
484
|
inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
446
485
|
for (int i = 0; i < n; ++i) {
|
|
447
|
-
float v =
|
|
448
|
-
y[i] =
|
|
486
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
487
|
+
y[i] = GGML_CPU_FP32_TO_FP16(v*v);
|
|
449
488
|
}
|
|
450
489
|
}
|
|
451
490
|
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
|
452
491
|
inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
453
492
|
for (int i = 0; i < n; ++i) {
|
|
454
|
-
y[i] =
|
|
493
|
+
y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
455
494
|
}
|
|
456
495
|
}
|
|
457
496
|
inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
|
|
458
497
|
inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
459
498
|
for (int i = 0; i < n; ++i) {
|
|
460
|
-
y[i] =
|
|
499
|
+
y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
461
500
|
}
|
|
462
501
|
}
|
|
463
502
|
inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
|
|
464
503
|
inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
465
504
|
for (int i = 0; i < n; ++i) {
|
|
466
|
-
y[i] =
|
|
505
|
+
y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
467
506
|
}
|
|
468
507
|
}
|
|
469
508
|
inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
|
|
470
509
|
inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
471
510
|
for (int i = 0; i < n; ++i) {
|
|
472
|
-
y[i] =
|
|
511
|
+
y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
473
512
|
}
|
|
474
513
|
}
|
|
475
514
|
inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
|
|
476
515
|
inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
477
516
|
for (int i = 0; i < n; ++i) {
|
|
478
|
-
y[i] =
|
|
517
|
+
y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
479
518
|
}
|
|
480
519
|
}
|
|
481
520
|
inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
|
|
482
521
|
inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
483
522
|
for (int i = 0; i < n; ++i) {
|
|
484
|
-
float v =
|
|
485
|
-
y[i] =
|
|
523
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
524
|
+
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
|
|
486
525
|
}
|
|
487
526
|
}
|
|
488
527
|
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
|
|
489
528
|
inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
490
529
|
for (int i = 0; i < n; ++i) {
|
|
491
|
-
y[i] =
|
|
530
|
+
y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
|
|
492
531
|
}
|
|
493
532
|
}
|
|
494
533
|
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
|
495
534
|
inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
496
535
|
for (int i = 0; i < n; ++i) {
|
|
497
|
-
y[i] =
|
|
536
|
+
y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
498
537
|
}
|
|
499
538
|
}
|
|
500
539
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
|
|
501
540
|
inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
502
541
|
for (int i = 0; i < n; ++i) {
|
|
503
|
-
y[i] =
|
|
542
|
+
y[i] = GGML_CPU_FP32_TO_FP16(expm1f(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
504
543
|
}
|
|
505
544
|
}
|
|
506
545
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
|
507
546
|
inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
508
547
|
for (int i = 0; i < n; ++i) {
|
|
509
|
-
float v =
|
|
510
|
-
y[i] =
|
|
548
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
549
|
+
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f);
|
|
511
550
|
}
|
|
512
551
|
}
|
|
513
552
|
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
|
514
553
|
inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
|
|
515
554
|
for (int i = 0; i < n; ++i) {
|
|
516
|
-
float v =
|
|
517
|
-
y[i] =
|
|
555
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
556
|
+
y[i] = GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
|
|
518
557
|
}
|
|
519
558
|
}
|
|
520
559
|
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
|
|
521
560
|
inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
522
561
|
for (int i = 0; i < n; ++i) {
|
|
523
|
-
y[i] =
|
|
562
|
+
y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i]))));
|
|
524
563
|
}
|
|
525
564
|
}
|
|
526
565
|
// TODO: optimize performance
|
|
527
566
|
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
|
528
567
|
inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
529
568
|
for (int i = 0; i < n; ++i) {
|
|
530
|
-
float v =
|
|
531
|
-
y[i] =
|
|
569
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
570
|
+
y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
|
|
532
571
|
}
|
|
533
572
|
}
|
|
534
573
|
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
|
535
574
|
inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
536
575
|
for (int i = 0; i < n; ++i) {
|
|
537
|
-
y[i] =
|
|
576
|
+
y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
|
|
538
577
|
}
|
|
539
578
|
}
|
|
540
579
|
inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
|
|
541
580
|
inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
542
581
|
for (int i = 0; i < n; ++i) {
|
|
543
|
-
y[i] =
|
|
582
|
+
y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
544
583
|
}
|
|
545
584
|
}
|
|
546
585
|
|
|
@@ -562,9 +601,9 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
|
|
|
562
601
|
|
|
563
602
|
inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
564
603
|
for (int i = 0; i < n; ++i) {
|
|
565
|
-
float xi =
|
|
604
|
+
float xi = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
566
605
|
float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
|
|
567
|
-
y[i] =
|
|
606
|
+
y[i] = GGML_CPU_FP32_TO_FP16(res);
|
|
568
607
|
}
|
|
569
608
|
}
|
|
570
609
|
|
|
@@ -577,9 +616,9 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
|
|
577
616
|
} else if (x[i] >= 10.0f) {
|
|
578
617
|
y[i] = x[i];
|
|
579
618
|
} else {
|
|
580
|
-
ggml_fp16_t fp16 =
|
|
619
|
+
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
|
581
620
|
memcpy(&t, &fp16, sizeof(uint16_t));
|
|
582
|
-
y[i] =
|
|
621
|
+
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]);
|
|
583
622
|
}
|
|
584
623
|
}
|
|
585
624
|
}
|
|
@@ -613,9 +652,9 @@ inline static float ggml_gelu_quick_f32(float x) {
|
|
|
613
652
|
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
|
|
614
653
|
uint16_t t;
|
|
615
654
|
for (int i = 0; i < n; ++i) {
|
|
616
|
-
ggml_fp16_t fp16 =
|
|
655
|
+
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
|
617
656
|
memcpy(&t, &fp16, sizeof(uint16_t));
|
|
618
|
-
y[i] =
|
|
657
|
+
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
|
|
619
658
|
}
|
|
620
659
|
}
|
|
621
660
|
#else
|
|
@@ -628,8 +667,8 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
|
|
|
628
667
|
|
|
629
668
|
inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
630
669
|
for (int i = 0; i < n; ++i) {
|
|
631
|
-
float v =
|
|
632
|
-
y[i] =
|
|
670
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
671
|
+
y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
|
|
633
672
|
}
|
|
634
673
|
}
|
|
635
674
|
|
|
@@ -638,8 +677,8 @@ inline static float ggml_silu_f32(float x) {
|
|
|
638
677
|
return x/(1.0f + expf(-x));
|
|
639
678
|
}
|
|
640
679
|
inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
|
|
641
|
-
float v =
|
|
642
|
-
return
|
|
680
|
+
float v = GGML_CPU_FP16_TO_FP32(x);
|
|
681
|
+
return GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v)));
|
|
643
682
|
}
|
|
644
683
|
|
|
645
684
|
#if __FINITE_MATH_ONLY__
|
|
@@ -888,9 +927,9 @@ inline static float ggml_silu_backward_f32(float x, float dy) {
|
|
|
888
927
|
}
|
|
889
928
|
|
|
890
929
|
inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
|
|
891
|
-
const float v =
|
|
930
|
+
const float v = GGML_CPU_FP16_TO_FP32(x);
|
|
892
931
|
const float s = 1.0f/(1.0f + expf(-v));
|
|
893
|
-
return
|
|
932
|
+
return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
|
|
894
933
|
}
|
|
895
934
|
|
|
896
935
|
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
|
|
@@ -905,6 +944,100 @@ inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, con
|
|
|
905
944
|
}
|
|
906
945
|
}
|
|
907
946
|
|
|
947
|
+
inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {
|
|
948
|
+
for (int i = 0; i < n; ++i) {
|
|
949
|
+
y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f;
|
|
950
|
+
}
|
|
951
|
+
}
|
|
952
|
+
|
|
953
|
+
inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
|
954
|
+
for (int i = 0; i < n; ++i) {
|
|
955
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
956
|
+
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : 0.f);
|
|
957
|
+
}
|
|
958
|
+
}
|
|
959
|
+
|
|
960
|
+
#ifdef GGML_GELU_FP16
|
|
961
|
+
inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
|
|
962
|
+
uint16_t t;
|
|
963
|
+
for (int i = 0; i < n; ++i) {
|
|
964
|
+
if (x[i] <= -10.0f) {
|
|
965
|
+
y[i] = 0.0f;
|
|
966
|
+
} else if (x[i] >= 10.0f) {
|
|
967
|
+
y[i] = x[i] * g[i];
|
|
968
|
+
} else {
|
|
969
|
+
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
|
970
|
+
memcpy(&t, &fp16, sizeof(uint16_t));
|
|
971
|
+
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i];
|
|
972
|
+
}
|
|
973
|
+
}
|
|
974
|
+
}
|
|
975
|
+
#else
|
|
976
|
+
inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
|
|
977
|
+
for (int i = 0; i < n; ++i) {
|
|
978
|
+
y[i] = ggml_gelu_f32(x[i]) * g[i];
|
|
979
|
+
}
|
|
980
|
+
}
|
|
981
|
+
#endif
|
|
982
|
+
|
|
983
|
+
inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
|
984
|
+
const uint16_t * i16 = (const uint16_t *) x;
|
|
985
|
+
for (int i = 0; i < n; ++i) {
|
|
986
|
+
float v = GGML_CPU_FP16_TO_FP32(g[i]);
|
|
987
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v);
|
|
988
|
+
}
|
|
989
|
+
}
|
|
990
|
+
|
|
991
|
+
void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g);
|
|
992
|
+
|
|
993
|
+
inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
|
994
|
+
for (int i = 0; i < n; ++i) {
|
|
995
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
996
|
+
float w = GGML_CPU_FP16_TO_FP32(g[i]);
|
|
997
|
+
y[i] = GGML_CPU_FP32_TO_FP16((v/(1.0f + expf(-v))) * w);
|
|
998
|
+
}
|
|
999
|
+
}
|
|
1000
|
+
|
|
1001
|
+
inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
|
|
1002
|
+
for (int i = 0; i < n; ++i) {
|
|
1003
|
+
float xi = x[i];
|
|
1004
|
+
y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
|
|
1005
|
+
}
|
|
1006
|
+
}
|
|
1007
|
+
|
|
1008
|
+
inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
|
1009
|
+
for (int i = 0; i < n; ++i) {
|
|
1010
|
+
float xi = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
1011
|
+
float gi = GGML_CPU_FP16_TO_FP32(g[i]);
|
|
1012
|
+
y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
|
|
1013
|
+
}
|
|
1014
|
+
}
|
|
1015
|
+
|
|
1016
|
+
#ifdef GGML_GELU_QUICK_FP16
|
|
1017
|
+
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
|
|
1018
|
+
uint16_t t;
|
|
1019
|
+
for (int i = 0; i < n; ++i) {
|
|
1020
|
+
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
|
1021
|
+
memcpy(&t, &fp16, sizeof(uint16_t));
|
|
1022
|
+
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
|
|
1023
|
+
}
|
|
1024
|
+
}
|
|
1025
|
+
#else
|
|
1026
|
+
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
|
|
1027
|
+
for (int i = 0; i < n; ++i) {
|
|
1028
|
+
y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
#endif
|
|
1032
|
+
|
|
1033
|
+
inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
|
1034
|
+
const uint16_t * i16 = (const uint16_t *) x;
|
|
1035
|
+
for (int i = 0; i < n; ++i) {
|
|
1036
|
+
float v = GGML_CPU_FP16_TO_FP32(g[i]);
|
|
1037
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
|
|
1038
|
+
}
|
|
1039
|
+
}
|
|
1040
|
+
|
|
908
1041
|
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
|
|
909
1042
|
#ifndef GGML_USE_ACCELERATE
|
|
910
1043
|
ggml_float sum = 0.0;
|
|
@@ -928,7 +1061,7 @@ inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float
|
|
|
928
1061
|
inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
|
|
929
1062
|
float sum = 0.0f;
|
|
930
1063
|
for (int i = 0; i < n; ++i) {
|
|
931
|
-
sum +=
|
|
1064
|
+
sum += GGML_CPU_FP16_TO_FP32(x[i]);
|
|
932
1065
|
}
|
|
933
1066
|
*s = sum;
|
|
934
1067
|
}
|
|
@@ -19,10 +19,10 @@
|
|
|
19
19
|
#endif
|
|
20
20
|
#include "ggml-common.h"
|
|
21
21
|
|
|
22
|
-
#include <cstdio>
|
|
23
22
|
#include <array>
|
|
24
23
|
#include <cassert>
|
|
25
24
|
#include <cfloat>
|
|
25
|
+
#include <cstdio>
|
|
26
26
|
#include <string>
|
|
27
27
|
#include <vector>
|
|
28
28
|
|
|
@@ -76,11 +76,9 @@
|
|
|
76
76
|
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
|
|
77
77
|
|
|
78
78
|
// Moore Threads
|
|
79
|
-
#define
|
|
80
|
-
|
|
81
|
-
#define
|
|
82
|
-
#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
|
|
83
|
-
#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
|
|
79
|
+
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
|
|
80
|
+
#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
|
|
81
|
+
#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
|
|
84
82
|
|
|
85
83
|
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
|
|
86
84
|
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
|
|
@@ -177,6 +175,23 @@ static const char * cu_get_error_str(CUresult err) {
|
|
|
177
175
|
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
|
|
178
176
|
#endif
|
|
179
177
|
|
|
178
|
+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
|
179
|
+
# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
|
|
180
|
+
do { \
|
|
181
|
+
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \
|
|
182
|
+
const int id = ggml_cuda_get_device(); \
|
|
183
|
+
if (!shared_memory_limit_raised[id]) { \
|
|
184
|
+
CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes)); \
|
|
185
|
+
shared_memory_limit_raised[id] = true; \
|
|
186
|
+
} \
|
|
187
|
+
} while (0)
|
|
188
|
+
#else
|
|
189
|
+
# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
|
|
190
|
+
do { \
|
|
191
|
+
GGML_UNUSED(nbytes); \
|
|
192
|
+
} while (0)
|
|
193
|
+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
|
194
|
+
|
|
180
195
|
#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
|
|
181
196
|
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
|
182
197
|
#else
|
|
@@ -203,9 +218,9 @@ typedef float2 dfloat2;
|
|
|
203
218
|
#define FAST_FP16_AVAILABLE
|
|
204
219
|
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
|
205
220
|
|
|
206
|
-
#if !
|
|
221
|
+
#if (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
|
|
207
222
|
#define FP16_MMA_AVAILABLE
|
|
208
|
-
#endif // !
|
|
223
|
+
#endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
|
|
209
224
|
|
|
210
225
|
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
|
211
226
|
#define FP16_MMA_AVAILABLE
|
|
@@ -219,9 +234,9 @@ typedef float2 dfloat2;
|
|
|
219
234
|
#define CP_ASYNC_AVAILABLE
|
|
220
235
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
|
221
236
|
|
|
222
|
-
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) &&
|
|
237
|
+
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
|
|
223
238
|
#define FLASH_ATTN_AVAILABLE
|
|
224
|
-
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) &&
|
|
239
|
+
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
|
|
225
240
|
|
|
226
241
|
static bool fp16_available(const int cc) {
|
|
227
242
|
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
|
|
@@ -233,7 +248,8 @@ static bool fast_fp16_available(const int cc) {
|
|
|
233
248
|
|
|
234
249
|
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
|
235
250
|
static bool fast_fp16_hardware_available(const int cc) {
|
|
236
|
-
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc)
|
|
251
|
+
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc) ||
|
|
252
|
+
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
|
|
237
253
|
}
|
|
238
254
|
|
|
239
255
|
// Any FP16 tensor core instructions are available for ggml code.
|
|
@@ -241,15 +257,35 @@ static bool fp16_mma_available(const int cc) {
|
|
|
241
257
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
|
242
258
|
return false;
|
|
243
259
|
#else
|
|
244
|
-
|
|
245
|
-
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) ||
|
|
260
|
+
if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
|
|
261
|
+
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) ||
|
|
262
|
+
GGML_CUDA_CC_IS_MTHREADS(cc)) {
|
|
263
|
+
return true;
|
|
264
|
+
} else if (GGML_CUDA_CC_IS_RDNA4(cc)) {
|
|
265
|
+
#if defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
|
|
266
|
+
return true;
|
|
267
|
+
#else
|
|
268
|
+
return false;
|
|
269
|
+
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
|
|
270
|
+
} else {
|
|
271
|
+
return false;
|
|
272
|
+
}
|
|
246
273
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
|
247
274
|
}
|
|
248
275
|
|
|
249
276
|
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
|
250
277
|
static bool fp16_mma_hardware_available(const int cc) {
|
|
251
278
|
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
|
|
252
|
-
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)
|
|
279
|
+
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc) ||
|
|
280
|
+
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
static bool bf16_mma_hardware_available(const int cc) {
|
|
284
|
+
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
static bool fp32_mma_hardware_available(const int cc) {
|
|
288
|
+
return GGML_CUDA_CC_IS_CDNA(cc);
|
|
253
289
|
}
|
|
254
290
|
|
|
255
291
|
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
|
|
@@ -362,6 +398,26 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
|
|
362
398
|
#endif // FP16_AVAILABLE
|
|
363
399
|
}
|
|
364
400
|
|
|
401
|
+
// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
|
|
402
|
+
template<bool norm>
|
|
403
|
+
static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) {
|
|
404
|
+
const int row = blockIdx.x;
|
|
405
|
+
const int col = threadIdx.x;
|
|
406
|
+
|
|
407
|
+
float sum = 0.0f;
|
|
408
|
+
for (int i = col; i < ncols; i += blockDim.x) {
|
|
409
|
+
sum += x[row * ncols + i];
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
sum = warp_reduce_sum(sum);
|
|
413
|
+
|
|
414
|
+
if (col != 0) {
|
|
415
|
+
return;
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
dst[row] = norm ? sum / ncols : sum;
|
|
419
|
+
}
|
|
420
|
+
|
|
365
421
|
template<int width = WARP_SIZE>
|
|
366
422
|
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
367
423
|
#pragma unroll
|
|
@@ -767,21 +823,7 @@ struct ggml_backend_cuda_context {
|
|
|
767
823
|
name(GGML_CUDA_NAME + std::to_string(device)) {
|
|
768
824
|
}
|
|
769
825
|
|
|
770
|
-
~ggml_backend_cuda_context()
|
|
771
|
-
if (copy_event != nullptr) {
|
|
772
|
-
CUDA_CHECK(cudaEventDestroy(copy_event));
|
|
773
|
-
}
|
|
774
|
-
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
|
|
775
|
-
for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
|
|
776
|
-
if (streams[i][j] != nullptr) {
|
|
777
|
-
CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
|
|
778
|
-
}
|
|
779
|
-
}
|
|
780
|
-
if (cublas_handles[i] != nullptr) {
|
|
781
|
-
CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
|
|
782
|
-
}
|
|
783
|
-
}
|
|
784
|
-
}
|
|
826
|
+
~ggml_backend_cuda_context();
|
|
785
827
|
|
|
786
828
|
cudaStream_t stream(int device, int stream) {
|
|
787
829
|
if (streams[device][stream] == nullptr) {
|