@novastera-oss/llamarn 0.2.6 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +141 -38
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +58 -24
- package/cpp/LlamaCppModel.h +3 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +32 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
- package/cpp/llama.cpp/common/arg.cpp +37 -6
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
- package/cpp/llama.cpp/common/chat-parser.h +2 -0
- package/cpp/llama.cpp/common/chat.cpp +12 -9
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +53 -40
- package/cpp/llama.cpp/common/common.h +6 -2
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
- package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
- package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +141 -38
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
- package/cpp/llama.cpp/src/llama-arch.h +25 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
- package/cpp/llama.cpp/src/llama-batch.h +110 -57
- package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +360 -266
- package/cpp/llama.cpp/src/llama-context.h +27 -23
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
- package/cpp/llama.cpp/src/llama-graph.h +126 -58
- package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
- package/cpp/llama.cpp/src/llama-hparams.h +16 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
- package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +73 -36
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
- package/cpp/llama.cpp/src/llama-model.h +26 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
- package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/rn-completion.cpp +2 -2
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
- package/ios/include/chat.h +1 -1
- package/ios/include/common.h +6 -2
- package/ios/include/llama.h +141 -38
- package/ios/libs/llama.xcframework/Info.plist +15 -15
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
|
@@ -58,7 +58,7 @@ inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf
|
|
|
58
58
|
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
|
|
59
59
|
inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
|
60
60
|
for (int i = 0; i < n; ++i) {
|
|
61
|
-
z[i] =
|
|
61
|
+
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
|
|
62
62
|
}
|
|
63
63
|
}
|
|
64
64
|
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
|
|
@@ -67,7 +67,7 @@ inline static void ggml_vec_acc1_f32(const int n, float * y, const float v)
|
|
|
67
67
|
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
|
|
68
68
|
inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
|
69
69
|
for (int i = 0; i < n; ++i) {
|
|
70
|
-
z[i] =
|
|
70
|
+
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i]));
|
|
71
71
|
}
|
|
72
72
|
}
|
|
73
73
|
inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
|
@@ -75,20 +75,20 @@ inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)
|
|
|
75
75
|
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
|
|
76
76
|
inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
77
77
|
for (int i = 0; i < n; ++i) {
|
|
78
|
-
y[i] =
|
|
78
|
+
y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i]));
|
|
79
79
|
}
|
|
80
80
|
}
|
|
81
81
|
|
|
82
82
|
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
|
83
83
|
inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
|
84
84
|
for (int i = 0; i < n; ++i) {
|
|
85
|
-
z[i] =
|
|
85
|
+
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i]));
|
|
86
86
|
}
|
|
87
87
|
}
|
|
88
88
|
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
|
89
89
|
inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
|
90
90
|
for (int i = 0; i < n; ++i) {
|
|
91
|
-
z[i] =
|
|
91
|
+
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i]));
|
|
92
92
|
}
|
|
93
93
|
}
|
|
94
94
|
|
|
@@ -131,13 +131,13 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
|
|
131
131
|
// leftovers
|
|
132
132
|
for (int i = np; i < n; ++i) {
|
|
133
133
|
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
|
134
|
-
sumf[j] += (ggml_float)(
|
|
134
|
+
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
135
135
|
}
|
|
136
136
|
}
|
|
137
137
|
#else
|
|
138
138
|
for (int i = 0; i < n; ++i) {
|
|
139
139
|
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
|
140
|
-
sumf[j] += (ggml_float)(
|
|
140
|
+
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
141
141
|
}
|
|
142
142
|
}
|
|
143
143
|
#endif
|
|
@@ -280,12 +280,12 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
|
|
|
280
280
|
|
|
281
281
|
// leftovers
|
|
282
282
|
for (int i = np; i < n; ++i) {
|
|
283
|
-
y[i] =
|
|
283
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
|
284
284
|
}
|
|
285
285
|
#else
|
|
286
286
|
// scalar
|
|
287
287
|
for (int i = 0; i < n; ++i) {
|
|
288
|
-
y[i] =
|
|
288
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
|
289
289
|
}
|
|
290
290
|
#endif
|
|
291
291
|
}
|
|
@@ -430,12 +430,12 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
|
|
|
430
430
|
|
|
431
431
|
// leftovers
|
|
432
432
|
for (int i = np; i < n; ++i) {
|
|
433
|
-
y[i] =
|
|
433
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
|
434
434
|
}
|
|
435
435
|
#else
|
|
436
436
|
// scalar
|
|
437
437
|
for (int i = 0; i < n; ++i) {
|
|
438
|
-
y[i] =
|
|
438
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
|
439
439
|
}
|
|
440
440
|
#endif
|
|
441
441
|
}
|
|
@@ -444,103 +444,103 @@ inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) {
|
|
|
444
444
|
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
|
445
445
|
inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
446
446
|
for (int i = 0; i < n; ++i) {
|
|
447
|
-
float v =
|
|
448
|
-
y[i] =
|
|
447
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
448
|
+
y[i] = GGML_CPU_FP32_TO_FP16(v*v);
|
|
449
449
|
}
|
|
450
450
|
}
|
|
451
451
|
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
|
452
452
|
inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
453
453
|
for (int i = 0; i < n; ++i) {
|
|
454
|
-
y[i] =
|
|
454
|
+
y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
455
455
|
}
|
|
456
456
|
}
|
|
457
457
|
inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
|
|
458
458
|
inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
459
459
|
for (int i = 0; i < n; ++i) {
|
|
460
|
-
y[i] =
|
|
460
|
+
y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
461
461
|
}
|
|
462
462
|
}
|
|
463
463
|
inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
|
|
464
464
|
inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
465
465
|
for (int i = 0; i < n; ++i) {
|
|
466
|
-
y[i] =
|
|
466
|
+
y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
467
467
|
}
|
|
468
468
|
}
|
|
469
469
|
inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
|
|
470
470
|
inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
471
471
|
for (int i = 0; i < n; ++i) {
|
|
472
|
-
y[i] =
|
|
472
|
+
y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
473
473
|
}
|
|
474
474
|
}
|
|
475
475
|
inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
|
|
476
476
|
inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
477
477
|
for (int i = 0; i < n; ++i) {
|
|
478
|
-
y[i] =
|
|
478
|
+
y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
479
479
|
}
|
|
480
480
|
}
|
|
481
481
|
inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
|
|
482
482
|
inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
483
483
|
for (int i = 0; i < n; ++i) {
|
|
484
|
-
float v =
|
|
485
|
-
y[i] =
|
|
484
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
485
|
+
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
|
|
486
486
|
}
|
|
487
487
|
}
|
|
488
488
|
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
|
|
489
489
|
inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
490
490
|
for (int i = 0; i < n; ++i) {
|
|
491
|
-
y[i] =
|
|
491
|
+
y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
|
|
492
492
|
}
|
|
493
493
|
}
|
|
494
494
|
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
|
495
495
|
inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
496
496
|
for (int i = 0; i < n; ++i) {
|
|
497
|
-
y[i] =
|
|
497
|
+
y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
498
498
|
}
|
|
499
499
|
}
|
|
500
500
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
|
|
501
501
|
inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
502
502
|
for (int i = 0; i < n; ++i) {
|
|
503
|
-
y[i] =
|
|
503
|
+
y[i] = GGML_CPU_FP32_TO_FP16(expm1f(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
504
504
|
}
|
|
505
505
|
}
|
|
506
506
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
|
507
507
|
inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
508
508
|
for (int i = 0; i < n; ++i) {
|
|
509
|
-
float v =
|
|
510
|
-
y[i] =
|
|
509
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
510
|
+
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f);
|
|
511
511
|
}
|
|
512
512
|
}
|
|
513
513
|
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
|
514
514
|
inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
|
|
515
515
|
for (int i = 0; i < n; ++i) {
|
|
516
|
-
float v =
|
|
517
|
-
y[i] =
|
|
516
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
517
|
+
y[i] = GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
|
|
518
518
|
}
|
|
519
519
|
}
|
|
520
520
|
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
|
|
521
521
|
inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
522
522
|
for (int i = 0; i < n; ++i) {
|
|
523
|
-
y[i] =
|
|
523
|
+
y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i]))));
|
|
524
524
|
}
|
|
525
525
|
}
|
|
526
526
|
// TODO: optimize performance
|
|
527
527
|
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
|
528
528
|
inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
529
529
|
for (int i = 0; i < n; ++i) {
|
|
530
|
-
float v =
|
|
531
|
-
y[i] =
|
|
530
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
531
|
+
y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
|
|
532
532
|
}
|
|
533
533
|
}
|
|
534
534
|
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
|
535
535
|
inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
536
536
|
for (int i = 0; i < n; ++i) {
|
|
537
|
-
y[i] =
|
|
537
|
+
y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
|
|
538
538
|
}
|
|
539
539
|
}
|
|
540
540
|
inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
|
|
541
541
|
inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
542
542
|
for (int i = 0; i < n; ++i) {
|
|
543
|
-
y[i] =
|
|
543
|
+
y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i])));
|
|
544
544
|
}
|
|
545
545
|
}
|
|
546
546
|
|
|
@@ -562,9 +562,9 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
|
|
|
562
562
|
|
|
563
563
|
inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
564
564
|
for (int i = 0; i < n; ++i) {
|
|
565
|
-
float xi =
|
|
565
|
+
float xi = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
566
566
|
float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
|
|
567
|
-
y[i] =
|
|
567
|
+
y[i] = GGML_CPU_FP32_TO_FP16(res);
|
|
568
568
|
}
|
|
569
569
|
}
|
|
570
570
|
|
|
@@ -577,9 +577,9 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
|
|
577
577
|
} else if (x[i] >= 10.0f) {
|
|
578
578
|
y[i] = x[i];
|
|
579
579
|
} else {
|
|
580
|
-
ggml_fp16_t fp16 =
|
|
580
|
+
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
|
581
581
|
memcpy(&t, &fp16, sizeof(uint16_t));
|
|
582
|
-
y[i] =
|
|
582
|
+
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]);
|
|
583
583
|
}
|
|
584
584
|
}
|
|
585
585
|
}
|
|
@@ -613,9 +613,9 @@ inline static float ggml_gelu_quick_f32(float x) {
|
|
|
613
613
|
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
|
|
614
614
|
uint16_t t;
|
|
615
615
|
for (int i = 0; i < n; ++i) {
|
|
616
|
-
ggml_fp16_t fp16 =
|
|
616
|
+
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
|
617
617
|
memcpy(&t, &fp16, sizeof(uint16_t));
|
|
618
|
-
y[i] =
|
|
618
|
+
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
|
|
619
619
|
}
|
|
620
620
|
}
|
|
621
621
|
#else
|
|
@@ -628,8 +628,8 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
|
|
|
628
628
|
|
|
629
629
|
inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
630
630
|
for (int i = 0; i < n; ++i) {
|
|
631
|
-
float v =
|
|
632
|
-
y[i] =
|
|
631
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
632
|
+
y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
|
|
633
633
|
}
|
|
634
634
|
}
|
|
635
635
|
|
|
@@ -638,8 +638,8 @@ inline static float ggml_silu_f32(float x) {
|
|
|
638
638
|
return x/(1.0f + expf(-x));
|
|
639
639
|
}
|
|
640
640
|
inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
|
|
641
|
-
float v =
|
|
642
|
-
return
|
|
641
|
+
float v = GGML_CPU_FP16_TO_FP32(x);
|
|
642
|
+
return GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v)));
|
|
643
643
|
}
|
|
644
644
|
|
|
645
645
|
#if __FINITE_MATH_ONLY__
|
|
@@ -888,9 +888,9 @@ inline static float ggml_silu_backward_f32(float x, float dy) {
|
|
|
888
888
|
}
|
|
889
889
|
|
|
890
890
|
inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
|
|
891
|
-
const float v =
|
|
891
|
+
const float v = GGML_CPU_FP16_TO_FP32(x);
|
|
892
892
|
const float s = 1.0f/(1.0f + expf(-v));
|
|
893
|
-
return
|
|
893
|
+
return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
|
|
894
894
|
}
|
|
895
895
|
|
|
896
896
|
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
|
|
@@ -928,7 +928,7 @@ inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float
|
|
|
928
928
|
inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
|
|
929
929
|
float sum = 0.0f;
|
|
930
930
|
for (int i = 0; i < n; ++i) {
|
|
931
|
-
sum +=
|
|
931
|
+
sum += GGML_CPU_FP16_TO_FP32(x[i]);
|
|
932
932
|
}
|
|
933
933
|
*s = sum;
|
|
934
934
|
}
|
|
@@ -19,10 +19,10 @@
|
|
|
19
19
|
#endif
|
|
20
20
|
#include "ggml-common.h"
|
|
21
21
|
|
|
22
|
-
#include <cstdio>
|
|
23
22
|
#include <array>
|
|
24
23
|
#include <cassert>
|
|
25
24
|
#include <cfloat>
|
|
25
|
+
#include <cstdio>
|
|
26
26
|
#include <string>
|
|
27
27
|
#include <vector>
|
|
28
28
|
|
|
@@ -76,11 +76,9 @@
|
|
|
76
76
|
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
|
|
77
77
|
|
|
78
78
|
// Moore Threads
|
|
79
|
-
#define
|
|
80
|
-
|
|
81
|
-
#define
|
|
82
|
-
#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
|
|
83
|
-
#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
|
|
79
|
+
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
|
|
80
|
+
#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
|
|
81
|
+
#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
|
|
84
82
|
|
|
85
83
|
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
|
|
86
84
|
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
|
|
@@ -203,13 +201,13 @@ typedef float2 dfloat2;
|
|
|
203
201
|
#define FAST_FP16_AVAILABLE
|
|
204
202
|
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
|
205
203
|
|
|
206
|
-
#if !
|
|
204
|
+
#if (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
|
|
207
205
|
#define FP16_MMA_AVAILABLE
|
|
208
|
-
#endif // !
|
|
206
|
+
#endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
|
|
209
207
|
|
|
210
|
-
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
|
|
208
|
+
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
|
211
209
|
#define FP16_MMA_AVAILABLE
|
|
212
|
-
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
|
|
210
|
+
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
|
213
211
|
|
|
214
212
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
|
215
213
|
#define NEW_MMA_AVAILABLE
|
|
@@ -219,9 +217,9 @@ typedef float2 dfloat2;
|
|
|
219
217
|
#define CP_ASYNC_AVAILABLE
|
|
220
218
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
|
221
219
|
|
|
222
|
-
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) &&
|
|
220
|
+
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
|
|
223
221
|
#define FLASH_ATTN_AVAILABLE
|
|
224
|
-
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) &&
|
|
222
|
+
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
|
|
225
223
|
|
|
226
224
|
static bool fp16_available(const int cc) {
|
|
227
225
|
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
|
|
@@ -233,7 +231,8 @@ static bool fast_fp16_available(const int cc) {
|
|
|
233
231
|
|
|
234
232
|
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
|
235
233
|
static bool fast_fp16_hardware_available(const int cc) {
|
|
236
|
-
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc)
|
|
234
|
+
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc) ||
|
|
235
|
+
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
|
|
237
236
|
}
|
|
238
237
|
|
|
239
238
|
// Any FP16 tensor core instructions are available for ggml code.
|
|
@@ -241,15 +240,35 @@ static bool fp16_mma_available(const int cc) {
|
|
|
241
240
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
|
242
241
|
return false;
|
|
243
242
|
#else
|
|
244
|
-
|
|
245
|
-
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) ||
|
|
243
|
+
if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
|
|
244
|
+
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) ||
|
|
245
|
+
GGML_CUDA_CC_IS_MTHREADS(cc)) {
|
|
246
|
+
return true;
|
|
247
|
+
} else if (GGML_CUDA_CC_IS_RDNA4(cc)) {
|
|
248
|
+
#if defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
|
|
249
|
+
return true;
|
|
250
|
+
#else
|
|
251
|
+
return false;
|
|
252
|
+
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
|
|
253
|
+
} else {
|
|
254
|
+
return false;
|
|
255
|
+
}
|
|
246
256
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
|
247
257
|
}
|
|
248
258
|
|
|
249
259
|
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
|
250
260
|
static bool fp16_mma_hardware_available(const int cc) {
|
|
251
261
|
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
|
|
252
|
-
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)
|
|
262
|
+
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc) ||
|
|
263
|
+
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
static bool bf16_mma_hardware_available(const int cc) {
|
|
267
|
+
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
static bool fp32_mma_hardware_available(const int cc) {
|
|
271
|
+
return GGML_CUDA_CC_IS_CDNA(cc);
|
|
253
272
|
}
|
|
254
273
|
|
|
255
274
|
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
|
|
@@ -262,11 +281,11 @@ static bool cp_async_available(const int cc) {
|
|
|
262
281
|
}
|
|
263
282
|
|
|
264
283
|
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
|
265
|
-
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
|
266
|
-
return
|
|
284
|
+
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
|
|
285
|
+
return 64;
|
|
267
286
|
#else
|
|
268
287
|
return 32;
|
|
269
|
-
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
|
288
|
+
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
|
|
270
289
|
}
|
|
271
290
|
|
|
272
291
|
[[noreturn]]
|
|
@@ -362,6 +381,26 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
|
|
362
381
|
#endif // FP16_AVAILABLE
|
|
363
382
|
}
|
|
364
383
|
|
|
384
|
+
// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
|
|
385
|
+
template<bool norm>
|
|
386
|
+
static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) {
|
|
387
|
+
const int row = blockIdx.x;
|
|
388
|
+
const int col = threadIdx.x;
|
|
389
|
+
|
|
390
|
+
float sum = 0.0f;
|
|
391
|
+
for (int i = col; i < ncols; i += blockDim.x) {
|
|
392
|
+
sum += x[row * ncols + i];
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
sum = warp_reduce_sum(sum);
|
|
396
|
+
|
|
397
|
+
if (col != 0) {
|
|
398
|
+
return;
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
dst[row] = norm ? sum / ncols : sum;
|
|
402
|
+
}
|
|
403
|
+
|
|
365
404
|
template<int width = WARP_SIZE>
|
|
366
405
|
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
367
406
|
#pragma unroll
|
|
@@ -466,9 +505,6 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
|
|
466
505
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
|
467
506
|
}
|
|
468
507
|
|
|
469
|
-
// TODO: move to ggml-common.h
|
|
470
|
-
static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
|
471
|
-
|
|
472
508
|
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
|
473
509
|
|
|
474
510
|
static __device__ __forceinline__ float get_alibi_slope(
|
|
@@ -770,21 +806,7 @@ struct ggml_backend_cuda_context {
|
|
|
770
806
|
name(GGML_CUDA_NAME + std::to_string(device)) {
|
|
771
807
|
}
|
|
772
808
|
|
|
773
|
-
~ggml_backend_cuda_context()
|
|
774
|
-
if (copy_event != nullptr) {
|
|
775
|
-
CUDA_CHECK(cudaEventDestroy(copy_event));
|
|
776
|
-
}
|
|
777
|
-
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
|
|
778
|
-
for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
|
|
779
|
-
if (streams[i][j] != nullptr) {
|
|
780
|
-
CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
|
|
781
|
-
}
|
|
782
|
-
}
|
|
783
|
-
if (cublas_handles[i] != nullptr) {
|
|
784
|
-
CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
|
|
785
|
-
}
|
|
786
|
-
}
|
|
787
|
-
}
|
|
809
|
+
~ggml_backend_cuda_context();
|
|
788
810
|
|
|
789
811
|
cudaStream_t stream(int device, int stream) {
|
|
790
812
|
if (streams[device][stream] == nullptr) {
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
#include "conv2d-dw.cuh"
|
|
2
|
+
|
|
3
|
+
struct conv_params {
|
|
4
|
+
int in_w, in_h;
|
|
5
|
+
int out_w, out_h;
|
|
6
|
+
int kernel_w, kernel_h;
|
|
7
|
+
int stride_x, stride_y;
|
|
8
|
+
int padding_x, padding_y;
|
|
9
|
+
int dilation_x, dilation_y;
|
|
10
|
+
int channels, batches;
|
|
11
|
+
};
|
|
12
|
+
|
|
13
|
+
struct kernel_bounds {
|
|
14
|
+
int y_min, y_max;
|
|
15
|
+
int x_min, x_max;
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
__device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int out_x, int out_y, const conv_params & params) {
|
|
19
|
+
kernel_bounds bounds;
|
|
20
|
+
bounds.y_min = max(0, (params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
|
|
21
|
+
bounds.y_max =
|
|
22
|
+
min(params.kernel_h,
|
|
23
|
+
(params.in_h + params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
|
|
24
|
+
bounds.x_min = max(0, (params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
|
|
25
|
+
bounds.x_max =
|
|
26
|
+
min(params.kernel_w,
|
|
27
|
+
(params.in_w + params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
|
|
28
|
+
return bounds;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
__device__ __forceinline__ int calculate_input_coord(int out_coord, int kern_coord, int stride, int dilation, int padding) {
|
|
32
|
+
return out_coord * stride + kern_coord * dilation - padding;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
struct whcn_layout {
|
|
36
|
+
__device__ static int input_index(int n, int c, int y, int x, const conv_params & params) {
|
|
37
|
+
return n * (params.channels * params.in_w * params.in_h) + c * params.in_w * params.in_h + y * params.in_w + x;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
__device__ static int kernel_index(int c, int ky, int kx, const conv_params & params) {
|
|
41
|
+
return c * params.kernel_h * params.kernel_w + ky * params.kernel_w + kx;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
__device__ static int output_index(int n, int c, int y, int x, const conv_params & params) {
|
|
45
|
+
return n * (params.channels * params.out_w * params.out_h) + c * params.out_w * params.out_h +
|
|
46
|
+
y * params.out_w + x;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
__device__ static void unpack_indices(int global_idx, const conv_params & params, int & n, int & c, int & out_y,
|
|
50
|
+
int & out_x) {
|
|
51
|
+
out_x = global_idx % params.out_w;
|
|
52
|
+
out_y = (global_idx / params.out_w) % params.out_h;
|
|
53
|
+
c = (global_idx / (params.out_w * params.out_h)) % params.channels;
|
|
54
|
+
n = global_idx / (params.out_w * params.out_h * params.channels);
|
|
55
|
+
}
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
struct cwhn_layout {
|
|
59
|
+
__device__ static int input_index(int n, int c, int y, int x, const conv_params & params) {
|
|
60
|
+
return n * (params.channels * params.in_w * params.in_h) + (y * params.in_w + x) * params.channels + c;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
__device__ static int kernel_index(int c, int ky, int kx, const conv_params & params) {
|
|
64
|
+
return (ky * params.kernel_w + kx) * params.channels + c;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
__device__ static int output_index(int n, int c, int y, int x, const conv_params & params) {
|
|
68
|
+
return n * (params.channels * params.out_w * params.out_h) + y * (params.out_w * params.channels) +
|
|
69
|
+
x * params.channels + c;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
__device__ static void unpack_indices(int global_idx, const conv_params & params, int & n, int & c, int & out_y,
|
|
73
|
+
int & out_x) {
|
|
74
|
+
c = global_idx % params.channels;
|
|
75
|
+
out_x = (global_idx / params.channels) % params.out_w;
|
|
76
|
+
out_y = (global_idx / (params.channels * params.out_w)) % params.out_h;
|
|
77
|
+
n = global_idx / (params.channels * params.out_w * params.out_h);
|
|
78
|
+
}
|
|
79
|
+
};
|
|
80
|
+
|
|
81
|
+
template <typename T, typename Layout>
|
|
82
|
+
__global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restrict__ kernel, T * __restrict__ output,
|
|
83
|
+
const int in_w, const int in_h, const int out_w, const int out_h,
|
|
84
|
+
const int kernel_w, const int kernel_h, const int stride_x, const int stride_y,
|
|
85
|
+
const int padding_x, const int padding_y, const int dilation_x, const int dilation_y,
|
|
86
|
+
const int channels, const int batches) {
|
|
87
|
+
const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
88
|
+
const int total_elements = batches * channels * out_h * out_w;
|
|
89
|
+
|
|
90
|
+
if (global_idx >= total_elements) {
|
|
91
|
+
return;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
conv_params params = { in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x,
|
|
95
|
+
stride_y, padding_x, padding_y, dilation_x, dilation_y, channels, batches };
|
|
96
|
+
|
|
97
|
+
int batch_idx, channel_idx, out_y_idx, out_x_idx;
|
|
98
|
+
Layout::unpack_indices(global_idx, params, batch_idx, channel_idx, out_y_idx, out_x_idx);
|
|
99
|
+
|
|
100
|
+
T accumulator = 0;
|
|
101
|
+
kernel_bounds bounds = calculate_kernel_bounds(out_x_idx, out_y_idx, params);
|
|
102
|
+
|
|
103
|
+
for (int kern_y = bounds.y_min; kern_y < bounds.y_max; ++kern_y) {
|
|
104
|
+
int in_y_idx = calculate_input_coord(out_y_idx, kern_y, params.stride_y, params.dilation_y, params.padding_y);
|
|
105
|
+
|
|
106
|
+
for (int kern_x = bounds.x_min; kern_x < bounds.x_max; ++kern_x) {
|
|
107
|
+
int in_x_idx = calculate_input_coord(out_x_idx, kern_x, params.stride_x, params.dilation_x, params.padding_x);
|
|
108
|
+
|
|
109
|
+
const T input_val = input[Layout::input_index(batch_idx, channel_idx, in_y_idx, in_x_idx, params)];
|
|
110
|
+
const T kernel_val = kernel[Layout::kernel_index(channel_idx, kern_y, kern_x, params)];
|
|
111
|
+
|
|
112
|
+
accumulator += input_val * kernel_val;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
output[Layout::output_index(batch_idx, channel_idx, out_y_idx, out_x_idx, params)] = accumulator;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
120
|
+
const ggml_tensor * kernel = dst->src[0];
|
|
121
|
+
const ggml_tensor * input = dst->src[1];
|
|
122
|
+
|
|
123
|
+
GGML_ASSERT(kernel->type == GGML_TYPE_F32 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
|
124
|
+
const float * w_d = (const float *) kernel->data;
|
|
125
|
+
const float * x_d = (const float *) input->data;
|
|
126
|
+
float * y_d = (float *) dst->data;
|
|
127
|
+
|
|
128
|
+
const int32_t * p = (const int32_t *) dst->op_params;
|
|
129
|
+
const int stride_x = p[0];
|
|
130
|
+
const int stride_y = p[1];
|
|
131
|
+
const int padding_x = p[2];
|
|
132
|
+
const int padding_y = p[3];
|
|
133
|
+
const int dilation_x = p[4];
|
|
134
|
+
const int dilation_y = p[5];
|
|
135
|
+
|
|
136
|
+
const int in_w = input->ne[0];
|
|
137
|
+
const int in_h = input->ne[1];
|
|
138
|
+
const int kernel_w = kernel->ne[0];
|
|
139
|
+
const int kernel_h = kernel->ne[1];
|
|
140
|
+
const int out_w = dst->ne[0];
|
|
141
|
+
const int out_h = dst->ne[1];
|
|
142
|
+
const int channels = dst->ne[2];
|
|
143
|
+
const int batches = dst->ne[3];
|
|
144
|
+
|
|
145
|
+
cudaStream_t st = ctx.stream();
|
|
146
|
+
|
|
147
|
+
const int total = batches * channels * out_h * out_w;
|
|
148
|
+
const int blocks = (total + CUDA_CONV2D_DW_BLOCK_SIZE - 1) / CUDA_CONV2D_DW_BLOCK_SIZE;
|
|
149
|
+
|
|
150
|
+
if (ggml_is_contiguous(input)) {
|
|
151
|
+
conv2d_dw_kernel<float, whcn_layout><<<blocks, CUDA_CONV2D_DW_BLOCK_SIZE, 0, st>>>(
|
|
152
|
+
x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y,
|
|
153
|
+
dilation_x, dilation_y, channels, batches);
|
|
154
|
+
} else if (ggml_is_contiguous_channels(input)) {
|
|
155
|
+
conv2d_dw_kernel<float, cwhn_layout><<<blocks, CUDA_CONV2D_DW_BLOCK_SIZE, 0, st>>>(
|
|
156
|
+
x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y,
|
|
157
|
+
dilation_x, dilation_y, channels, batches);
|
|
158
|
+
} else {
|
|
159
|
+
GGML_ABORT("Unsupported memory layout for conv_2d_dw");
|
|
160
|
+
}
|
|
161
|
+
}
|