@novastera-oss/llamarn 0.2.6 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +141 -38
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +58 -24
- package/cpp/LlamaCppModel.h +3 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +32 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
- package/cpp/llama.cpp/common/arg.cpp +37 -6
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
- package/cpp/llama.cpp/common/chat-parser.h +2 -0
- package/cpp/llama.cpp/common/chat.cpp +12 -9
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +53 -40
- package/cpp/llama.cpp/common/common.h +6 -2
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
- package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
- package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +141 -38
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
- package/cpp/llama.cpp/src/llama-arch.h +25 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
- package/cpp/llama.cpp/src/llama-batch.h +110 -57
- package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +360 -266
- package/cpp/llama.cpp/src/llama-context.h +27 -23
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
- package/cpp/llama.cpp/src/llama-graph.h +126 -58
- package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
- package/cpp/llama.cpp/src/llama-hparams.h +16 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
- package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +73 -36
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
- package/cpp/llama.cpp/src/llama-model.h +26 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
- package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/rn-completion.cpp +2 -2
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
- package/ios/include/chat.h +1 -1
- package/ios/include/common.h +6 -2
- package/ios/include/llama.h +141 -38
- package/ios/libs/llama.xcframework/Info.plist +15 -15
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
|
@@ -231,6 +231,71 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
|
|
|
231
231
|
return { type, major, minor, patch };
|
|
232
232
|
}
|
|
233
233
|
|
|
234
|
+
// Profiling
|
|
235
|
+
struct ProfilingInfo {
|
|
236
|
+
std::string op_name;
|
|
237
|
+
std::string kernel_name;
|
|
238
|
+
|
|
239
|
+
cl_kernel kernel;
|
|
240
|
+
cl_event evt;
|
|
241
|
+
|
|
242
|
+
cl_ulong cmd_queued;
|
|
243
|
+
cl_ulong cmd_submit;
|
|
244
|
+
cl_ulong cmd_start;
|
|
245
|
+
cl_ulong cmd_end;
|
|
246
|
+
cl_ulong overhead_start;
|
|
247
|
+
cl_ulong overhead_end;
|
|
248
|
+
// For the times below, see spec for clGetEventProfilingInfo
|
|
249
|
+
// The time kernel spent in cmd queue - SUBMIT - QUEUED
|
|
250
|
+
cl_ulong cmd_queued_duration_ns;
|
|
251
|
+
// The time kernel spent for submission - START - SUBMIT
|
|
252
|
+
cl_ulong cmd_submit_duration_ns;
|
|
253
|
+
// Kernel execution time in nanoseconds - END - START
|
|
254
|
+
cl_ulong cmd_duration_ns;
|
|
255
|
+
// The time for the kernel to complete - COMPLETE - END
|
|
256
|
+
cl_ulong cmd_complete_duration_ns;
|
|
257
|
+
// Total time to finish the kernel - COMPELTE - QUEUED
|
|
258
|
+
cl_ulong cmd_total_duration_ns;
|
|
259
|
+
// Global and local work sizes.
|
|
260
|
+
size_t global_size[3];
|
|
261
|
+
size_t local_size[3];
|
|
262
|
+
// Op output size.
|
|
263
|
+
size_t output_size[4];
|
|
264
|
+
};
|
|
265
|
+
|
|
266
|
+
static void populateProfilingInfo(
|
|
267
|
+
ProfilingInfo& info, cl_event evt, cl_kernel kernel, cl_uint work_dim,
|
|
268
|
+
size_t global_size[3], size_t local_size[3],
|
|
269
|
+
const ggml_tensor * tensor) {
|
|
270
|
+
info.op_name = tensor->name;
|
|
271
|
+
info.kernel = kernel;
|
|
272
|
+
info.evt = evt;
|
|
273
|
+
|
|
274
|
+
// 0 means not specified, e.g., 2D workgroup, or NULL for driver to choose
|
|
275
|
+
info.local_size[0] = 0;
|
|
276
|
+
info.local_size[1] = 0;
|
|
277
|
+
info.local_size[2] = 0;
|
|
278
|
+
|
|
279
|
+
info.global_size[0] = 0;
|
|
280
|
+
info.global_size[1] = 0;
|
|
281
|
+
info.global_size[2] = 0;
|
|
282
|
+
|
|
283
|
+
if (local_size) {
|
|
284
|
+
for (cl_uint i = 0; i < work_dim; ++i) {
|
|
285
|
+
info.local_size[i] = local_size[i];
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
for (cl_uint i = 0; i < work_dim; ++i) {
|
|
290
|
+
info.global_size[i] = global_size[i];
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
info.output_size[0] = tensor->ne[0];
|
|
294
|
+
info.output_size[1] = tensor->ne[1];
|
|
295
|
+
info.output_size[2] = tensor->ne[2];
|
|
296
|
+
info.output_size[3] = tensor->ne[3];
|
|
297
|
+
}
|
|
298
|
+
|
|
234
299
|
struct ggml_backend_opencl_context;
|
|
235
300
|
|
|
236
301
|
// backend device context
|
|
@@ -254,6 +319,8 @@ struct ggml_backend_opencl_device_context {
|
|
|
254
319
|
|
|
255
320
|
// backend context
|
|
256
321
|
struct ggml_backend_opencl_context {
|
|
322
|
+
int ref_count;
|
|
323
|
+
|
|
257
324
|
cl_device_id device;
|
|
258
325
|
std::string device_name;
|
|
259
326
|
|
|
@@ -315,6 +382,13 @@ struct ggml_backend_opencl_context {
|
|
|
315
382
|
cl_program program_softmax_4_f16;
|
|
316
383
|
cl_program program_argsort_f32_i32;
|
|
317
384
|
cl_program program_sum_rows_f32;
|
|
385
|
+
cl_program program_repeat;
|
|
386
|
+
cl_program program_pad;
|
|
387
|
+
cl_program program_tanh;
|
|
388
|
+
cl_program program_upscale;
|
|
389
|
+
cl_program program_concat;
|
|
390
|
+
cl_program program_tsembd;
|
|
391
|
+
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
|
|
318
392
|
|
|
319
393
|
cl_kernel kernel_add, kernel_add_row;
|
|
320
394
|
cl_kernel kernel_mul, kernel_mul_row;
|
|
@@ -351,6 +425,118 @@ struct ggml_backend_opencl_context {
|
|
|
351
425
|
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
|
|
352
426
|
cl_kernel kernel_argsort_f32_i32;
|
|
353
427
|
cl_kernel kernel_sum_rows_f32;
|
|
428
|
+
cl_kernel kernel_repeat;
|
|
429
|
+
cl_kernel kernel_pad;
|
|
430
|
+
cl_kernel kernel_tanh_f32_nd;
|
|
431
|
+
cl_kernel kernel_tanh_f16_nd;
|
|
432
|
+
cl_kernel kernel_upscale;
|
|
433
|
+
cl_kernel kernel_upscale_bilinear;
|
|
434
|
+
cl_kernel kernel_concat_f32_contiguous;
|
|
435
|
+
cl_kernel kernel_concat_f32_non_contiguous;
|
|
436
|
+
cl_kernel kernel_timestep_embedding;
|
|
437
|
+
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
|
438
|
+
|
|
439
|
+
std::vector<ProfilingInfo> profiling_info;
|
|
440
|
+
|
|
441
|
+
void write_profiling_info() {
|
|
442
|
+
FILE * fperf = fopen("cl_profiling.csv", "w");
|
|
443
|
+
if (!fperf) {
|
|
444
|
+
GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
|
|
445
|
+
return;
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
// Populate profiling info
|
|
449
|
+
for (ProfilingInfo & info : profiling_info) {
|
|
450
|
+
cl_ulong cmd_queued;
|
|
451
|
+
cl_ulong cmd_submit;
|
|
452
|
+
cl_ulong cmd_start;
|
|
453
|
+
cl_ulong cmd_end;
|
|
454
|
+
cl_ulong cmd_complete;
|
|
455
|
+
|
|
456
|
+
CL_CHECK(clWaitForEvents(1, &info.evt));
|
|
457
|
+
CL_CHECK(clGetEventProfilingInfo(
|
|
458
|
+
info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
|
|
459
|
+
CL_CHECK(clGetEventProfilingInfo(
|
|
460
|
+
info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
|
|
461
|
+
CL_CHECK(clGetEventProfilingInfo(
|
|
462
|
+
info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
|
|
463
|
+
CL_CHECK(clGetEventProfilingInfo(
|
|
464
|
+
info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
|
|
465
|
+
CL_CHECK(clGetEventProfilingInfo(
|
|
466
|
+
info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
|
|
467
|
+
CL_CHECK(clReleaseEvent(info.evt));
|
|
468
|
+
|
|
469
|
+
char kernel_name[512];
|
|
470
|
+
CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
|
|
471
|
+
sizeof(kernel_name), kernel_name, NULL));
|
|
472
|
+
info.kernel_name = kernel_name;
|
|
473
|
+
|
|
474
|
+
info.cmd_queued = cmd_queued;
|
|
475
|
+
info.cmd_submit = cmd_submit;
|
|
476
|
+
info.cmd_start = cmd_start;
|
|
477
|
+
info.cmd_end = cmd_end;
|
|
478
|
+
|
|
479
|
+
info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
|
|
480
|
+
info.cmd_submit_duration_ns = cmd_start - cmd_submit;
|
|
481
|
+
info.cmd_duration_ns = cmd_end - cmd_start;
|
|
482
|
+
info.cmd_complete_duration_ns = cmd_complete - cmd_end;
|
|
483
|
+
info.cmd_total_duration_ns = cmd_complete - cmd_queued;
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
// Dump a csv
|
|
487
|
+
float total_kernel_time = 0;
|
|
488
|
+
fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
|
|
489
|
+
for (const ProfilingInfo & info : profiling_info) {
|
|
490
|
+
total_kernel_time += info.cmd_duration_ns/1.e6f;
|
|
491
|
+
fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
|
|
492
|
+
info.op_name.c_str(), info.kernel_name.c_str(),
|
|
493
|
+
info.cmd_queued_duration_ns/1.e6f,
|
|
494
|
+
info.cmd_submit_duration_ns/1.e6f,
|
|
495
|
+
info.cmd_duration_ns/1.e6f,
|
|
496
|
+
info.cmd_complete_duration_ns/1.e6f,
|
|
497
|
+
info.cmd_total_duration_ns/1.e6f,
|
|
498
|
+
info.global_size[0], info.global_size[1], info.global_size[2],
|
|
499
|
+
info.local_size[0], info.local_size[1], info.local_size[2],
|
|
500
|
+
info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
|
|
501
|
+
}
|
|
502
|
+
fclose(fperf);
|
|
503
|
+
|
|
504
|
+
GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
|
|
505
|
+
|
|
506
|
+
// Dump a simple chrome trace
|
|
507
|
+
FILE* ftrace = fopen("cl_trace.json", "w");
|
|
508
|
+
if (!ftrace) {
|
|
509
|
+
GGML_LOG_ERROR("Failed to open cl_trace.json\n");
|
|
510
|
+
return;
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
fprintf(ftrace, "[\n");
|
|
514
|
+
for (const ProfilingInfo & info : profiling_info) {
|
|
515
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
|
516
|
+
info.kernel_name.c_str(), info.cmd_queued/1000);
|
|
517
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
|
518
|
+
info.kernel_name.c_str(), info.cmd_submit/1000);
|
|
519
|
+
|
|
520
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
|
521
|
+
info.kernel_name.c_str(), info.cmd_start/1000);
|
|
522
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
|
523
|
+
info.kernel_name.c_str(), info.cmd_end/1000);
|
|
524
|
+
}
|
|
525
|
+
fclose(ftrace);
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
|
|
529
|
+
#ifdef GGML_OPENCL_PROFILING
|
|
530
|
+
cl_event evt;
|
|
531
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
532
|
+
|
|
533
|
+
profiling_info.emplace_back();
|
|
534
|
+
populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
|
|
535
|
+
#else
|
|
536
|
+
GGML_UNUSED(tensor);
|
|
537
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
538
|
+
#endif
|
|
539
|
+
}
|
|
354
540
|
|
|
355
541
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
356
542
|
// Transpose kernels
|
|
@@ -378,46 +564,19 @@ struct ggml_backend_opencl_context {
|
|
|
378
564
|
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
|
|
379
565
|
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
|
|
380
566
|
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
|
381
|
-
};
|
|
382
|
-
|
|
383
|
-
// All registered devices with a default device in the front.
|
|
384
|
-
static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
|
|
385
567
|
|
|
386
|
-
|
|
568
|
+
void free() {
|
|
569
|
+
ref_count--;
|
|
570
|
+
if (ref_count == 0) {
|
|
387
571
|
#ifdef GGML_OPENCL_PROFILING
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
cl_kernel kernel;
|
|
393
|
-
cl_event evt;
|
|
394
|
-
|
|
395
|
-
cl_ulong cmd_queued;
|
|
396
|
-
cl_ulong cmd_submit;
|
|
397
|
-
cl_ulong cmd_start;
|
|
398
|
-
cl_ulong cmd_end;
|
|
399
|
-
cl_ulong overhead_start;
|
|
400
|
-
cl_ulong overhead_end;
|
|
401
|
-
// For the times below, see spec for clGetEventProfilingInfo
|
|
402
|
-
// The time kernel spent in cmd queue - SUBMIT - QUEUED
|
|
403
|
-
cl_ulong cmd_queued_duration_ns;
|
|
404
|
-
// The time kernel spent for submission - START - SUBMIT
|
|
405
|
-
cl_ulong cmd_submit_duration_ns;
|
|
406
|
-
// Kernel execution time in nanoseconds - END - START
|
|
407
|
-
cl_ulong cmd_duration_ns;
|
|
408
|
-
// The time for the kernel to complete - COMPLETE - END
|
|
409
|
-
cl_ulong cmd_complete_duration_ns;
|
|
410
|
-
// Total time to finish the kernel - COMPELTE - QUEUED
|
|
411
|
-
cl_ulong cmd_total_duration_ns;
|
|
412
|
-
// Global and local work sizes.
|
|
413
|
-
size_t global_size[3];
|
|
414
|
-
size_t local_size[3];
|
|
415
|
-
// Op output size.
|
|
416
|
-
size_t output_size[4];
|
|
572
|
+
write_profiling_info();
|
|
573
|
+
#endif
|
|
574
|
+
}
|
|
575
|
+
}
|
|
417
576
|
};
|
|
418
577
|
|
|
419
|
-
|
|
420
|
-
|
|
578
|
+
// All registered devices with a default device in the front.
|
|
579
|
+
static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
|
|
421
580
|
|
|
422
581
|
inline std::string read_file(const std::string &path) {
|
|
423
582
|
std::ifstream ifs(path);
|
|
@@ -1097,6 +1256,166 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1097
1256
|
GGML_LOG_CONT(".");
|
|
1098
1257
|
}
|
|
1099
1258
|
|
|
1259
|
+
// repeat
|
|
1260
|
+
{
|
|
1261
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1262
|
+
const std::string kernel_src {
|
|
1263
|
+
#include "repeat.cl.h"
|
|
1264
|
+
};
|
|
1265
|
+
#else
|
|
1266
|
+
const std::string kernel_src = read_file("repeat.cl");
|
|
1267
|
+
#endif
|
|
1268
|
+
if (!kernel_src.empty()) {
|
|
1269
|
+
backend_ctx->program_repeat =
|
|
1270
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1271
|
+
CL_CHECK((backend_ctx->kernel_repeat = clCreateKernel(backend_ctx->program_repeat, "kernel_repeat", &err), err));
|
|
1272
|
+
GGML_LOG_CONT(".");
|
|
1273
|
+
} else {
|
|
1274
|
+
GGML_LOG_WARN("ggml_opencl: repeat kernel source not found or empty. Repeat operations will not be available.\n");
|
|
1275
|
+
backend_ctx->program_repeat = nullptr;
|
|
1276
|
+
backend_ctx->kernel_repeat = nullptr;
|
|
1277
|
+
}
|
|
1278
|
+
}
|
|
1279
|
+
|
|
1280
|
+
// pad
|
|
1281
|
+
{
|
|
1282
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1283
|
+
const std::string kernel_src {
|
|
1284
|
+
#include "pad.cl.h"
|
|
1285
|
+
};
|
|
1286
|
+
#else
|
|
1287
|
+
const std::string kernel_src = read_file("pad.cl");
|
|
1288
|
+
#endif
|
|
1289
|
+
if (!kernel_src.empty()) {
|
|
1290
|
+
backend_ctx->program_pad =
|
|
1291
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1292
|
+
CL_CHECK((backend_ctx->kernel_pad = clCreateKernel(backend_ctx->program_pad, "kernel_pad", &err), err));
|
|
1293
|
+
GGML_LOG_CONT(".");
|
|
1294
|
+
} else {
|
|
1295
|
+
GGML_LOG_WARN("ggml_opencl: pad kernel source not found or empty. Pad operations will not be available.\n");
|
|
1296
|
+
backend_ctx->program_pad = nullptr;
|
|
1297
|
+
backend_ctx->kernel_pad = nullptr;
|
|
1298
|
+
}
|
|
1299
|
+
}
|
|
1300
|
+
|
|
1301
|
+
// tanh
|
|
1302
|
+
{
|
|
1303
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1304
|
+
const std::string kernel_src {
|
|
1305
|
+
#include "tanh.cl.h"
|
|
1306
|
+
};
|
|
1307
|
+
#else
|
|
1308
|
+
const std::string kernel_src = read_file("tanh.cl");
|
|
1309
|
+
#endif
|
|
1310
|
+
if (!kernel_src.empty()) {
|
|
1311
|
+
backend_ctx->program_tanh =
|
|
1312
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1313
|
+
CL_CHECK((backend_ctx->kernel_tanh_f32_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f32_nd", &err), err));
|
|
1314
|
+
CL_CHECK((backend_ctx->kernel_tanh_f16_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f16_nd", &err), err));
|
|
1315
|
+
GGML_LOG_CONT(".");
|
|
1316
|
+
} else {
|
|
1317
|
+
GGML_LOG_WARN("ggml_opencl: tanh kernel source not found or empty. Tanh operation will not be available.\n");
|
|
1318
|
+
backend_ctx->program_tanh = nullptr;
|
|
1319
|
+
backend_ctx->kernel_tanh_f32_nd = nullptr;
|
|
1320
|
+
backend_ctx->kernel_tanh_f16_nd = nullptr;
|
|
1321
|
+
}
|
|
1322
|
+
}
|
|
1323
|
+
|
|
1324
|
+
// upscale
|
|
1325
|
+
{
|
|
1326
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1327
|
+
const std::string kernel_src {
|
|
1328
|
+
#include "upscale.cl.h"
|
|
1329
|
+
};
|
|
1330
|
+
#else
|
|
1331
|
+
const std::string kernel_src = read_file("upscale.cl");
|
|
1332
|
+
#endif
|
|
1333
|
+
if (!kernel_src.empty()) {
|
|
1334
|
+
backend_ctx->program_upscale =
|
|
1335
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1336
|
+
CL_CHECK((backend_ctx->kernel_upscale = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale", &err), err));
|
|
1337
|
+
if (backend_ctx->program_upscale) {
|
|
1338
|
+
cl_int err_bilinear;
|
|
1339
|
+
backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear);
|
|
1340
|
+
if (err_bilinear != CL_SUCCESS) {
|
|
1341
|
+
GGML_LOG_WARN("ggml_opencl: kernel_upscale_bilinear not found in upscale.cl. Bilinear upscale will not be available. Error: %d\n", err_bilinear);
|
|
1342
|
+
backend_ctx->kernel_upscale_bilinear = nullptr;
|
|
1343
|
+
}
|
|
1344
|
+
} else {
|
|
1345
|
+
backend_ctx->kernel_upscale_bilinear = nullptr;
|
|
1346
|
+
}
|
|
1347
|
+
GGML_LOG_CONT(".");
|
|
1348
|
+
} else {
|
|
1349
|
+
GGML_LOG_WARN("ggml_opencl: upscale kernel source not found or empty. Upscale operations will not be available.\n");
|
|
1350
|
+
backend_ctx->program_upscale = nullptr;
|
|
1351
|
+
backend_ctx->kernel_upscale = nullptr;
|
|
1352
|
+
backend_ctx->kernel_upscale_bilinear = nullptr;
|
|
1353
|
+
}
|
|
1354
|
+
}
|
|
1355
|
+
|
|
1356
|
+
// concat
|
|
1357
|
+
{
|
|
1358
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1359
|
+
const std::string kernel_src {
|
|
1360
|
+
#include "concat.cl.h"
|
|
1361
|
+
};
|
|
1362
|
+
#else
|
|
1363
|
+
|
|
1364
|
+
const std::string kernel_src = read_file("concat.cl");
|
|
1365
|
+
#endif
|
|
1366
|
+
if (!kernel_src.empty()) {
|
|
1367
|
+
backend_ctx->program_concat =
|
|
1368
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1369
|
+
|
|
1370
|
+
CL_CHECK((backend_ctx->kernel_concat_f32_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_contiguous", &err), err));
|
|
1371
|
+
CL_CHECK((backend_ctx->kernel_concat_f32_non_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_non_contiguous", &err), err));
|
|
1372
|
+
GGML_LOG_CONT(".");
|
|
1373
|
+
} else {
|
|
1374
|
+
GGML_LOG_WARN("ggml_opencl: concat kernel source not found or empty. Concat operations will not be available.\n");
|
|
1375
|
+
backend_ctx->program_concat = nullptr;
|
|
1376
|
+
backend_ctx->kernel_concat_f32_contiguous = nullptr;
|
|
1377
|
+
backend_ctx->kernel_concat_f32_non_contiguous = nullptr;
|
|
1378
|
+
}
|
|
1379
|
+
}
|
|
1380
|
+
|
|
1381
|
+
// timestep_embedding
|
|
1382
|
+
{
|
|
1383
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1384
|
+
const std::string kernel_src {
|
|
1385
|
+
#include "tsembd.cl.h"
|
|
1386
|
+
};
|
|
1387
|
+
#else
|
|
1388
|
+
|
|
1389
|
+
const std::string kernel_src = read_file("tsembd.cl");
|
|
1390
|
+
#endif
|
|
1391
|
+
if (!kernel_src.empty()) {
|
|
1392
|
+
backend_ctx->program_tsembd =
|
|
1393
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1394
|
+
CL_CHECK((backend_ctx->kernel_timestep_embedding = clCreateKernel(backend_ctx->program_tsembd, "kernel_timestep_embedding", &err), err));
|
|
1395
|
+
GGML_LOG_CONT(".");
|
|
1396
|
+
} else {
|
|
1397
|
+
GGML_LOG_WARN("ggml_opencl: timestep_embedding kernel source not found or empty. This op will not be available.\n");
|
|
1398
|
+
backend_ctx->program_tsembd = nullptr;
|
|
1399
|
+
backend_ctx->kernel_timestep_embedding = nullptr;
|
|
1400
|
+
}
|
|
1401
|
+
}
|
|
1402
|
+
|
|
1403
|
+
// mul_mv_id_q4_0_f32_8x_flat
|
|
1404
|
+
{
|
|
1405
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1406
|
+
const std::string kernel_src {
|
|
1407
|
+
#include "mul_mv_id_q4_0_f32_8x_flat.cl.h"
|
|
1408
|
+
};
|
|
1409
|
+
#else
|
|
1410
|
+
const std::string kernel_src = read_file("mul_mv_id_q4_0_f32_8x_flat.cl");
|
|
1411
|
+
#endif
|
|
1412
|
+
backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat =
|
|
1413
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1414
|
+
|
|
1415
|
+
CL_CHECK((backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat, "kernel_mul_mv_id_q4_0_f32_8x_flat", &err), err));
|
|
1416
|
+
GGML_LOG_CONT(".");
|
|
1417
|
+
}
|
|
1418
|
+
|
|
1100
1419
|
// Adreno kernels
|
|
1101
1420
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
1102
1421
|
// transpose
|
|
@@ -1492,6 +1811,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
1492
1811
|
backend_ctx->device = dev_ctx->device;
|
|
1493
1812
|
backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
|
|
1494
1813
|
|
|
1814
|
+
// ref_count get increased in ggml_backend_opencl_device_init
|
|
1815
|
+
// This function is also used to retrieve backend context, so we don't want
|
|
1816
|
+
// to increase ref_count for each call. We only want to increase ref_count
|
|
1817
|
+
// when the associated device is initialized
|
|
1818
|
+
backend_ctx->ref_count = 0;
|
|
1819
|
+
|
|
1495
1820
|
if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
|
|
1496
1821
|
strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
|
|
1497
1822
|
strstr(dev_ctx->device_version.c_str(), "Adreno")) {
|
|
@@ -1664,93 +1989,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
1664
1989
|
return dev_ctx->backend_ctx;
|
|
1665
1990
|
}
|
|
1666
1991
|
|
|
1667
|
-
static void ggml_cl2_free(
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
if (!fperf) {
|
|
1671
|
-
GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
|
|
1672
|
-
return;
|
|
1673
|
-
}
|
|
1992
|
+
static void ggml_cl2_free(ggml_backend_t backend) {
|
|
1993
|
+
ggml_backend_opencl_context * ctx = (ggml_backend_opencl_context *) backend->context;
|
|
1994
|
+
ctx->free();
|
|
1674
1995
|
|
|
1675
|
-
//
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
CL_CHECK(clWaitForEvents(1, &info.evt));
|
|
1684
|
-
CL_CHECK(clGetEventProfilingInfo(
|
|
1685
|
-
info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
|
|
1686
|
-
CL_CHECK(clGetEventProfilingInfo(
|
|
1687
|
-
info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
|
|
1688
|
-
CL_CHECK(clGetEventProfilingInfo(
|
|
1689
|
-
info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
|
|
1690
|
-
CL_CHECK(clGetEventProfilingInfo(
|
|
1691
|
-
info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
|
|
1692
|
-
CL_CHECK(clGetEventProfilingInfo(
|
|
1693
|
-
info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
|
|
1694
|
-
CL_CHECK(clReleaseEvent(info.evt));
|
|
1695
|
-
|
|
1696
|
-
char kernel_name[512];
|
|
1697
|
-
CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
|
|
1698
|
-
sizeof(kernel_name), kernel_name, NULL));
|
|
1699
|
-
info.kernel_name = kernel_name;
|
|
1700
|
-
|
|
1701
|
-
info.cmd_queued = cmd_queued;
|
|
1702
|
-
info.cmd_submit = cmd_submit;
|
|
1703
|
-
info.cmd_start = cmd_start;
|
|
1704
|
-
info.cmd_end = cmd_end;
|
|
1705
|
-
|
|
1706
|
-
info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
|
|
1707
|
-
info.cmd_submit_duration_ns = cmd_start - cmd_submit;
|
|
1708
|
-
info.cmd_duration_ns = cmd_end - cmd_start;
|
|
1709
|
-
info.cmd_complete_duration_ns = cmd_complete - cmd_end;
|
|
1710
|
-
info.cmd_total_duration_ns = cmd_complete - cmd_queued;
|
|
1711
|
-
}
|
|
1712
|
-
|
|
1713
|
-
// Dump a csv
|
|
1714
|
-
float total_kernel_time = 0;
|
|
1715
|
-
fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
|
|
1716
|
-
for (const ProfilingInfo & info : g_profiling_info) {
|
|
1717
|
-
total_kernel_time += info.cmd_duration_ns/1.e6f;
|
|
1718
|
-
fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
|
|
1719
|
-
info.op_name.c_str(), info.kernel_name.c_str(),
|
|
1720
|
-
info.cmd_queued_duration_ns/1.e6f,
|
|
1721
|
-
info.cmd_submit_duration_ns/1.e6f,
|
|
1722
|
-
info.cmd_duration_ns/1.e6f,
|
|
1723
|
-
info.cmd_complete_duration_ns/1.e6f,
|
|
1724
|
-
info.cmd_total_duration_ns/1.e6f,
|
|
1725
|
-
info.global_size[0], info.global_size[1], info.global_size[2],
|
|
1726
|
-
info.local_size[0], info.local_size[1], info.local_size[2],
|
|
1727
|
-
info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
|
|
1728
|
-
}
|
|
1729
|
-
fclose(fperf);
|
|
1730
|
-
|
|
1731
|
-
GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
|
|
1732
|
-
|
|
1733
|
-
// Dump a simple chrome trace
|
|
1734
|
-
FILE* ftrace = fopen("cl_trace.json", "w");
|
|
1735
|
-
if (!ftrace) {
|
|
1736
|
-
GGML_LOG_ERROR("Failed to open cl_trace.json\n");
|
|
1737
|
-
return;
|
|
1996
|
+
// The CL context is shared by all backends, release it if all backends have been released
|
|
1997
|
+
bool should_release_opencl = true;
|
|
1998
|
+
for (auto device : g_ggml_backend_opencl_devices) {
|
|
1999
|
+
ggml_backend_opencl_device_context * ctx_dev = (ggml_backend_opencl_device_context *) device.context;
|
|
2000
|
+
if (ctx_dev->backend_ctx->ref_count > 0) {
|
|
2001
|
+
should_release_opencl = false;
|
|
2002
|
+
}
|
|
1738
2003
|
}
|
|
1739
2004
|
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
|
1743
|
-
info.kernel_name.c_str(), info.cmd_queued/1000);
|
|
1744
|
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
|
1745
|
-
info.kernel_name.c_str(), info.cmd_submit/1000);
|
|
1746
|
-
|
|
1747
|
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
|
1748
|
-
info.kernel_name.c_str(), info.cmd_start/1000);
|
|
1749
|
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
|
1750
|
-
info.kernel_name.c_str(), info.cmd_end/1000);
|
|
2005
|
+
if (should_release_opencl) {
|
|
2006
|
+
CL_CHECK(clReleaseContext(ctx->context));
|
|
1751
2007
|
}
|
|
1752
|
-
fclose(ftrace);
|
|
1753
|
-
#endif
|
|
1754
2008
|
}
|
|
1755
2009
|
|
|
1756
2010
|
//------------------------------------------------------------------------------
|
|
@@ -1834,9 +2088,7 @@ static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
|
|
|
1834
2088
|
}
|
|
1835
2089
|
|
|
1836
2090
|
static void ggml_backend_opencl_free(ggml_backend_t backend) {
|
|
1837
|
-
ggml_cl2_free();
|
|
1838
|
-
|
|
1839
|
-
GGML_UNUSED(backend);
|
|
2091
|
+
ggml_cl2_free(backend);
|
|
1840
2092
|
}
|
|
1841
2093
|
|
|
1842
2094
|
static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
@@ -1863,7 +2115,12 @@ static bool ggml_backend_opencl_cpy_tensor_async(ggml_backend_t backend, const g
|
|
|
1863
2115
|
}
|
|
1864
2116
|
|
|
1865
2117
|
static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
|
|
1866
|
-
|
|
2118
|
+
auto * backend_ctx = static_cast<ggml_backend_opencl_context *>(backend->context);
|
|
2119
|
+
|
|
2120
|
+
cl_event evt;
|
|
2121
|
+
CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, 0, nullptr, &evt));
|
|
2122
|
+
CL_CHECK(clWaitForEvents(1, &evt));
|
|
2123
|
+
CL_CHECK(clReleaseEvent(evt));
|
|
1867
2124
|
}
|
|
1868
2125
|
|
|
1869
2126
|
// Syncronizes the 'backend_ctx's device with others so that commands
|
|
@@ -1976,9 +2233,12 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
1976
2233
|
case GGML_UNARY_OP_SILU:
|
|
1977
2234
|
case GGML_UNARY_OP_RELU:
|
|
1978
2235
|
case GGML_UNARY_OP_GELU_QUICK:
|
|
1979
|
-
|
|
2236
|
+
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
|
1980
2237
|
case GGML_UNARY_OP_SIGMOID:
|
|
1981
2238
|
return ggml_is_contiguous(op->src[0]);
|
|
2239
|
+
case GGML_UNARY_OP_TANH:
|
|
2240
|
+
return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
|
|
2241
|
+
(op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
|
|
1982
2242
|
default:
|
|
1983
2243
|
return false;
|
|
1984
2244
|
}
|
|
@@ -1988,6 +2248,17 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
1988
2248
|
case GGML_OP_NORM:
|
|
1989
2249
|
case GGML_OP_RMS_NORM:
|
|
1990
2250
|
return true;
|
|
2251
|
+
case GGML_OP_REPEAT:
|
|
2252
|
+
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
|
|
2253
|
+
case GGML_OP_PAD:
|
|
2254
|
+
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
|
|
2255
|
+
op->src[0]->ne[3] == 1 && op->ne[3] == 1;
|
|
2256
|
+
case GGML_OP_UPSCALE:
|
|
2257
|
+
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
|
2258
|
+
case GGML_OP_CONCAT:
|
|
2259
|
+
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
|
2260
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
2261
|
+
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
|
1991
2262
|
case GGML_OP_GROUP_NORM:
|
|
1992
2263
|
return ggml_is_contiguous(op->src[0]);
|
|
1993
2264
|
case GGML_OP_MUL_MAT:
|
|
@@ -2000,6 +2271,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
2000
2271
|
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
|
2001
2272
|
}
|
|
2002
2273
|
return false;
|
|
2274
|
+
case GGML_OP_MUL_MAT_ID:
|
|
2275
|
+
if (op->src[0]->type == GGML_TYPE_Q4_0) {
|
|
2276
|
+
if (op->src[1]->type == GGML_TYPE_F32) {
|
|
2277
|
+
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
|
2278
|
+
}
|
|
2279
|
+
}
|
|
2280
|
+
return false;
|
|
2003
2281
|
case GGML_OP_RESHAPE:
|
|
2004
2282
|
case GGML_OP_VIEW:
|
|
2005
2283
|
case GGML_OP_PERMUTE:
|
|
@@ -2052,7 +2330,7 @@ static ggml_backend_i ggml_backend_opencl_i = {
|
|
|
2052
2330
|
/* .set_tensor_async = */ NULL, /* ggml_backend_opencl_set_tensor_async */
|
|
2053
2331
|
/* .get_tensor_async = */ NULL, /* ggml_backend_opencl_get_tensor_async */
|
|
2054
2332
|
/* .cpy_tensor_async = */ NULL, /* ggml_backend_opencl_cpy_tensor_async */
|
|
2055
|
-
/* .synchronize = */
|
|
2333
|
+
/* .synchronize = */ ggml_backend_opencl_synchronize,
|
|
2056
2334
|
/* .graph_plan_create = */ NULL,
|
|
2057
2335
|
/* .graph_plan_free = */ NULL,
|
|
2058
2336
|
/* .graph_plan_update = */ NULL,
|
|
@@ -2696,6 +2974,8 @@ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct
|
|
|
2696
2974
|
|
|
2697
2975
|
static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
|
|
2698
2976
|
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
|
|
2977
|
+
// Getting a new reference to the backend, increase ref_count
|
|
2978
|
+
backend_ctx->ref_count++;
|
|
2699
2979
|
|
|
2700
2980
|
ggml_backend_t backend = new ggml_backend {
|
|
2701
2981
|
/* .guid = */ ggml_backend_opencl_guid(),
|
|
@@ -2956,31 +3236,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
|
|
|
2956
3236
|
#define dump_tensor(tensor)
|
|
2957
3237
|
#endif
|
|
2958
3238
|
|
|
2959
|
-
//------------------------------------------------------------------------------
|
|
2960
|
-
// Profiling utility
|
|
2961
|
-
//------------------------------------------------------------------------------
|
|
2962
|
-
#ifdef GGML_OPENCL_PROFILING
|
|
2963
|
-
static void populateProfilingInfo(
|
|
2964
|
-
ProfilingInfo& info, cl_event evt, cl_kernel kernel,
|
|
2965
|
-
size_t global_size[3], size_t local_size[3],
|
|
2966
|
-
const ggml_tensor * tensor) {
|
|
2967
|
-
info.op_name = tensor->name;
|
|
2968
|
-
info.kernel = kernel;
|
|
2969
|
-
info.evt = evt;
|
|
2970
|
-
|
|
2971
|
-
info.local_size[0] = local_size[0];
|
|
2972
|
-
info.local_size[1] = local_size[1];
|
|
2973
|
-
info.local_size[2] = local_size[2];
|
|
2974
|
-
info.global_size[0] = global_size[0];
|
|
2975
|
-
info.global_size[1] = global_size[1];
|
|
2976
|
-
info.global_size[2] = global_size[2];
|
|
2977
|
-
info.output_size[0] = tensor->ne[0];
|
|
2978
|
-
info.output_size[1] = tensor->ne[1];
|
|
2979
|
-
info.output_size[2] = tensor->ne[2];
|
|
2980
|
-
info.output_size[3] = tensor->ne[3];
|
|
2981
|
-
}
|
|
2982
|
-
#endif
|
|
2983
|
-
|
|
2984
3239
|
//------------------------------------------------------------------------------
|
|
2985
3240
|
// Ops
|
|
2986
3241
|
//------------------------------------------------------------------------------
|
|
@@ -3024,7 +3279,6 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
3024
3279
|
const cl_ulong nb2 = dst ? dst->nb[2] : 0;
|
|
3025
3280
|
|
|
3026
3281
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3027
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3028
3282
|
|
|
3029
3283
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3030
3284
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -3068,15 +3322,7 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
3068
3322
|
size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1};
|
|
3069
3323
|
size_t local_work_size[] = {1, 1, 1};
|
|
3070
3324
|
|
|
3071
|
-
|
|
3072
|
-
cl_event evt;
|
|
3073
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3074
|
-
|
|
3075
|
-
g_profiling_info.emplace_back();
|
|
3076
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3077
|
-
#else
|
|
3078
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3079
|
-
#endif
|
|
3325
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3080
3326
|
}
|
|
3081
3327
|
|
|
3082
3328
|
static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3118,7 +3364,6 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3118
3364
|
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
|
3119
3365
|
|
|
3120
3366
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3121
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3122
3367
|
|
|
3123
3368
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3124
3369
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -3193,29 +3438,13 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3193
3438
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
3194
3439
|
}
|
|
3195
3440
|
|
|
3196
|
-
|
|
3197
|
-
cl_event evt;
|
|
3198
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
3199
|
-
|
|
3200
|
-
g_profiling_info.emplace_back();
|
|
3201
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
3202
|
-
#else
|
|
3203
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
3204
|
-
#endif
|
|
3441
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
3205
3442
|
} else {
|
|
3206
3443
|
unsigned int nth = MIN(64, ne0);
|
|
3207
3444
|
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
3208
3445
|
size_t local_work_size[] = {nth, 1, 1};
|
|
3209
3446
|
|
|
3210
|
-
|
|
3211
|
-
cl_event evt;
|
|
3212
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3213
|
-
|
|
3214
|
-
g_profiling_info.emplace_back();
|
|
3215
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3216
|
-
#else
|
|
3217
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3218
|
-
#endif
|
|
3447
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3219
3448
|
}
|
|
3220
3449
|
}
|
|
3221
3450
|
|
|
@@ -3258,7 +3487,6 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3258
3487
|
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
|
3259
3488
|
|
|
3260
3489
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3261
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3262
3490
|
|
|
3263
3491
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3264
3492
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -3333,29 +3561,13 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3333
3561
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
3334
3562
|
}
|
|
3335
3563
|
|
|
3336
|
-
|
|
3337
|
-
cl_event evt;
|
|
3338
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
3339
|
-
|
|
3340
|
-
g_profiling_info.emplace_back();
|
|
3341
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
3342
|
-
#else
|
|
3343
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
3344
|
-
#endif
|
|
3564
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
3345
3565
|
} else {
|
|
3346
3566
|
unsigned int nth = MIN(64, ne0);
|
|
3347
3567
|
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
3348
3568
|
size_t local_work_size[] = {nth, 1, 1};
|
|
3349
3569
|
|
|
3350
|
-
|
|
3351
|
-
cl_event evt;
|
|
3352
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3353
|
-
|
|
3354
|
-
g_profiling_info.emplace_back();
|
|
3355
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3356
|
-
#else
|
|
3357
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3358
|
-
#endif
|
|
3570
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3359
3571
|
}
|
|
3360
3572
|
}
|
|
3361
3573
|
|
|
@@ -3395,7 +3607,6 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3395
3607
|
const cl_ulong nb3 = dst->nb[3];
|
|
3396
3608
|
|
|
3397
3609
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3398
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3399
3610
|
|
|
3400
3611
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3401
3612
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -3458,29 +3669,13 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3458
3669
|
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
3459
3670
|
size_t local_work_size[] = {64, 1, 1};
|
|
3460
3671
|
|
|
3461
|
-
|
|
3462
|
-
cl_event evt;
|
|
3463
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3464
|
-
|
|
3465
|
-
g_profiling_info.emplace_back();
|
|
3466
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3467
|
-
#else
|
|
3468
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3469
|
-
#endif
|
|
3672
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3470
3673
|
} else {
|
|
3471
3674
|
unsigned int nth = MIN(64, ne0);
|
|
3472
3675
|
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
3473
3676
|
size_t local_work_size[] = {nth, 1, 1};
|
|
3474
3677
|
|
|
3475
|
-
|
|
3476
|
-
cl_event evt;
|
|
3477
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3478
|
-
|
|
3479
|
-
g_profiling_info.emplace_back();
|
|
3480
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3481
|
-
#else
|
|
3482
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3483
|
-
#endif
|
|
3678
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3484
3679
|
}
|
|
3485
3680
|
}
|
|
3486
3681
|
|
|
@@ -3520,7 +3715,6 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3520
3715
|
const cl_ulong nb3 = dst->nb[3];
|
|
3521
3716
|
|
|
3522
3717
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3523
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3524
3718
|
|
|
3525
3719
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3526
3720
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -3583,29 +3777,13 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3583
3777
|
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
3584
3778
|
size_t local_work_size[] = {64, 1, 1};
|
|
3585
3779
|
|
|
3586
|
-
|
|
3587
|
-
cl_event evt;
|
|
3588
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3589
|
-
|
|
3590
|
-
g_profiling_info.emplace_back();
|
|
3591
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3592
|
-
#else
|
|
3593
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3594
|
-
#endif
|
|
3780
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3595
3781
|
} else {
|
|
3596
3782
|
unsigned int nth = MIN(64, ne0);
|
|
3597
3783
|
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
3598
3784
|
size_t local_work_size[] = {nth, 1, 1};
|
|
3599
3785
|
|
|
3600
|
-
|
|
3601
|
-
cl_event evt;
|
|
3602
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3603
|
-
|
|
3604
|
-
g_profiling_info.emplace_back();
|
|
3605
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3606
|
-
#else
|
|
3607
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3608
|
-
#endif
|
|
3786
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3609
3787
|
}
|
|
3610
3788
|
}
|
|
3611
3789
|
|
|
@@ -3618,7 +3796,6 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3618
3796
|
UNUSED(src1);
|
|
3619
3797
|
|
|
3620
3798
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3621
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3622
3799
|
|
|
3623
3800
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3624
3801
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -3645,15 +3822,7 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3645
3822
|
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
3646
3823
|
size_t local_work_size[] = {64, 1, 1};
|
|
3647
3824
|
|
|
3648
|
-
|
|
3649
|
-
cl_event evt;
|
|
3650
|
-
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
|
|
3651
|
-
|
|
3652
|
-
g_profiling_info.emplace_back();
|
|
3653
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3654
|
-
#else
|
|
3655
|
-
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
|
|
3656
|
-
#endif
|
|
3825
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3657
3826
|
}
|
|
3658
3827
|
|
|
3659
3828
|
static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3665,7 +3834,6 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
3665
3834
|
UNUSED(src1);
|
|
3666
3835
|
|
|
3667
3836
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3668
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3669
3837
|
|
|
3670
3838
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3671
3839
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -3692,15 +3860,7 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
3692
3860
|
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
3693
3861
|
size_t local_work_size[] = {64, 1, 1};
|
|
3694
3862
|
|
|
3695
|
-
|
|
3696
|
-
cl_event evt;
|
|
3697
|
-
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
|
|
3698
|
-
|
|
3699
|
-
g_profiling_info.emplace_back();
|
|
3700
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3701
|
-
#else
|
|
3702
|
-
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
|
|
3703
|
-
#endif
|
|
3863
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3704
3864
|
}
|
|
3705
3865
|
|
|
3706
3866
|
static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3712,7 +3872,6 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3712
3872
|
UNUSED(src1);
|
|
3713
3873
|
|
|
3714
3874
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3715
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3716
3875
|
|
|
3717
3876
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3718
3877
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -3744,15 +3903,7 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3744
3903
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
3745
3904
|
}
|
|
3746
3905
|
|
|
3747
|
-
|
|
3748
|
-
cl_event evt;
|
|
3749
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
3750
|
-
|
|
3751
|
-
g_profiling_info.emplace_back();
|
|
3752
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
3753
|
-
#else
|
|
3754
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
3755
|
-
#endif
|
|
3906
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
3756
3907
|
}
|
|
3757
3908
|
|
|
3758
3909
|
static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3764,7 +3915,6 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3764
3915
|
UNUSED(src1);
|
|
3765
3916
|
|
|
3766
3917
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3767
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3768
3918
|
|
|
3769
3919
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3770
3920
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -3789,15 +3939,7 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3789
3939
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
3790
3940
|
}
|
|
3791
3941
|
|
|
3792
|
-
|
|
3793
|
-
cl_event evt;
|
|
3794
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
3795
|
-
|
|
3796
|
-
g_profiling_info.emplace_back();
|
|
3797
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
3798
|
-
#else
|
|
3799
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
3800
|
-
#endif
|
|
3942
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
3801
3943
|
}
|
|
3802
3944
|
|
|
3803
3945
|
static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3809,7 +3951,6 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
3809
3951
|
UNUSED(src1);
|
|
3810
3952
|
|
|
3811
3953
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3812
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3813
3954
|
|
|
3814
3955
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3815
3956
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -3841,15 +3982,7 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
3841
3982
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
3842
3983
|
}
|
|
3843
3984
|
|
|
3844
|
-
|
|
3845
|
-
cl_event evt;
|
|
3846
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
3847
|
-
|
|
3848
|
-
g_profiling_info.emplace_back();
|
|
3849
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
3850
|
-
#else
|
|
3851
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
3852
|
-
#endif
|
|
3985
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
3853
3986
|
}
|
|
3854
3987
|
|
|
3855
3988
|
static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3861,7 +3994,6 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
|
3861
3994
|
UNUSED(src1);
|
|
3862
3995
|
|
|
3863
3996
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3864
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3865
3997
|
|
|
3866
3998
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3867
3999
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -3893,15 +4025,7 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
|
3893
4025
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
3894
4026
|
}
|
|
3895
4027
|
|
|
3896
|
-
|
|
3897
|
-
cl_event evt;
|
|
3898
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
3899
|
-
|
|
3900
|
-
g_profiling_info.emplace_back();
|
|
3901
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
3902
|
-
#else
|
|
3903
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
3904
|
-
#endif
|
|
4028
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
3905
4029
|
}
|
|
3906
4030
|
|
|
3907
4031
|
static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3913,7 +4037,6 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3913
4037
|
UNUSED(src1);
|
|
3914
4038
|
|
|
3915
4039
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3916
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3917
4040
|
|
|
3918
4041
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3919
4042
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -3954,15 +4077,7 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3954
4077
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
3955
4078
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
3956
4079
|
|
|
3957
|
-
|
|
3958
|
-
cl_event evt;
|
|
3959
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3960
|
-
|
|
3961
|
-
g_profiling_info.emplace_back();
|
|
3962
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3963
|
-
#else
|
|
3964
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3965
|
-
#endif
|
|
4080
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3966
4081
|
}
|
|
3967
4082
|
|
|
3968
4083
|
static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3974,7 +4089,6 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
3974
4089
|
UNUSED(src1);
|
|
3975
4090
|
|
|
3976
4091
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3977
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3978
4092
|
|
|
3979
4093
|
//ggml_backend_opencl_device_context * dev_ctx =
|
|
3980
4094
|
// (ggml_backend_opencl_device_context *)backend->device->context;
|
|
@@ -4038,15 +4152,7 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
4038
4152
|
// This is local memory - the size depends on subgroup size.
|
|
4039
4153
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
|
|
4040
4154
|
|
|
4041
|
-
|
|
4042
|
-
cl_event evt;
|
|
4043
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
4044
|
-
|
|
4045
|
-
g_profiling_info.emplace_back();
|
|
4046
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
4047
|
-
#else
|
|
4048
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
4049
|
-
#endif
|
|
4155
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
4050
4156
|
}
|
|
4051
4157
|
|
|
4052
4158
|
static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4058,7 +4164,6 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
4058
4164
|
UNUSED(src1);
|
|
4059
4165
|
|
|
4060
4166
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4061
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4062
4167
|
|
|
4063
4168
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4064
4169
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -4097,15 +4202,487 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
4097
4202
|
size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
|
|
4098
4203
|
size_t local_work_size[] = {(size_t)sgs, 1, 1};
|
|
4099
4204
|
|
|
4100
|
-
|
|
4101
|
-
|
|
4102
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
4205
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
4206
|
+
}
|
|
4103
4207
|
|
|
4104
|
-
|
|
4105
|
-
|
|
4106
|
-
|
|
4107
|
-
|
|
4108
|
-
|
|
4208
|
+
static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
4209
|
+
GGML_ASSERT(src0);
|
|
4210
|
+
GGML_ASSERT(src0->extra);
|
|
4211
|
+
GGML_ASSERT(dst);
|
|
4212
|
+
GGML_ASSERT(dst->extra);
|
|
4213
|
+
|
|
4214
|
+
UNUSED(src1);
|
|
4215
|
+
|
|
4216
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4217
|
+
|
|
4218
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4219
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
4220
|
+
|
|
4221
|
+
cl_ulong offset0_abs = extra0->offset + src0->view_offs;
|
|
4222
|
+
cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
|
|
4223
|
+
|
|
4224
|
+
cl_kernel kernel;
|
|
4225
|
+
if (dst->type == GGML_TYPE_F32) {
|
|
4226
|
+
kernel = backend_ctx->kernel_tanh_f32_nd;
|
|
4227
|
+
} else if (dst->type == GGML_TYPE_F16) {
|
|
4228
|
+
kernel = backend_ctx->kernel_tanh_f16_nd;
|
|
4229
|
+
} else {
|
|
4230
|
+
GGML_ASSERT(false && "Unsupported type for ggml_cl_tanh");
|
|
4231
|
+
}
|
|
4232
|
+
GGML_ASSERT(kernel != nullptr);
|
|
4233
|
+
|
|
4234
|
+
const int ne00 = src0->ne[0]; const int ne01 = src0->ne[1]; const int ne02 = src0->ne[2]; const int ne03 = src0->ne[3];
|
|
4235
|
+
const cl_ulong nb00 = src0->nb[0]; const cl_ulong nb01 = src0->nb[1]; const cl_ulong nb02 = src0->nb[2]; const cl_ulong nb03 = src0->nb[3];
|
|
4236
|
+
|
|
4237
|
+
const int ne10 = dst->ne[0]; const int ne11 = dst->ne[1]; const int ne12 = dst->ne[2]; const int ne13 = dst->ne[3];
|
|
4238
|
+
const cl_ulong nb10 = dst->nb[0]; const cl_ulong nb11 = dst->nb[1]; const cl_ulong nb12 = dst->nb[2]; const cl_ulong nb13 = dst->nb[3];
|
|
4239
|
+
|
|
4240
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
4241
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
|
|
4242
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
4243
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
|
|
4244
|
+
|
|
4245
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
|
4246
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
|
4247
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
|
4248
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
|
4249
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
|
|
4250
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
|
4251
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
|
|
4252
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
|
|
4253
|
+
|
|
4254
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
|
|
4255
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
|
|
4256
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
|
|
4257
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
|
|
4258
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
|
|
4259
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
|
|
4260
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
|
|
4261
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
|
|
4262
|
+
|
|
4263
|
+
size_t global_work_size[3];
|
|
4264
|
+
if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
|
|
4265
|
+
return;
|
|
4266
|
+
}
|
|
4267
|
+
global_work_size[0] = (size_t)ne10;
|
|
4268
|
+
global_work_size[1] = (size_t)ne11;
|
|
4269
|
+
global_work_size[2] = (size_t)ne12;
|
|
4270
|
+
|
|
4271
|
+
size_t lws0 = 16, lws1 = 4, lws2 = 1;
|
|
4272
|
+
if (ne10 < 16) lws0 = ne10;
|
|
4273
|
+
if (ne11 < 4) lws1 = ne11;
|
|
4274
|
+
if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
|
|
4275
|
+
|
|
4276
|
+
while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
|
|
4277
|
+
while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
|
|
4278
|
+
while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
|
|
4279
|
+
|
|
4280
|
+
|
|
4281
|
+
size_t local_work_size[] = {lws0, lws1, lws2};
|
|
4282
|
+
|
|
4283
|
+
size_t* local_work_size_ptr = local_work_size;
|
|
4284
|
+
if (!backend_ctx->non_uniform_workgroups) {
|
|
4285
|
+
if (global_work_size[0] % local_work_size[0] != 0 ||
|
|
4286
|
+
global_work_size[1] % local_work_size[1] != 0 ||
|
|
4287
|
+
global_work_size[2] % local_work_size[2] != 0) {
|
|
4288
|
+
local_work_size_ptr = NULL;
|
|
4289
|
+
}
|
|
4290
|
+
}
|
|
4291
|
+
if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
|
|
4292
|
+
|
|
4293
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
4294
|
+
}
|
|
4295
|
+
|
|
4296
|
+
static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
|
|
4297
|
+
GGML_ASSERT(src0);
|
|
4298
|
+
GGML_ASSERT(src0->extra);
|
|
4299
|
+
GGML_ASSERT(dst);
|
|
4300
|
+
GGML_ASSERT(dst->extra);
|
|
4301
|
+
GGML_ASSERT(dst->type == src0->type);
|
|
4302
|
+
|
|
4303
|
+
UNUSED(src1_shape_def);
|
|
4304
|
+
|
|
4305
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4306
|
+
|
|
4307
|
+
if (backend_ctx->kernel_repeat == nullptr) {
|
|
4308
|
+
GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__);
|
|
4309
|
+
return;
|
|
4310
|
+
}
|
|
4311
|
+
|
|
4312
|
+
ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4313
|
+
ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
|
|
4314
|
+
|
|
4315
|
+
cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
|
|
4316
|
+
cl_ulong off_dst = extra_dst->offset + dst->view_offs;
|
|
4317
|
+
|
|
4318
|
+
const int src0_ne0 = src0->ne[0]; const int src0_ne1 = src0->ne[1]; const int src0_ne2 = src0->ne[2]; const int src0_ne3 = src0->ne[3];
|
|
4319
|
+
const cl_ulong src0_nb0 = src0->nb[0]; const cl_ulong src0_nb1 = src0->nb[1]; const cl_ulong src0_nb2 = src0->nb[2]; const cl_ulong src0_nb3 = src0->nb[3];
|
|
4320
|
+
|
|
4321
|
+
const int dst_ne0 = dst->ne[0]; const int dst_ne1 = dst->ne[1]; const int dst_ne2 = dst->ne[2]; const int dst_ne3 = dst->ne[3];
|
|
4322
|
+
const cl_ulong dst_nb0 = dst->nb[0]; const cl_ulong dst_nb1 = dst->nb[1]; const cl_ulong dst_nb2 = dst->nb[2]; const cl_ulong dst_nb3 = dst->nb[3];
|
|
4323
|
+
|
|
4324
|
+
cl_kernel kernel = backend_ctx->kernel_repeat;
|
|
4325
|
+
|
|
4326
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
|
|
4327
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra_dst->data_device));
|
|
4328
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &off_src0));
|
|
4329
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
|
|
4330
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &src0_ne0));
|
|
4331
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &src0_ne1));
|
|
4332
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &src0_ne2));
|
|
4333
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &src0_ne3));
|
|
4334
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &src0_nb0));
|
|
4335
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &src0_nb1));
|
|
4336
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &src0_nb2));
|
|
4337
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &src0_nb3));
|
|
4338
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &dst_ne0));
|
|
4339
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &dst_ne1));
|
|
4340
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &dst_ne2));
|
|
4341
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dst_ne3));
|
|
4342
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &dst_nb0));
|
|
4343
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &dst_nb1));
|
|
4344
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &dst_nb2));
|
|
4345
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &dst_nb3));
|
|
4346
|
+
|
|
4347
|
+
size_t gws0 = dst_ne1 > 0 ? (size_t)dst_ne1 : 1;
|
|
4348
|
+
size_t gws1 = dst_ne2 > 0 ? (size_t)dst_ne2 : 1;
|
|
4349
|
+
size_t gws2 = dst_ne3 > 0 ? (size_t)dst_ne3 : 1;
|
|
4350
|
+
|
|
4351
|
+
size_t global_work_size[] = { gws0, gws1, gws2 };
|
|
4352
|
+
|
|
4353
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
|
|
4354
|
+
}
|
|
4355
|
+
|
|
4356
|
+
static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
4357
|
+
GGML_ASSERT(src0);
|
|
4358
|
+
GGML_ASSERT(src0->extra);
|
|
4359
|
+
GGML_ASSERT(dst);
|
|
4360
|
+
GGML_ASSERT(dst->extra);
|
|
4361
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
4362
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
4363
|
+
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1);
|
|
4364
|
+
|
|
4365
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4366
|
+
|
|
4367
|
+
if (backend_ctx->kernel_pad == nullptr) {
|
|
4368
|
+
GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__);
|
|
4369
|
+
return;
|
|
4370
|
+
}
|
|
4371
|
+
|
|
4372
|
+
ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4373
|
+
ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
|
|
4374
|
+
|
|
4375
|
+
cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
|
|
4376
|
+
cl_ulong off_dst = extra_dst->offset + dst->view_offs;
|
|
4377
|
+
|
|
4378
|
+
const int s_ne0 = src0->ne[0];
|
|
4379
|
+
const int s_ne1 = src0->ne[1];
|
|
4380
|
+
const int s_ne2 = src0->ne[2];
|
|
4381
|
+
|
|
4382
|
+
const int d_ne0 = dst->ne[0];
|
|
4383
|
+
const int d_ne1 = dst->ne[1];
|
|
4384
|
+
const int d_ne2 = dst->ne[2];
|
|
4385
|
+
|
|
4386
|
+
cl_kernel kernel = backend_ctx->kernel_pad;
|
|
4387
|
+
|
|
4388
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
|
|
4389
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
|
4390
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
|
|
4391
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
|
|
4392
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &s_ne0));
|
|
4393
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &s_ne1));
|
|
4394
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &s_ne2));
|
|
4395
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne0));
|
|
4396
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne1));
|
|
4397
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne2));
|
|
4398
|
+
|
|
4399
|
+
size_t lws0 = 64;
|
|
4400
|
+
size_t gws0 = (( (size_t)d_ne0 + lws0 - 1 ) / lws0) * lws0;
|
|
4401
|
+
|
|
4402
|
+
size_t global_work_size[] = { gws0, (size_t)d_ne1, (size_t)d_ne2 };
|
|
4403
|
+
size_t local_work_size[] = { lws0, 1, 1 };
|
|
4404
|
+
|
|
4405
|
+
size_t * local_work_size_ptr = local_work_size;
|
|
4406
|
+
if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) {
|
|
4407
|
+
local_work_size_ptr = nullptr;
|
|
4408
|
+
}
|
|
4409
|
+
|
|
4410
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
4411
|
+
}
|
|
4412
|
+
|
|
4413
|
+
static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
4414
|
+
GGML_ASSERT(src0);
|
|
4415
|
+
GGML_ASSERT(src0->extra);
|
|
4416
|
+
GGML_ASSERT(dst);
|
|
4417
|
+
GGML_ASSERT(dst->extra);
|
|
4418
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
4419
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
4420
|
+
|
|
4421
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4422
|
+
|
|
4423
|
+
const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
|
|
4424
|
+
cl_kernel kernel = nullptr;
|
|
4425
|
+
|
|
4426
|
+
if (mode == GGML_SCALE_MODE_NEAREST) {
|
|
4427
|
+
kernel = backend_ctx->kernel_upscale;
|
|
4428
|
+
if (kernel == nullptr) {
|
|
4429
|
+
GGML_LOG_WARN("%s: nearest upscale kernel not available, skipping OpenCL execution.\n", __func__);
|
|
4430
|
+
return;
|
|
4431
|
+
}
|
|
4432
|
+
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
|
4433
|
+
kernel = backend_ctx->kernel_upscale_bilinear;
|
|
4434
|
+
if (kernel == nullptr) {
|
|
4435
|
+
GGML_LOG_WARN("%s: bilinear upscale kernel not available, skipping OpenCL execution.\n", __func__);
|
|
4436
|
+
return;
|
|
4437
|
+
}
|
|
4438
|
+
} else {
|
|
4439
|
+
GGML_LOG_WARN("%s: unsupported upscale mode %d, skipping OpenCL execution.\n", __func__, mode);
|
|
4440
|
+
return;
|
|
4441
|
+
}
|
|
4442
|
+
|
|
4443
|
+
ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4444
|
+
ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
|
|
4445
|
+
|
|
4446
|
+
cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
|
|
4447
|
+
cl_ulong off_dst = extra_dst->offset + dst->view_offs;
|
|
4448
|
+
|
|
4449
|
+
const cl_ulong nb00 = src0->nb[0];
|
|
4450
|
+
const cl_ulong nb01 = src0->nb[1];
|
|
4451
|
+
const cl_ulong nb02 = src0->nb[2];
|
|
4452
|
+
const cl_ulong nb03 = src0->nb[3];
|
|
4453
|
+
|
|
4454
|
+
const int ne00_src = src0->ne[0];
|
|
4455
|
+
const int ne01_src = src0->ne[1];
|
|
4456
|
+
|
|
4457
|
+
const int ne10_dst = dst->ne[0];
|
|
4458
|
+
const int ne11_dst = dst->ne[1];
|
|
4459
|
+
const int ne12_dst = dst->ne[2];
|
|
4460
|
+
const int ne13_dst = dst->ne[3];
|
|
4461
|
+
|
|
4462
|
+
const float sf0 = (float)dst->ne[0] / src0->ne[0];
|
|
4463
|
+
const float sf1 = (float)dst->ne[1] / src0->ne[1];
|
|
4464
|
+
const float sf2 = (float)dst->ne[2] / src0->ne[2];
|
|
4465
|
+
const float sf3 = (float)dst->ne[3] / src0->ne[3];
|
|
4466
|
+
|
|
4467
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
|
|
4468
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
|
4469
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
|
|
4470
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
|
|
4471
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &nb00));
|
|
4472
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
|
|
4473
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb02));
|
|
4474
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb03));
|
|
4475
|
+
|
|
4476
|
+
if (mode == GGML_SCALE_MODE_NEAREST) {
|
|
4477
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne10_dst));
|
|
4478
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11_dst));
|
|
4479
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12_dst));
|
|
4480
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13_dst));
|
|
4481
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &sf0));
|
|
4482
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float), &sf1));
|
|
4483
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf2));
|
|
4484
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf3));
|
|
4485
|
+
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
|
4486
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00_src));
|
|
4487
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01_src));
|
|
4488
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10_dst));
|
|
4489
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11_dst));
|
|
4490
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12_dst));
|
|
4491
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13_dst));
|
|
4492
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf0));
|
|
4493
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf1));
|
|
4494
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(float), &sf2));
|
|
4495
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(float), &sf3));
|
|
4496
|
+
}
|
|
4497
|
+
|
|
4498
|
+
|
|
4499
|
+
size_t dst_total_elements = (size_t)ne10_dst * ne11_dst * ne12_dst * ne13_dst;
|
|
4500
|
+
if (dst_total_elements == 0) {
|
|
4501
|
+
return;
|
|
4502
|
+
}
|
|
4503
|
+
size_t global_work_size[] = { dst_total_elements, 1, 1 };
|
|
4504
|
+
size_t local_work_size_pref = 256;
|
|
4505
|
+
size_t local_work_size[] = { MIN(local_work_size_pref, dst_total_elements), 1, 1};
|
|
4506
|
+
|
|
4507
|
+
size_t * local_work_size_ptr = local_work_size;
|
|
4508
|
+
if (dst_total_elements % local_work_size[0] != 0 && !backend_ctx->non_uniform_workgroups) {
|
|
4509
|
+
local_work_size_ptr = nullptr;
|
|
4510
|
+
}
|
|
4511
|
+
|
|
4512
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
4513
|
+
}
|
|
4514
|
+
|
|
4515
|
+
static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
4516
|
+
GGML_ASSERT(src0);
|
|
4517
|
+
GGML_ASSERT(src0->extra);
|
|
4518
|
+
GGML_ASSERT(src1);
|
|
4519
|
+
GGML_ASSERT(src1->extra);
|
|
4520
|
+
GGML_ASSERT(dst);
|
|
4521
|
+
GGML_ASSERT(dst->extra);
|
|
4522
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
4523
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
4524
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
4525
|
+
|
|
4526
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4527
|
+
cl_command_queue queue = backend_ctx->queue;
|
|
4528
|
+
|
|
4529
|
+
if (backend_ctx->kernel_concat_f32_contiguous == nullptr || backend_ctx->kernel_concat_f32_non_contiguous == nullptr) {
|
|
4530
|
+
GGML_LOG_WARN("%s: concat kernels not available, skipping OpenCL execution.\n", __func__);
|
|
4531
|
+
return;
|
|
4532
|
+
}
|
|
4533
|
+
|
|
4534
|
+
ggml_tensor_extra_cl * extra0_cl = (ggml_tensor_extra_cl *)src0->extra;
|
|
4535
|
+
ggml_tensor_extra_cl * extra1_cl = (ggml_tensor_extra_cl *)src1->extra;
|
|
4536
|
+
ggml_tensor_extra_cl * extrad_cl = (ggml_tensor_extra_cl *)dst->extra;
|
|
4537
|
+
|
|
4538
|
+
cl_ulong off_src0 = extra0_cl->offset + src0->view_offs;
|
|
4539
|
+
cl_ulong off_src1 = extra1_cl->offset + src1->view_offs;
|
|
4540
|
+
cl_ulong off_dst = extrad_cl->offset + dst->view_offs;
|
|
4541
|
+
|
|
4542
|
+
const int32_t dim = ((const int32_t *) dst->op_params)[0];
|
|
4543
|
+
GGML_ASSERT(dim >= 0 && dim <= 3);
|
|
4544
|
+
|
|
4545
|
+
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
|
|
4546
|
+
if (dim == 3) {
|
|
4547
|
+
|
|
4548
|
+
size_t nbytes_src0 = ggml_nbytes(src0);
|
|
4549
|
+
size_t nbytes_src1 = ggml_nbytes(src1);
|
|
4550
|
+
|
|
4551
|
+
CL_CHECK(clEnqueueCopyBuffer(queue, extra0_cl->data_device, extrad_cl->data_device,
|
|
4552
|
+
off_src0, off_dst, nbytes_src0, 0, NULL, NULL));
|
|
4553
|
+
CL_CHECK(clEnqueueCopyBuffer(queue, extra1_cl->data_device, extrad_cl->data_device,
|
|
4554
|
+
off_src1, off_dst + nbytes_src0, nbytes_src1, 0, NULL, NULL));
|
|
4555
|
+
} else {
|
|
4556
|
+
|
|
4557
|
+
cl_kernel kernel = backend_ctx->kernel_concat_f32_contiguous;
|
|
4558
|
+
size_t global_work_size[3];
|
|
4559
|
+
|
|
4560
|
+
for (int i3 = 0; i3 < dst->ne[3]; ++i3) {
|
|
4561
|
+
cl_ulong current_off_src0 = off_src0 + (i3 * src0->nb[3]);
|
|
4562
|
+
cl_ulong current_off_src1 = off_src1 + (i3 * src1->nb[3]);
|
|
4563
|
+
cl_ulong current_off_dst = off_dst + (i3 * dst->nb[3]);
|
|
4564
|
+
|
|
4565
|
+
int d_ne00 = src0->ne[0]; int d_ne01 = src0->ne[1]; int d_ne02 = src0->ne[2];
|
|
4566
|
+
int d_ne10 = src1->ne[0]; int d_ne11 = src1->ne[1]; int d_ne12 = src1->ne[2];
|
|
4567
|
+
int d_ne0 = dst->ne[0]; int d_ne1 = dst->ne[1]; int d_ne2 = dst->ne[2];
|
|
4568
|
+
|
|
4569
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
|
|
4570
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), ¤t_off_src0));
|
|
4571
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
|
|
4572
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), ¤t_off_src1));
|
|
4573
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
|
|
4574
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), ¤t_off_dst));
|
|
4575
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &d_ne00));
|
|
4576
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne01));
|
|
4577
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne02));
|
|
4578
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne10));
|
|
4579
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &d_ne11));
|
|
4580
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &d_ne12));
|
|
4581
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &d_ne0));
|
|
4582
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &d_ne1));
|
|
4583
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &d_ne2));
|
|
4584
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dim));
|
|
4585
|
+
|
|
4586
|
+
global_work_size[0] = d_ne0;
|
|
4587
|
+
global_work_size[1] = d_ne1;
|
|
4588
|
+
global_work_size[2] = d_ne2;
|
|
4589
|
+
|
|
4590
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
|
|
4591
|
+
}
|
|
4592
|
+
}
|
|
4593
|
+
} else {
|
|
4594
|
+
cl_kernel kernel = backend_ctx->kernel_concat_f32_non_contiguous;
|
|
4595
|
+
|
|
4596
|
+
long ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3];
|
|
4597
|
+
cl_ulong nb00 = src0->nb[0], nb01 = src0->nb[1], nb02 = src0->nb[2], nb03 = src0->nb[3];
|
|
4598
|
+
|
|
4599
|
+
cl_ulong nb10 = src1->nb[0], nb11 = src1->nb[1], nb12 = src1->nb[2], nb13 = src1->nb[3];
|
|
4600
|
+
|
|
4601
|
+
long d_ne0 = dst->ne[0], d_ne1 = dst->ne[1], d_ne2 = dst->ne[2], d_ne3 = dst->ne[3];
|
|
4602
|
+
cl_ulong d_nb0 = dst->nb[0], d_nb1 = dst->nb[1], d_nb2 = dst->nb[2], d_nb3 = dst->nb[3];
|
|
4603
|
+
|
|
4604
|
+
|
|
4605
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
|
|
4606
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
|
4607
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
|
|
4608
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_src1));
|
|
4609
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
|
|
4610
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &off_dst));
|
|
4611
|
+
|
|
4612
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(long), &ne00));
|
|
4613
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(long), &ne01));
|
|
4614
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(long), &ne02));
|
|
4615
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(long), &ne03));
|
|
4616
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
|
|
4617
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
|
|
4618
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
|
4619
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
|
|
4620
|
+
|
|
4621
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
|
|
4622
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
|
|
4623
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
|
|
4624
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
|
|
4625
|
+
|
|
4626
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(long), &d_ne0));
|
|
4627
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(long), &d_ne1));
|
|
4628
|
+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(long), &d_ne2));
|
|
4629
|
+
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(long), &d_ne3));
|
|
4630
|
+
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &d_nb0));
|
|
4631
|
+
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &d_nb1));
|
|
4632
|
+
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(cl_ulong), &d_nb2));
|
|
4633
|
+
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(cl_ulong), &d_nb3));
|
|
4634
|
+
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &dim));
|
|
4635
|
+
|
|
4636
|
+
size_t global_work_size_nc[] = { d_ne1 > 0 ? (size_t)d_ne1 : 1,
|
|
4637
|
+
d_ne2 > 0 ? (size_t)d_ne2 : 1,
|
|
4638
|
+
d_ne3 > 0 ? (size_t)d_ne3 : 1 };
|
|
4639
|
+
|
|
4640
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_nc, NULL, dst);
|
|
4641
|
+
}
|
|
4642
|
+
}
|
|
4643
|
+
|
|
4644
|
+
static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
4645
|
+
GGML_ASSERT(src0);
|
|
4646
|
+
GGML_ASSERT(src0->extra);
|
|
4647
|
+
GGML_ASSERT(dst);
|
|
4648
|
+
GGML_ASSERT(dst->extra);
|
|
4649
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
4650
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
4651
|
+
|
|
4652
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4653
|
+
|
|
4654
|
+
if (backend_ctx->kernel_timestep_embedding == nullptr) {
|
|
4655
|
+
GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__);
|
|
4656
|
+
return;
|
|
4657
|
+
}
|
|
4658
|
+
|
|
4659
|
+
ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4660
|
+
ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
|
|
4661
|
+
|
|
4662
|
+
cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
|
|
4663
|
+
cl_ulong off_dst = extra_dst->offset + dst->view_offs;
|
|
4664
|
+
|
|
4665
|
+
const int logical_dim = dst->op_params[0];
|
|
4666
|
+
const int max_period = dst->op_params[1];
|
|
4667
|
+
const int dst_nb1_bytes = dst->nb[1];
|
|
4668
|
+
|
|
4669
|
+
cl_kernel kernel = backend_ctx->kernel_timestep_embedding;
|
|
4670
|
+
|
|
4671
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
|
|
4672
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
|
4673
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
|
|
4674
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
|
|
4675
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &dst_nb1_bytes));
|
|
4676
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &logical_dim));
|
|
4677
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &max_period));
|
|
4678
|
+
|
|
4679
|
+
size_t gws0 = (size_t)(((logical_dim + 1) / 2) + 1);
|
|
4680
|
+
|
|
4681
|
+
size_t gws1 = (size_t)src0->ne[0];
|
|
4682
|
+
|
|
4683
|
+
size_t global_work_size[] = {gws0, gws1, 1};
|
|
4684
|
+
|
|
4685
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
|
|
4109
4686
|
}
|
|
4110
4687
|
|
|
4111
4688
|
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4120,7 +4697,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
4120
4697
|
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
|
4121
4698
|
|
|
4122
4699
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4123
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4124
4700
|
|
|
4125
4701
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4126
4702
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -4325,15 +4901,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
4325
4901
|
static_cast<size_t>(padded_height_B)
|
|
4326
4902
|
};
|
|
4327
4903
|
|
|
4328
|
-
|
|
4329
|
-
cl_event evt;
|
|
4330
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, &evt));
|
|
4331
|
-
|
|
4332
|
-
g_profiling_info.emplace_back();
|
|
4333
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_size_t, local_size_t, dst);
|
|
4334
|
-
#else
|
|
4335
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, NULL));
|
|
4336
|
-
#endif
|
|
4904
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
|
|
4337
4905
|
} else {
|
|
4338
4906
|
// no need to transpose B in other cases
|
|
4339
4907
|
// create an image for B from sub_buffer
|
|
@@ -4455,16 +5023,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
4455
5023
|
|
|
4456
5024
|
// enqueue kernel with profiling
|
|
4457
5025
|
// <--------------------------------------------> //
|
|
4458
|
-
|
|
4459
|
-
cl_event evt;
|
|
4460
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
4461
|
-
|
|
4462
|
-
g_profiling_info.emplace_back();
|
|
4463
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
4464
|
-
// enqueue kernel without profiling
|
|
4465
|
-
#else
|
|
4466
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
4467
|
-
#endif
|
|
5026
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
4468
5027
|
// <--------------------------------------------> //
|
|
4469
5028
|
|
|
4470
5029
|
// deallocate sub buffers and images
|
|
@@ -4544,15 +5103,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
4544
5103
|
global_work_size[2] = (size_t)ne12*ne13;
|
|
4545
5104
|
}
|
|
4546
5105
|
|
|
4547
|
-
|
|
4548
|
-
cl_event evt;
|
|
4549
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
4550
|
-
|
|
4551
|
-
g_profiling_info.emplace_back();
|
|
4552
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
4553
|
-
#else
|
|
4554
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
4555
|
-
#endif
|
|
5106
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
4556
5107
|
return;
|
|
4557
5108
|
}
|
|
4558
5109
|
#else // GGML_OPENCL_SOA_Q
|
|
@@ -4782,15 +5333,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
4782
5333
|
size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
|
|
4783
5334
|
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
|
4784
5335
|
|
|
4785
|
-
|
|
4786
|
-
cl_event evt;
|
|
4787
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
4788
|
-
|
|
4789
|
-
g_profiling_info.emplace_back();
|
|
4790
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
4791
|
-
#else
|
|
4792
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
4793
|
-
#endif
|
|
5336
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
4794
5337
|
} else if (src0t == GGML_TYPE_Q4_K) {
|
|
4795
5338
|
GGML_ASSERT(false && "not implemented");
|
|
4796
5339
|
} else if (src0t == GGML_TYPE_Q3_K) {
|
|
@@ -4801,31 +5344,136 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
4801
5344
|
size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
|
|
4802
5345
|
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
|
4803
5346
|
|
|
4804
|
-
|
|
4805
|
-
cl_event evt;
|
|
4806
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
4807
|
-
|
|
4808
|
-
g_profiling_info.emplace_back();
|
|
4809
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
4810
|
-
#else
|
|
4811
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
4812
|
-
#endif
|
|
5347
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
4813
5348
|
} else {
|
|
4814
5349
|
int64_t ny = (ne11 + nrows - 1)/nrows;
|
|
4815
5350
|
|
|
4816
5351
|
size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
|
|
4817
5352
|
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
|
4818
5353
|
|
|
4819
|
-
|
|
4820
|
-
|
|
4821
|
-
|
|
5354
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5355
|
+
}
|
|
5356
|
+
}
|
|
4822
5357
|
|
|
4823
|
-
|
|
4824
|
-
|
|
4825
|
-
|
|
4826
|
-
|
|
5358
|
+
static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
5359
|
+
GGML_ASSERT(src0);
|
|
5360
|
+
GGML_ASSERT(src0->extra);
|
|
5361
|
+
GGML_ASSERT(src1);
|
|
5362
|
+
GGML_ASSERT(src1->extra);
|
|
5363
|
+
GGML_ASSERT(dst);
|
|
5364
|
+
GGML_ASSERT(dst->extra);
|
|
5365
|
+
|
|
5366
|
+
const ggml_tensor * src2 = dst->src[2];
|
|
5367
|
+
GGML_ASSERT(src2);
|
|
5368
|
+
GGML_ASSERT(src2->extra);
|
|
5369
|
+
|
|
5370
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5371
|
+
|
|
5372
|
+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
5373
|
+
ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
|
|
5374
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
5375
|
+
|
|
5376
|
+
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
|
5377
|
+
cl_ulong offset2 = extra2->offset + src2->view_offs;
|
|
5378
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
5379
|
+
|
|
5380
|
+
#ifdef GGML_OPENCL_SOA_Q
|
|
5381
|
+
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
|
|
4827
5382
|
#endif
|
|
5383
|
+
|
|
5384
|
+
const int ne00 = src0->ne[0];
|
|
5385
|
+
const int ne01 = src0->ne[1];
|
|
5386
|
+
const int ne02 = src0->ne[2];
|
|
5387
|
+
const int ne03 = src0->ne[3];
|
|
5388
|
+
|
|
5389
|
+
const cl_ulong nb00 = src0->nb[0];
|
|
5390
|
+
const cl_ulong nb02 = src0->nb[2];
|
|
5391
|
+
|
|
5392
|
+
const int ne10 = src1->ne[0];
|
|
5393
|
+
const int ne11 = src1->ne[1];
|
|
5394
|
+
const int ne12 = src1->ne[2];
|
|
5395
|
+
const int ne13 = src1->ne[3];
|
|
5396
|
+
|
|
5397
|
+
const cl_ulong nb11 = src1->nb[1];
|
|
5398
|
+
const cl_ulong nb12 = src1->nb[2];
|
|
5399
|
+
|
|
5400
|
+
const int ne20 = src2->ne[0];
|
|
5401
|
+
const int ne21 = src2->ne[1];
|
|
5402
|
+
|
|
5403
|
+
const cl_ulong nb21 = src2->nb[1];
|
|
5404
|
+
|
|
5405
|
+
const int ne0 = dst->ne[0];
|
|
5406
|
+
const int ne1 = dst->ne[1];
|
|
5407
|
+
|
|
5408
|
+
const int r2 = ne12/ne02;
|
|
5409
|
+
const int r3 = ne13/ne03;
|
|
5410
|
+
const int dst_rows = ne20*ne21; // ne20 = n_used_experts, ne21 = n_rows
|
|
5411
|
+
|
|
5412
|
+
GGML_ASSERT(ne00 == ne10);
|
|
5413
|
+
|
|
5414
|
+
int sgs = 32; // subgroup size
|
|
5415
|
+
int nsg = 1; // number of subgroups
|
|
5416
|
+
int nrows = 1; // number of row in src1
|
|
5417
|
+
int ndst = 4; // number of values produced by each subgroup
|
|
5418
|
+
|
|
5419
|
+
cl_kernel kernel;
|
|
5420
|
+
|
|
5421
|
+
// subgroup mat vec
|
|
5422
|
+
switch (src0->type) {
|
|
5423
|
+
case GGML_TYPE_Q4_0: {
|
|
5424
|
+
kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat;
|
|
5425
|
+
|
|
5426
|
+
if (backend_ctx->gpu_family == INTEL) {
|
|
5427
|
+
sgs = 16;
|
|
5428
|
+
nsg = 1;
|
|
5429
|
+
ndst = 8;
|
|
5430
|
+
} else if (backend_ctx->gpu_family == ADRENO) {
|
|
5431
|
+
sgs = 64;
|
|
5432
|
+
nsg = 1;
|
|
5433
|
+
ndst = 8;
|
|
5434
|
+
} else {
|
|
5435
|
+
GGML_ASSERT(false && "TODO: Unknown GPU");
|
|
5436
|
+
}
|
|
5437
|
+
|
|
5438
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
|
|
5439
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
|
|
5440
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
5441
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
5442
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
|
|
5443
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
|
|
5444
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
|
5445
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
|
5446
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
|
5447
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
|
|
5448
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
|
|
5449
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb00));
|
|
5450
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
|
5451
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
|
|
5452
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
|
|
5453
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
|
|
5454
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb11));
|
|
5455
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb12));
|
|
5456
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne20));
|
|
5457
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne21));
|
|
5458
|
+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb21));
|
|
5459
|
+
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne0));
|
|
5460
|
+
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne1));
|
|
5461
|
+
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r2));
|
|
5462
|
+
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &r3));
|
|
5463
|
+
|
|
5464
|
+
break;
|
|
5465
|
+
}
|
|
5466
|
+
default:
|
|
5467
|
+
GGML_ASSERT(false && "not implemented");;
|
|
4828
5468
|
}
|
|
5469
|
+
|
|
5470
|
+
int _ne1 = 1;
|
|
5471
|
+
int ne123 = dst_rows;
|
|
5472
|
+
|
|
5473
|
+
size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
|
|
5474
|
+
size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
|
|
5475
|
+
|
|
5476
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
4829
5477
|
}
|
|
4830
5478
|
|
|
4831
5479
|
static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4838,7 +5486,6 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
|
4838
5486
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
4839
5487
|
|
|
4840
5488
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4841
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4842
5489
|
|
|
4843
5490
|
float scale;
|
|
4844
5491
|
memcpy(&scale, dst->op_params, sizeof(scale));
|
|
@@ -4867,15 +5514,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
|
4867
5514
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
4868
5515
|
}
|
|
4869
5516
|
|
|
4870
|
-
|
|
4871
|
-
cl_event evt;
|
|
4872
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
4873
|
-
|
|
4874
|
-
g_profiling_info.emplace_back();
|
|
4875
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
4876
|
-
#else
|
|
4877
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
4878
|
-
#endif
|
|
5517
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
4879
5518
|
}
|
|
4880
5519
|
|
|
4881
5520
|
static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4912,7 +5551,6 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
4912
5551
|
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
|
4913
5552
|
|
|
4914
5553
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4915
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4916
5554
|
|
|
4917
5555
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4918
5556
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -4977,15 +5615,7 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
4977
5615
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
4978
5616
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
4979
5617
|
|
|
4980
|
-
|
|
4981
|
-
cl_event evt;
|
|
4982
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
4983
|
-
|
|
4984
|
-
g_profiling_info.emplace_back();
|
|
4985
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, src1);
|
|
4986
|
-
#else
|
|
4987
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
4988
|
-
#endif
|
|
5618
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
|
|
4989
5619
|
}
|
|
4990
5620
|
|
|
4991
5621
|
static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -5008,7 +5638,6 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
|
|
|
5008
5638
|
const int ne02 = src0 ? src0->ne[2] : 0;
|
|
5009
5639
|
|
|
5010
5640
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5011
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
5012
5641
|
|
|
5013
5642
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
5014
5643
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -5032,15 +5661,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
|
|
|
5032
5661
|
size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1};
|
|
5033
5662
|
size_t local_work_size[] = {64, 1, 1};
|
|
5034
5663
|
|
|
5035
|
-
|
|
5036
|
-
cl_event evt;
|
|
5037
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5038
|
-
|
|
5039
|
-
g_profiling_info.emplace_back();
|
|
5040
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5041
|
-
#else
|
|
5042
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5043
|
-
#endif
|
|
5664
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5044
5665
|
} else {
|
|
5045
5666
|
kernel = backend_ctx->kernel_diag_mask_inf;
|
|
5046
5667
|
|
|
@@ -5060,15 +5681,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
|
|
|
5060
5681
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
5061
5682
|
}
|
|
5062
5683
|
|
|
5063
|
-
|
|
5064
|
-
cl_event evt;
|
|
5065
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
5066
|
-
|
|
5067
|
-
g_profiling_info.emplace_back();
|
|
5068
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
5069
|
-
#else
|
|
5070
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
5071
|
-
#endif
|
|
5684
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
5072
5685
|
}
|
|
5073
5686
|
}
|
|
5074
5687
|
|
|
@@ -5088,7 +5701,6 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
5088
5701
|
}
|
|
5089
5702
|
|
|
5090
5703
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5091
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
5092
5704
|
|
|
5093
5705
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
5094
5706
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -5168,15 +5780,7 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
5168
5780
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
5169
5781
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
5170
5782
|
|
|
5171
|
-
|
|
5172
|
-
cl_event evt;
|
|
5173
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5174
|
-
|
|
5175
|
-
g_profiling_info.emplace_back();
|
|
5176
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5177
|
-
#else
|
|
5178
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5179
|
-
#endif
|
|
5783
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5180
5784
|
}
|
|
5181
5785
|
|
|
5182
5786
|
static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -5188,7 +5792,6 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
5188
5792
|
GGML_ASSERT(dst->extra);
|
|
5189
5793
|
|
|
5190
5794
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5191
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
5192
5795
|
|
|
5193
5796
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
5194
5797
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -5354,15 +5957,7 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
5354
5957
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
5355
5958
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
5356
5959
|
|
|
5357
|
-
|
|
5358
|
-
cl_event evt;
|
|
5359
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5360
|
-
|
|
5361
|
-
g_profiling_info.emplace_back();
|
|
5362
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5363
|
-
#else
|
|
5364
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5365
|
-
#endif
|
|
5960
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5366
5961
|
}
|
|
5367
5962
|
|
|
5368
5963
|
static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -5377,7 +5972,6 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
|
5377
5972
|
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
|
5378
5973
|
|
|
5379
5974
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5380
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
5381
5975
|
|
|
5382
5976
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
5383
5977
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -5446,15 +6040,7 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
|
5446
6040
|
size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC};
|
|
5447
6041
|
size_t local_work_size[] = {256, 1, 1};
|
|
5448
6042
|
|
|
5449
|
-
|
|
5450
|
-
cl_event evt;
|
|
5451
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5452
|
-
|
|
5453
|
-
g_profiling_info.emplace_back();
|
|
5454
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5455
|
-
#else
|
|
5456
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5457
|
-
#endif
|
|
6043
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5458
6044
|
}
|
|
5459
6045
|
|
|
5460
6046
|
static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -5469,7 +6055,6 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
5469
6055
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
5470
6056
|
|
|
5471
6057
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5472
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
5473
6058
|
|
|
5474
6059
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
5475
6060
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -5501,15 +6086,7 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
5501
6086
|
size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
|
|
5502
6087
|
size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
|
|
5503
6088
|
|
|
5504
|
-
|
|
5505
|
-
cl_event evt;
|
|
5506
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5507
|
-
|
|
5508
|
-
g_profiling_info.emplace_back();
|
|
5509
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5510
|
-
#else
|
|
5511
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5512
|
-
#endif
|
|
6089
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5513
6090
|
}
|
|
5514
6091
|
|
|
5515
6092
|
static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -5523,7 +6100,6 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
5523
6100
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
5524
6101
|
|
|
5525
6102
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5526
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
5527
6103
|
|
|
5528
6104
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
5529
6105
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -5564,15 +6140,7 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
5564
6140
|
size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
|
|
5565
6141
|
size_t local_work_size[] = {(size_t)64, 1, 1};
|
|
5566
6142
|
|
|
5567
|
-
|
|
5568
|
-
cl_event evt;
|
|
5569
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5570
|
-
|
|
5571
|
-
g_profiling_info.emplace_back();
|
|
5572
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5573
|
-
#else
|
|
5574
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5575
|
-
#endif
|
|
6143
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5576
6144
|
}
|
|
5577
6145
|
|
|
5578
6146
|
//------------------------------------------------------------------------------
|
|
@@ -5667,6 +6235,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
5667
6235
|
}
|
|
5668
6236
|
func = ggml_cl_sigmoid;
|
|
5669
6237
|
break;
|
|
6238
|
+
case GGML_UNARY_OP_TANH:
|
|
6239
|
+
if (!any_on_device) {
|
|
6240
|
+
return false;
|
|
6241
|
+
}
|
|
6242
|
+
func = ggml_cl_tanh;
|
|
6243
|
+
break;
|
|
5670
6244
|
default:
|
|
5671
6245
|
return false;
|
|
5672
6246
|
} break;
|
|
@@ -5694,12 +6268,48 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
5694
6268
|
}
|
|
5695
6269
|
func = ggml_cl_group_norm;
|
|
5696
6270
|
break;
|
|
6271
|
+
case GGML_OP_REPEAT:
|
|
6272
|
+
if (!any_on_device) {
|
|
6273
|
+
return false;
|
|
6274
|
+
}
|
|
6275
|
+
func = ggml_cl_repeat;
|
|
6276
|
+
break;
|
|
6277
|
+
case GGML_OP_PAD:
|
|
6278
|
+
if (!any_on_device) {
|
|
6279
|
+
return false;
|
|
6280
|
+
}
|
|
6281
|
+
ggml_cl_pad(backend, tensor->src[0], tensor);
|
|
6282
|
+
return true;
|
|
6283
|
+
case GGML_OP_UPSCALE:
|
|
6284
|
+
if (!any_on_device) {
|
|
6285
|
+
return false;
|
|
6286
|
+
}
|
|
6287
|
+
ggml_cl_upscale(backend, tensor->src[0], tensor);
|
|
6288
|
+
return true;
|
|
6289
|
+
case GGML_OP_CONCAT:
|
|
6290
|
+
if (!any_on_device) {
|
|
6291
|
+
return false;
|
|
6292
|
+
}
|
|
6293
|
+
func = ggml_cl_concat;
|
|
6294
|
+
break;
|
|
6295
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
6296
|
+
if (!any_on_device) {
|
|
6297
|
+
return false;
|
|
6298
|
+
}
|
|
6299
|
+
ggml_cl_timestep_embedding(backend, tensor->src[0], tensor);
|
|
6300
|
+
return true;
|
|
5697
6301
|
case GGML_OP_MUL_MAT:
|
|
5698
6302
|
if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
|
5699
6303
|
return false;
|
|
5700
6304
|
}
|
|
5701
6305
|
func = ggml_cl_mul_mat;
|
|
5702
6306
|
break;
|
|
6307
|
+
case GGML_OP_MUL_MAT_ID:
|
|
6308
|
+
if (!any_on_device) {
|
|
6309
|
+
return false;
|
|
6310
|
+
}
|
|
6311
|
+
func = ggml_cl_mul_mat_id;
|
|
6312
|
+
break;
|
|
5703
6313
|
case GGML_OP_SCALE:
|
|
5704
6314
|
if (!any_on_device) {
|
|
5705
6315
|
return false;
|