@novastera-oss/llamarn 0.2.7 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +8 -3
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +56 -22
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +1 -2
- package/cpp/llama.cpp/README.md +4 -5
- package/cpp/llama.cpp/build-xcframework.sh +1 -1
- package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
- package/cpp/llama.cpp/common/arg.cpp +24 -0
- package/cpp/llama.cpp/common/chat.cpp +37 -20
- package/cpp/llama.cpp/common/chat.h +2 -0
- package/cpp/llama.cpp/common/common.cpp +3 -0
- package/cpp/llama.cpp/common/common.h +5 -0
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/convert_hf_to_gguf.py +860 -23
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +9 -0
- package/cpp/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +206 -10
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -1
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +36 -18
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +68 -5
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +16 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +37 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -103
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1405 -240
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +212 -34
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +35 -11
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +187 -54
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +71 -29
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +8 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +14 -12
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +5 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +15 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +12 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +269 -110
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +2 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -5
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +125 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +51 -9
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +394 -80
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +616 -239
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +741 -571
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +697 -1098
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +104 -62
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +39 -38
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +767 -292
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +58 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +23 -3
- package/cpp/llama.cpp/ggml/src/ggml.c +449 -72
- package/cpp/llama.cpp/ggml/src/gguf.cpp +13 -2
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +285 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +27 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +137 -21
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +109 -7
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +8 -43
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +265 -3
- package/cpp/llama.cpp/src/llama-arch.h +36 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +596 -359
- package/cpp/llama.cpp/src/llama-batch.h +105 -70
- package/cpp/llama.cpp/src/llama-chat.cpp +26 -6
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +101 -107
- package/cpp/llama.cpp/src/llama-context.h +13 -13
- package/cpp/llama.cpp/src/llama-graph.cpp +286 -404
- package/cpp/llama.cpp/src/llama-graph.h +78 -79
- package/cpp/llama.cpp/src/llama-hparams.cpp +11 -1
- package/cpp/llama.cpp/src/llama-hparams.h +11 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +74 -66
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +312 -157
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +79 -46
- package/cpp/llama.cpp/src/llama-kv-cells.h +97 -21
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +73 -69
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +19 -22
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +88 -77
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
- package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
- package/cpp/llama.cpp/src/llama-memory.h +21 -22
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +5301 -2922
- package/cpp/llama.cpp/src/llama-model.h +40 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +88 -5
- package/cpp/llama.cpp/src/llama-vocab.cpp +37 -3
- package/cpp/llama.cpp/src/llama-vocab.h +42 -0
- package/cpp/rn-utils.h +3 -0
- package/ios/include/chat.h +2 -0
- package/ios/include/common.h +5 -0
- package/ios/include/llama.h +8 -43
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3891 -3744
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5095 -4900
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5066 -4871
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3919 -3773
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
- package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
|
@@ -231,6 +231,71 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
|
|
|
231
231
|
return { type, major, minor, patch };
|
|
232
232
|
}
|
|
233
233
|
|
|
234
|
+
// Profiling
|
|
235
|
+
struct ProfilingInfo {
|
|
236
|
+
std::string op_name;
|
|
237
|
+
std::string kernel_name;
|
|
238
|
+
|
|
239
|
+
cl_kernel kernel;
|
|
240
|
+
cl_event evt;
|
|
241
|
+
|
|
242
|
+
cl_ulong cmd_queued;
|
|
243
|
+
cl_ulong cmd_submit;
|
|
244
|
+
cl_ulong cmd_start;
|
|
245
|
+
cl_ulong cmd_end;
|
|
246
|
+
cl_ulong overhead_start;
|
|
247
|
+
cl_ulong overhead_end;
|
|
248
|
+
// For the times below, see spec for clGetEventProfilingInfo
|
|
249
|
+
// The time kernel spent in cmd queue - SUBMIT - QUEUED
|
|
250
|
+
cl_ulong cmd_queued_duration_ns;
|
|
251
|
+
// The time kernel spent for submission - START - SUBMIT
|
|
252
|
+
cl_ulong cmd_submit_duration_ns;
|
|
253
|
+
// Kernel execution time in nanoseconds - END - START
|
|
254
|
+
cl_ulong cmd_duration_ns;
|
|
255
|
+
// The time for the kernel to complete - COMPLETE - END
|
|
256
|
+
cl_ulong cmd_complete_duration_ns;
|
|
257
|
+
// Total time to finish the kernel - COMPELTE - QUEUED
|
|
258
|
+
cl_ulong cmd_total_duration_ns;
|
|
259
|
+
// Global and local work sizes.
|
|
260
|
+
size_t global_size[3];
|
|
261
|
+
size_t local_size[3];
|
|
262
|
+
// Op output size.
|
|
263
|
+
size_t output_size[4];
|
|
264
|
+
};
|
|
265
|
+
|
|
266
|
+
static void populateProfilingInfo(
|
|
267
|
+
ProfilingInfo& info, cl_event evt, cl_kernel kernel, cl_uint work_dim,
|
|
268
|
+
size_t global_size[3], size_t local_size[3],
|
|
269
|
+
const ggml_tensor * tensor) {
|
|
270
|
+
info.op_name = tensor->name;
|
|
271
|
+
info.kernel = kernel;
|
|
272
|
+
info.evt = evt;
|
|
273
|
+
|
|
274
|
+
// 0 means not specified, e.g., 2D workgroup, or NULL for driver to choose
|
|
275
|
+
info.local_size[0] = 0;
|
|
276
|
+
info.local_size[1] = 0;
|
|
277
|
+
info.local_size[2] = 0;
|
|
278
|
+
|
|
279
|
+
info.global_size[0] = 0;
|
|
280
|
+
info.global_size[1] = 0;
|
|
281
|
+
info.global_size[2] = 0;
|
|
282
|
+
|
|
283
|
+
if (local_size) {
|
|
284
|
+
for (cl_uint i = 0; i < work_dim; ++i) {
|
|
285
|
+
info.local_size[i] = local_size[i];
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
for (cl_uint i = 0; i < work_dim; ++i) {
|
|
290
|
+
info.global_size[i] = global_size[i];
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
info.output_size[0] = tensor->ne[0];
|
|
294
|
+
info.output_size[1] = tensor->ne[1];
|
|
295
|
+
info.output_size[2] = tensor->ne[2];
|
|
296
|
+
info.output_size[3] = tensor->ne[3];
|
|
297
|
+
}
|
|
298
|
+
|
|
234
299
|
struct ggml_backend_opencl_context;
|
|
235
300
|
|
|
236
301
|
// backend device context
|
|
@@ -254,6 +319,8 @@ struct ggml_backend_opencl_device_context {
|
|
|
254
319
|
|
|
255
320
|
// backend context
|
|
256
321
|
struct ggml_backend_opencl_context {
|
|
322
|
+
int ref_count;
|
|
323
|
+
|
|
257
324
|
cl_device_id device;
|
|
258
325
|
std::string device_name;
|
|
259
326
|
|
|
@@ -284,6 +351,8 @@ struct ggml_backend_opencl_context {
|
|
|
284
351
|
cl_program program_gemv_noshuffle_general;
|
|
285
352
|
cl_program program_gemv_noshuffle;
|
|
286
353
|
cl_program program_get_rows;
|
|
354
|
+
cl_program program_set_rows;
|
|
355
|
+
cl_program program_glu;
|
|
287
356
|
cl_program program_im2col_f16;
|
|
288
357
|
cl_program program_im2col_f32;
|
|
289
358
|
cl_program program_mul_mat_Ab_Bi_8x4;
|
|
@@ -299,6 +368,7 @@ struct ggml_backend_opencl_context {
|
|
|
299
368
|
cl_program program_mul_mv_f16_f32;
|
|
300
369
|
cl_program program_mul_mv_f32_f32;
|
|
301
370
|
cl_program program_mul;
|
|
371
|
+
cl_program program_mul_mat_f16_f32_tiled;
|
|
302
372
|
cl_program program_div;
|
|
303
373
|
cl_program program_sub;
|
|
304
374
|
cl_program program_norm;
|
|
@@ -330,10 +400,13 @@ struct ggml_backend_opencl_context {
|
|
|
330
400
|
cl_kernel kernel_scale;
|
|
331
401
|
cl_kernel kernel_silu, kernel_silu_4;
|
|
332
402
|
cl_kernel kernel_gelu, kernel_gelu_4;
|
|
403
|
+
cl_kernel kernel_gelu_erf, kernel_gelu_erf_4;
|
|
333
404
|
cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
|
|
334
405
|
cl_kernel kernel_relu;
|
|
335
406
|
cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
|
|
336
407
|
cl_kernel kernel_clamp;
|
|
408
|
+
cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, kernel_geglu_erf, kernel_geglu_quick,
|
|
409
|
+
kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16, kernel_geglu_erf_f16, kernel_geglu_quick_f16;
|
|
337
410
|
cl_kernel kernel_norm;
|
|
338
411
|
cl_kernel kernel_rms_norm;
|
|
339
412
|
cl_kernel kernel_group_norm;
|
|
@@ -341,6 +414,7 @@ struct ggml_backend_opencl_context {
|
|
|
341
414
|
cl_kernel kernel_soft_max, kernel_soft_max_4;
|
|
342
415
|
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
|
|
343
416
|
cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
|
|
417
|
+
cl_kernel kernel_set_rows_f32, kernel_set_rows_f16;
|
|
344
418
|
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
|
|
345
419
|
cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
|
|
346
420
|
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
|
|
@@ -349,6 +423,7 @@ struct ggml_backend_opencl_context {
|
|
|
349
423
|
cl_kernel kernel_mul_mat_f16_f32_1row;
|
|
350
424
|
cl_kernel kernel_mul_mat_f16_f32;
|
|
351
425
|
cl_kernel kernel_mul_mat_f16_f32_l4;
|
|
426
|
+
cl_kernel kernel_mul_mat_f16_f32_tiled;
|
|
352
427
|
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
|
|
353
428
|
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
|
|
354
429
|
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
|
|
@@ -369,6 +444,118 @@ struct ggml_backend_opencl_context {
|
|
|
369
444
|
cl_kernel kernel_timestep_embedding;
|
|
370
445
|
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
|
371
446
|
|
|
447
|
+
std::vector<ProfilingInfo> profiling_info;
|
|
448
|
+
|
|
449
|
+
void write_profiling_info() {
|
|
450
|
+
FILE * fperf = fopen("cl_profiling.csv", "w");
|
|
451
|
+
if (!fperf) {
|
|
452
|
+
GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
|
|
453
|
+
return;
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
// Populate profiling info
|
|
457
|
+
for (ProfilingInfo & info : profiling_info) {
|
|
458
|
+
cl_ulong cmd_queued;
|
|
459
|
+
cl_ulong cmd_submit;
|
|
460
|
+
cl_ulong cmd_start;
|
|
461
|
+
cl_ulong cmd_end;
|
|
462
|
+
cl_ulong cmd_complete;
|
|
463
|
+
|
|
464
|
+
CL_CHECK(clWaitForEvents(1, &info.evt));
|
|
465
|
+
CL_CHECK(clGetEventProfilingInfo(
|
|
466
|
+
info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
|
|
467
|
+
CL_CHECK(clGetEventProfilingInfo(
|
|
468
|
+
info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
|
|
469
|
+
CL_CHECK(clGetEventProfilingInfo(
|
|
470
|
+
info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
|
|
471
|
+
CL_CHECK(clGetEventProfilingInfo(
|
|
472
|
+
info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
|
|
473
|
+
CL_CHECK(clGetEventProfilingInfo(
|
|
474
|
+
info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
|
|
475
|
+
CL_CHECK(clReleaseEvent(info.evt));
|
|
476
|
+
|
|
477
|
+
char kernel_name[512];
|
|
478
|
+
CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
|
|
479
|
+
sizeof(kernel_name), kernel_name, NULL));
|
|
480
|
+
info.kernel_name = kernel_name;
|
|
481
|
+
|
|
482
|
+
info.cmd_queued = cmd_queued;
|
|
483
|
+
info.cmd_submit = cmd_submit;
|
|
484
|
+
info.cmd_start = cmd_start;
|
|
485
|
+
info.cmd_end = cmd_end;
|
|
486
|
+
|
|
487
|
+
info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
|
|
488
|
+
info.cmd_submit_duration_ns = cmd_start - cmd_submit;
|
|
489
|
+
info.cmd_duration_ns = cmd_end - cmd_start;
|
|
490
|
+
info.cmd_complete_duration_ns = cmd_complete - cmd_end;
|
|
491
|
+
info.cmd_total_duration_ns = cmd_complete - cmd_queued;
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
// Dump a csv
|
|
495
|
+
float total_kernel_time = 0;
|
|
496
|
+
fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
|
|
497
|
+
for (const ProfilingInfo & info : profiling_info) {
|
|
498
|
+
total_kernel_time += info.cmd_duration_ns/1.e6f;
|
|
499
|
+
fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
|
|
500
|
+
info.op_name.c_str(), info.kernel_name.c_str(),
|
|
501
|
+
info.cmd_queued_duration_ns/1.e6f,
|
|
502
|
+
info.cmd_submit_duration_ns/1.e6f,
|
|
503
|
+
info.cmd_duration_ns/1.e6f,
|
|
504
|
+
info.cmd_complete_duration_ns/1.e6f,
|
|
505
|
+
info.cmd_total_duration_ns/1.e6f,
|
|
506
|
+
info.global_size[0], info.global_size[1], info.global_size[2],
|
|
507
|
+
info.local_size[0], info.local_size[1], info.local_size[2],
|
|
508
|
+
info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
|
|
509
|
+
}
|
|
510
|
+
fclose(fperf);
|
|
511
|
+
|
|
512
|
+
GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
|
|
513
|
+
|
|
514
|
+
// Dump a simple chrome trace
|
|
515
|
+
FILE* ftrace = fopen("cl_trace.json", "w");
|
|
516
|
+
if (!ftrace) {
|
|
517
|
+
GGML_LOG_ERROR("Failed to open cl_trace.json\n");
|
|
518
|
+
return;
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
fprintf(ftrace, "[\n");
|
|
522
|
+
for (const ProfilingInfo & info : profiling_info) {
|
|
523
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
|
524
|
+
info.kernel_name.c_str(), info.cmd_queued/1000);
|
|
525
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
|
526
|
+
info.kernel_name.c_str(), info.cmd_submit/1000);
|
|
527
|
+
|
|
528
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
|
529
|
+
info.kernel_name.c_str(), info.cmd_start/1000);
|
|
530
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
|
531
|
+
info.kernel_name.c_str(), info.cmd_end/1000);
|
|
532
|
+
}
|
|
533
|
+
fclose(ftrace);
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
size_t get_kernel_workgroup_size(cl_kernel kernel) const {
|
|
537
|
+
size_t workgroup_size = 0;
|
|
538
|
+
size_t ret_size = 0;
|
|
539
|
+
CL_CHECK(
|
|
540
|
+
clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
|
|
541
|
+
sizeof(size_t), &workgroup_size, &ret_size));
|
|
542
|
+
GGML_ASSERT(sizeof(size_t) == ret_size);
|
|
543
|
+
return workgroup_size;
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
|
|
547
|
+
#ifdef GGML_OPENCL_PROFILING
|
|
548
|
+
cl_event evt;
|
|
549
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
550
|
+
|
|
551
|
+
profiling_info.emplace_back();
|
|
552
|
+
populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
|
|
553
|
+
#else
|
|
554
|
+
GGML_UNUSED(tensor);
|
|
555
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
556
|
+
#endif
|
|
557
|
+
}
|
|
558
|
+
|
|
372
559
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
373
560
|
// Transpose kernels
|
|
374
561
|
cl_program program_transpose;
|
|
@@ -395,46 +582,19 @@ struct ggml_backend_opencl_context {
|
|
|
395
582
|
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
|
|
396
583
|
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
|
|
397
584
|
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
|
398
|
-
};
|
|
399
|
-
|
|
400
|
-
// All registered devices with a default device in the front.
|
|
401
|
-
static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
|
|
402
585
|
|
|
403
|
-
|
|
586
|
+
void free() {
|
|
587
|
+
ref_count--;
|
|
588
|
+
if (ref_count == 0) {
|
|
404
589
|
#ifdef GGML_OPENCL_PROFILING
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
cl_kernel kernel;
|
|
410
|
-
cl_event evt;
|
|
411
|
-
|
|
412
|
-
cl_ulong cmd_queued;
|
|
413
|
-
cl_ulong cmd_submit;
|
|
414
|
-
cl_ulong cmd_start;
|
|
415
|
-
cl_ulong cmd_end;
|
|
416
|
-
cl_ulong overhead_start;
|
|
417
|
-
cl_ulong overhead_end;
|
|
418
|
-
// For the times below, see spec for clGetEventProfilingInfo
|
|
419
|
-
// The time kernel spent in cmd queue - SUBMIT - QUEUED
|
|
420
|
-
cl_ulong cmd_queued_duration_ns;
|
|
421
|
-
// The time kernel spent for submission - START - SUBMIT
|
|
422
|
-
cl_ulong cmd_submit_duration_ns;
|
|
423
|
-
// Kernel execution time in nanoseconds - END - START
|
|
424
|
-
cl_ulong cmd_duration_ns;
|
|
425
|
-
// The time for the kernel to complete - COMPLETE - END
|
|
426
|
-
cl_ulong cmd_complete_duration_ns;
|
|
427
|
-
// Total time to finish the kernel - COMPELTE - QUEUED
|
|
428
|
-
cl_ulong cmd_total_duration_ns;
|
|
429
|
-
// Global and local work sizes.
|
|
430
|
-
size_t global_size[3];
|
|
431
|
-
size_t local_size[3];
|
|
432
|
-
// Op output size.
|
|
433
|
-
size_t output_size[4];
|
|
590
|
+
write_profiling_info();
|
|
591
|
+
#endif
|
|
592
|
+
}
|
|
593
|
+
}
|
|
434
594
|
};
|
|
435
595
|
|
|
436
|
-
|
|
437
|
-
|
|
596
|
+
// All registered devices with a default device in the front.
|
|
597
|
+
static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
|
|
438
598
|
|
|
439
599
|
inline std::string read_file(const std::string &path) {
|
|
440
600
|
std::ifstream ifs(path);
|
|
@@ -591,11 +751,38 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
591
751
|
|
|
592
752
|
CL_CHECK((backend_ctx->kernel_gelu = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu", &err), err));
|
|
593
753
|
CL_CHECK((backend_ctx->kernel_gelu_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_4", &err), err));
|
|
754
|
+
CL_CHECK((backend_ctx->kernel_gelu_erf = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_erf", &err), err));
|
|
755
|
+
CL_CHECK((backend_ctx->kernel_gelu_erf_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_erf_4", &err), err));
|
|
594
756
|
CL_CHECK((backend_ctx->kernel_gelu_quick = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick", &err), err));
|
|
595
757
|
CL_CHECK((backend_ctx->kernel_gelu_quick_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick_4", &err), err));
|
|
596
758
|
GGML_LOG_CONT(".");
|
|
597
759
|
}
|
|
598
760
|
|
|
761
|
+
// glu
|
|
762
|
+
{
|
|
763
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
764
|
+
const std::string kernel_src {
|
|
765
|
+
#include "glu.cl.h"
|
|
766
|
+
};
|
|
767
|
+
#else
|
|
768
|
+
const std::string kernel_src = read_file("glu.cl");
|
|
769
|
+
#endif
|
|
770
|
+
backend_ctx->program_glu =
|
|
771
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
772
|
+
|
|
773
|
+
CL_CHECK((backend_ctx->kernel_geglu = clCreateKernel(backend_ctx->program_glu, "kernel_geglu", &err), err));
|
|
774
|
+
CL_CHECK((backend_ctx->kernel_reglu = clCreateKernel(backend_ctx->program_glu, "kernel_reglu", &err), err));
|
|
775
|
+
CL_CHECK((backend_ctx->kernel_swiglu = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu", &err), err));
|
|
776
|
+
CL_CHECK((backend_ctx->kernel_geglu_erf = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_erf", &err), err));
|
|
777
|
+
CL_CHECK((backend_ctx->kernel_geglu_quick = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_quick", &err), err));
|
|
778
|
+
CL_CHECK((backend_ctx->kernel_geglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_f16", &err), err));
|
|
779
|
+
CL_CHECK((backend_ctx->kernel_reglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_reglu_f16", &err), err));
|
|
780
|
+
CL_CHECK((backend_ctx->kernel_swiglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu_f16", &err), err));
|
|
781
|
+
CL_CHECK((backend_ctx->kernel_geglu_erf_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_erf_f16", &err), err));
|
|
782
|
+
CL_CHECK((backend_ctx->kernel_geglu_quick_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_quick_f16", &err), err));
|
|
783
|
+
GGML_LOG_CONT(".");
|
|
784
|
+
}
|
|
785
|
+
|
|
599
786
|
// get_rows
|
|
600
787
|
{
|
|
601
788
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
@@ -830,6 +1017,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
830
1017
|
GGML_LOG_CONT(".");
|
|
831
1018
|
}
|
|
832
1019
|
|
|
1020
|
+
// mul_mat_f16_f32_tiled
|
|
1021
|
+
{
|
|
1022
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1023
|
+
const std::string kernel_src {
|
|
1024
|
+
#include "mul_mat_f16_f32.cl.h"
|
|
1025
|
+
};
|
|
1026
|
+
#else
|
|
1027
|
+
const std::string kernel_src = read_file("mul_mat_f16_f32.cl");
|
|
1028
|
+
#endif
|
|
1029
|
+
backend_ctx->program_mul_mat_f16_f32_tiled =
|
|
1030
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1031
|
+
|
|
1032
|
+
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_tiled = clCreateKernel(backend_ctx->program_mul_mat_f16_f32_tiled, "mul_mat_f16_f32", &err), err));
|
|
1033
|
+
GGML_LOG_CONT(".");
|
|
1034
|
+
}
|
|
1035
|
+
|
|
833
1036
|
// mul
|
|
834
1037
|
{
|
|
835
1038
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
@@ -1258,6 +1461,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1258
1461
|
}
|
|
1259
1462
|
}
|
|
1260
1463
|
|
|
1464
|
+
// set_rows
|
|
1465
|
+
{
|
|
1466
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1467
|
+
const std::string kernel_src {
|
|
1468
|
+
#include "set_rows.cl.h"
|
|
1469
|
+
};
|
|
1470
|
+
#else
|
|
1471
|
+
const std::string kernel_src = read_file("set_rows.cl");
|
|
1472
|
+
#endif
|
|
1473
|
+
backend_ctx->program_set_rows =
|
|
1474
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1475
|
+
|
|
1476
|
+
CL_CHECK((backend_ctx->kernel_set_rows_f32 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f32", &err), err));
|
|
1477
|
+
CL_CHECK((backend_ctx->kernel_set_rows_f16 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f16", &err), err));
|
|
1478
|
+
GGML_LOG_CONT(".");
|
|
1479
|
+
}
|
|
1480
|
+
|
|
1261
1481
|
// mul_mv_id_q4_0_f32_8x_flat
|
|
1262
1482
|
{
|
|
1263
1483
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
@@ -1669,6 +1889,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
1669
1889
|
backend_ctx->device = dev_ctx->device;
|
|
1670
1890
|
backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
|
|
1671
1891
|
|
|
1892
|
+
// ref_count get increased in ggml_backend_opencl_device_init
|
|
1893
|
+
// This function is also used to retrieve backend context, so we don't want
|
|
1894
|
+
// to increase ref_count for each call. We only want to increase ref_count
|
|
1895
|
+
// when the associated device is initialized
|
|
1896
|
+
backend_ctx->ref_count = 0;
|
|
1897
|
+
|
|
1672
1898
|
if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
|
|
1673
1899
|
strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
|
|
1674
1900
|
strstr(dev_ctx->device_version.c_str(), "Adreno")) {
|
|
@@ -1841,93 +2067,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
1841
2067
|
return dev_ctx->backend_ctx;
|
|
1842
2068
|
}
|
|
1843
2069
|
|
|
1844
|
-
static void ggml_cl2_free(
|
|
1845
|
-
|
|
1846
|
-
|
|
1847
|
-
if (!fperf) {
|
|
1848
|
-
GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
|
|
1849
|
-
return;
|
|
1850
|
-
}
|
|
2070
|
+
static void ggml_cl2_free(ggml_backend_t backend) {
|
|
2071
|
+
ggml_backend_opencl_context * ctx = (ggml_backend_opencl_context *) backend->context;
|
|
2072
|
+
ctx->free();
|
|
1851
2073
|
|
|
1852
|
-
//
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1859
|
-
|
|
1860
|
-
CL_CHECK(clWaitForEvents(1, &info.evt));
|
|
1861
|
-
CL_CHECK(clGetEventProfilingInfo(
|
|
1862
|
-
info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
|
|
1863
|
-
CL_CHECK(clGetEventProfilingInfo(
|
|
1864
|
-
info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
|
|
1865
|
-
CL_CHECK(clGetEventProfilingInfo(
|
|
1866
|
-
info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
|
|
1867
|
-
CL_CHECK(clGetEventProfilingInfo(
|
|
1868
|
-
info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
|
|
1869
|
-
CL_CHECK(clGetEventProfilingInfo(
|
|
1870
|
-
info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
|
|
1871
|
-
CL_CHECK(clReleaseEvent(info.evt));
|
|
1872
|
-
|
|
1873
|
-
char kernel_name[512];
|
|
1874
|
-
CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
|
|
1875
|
-
sizeof(kernel_name), kernel_name, NULL));
|
|
1876
|
-
info.kernel_name = kernel_name;
|
|
1877
|
-
|
|
1878
|
-
info.cmd_queued = cmd_queued;
|
|
1879
|
-
info.cmd_submit = cmd_submit;
|
|
1880
|
-
info.cmd_start = cmd_start;
|
|
1881
|
-
info.cmd_end = cmd_end;
|
|
1882
|
-
|
|
1883
|
-
info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
|
|
1884
|
-
info.cmd_submit_duration_ns = cmd_start - cmd_submit;
|
|
1885
|
-
info.cmd_duration_ns = cmd_end - cmd_start;
|
|
1886
|
-
info.cmd_complete_duration_ns = cmd_complete - cmd_end;
|
|
1887
|
-
info.cmd_total_duration_ns = cmd_complete - cmd_queued;
|
|
1888
|
-
}
|
|
1889
|
-
|
|
1890
|
-
// Dump a csv
|
|
1891
|
-
float total_kernel_time = 0;
|
|
1892
|
-
fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
|
|
1893
|
-
for (const ProfilingInfo & info : g_profiling_info) {
|
|
1894
|
-
total_kernel_time += info.cmd_duration_ns/1.e6f;
|
|
1895
|
-
fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
|
|
1896
|
-
info.op_name.c_str(), info.kernel_name.c_str(),
|
|
1897
|
-
info.cmd_queued_duration_ns/1.e6f,
|
|
1898
|
-
info.cmd_submit_duration_ns/1.e6f,
|
|
1899
|
-
info.cmd_duration_ns/1.e6f,
|
|
1900
|
-
info.cmd_complete_duration_ns/1.e6f,
|
|
1901
|
-
info.cmd_total_duration_ns/1.e6f,
|
|
1902
|
-
info.global_size[0], info.global_size[1], info.global_size[2],
|
|
1903
|
-
info.local_size[0], info.local_size[1], info.local_size[2],
|
|
1904
|
-
info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
|
|
1905
|
-
}
|
|
1906
|
-
fclose(fperf);
|
|
1907
|
-
|
|
1908
|
-
GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
|
|
1909
|
-
|
|
1910
|
-
// Dump a simple chrome trace
|
|
1911
|
-
FILE* ftrace = fopen("cl_trace.json", "w");
|
|
1912
|
-
if (!ftrace) {
|
|
1913
|
-
GGML_LOG_ERROR("Failed to open cl_trace.json\n");
|
|
1914
|
-
return;
|
|
2074
|
+
// The CL context is shared by all backends, release it if all backends have been released
|
|
2075
|
+
bool should_release_opencl = true;
|
|
2076
|
+
for (auto device : g_ggml_backend_opencl_devices) {
|
|
2077
|
+
ggml_backend_opencl_device_context * ctx_dev = (ggml_backend_opencl_device_context *) device.context;
|
|
2078
|
+
if (ctx_dev->backend_ctx->ref_count > 0) {
|
|
2079
|
+
should_release_opencl = false;
|
|
2080
|
+
}
|
|
1915
2081
|
}
|
|
1916
2082
|
|
|
1917
|
-
|
|
1918
|
-
|
|
1919
|
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
|
1920
|
-
info.kernel_name.c_str(), info.cmd_queued/1000);
|
|
1921
|
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
|
1922
|
-
info.kernel_name.c_str(), info.cmd_submit/1000);
|
|
1923
|
-
|
|
1924
|
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
|
1925
|
-
info.kernel_name.c_str(), info.cmd_start/1000);
|
|
1926
|
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
|
1927
|
-
info.kernel_name.c_str(), info.cmd_end/1000);
|
|
2083
|
+
if (should_release_opencl) {
|
|
2084
|
+
CL_CHECK(clReleaseContext(ctx->context));
|
|
1928
2085
|
}
|
|
1929
|
-
fclose(ftrace);
|
|
1930
|
-
#endif
|
|
1931
2086
|
}
|
|
1932
2087
|
|
|
1933
2088
|
//------------------------------------------------------------------------------
|
|
@@ -2011,9 +2166,7 @@ static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
|
|
|
2011
2166
|
}
|
|
2012
2167
|
|
|
2013
2168
|
static void ggml_backend_opencl_free(ggml_backend_t backend) {
|
|
2014
|
-
ggml_cl2_free();
|
|
2015
|
-
|
|
2016
|
-
GGML_UNUSED(backend);
|
|
2169
|
+
ggml_cl2_free(backend);
|
|
2017
2170
|
}
|
|
2018
2171
|
|
|
2019
2172
|
static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
@@ -2088,7 +2241,7 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm
|
|
|
2088
2241
|
// dependencies.
|
|
2089
2242
|
sync_with_other_backends(backend);
|
|
2090
2243
|
|
|
2091
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
|
2244
|
+
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
|
2092
2245
|
continue;
|
|
2093
2246
|
}
|
|
2094
2247
|
|
|
@@ -2123,6 +2276,21 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
2123
2276
|
default:
|
|
2124
2277
|
return false;
|
|
2125
2278
|
}
|
|
2279
|
+
case GGML_OP_SET_ROWS:
|
|
2280
|
+
{
|
|
2281
|
+
// TODO: add support
|
|
2282
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
|
|
2283
|
+
if (op->src[0]->type != GGML_TYPE_F32) {
|
|
2284
|
+
return false;
|
|
2285
|
+
}
|
|
2286
|
+
switch (op->type) {
|
|
2287
|
+
case GGML_TYPE_F16:
|
|
2288
|
+
case GGML_TYPE_F32:
|
|
2289
|
+
return true;
|
|
2290
|
+
default:
|
|
2291
|
+
return false;
|
|
2292
|
+
}
|
|
2293
|
+
}
|
|
2126
2294
|
case GGML_OP_CPY:
|
|
2127
2295
|
case GGML_OP_DUP:
|
|
2128
2296
|
case GGML_OP_CONT:
|
|
@@ -2157,6 +2325,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
2157
2325
|
case GGML_UNARY_OP_GELU:
|
|
2158
2326
|
case GGML_UNARY_OP_SILU:
|
|
2159
2327
|
case GGML_UNARY_OP_RELU:
|
|
2328
|
+
case GGML_UNARY_OP_GELU_ERF:
|
|
2160
2329
|
case GGML_UNARY_OP_GELU_QUICK:
|
|
2161
2330
|
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
|
2162
2331
|
case GGML_UNARY_OP_SIGMOID:
|
|
@@ -2167,6 +2336,17 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
2167
2336
|
default:
|
|
2168
2337
|
return false;
|
|
2169
2338
|
}
|
|
2339
|
+
case GGML_OP_GLU:
|
|
2340
|
+
switch (ggml_get_glu_op(op)) {
|
|
2341
|
+
case GGML_GLU_OP_GEGLU:
|
|
2342
|
+
case GGML_GLU_OP_REGLU:
|
|
2343
|
+
case GGML_GLU_OP_SWIGLU:
|
|
2344
|
+
case GGML_GLU_OP_GEGLU_ERF:
|
|
2345
|
+
case GGML_GLU_OP_GEGLU_QUICK:
|
|
2346
|
+
return ggml_is_contiguous_1(op->src[0]) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16);
|
|
2347
|
+
default:
|
|
2348
|
+
return false;
|
|
2349
|
+
}
|
|
2170
2350
|
case GGML_OP_CLAMP:
|
|
2171
2351
|
return op->src[0]->type == GGML_TYPE_F32;
|
|
2172
2352
|
case GGML_OP_SOFT_MAX:
|
|
@@ -2899,6 +3079,8 @@ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct
|
|
|
2899
3079
|
|
|
2900
3080
|
static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
|
|
2901
3081
|
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
|
|
3082
|
+
// Getting a new reference to the backend, increase ref_count
|
|
3083
|
+
backend_ctx->ref_count++;
|
|
2902
3084
|
|
|
2903
3085
|
ggml_backend_t backend = new ggml_backend {
|
|
2904
3086
|
/* .guid = */ ggml_backend_opencl_guid(),
|
|
@@ -3089,7 +3271,7 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
|
|
|
3089
3271
|
|
|
3090
3272
|
// Open file and dump.
|
|
3091
3273
|
char fname[512];
|
|
3092
|
-
|
|
3274
|
+
snprintf(fname, sizeof(fname), "./tensor-dumps/%s.txt", tensor->name);
|
|
3093
3275
|
FILE * f = fopen(fname, "w");
|
|
3094
3276
|
if (!f) {
|
|
3095
3277
|
printf("Failed to open %s\n", fname);
|
|
@@ -3159,31 +3341,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
|
|
|
3159
3341
|
#define dump_tensor(tensor)
|
|
3160
3342
|
#endif
|
|
3161
3343
|
|
|
3162
|
-
//------------------------------------------------------------------------------
|
|
3163
|
-
// Profiling utility
|
|
3164
|
-
//------------------------------------------------------------------------------
|
|
3165
|
-
#ifdef GGML_OPENCL_PROFILING
|
|
3166
|
-
static void populateProfilingInfo(
|
|
3167
|
-
ProfilingInfo& info, cl_event evt, cl_kernel kernel,
|
|
3168
|
-
size_t global_size[3], size_t local_size[3],
|
|
3169
|
-
const ggml_tensor * tensor) {
|
|
3170
|
-
info.op_name = tensor->name;
|
|
3171
|
-
info.kernel = kernel;
|
|
3172
|
-
info.evt = evt;
|
|
3173
|
-
|
|
3174
|
-
info.local_size[0] = local_size[0];
|
|
3175
|
-
info.local_size[1] = local_size[1];
|
|
3176
|
-
info.local_size[2] = local_size[2];
|
|
3177
|
-
info.global_size[0] = global_size[0];
|
|
3178
|
-
info.global_size[1] = global_size[1];
|
|
3179
|
-
info.global_size[2] = global_size[2];
|
|
3180
|
-
info.output_size[0] = tensor->ne[0];
|
|
3181
|
-
info.output_size[1] = tensor->ne[1];
|
|
3182
|
-
info.output_size[2] = tensor->ne[2];
|
|
3183
|
-
info.output_size[3] = tensor->ne[3];
|
|
3184
|
-
}
|
|
3185
|
-
#endif
|
|
3186
|
-
|
|
3187
3344
|
//------------------------------------------------------------------------------
|
|
3188
3345
|
// Ops
|
|
3189
3346
|
//------------------------------------------------------------------------------
|
|
@@ -3227,7 +3384,6 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
3227
3384
|
const cl_ulong nb2 = dst ? dst->nb[2] : 0;
|
|
3228
3385
|
|
|
3229
3386
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3230
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3231
3387
|
|
|
3232
3388
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3233
3389
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -3271,15 +3427,112 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
3271
3427
|
size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1};
|
|
3272
3428
|
size_t local_work_size[] = {1, 1, 1};
|
|
3273
3429
|
|
|
3274
|
-
|
|
3275
|
-
|
|
3276
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3430
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3431
|
+
}
|
|
3277
3432
|
|
|
3278
|
-
|
|
3279
|
-
|
|
3280
|
-
|
|
3281
|
-
|
|
3282
|
-
|
|
3433
|
+
static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3434
|
+
GGML_ASSERT(src0);
|
|
3435
|
+
GGML_ASSERT(src0->extra);
|
|
3436
|
+
GGML_ASSERT(src1);
|
|
3437
|
+
GGML_ASSERT(src1->extra);
|
|
3438
|
+
GGML_ASSERT(dst);
|
|
3439
|
+
GGML_ASSERT(dst->extra);
|
|
3440
|
+
|
|
3441
|
+
// ne0 = ne00
|
|
3442
|
+
// ne2 = ne02
|
|
3443
|
+
// ne3 = ne03
|
|
3444
|
+
|
|
3445
|
+
const int ne01 = src0->ne[1];
|
|
3446
|
+
const int ne02 = src0->ne[2];
|
|
3447
|
+
const int ne03 = src0->ne[3];
|
|
3448
|
+
|
|
3449
|
+
const cl_ulong nb01 = src0->nb[1];
|
|
3450
|
+
const cl_ulong nb02 = src0->nb[2];
|
|
3451
|
+
const cl_ulong nb03 = src0->nb[3];
|
|
3452
|
+
|
|
3453
|
+
const int ne11 = src1->ne[1];
|
|
3454
|
+
const int ne12 = src1->ne[2];
|
|
3455
|
+
|
|
3456
|
+
const cl_ulong nb10 = src1->nb[0];
|
|
3457
|
+
const cl_ulong nb11 = src1->nb[1];
|
|
3458
|
+
const cl_ulong nb12 = src1->nb[2];
|
|
3459
|
+
|
|
3460
|
+
const int ne0 = dst->ne[0];
|
|
3461
|
+
|
|
3462
|
+
const cl_ulong nb1 = dst->nb[1];
|
|
3463
|
+
const cl_ulong nb2 = dst->nb[2];
|
|
3464
|
+
const cl_ulong nb3 = dst->nb[3];
|
|
3465
|
+
|
|
3466
|
+
const int nblk0 = ne0/ggml_blck_size(dst->type);
|
|
3467
|
+
|
|
3468
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3469
|
+
|
|
3470
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3471
|
+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
3472
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
3473
|
+
|
|
3474
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
3475
|
+
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
|
3476
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
3477
|
+
|
|
3478
|
+
cl_kernel kernel;
|
|
3479
|
+
|
|
3480
|
+
switch (dst->type) {
|
|
3481
|
+
case GGML_TYPE_F32:
|
|
3482
|
+
kernel = backend_ctx->kernel_set_rows_f32;
|
|
3483
|
+
break;
|
|
3484
|
+
case GGML_TYPE_F16:
|
|
3485
|
+
kernel = backend_ctx->kernel_set_rows_f16;
|
|
3486
|
+
break;
|
|
3487
|
+
default:
|
|
3488
|
+
GGML_ABORT("not implemented");
|
|
3489
|
+
}
|
|
3490
|
+
|
|
3491
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
3492
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
3493
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
3494
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
3495
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
3496
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
3497
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01));
|
|
3498
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
|
3499
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
|
3500
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
|
3501
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne11));
|
|
3502
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
|
|
3503
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb10));
|
|
3504
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb11));
|
|
3505
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb12));
|
|
3506
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &nblk0));
|
|
3507
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb1));
|
|
3508
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb2));
|
|
3509
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb3));
|
|
3510
|
+
|
|
3511
|
+
int nth0 = 64;
|
|
3512
|
+
if (backend_ctx->gpu_family == INTEL) {
|
|
3513
|
+
nth0 = 32;
|
|
3514
|
+
} else if (backend_ctx->gpu_family == ADRENO) {
|
|
3515
|
+
nth0 = 64;
|
|
3516
|
+
}
|
|
3517
|
+
|
|
3518
|
+
int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
|
|
3519
|
+
while (nth0 < nblk0 && nth0 < max_workgroup_size) {
|
|
3520
|
+
nth0 *= 2;
|
|
3521
|
+
}
|
|
3522
|
+
|
|
3523
|
+
int rows_per_workgroup = 1;
|
|
3524
|
+
if (nth0 > nblk0) {
|
|
3525
|
+
rows_per_workgroup = nth0 / nblk0;
|
|
3526
|
+
nth0 = nblk0;
|
|
3527
|
+
}
|
|
3528
|
+
|
|
3529
|
+
size_t global_work_size[] = {
|
|
3530
|
+
(size_t)(ne01 + rows_per_workgroup - 1)/rows_per_workgroup*nth0,
|
|
3531
|
+
(size_t)ne02*rows_per_workgroup,
|
|
3532
|
+
(size_t)ne03};
|
|
3533
|
+
size_t local_work_size[] = {(size_t)nth0, (size_t)rows_per_workgroup, 1};
|
|
3534
|
+
|
|
3535
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3283
3536
|
}
|
|
3284
3537
|
|
|
3285
3538
|
static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3321,7 +3574,6 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3321
3574
|
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
|
3322
3575
|
|
|
3323
3576
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3324
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3325
3577
|
|
|
3326
3578
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3327
3579
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -3396,29 +3648,13 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3396
3648
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
3397
3649
|
}
|
|
3398
3650
|
|
|
3399
|
-
|
|
3400
|
-
cl_event evt;
|
|
3401
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
3402
|
-
|
|
3403
|
-
g_profiling_info.emplace_back();
|
|
3404
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
3405
|
-
#else
|
|
3406
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
3407
|
-
#endif
|
|
3651
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
3408
3652
|
} else {
|
|
3409
3653
|
unsigned int nth = MIN(64, ne0);
|
|
3410
3654
|
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
3411
3655
|
size_t local_work_size[] = {nth, 1, 1};
|
|
3412
3656
|
|
|
3413
|
-
|
|
3414
|
-
cl_event evt;
|
|
3415
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3416
|
-
|
|
3417
|
-
g_profiling_info.emplace_back();
|
|
3418
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3419
|
-
#else
|
|
3420
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3421
|
-
#endif
|
|
3657
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3422
3658
|
}
|
|
3423
3659
|
}
|
|
3424
3660
|
|
|
@@ -3461,7 +3697,6 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3461
3697
|
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
|
3462
3698
|
|
|
3463
3699
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3464
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3465
3700
|
|
|
3466
3701
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3467
3702
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -3536,29 +3771,13 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3536
3771
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
3537
3772
|
}
|
|
3538
3773
|
|
|
3539
|
-
|
|
3540
|
-
cl_event evt;
|
|
3541
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
3542
|
-
|
|
3543
|
-
g_profiling_info.emplace_back();
|
|
3544
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
3545
|
-
#else
|
|
3546
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
3547
|
-
#endif
|
|
3774
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
3548
3775
|
} else {
|
|
3549
3776
|
unsigned int nth = MIN(64, ne0);
|
|
3550
3777
|
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
3551
3778
|
size_t local_work_size[] = {nth, 1, 1};
|
|
3552
3779
|
|
|
3553
|
-
|
|
3554
|
-
cl_event evt;
|
|
3555
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3556
|
-
|
|
3557
|
-
g_profiling_info.emplace_back();
|
|
3558
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3559
|
-
#else
|
|
3560
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3561
|
-
#endif
|
|
3780
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3562
3781
|
}
|
|
3563
3782
|
}
|
|
3564
3783
|
|
|
@@ -3598,7 +3817,6 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3598
3817
|
const cl_ulong nb3 = dst->nb[3];
|
|
3599
3818
|
|
|
3600
3819
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3601
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3602
3820
|
|
|
3603
3821
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3604
3822
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -3661,29 +3879,13 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3661
3879
|
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
3662
3880
|
size_t local_work_size[] = {64, 1, 1};
|
|
3663
3881
|
|
|
3664
|
-
|
|
3665
|
-
cl_event evt;
|
|
3666
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3667
|
-
|
|
3668
|
-
g_profiling_info.emplace_back();
|
|
3669
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3670
|
-
#else
|
|
3671
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3672
|
-
#endif
|
|
3882
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3673
3883
|
} else {
|
|
3674
3884
|
unsigned int nth = MIN(64, ne0);
|
|
3675
3885
|
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
3676
3886
|
size_t local_work_size[] = {nth, 1, 1};
|
|
3677
3887
|
|
|
3678
|
-
|
|
3679
|
-
cl_event evt;
|
|
3680
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3681
|
-
|
|
3682
|
-
g_profiling_info.emplace_back();
|
|
3683
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3684
|
-
#else
|
|
3685
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3686
|
-
#endif
|
|
3888
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3687
3889
|
}
|
|
3688
3890
|
}
|
|
3689
3891
|
|
|
@@ -3723,7 +3925,6 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3723
3925
|
const cl_ulong nb3 = dst->nb[3];
|
|
3724
3926
|
|
|
3725
3927
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3726
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3727
3928
|
|
|
3728
3929
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3729
3930
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -3786,29 +3987,13 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3786
3987
|
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
3787
3988
|
size_t local_work_size[] = {64, 1, 1};
|
|
3788
3989
|
|
|
3789
|
-
|
|
3790
|
-
cl_event evt;
|
|
3791
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3792
|
-
|
|
3793
|
-
g_profiling_info.emplace_back();
|
|
3794
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3795
|
-
#else
|
|
3796
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3797
|
-
#endif
|
|
3990
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3798
3991
|
} else {
|
|
3799
3992
|
unsigned int nth = MIN(64, ne0);
|
|
3800
3993
|
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
3801
3994
|
size_t local_work_size[] = {nth, 1, 1};
|
|
3802
3995
|
|
|
3803
|
-
|
|
3804
|
-
cl_event evt;
|
|
3805
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3806
|
-
|
|
3807
|
-
g_profiling_info.emplace_back();
|
|
3808
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3809
|
-
#else
|
|
3810
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3811
|
-
#endif
|
|
3996
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3812
3997
|
}
|
|
3813
3998
|
}
|
|
3814
3999
|
|
|
@@ -3821,7 +4006,6 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3821
4006
|
UNUSED(src1);
|
|
3822
4007
|
|
|
3823
4008
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3824
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3825
4009
|
|
|
3826
4010
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3827
4011
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -3848,15 +4032,45 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3848
4032
|
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
3849
4033
|
size_t local_work_size[] = {64, 1, 1};
|
|
3850
4034
|
|
|
3851
|
-
|
|
3852
|
-
|
|
3853
|
-
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
|
|
4035
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
4036
|
+
}
|
|
3854
4037
|
|
|
3855
|
-
|
|
3856
|
-
|
|
3857
|
-
|
|
3858
|
-
|
|
3859
|
-
|
|
4038
|
+
static void ggml_cl_gelu_erf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
4039
|
+
GGML_ASSERT(src0);
|
|
4040
|
+
GGML_ASSERT(src0->extra);
|
|
4041
|
+
GGML_ASSERT(dst);
|
|
4042
|
+
GGML_ASSERT(dst->extra);
|
|
4043
|
+
|
|
4044
|
+
UNUSED(src1);
|
|
4045
|
+
|
|
4046
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4047
|
+
|
|
4048
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4049
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
4050
|
+
|
|
4051
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
4052
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
4053
|
+
|
|
4054
|
+
cl_kernel kernel;
|
|
4055
|
+
|
|
4056
|
+
int n = ggml_nelements(dst);
|
|
4057
|
+
|
|
4058
|
+
if (n % 4 == 0) {
|
|
4059
|
+
kernel = backend_ctx->kernel_gelu_erf_4;
|
|
4060
|
+
n /= 4;
|
|
4061
|
+
} else {
|
|
4062
|
+
kernel = backend_ctx->kernel_gelu_erf;
|
|
4063
|
+
}
|
|
4064
|
+
|
|
4065
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
4066
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
4067
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
4068
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
4069
|
+
|
|
4070
|
+
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
4071
|
+
size_t local_work_size[] = {64, 1, 1};
|
|
4072
|
+
|
|
4073
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3860
4074
|
}
|
|
3861
4075
|
|
|
3862
4076
|
static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3868,7 +4082,6 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
3868
4082
|
UNUSED(src1);
|
|
3869
4083
|
|
|
3870
4084
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3871
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3872
4085
|
|
|
3873
4086
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3874
4087
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -3895,15 +4108,7 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
3895
4108
|
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
3896
4109
|
size_t local_work_size[] = {64, 1, 1};
|
|
3897
4110
|
|
|
3898
|
-
|
|
3899
|
-
cl_event evt;
|
|
3900
|
-
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
|
|
3901
|
-
|
|
3902
|
-
g_profiling_info.emplace_back();
|
|
3903
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3904
|
-
#else
|
|
3905
|
-
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
|
|
3906
|
-
#endif
|
|
4111
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3907
4112
|
}
|
|
3908
4113
|
|
|
3909
4114
|
static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3915,7 +4120,6 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3915
4120
|
UNUSED(src1);
|
|
3916
4121
|
|
|
3917
4122
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3918
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3919
4123
|
|
|
3920
4124
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3921
4125
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -3947,15 +4151,7 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3947
4151
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
3948
4152
|
}
|
|
3949
4153
|
|
|
3950
|
-
|
|
3951
|
-
cl_event evt;
|
|
3952
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
3953
|
-
|
|
3954
|
-
g_profiling_info.emplace_back();
|
|
3955
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
3956
|
-
#else
|
|
3957
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
3958
|
-
#endif
|
|
4154
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
3959
4155
|
}
|
|
3960
4156
|
|
|
3961
4157
|
static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3967,7 +4163,6 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3967
4163
|
UNUSED(src1);
|
|
3968
4164
|
|
|
3969
4165
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3970
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3971
4166
|
|
|
3972
4167
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3973
4168
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -3992,15 +4187,7 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3992
4187
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
3993
4188
|
}
|
|
3994
4189
|
|
|
3995
|
-
|
|
3996
|
-
cl_event evt;
|
|
3997
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
3998
|
-
|
|
3999
|
-
g_profiling_info.emplace_back();
|
|
4000
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
4001
|
-
#else
|
|
4002
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
4003
|
-
#endif
|
|
4190
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
4004
4191
|
}
|
|
4005
4192
|
|
|
4006
4193
|
static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4012,7 +4199,6 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
4012
4199
|
UNUSED(src1);
|
|
4013
4200
|
|
|
4014
4201
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4015
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4016
4202
|
|
|
4017
4203
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4018
4204
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -4044,15 +4230,7 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
4044
4230
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
4045
4231
|
}
|
|
4046
4232
|
|
|
4047
|
-
|
|
4048
|
-
cl_event evt;
|
|
4049
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
4050
|
-
|
|
4051
|
-
g_profiling_info.emplace_back();
|
|
4052
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
4053
|
-
#else
|
|
4054
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
4055
|
-
#endif
|
|
4233
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
4056
4234
|
}
|
|
4057
4235
|
|
|
4058
4236
|
static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4064,7 +4242,6 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
|
4064
4242
|
UNUSED(src1);
|
|
4065
4243
|
|
|
4066
4244
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4067
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4068
4245
|
|
|
4069
4246
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4070
4247
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -4096,15 +4273,7 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
|
4096
4273
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
4097
4274
|
}
|
|
4098
4275
|
|
|
4099
|
-
|
|
4100
|
-
cl_event evt;
|
|
4101
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
4102
|
-
|
|
4103
|
-
g_profiling_info.emplace_back();
|
|
4104
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
4105
|
-
#else
|
|
4106
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
4107
|
-
#endif
|
|
4276
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
4108
4277
|
}
|
|
4109
4278
|
|
|
4110
4279
|
static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4116,7 +4285,6 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
4116
4285
|
UNUSED(src1);
|
|
4117
4286
|
|
|
4118
4287
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4119
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4120
4288
|
|
|
4121
4289
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4122
4290
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -4157,15 +4325,7 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
4157
4325
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
4158
4326
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
4159
4327
|
|
|
4160
|
-
|
|
4161
|
-
cl_event evt;
|
|
4162
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
4163
|
-
|
|
4164
|
-
g_profiling_info.emplace_back();
|
|
4165
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
4166
|
-
#else
|
|
4167
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
4168
|
-
#endif
|
|
4328
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
4169
4329
|
}
|
|
4170
4330
|
|
|
4171
4331
|
static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4177,7 +4337,6 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
4177
4337
|
UNUSED(src1);
|
|
4178
4338
|
|
|
4179
4339
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4180
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4181
4340
|
|
|
4182
4341
|
//ggml_backend_opencl_device_context * dev_ctx =
|
|
4183
4342
|
// (ggml_backend_opencl_device_context *)backend->device->context;
|
|
@@ -4241,15 +4400,7 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
4241
4400
|
// This is local memory - the size depends on subgroup size.
|
|
4242
4401
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
|
|
4243
4402
|
|
|
4244
|
-
|
|
4245
|
-
cl_event evt;
|
|
4246
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
4247
|
-
|
|
4248
|
-
g_profiling_info.emplace_back();
|
|
4249
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
4250
|
-
#else
|
|
4251
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
4252
|
-
#endif
|
|
4403
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
4253
4404
|
}
|
|
4254
4405
|
|
|
4255
4406
|
static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4261,7 +4412,6 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
4261
4412
|
UNUSED(src1);
|
|
4262
4413
|
|
|
4263
4414
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4264
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4265
4415
|
|
|
4266
4416
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4267
4417
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -4300,15 +4450,7 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
4300
4450
|
size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
|
|
4301
4451
|
size_t local_work_size[] = {(size_t)sgs, 1, 1};
|
|
4302
4452
|
|
|
4303
|
-
|
|
4304
|
-
cl_event evt;
|
|
4305
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
4306
|
-
|
|
4307
|
-
g_profiling_info.emplace_back();
|
|
4308
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
4309
|
-
#else
|
|
4310
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
4311
|
-
#endif
|
|
4453
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
4312
4454
|
}
|
|
4313
4455
|
|
|
4314
4456
|
static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4320,7 +4462,6 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
4320
4462
|
UNUSED(src1);
|
|
4321
4463
|
|
|
4322
4464
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4323
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4324
4465
|
|
|
4325
4466
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4326
4467
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -4397,16 +4538,7 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
4397
4538
|
}
|
|
4398
4539
|
if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
|
|
4399
4540
|
|
|
4400
|
-
|
|
4401
|
-
#ifdef GGML_OPENCL_PROFILING
|
|
4402
|
-
cl_event evt;
|
|
4403
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
4404
|
-
|
|
4405
|
-
g_profiling_info.emplace_back();
|
|
4406
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst);
|
|
4407
|
-
#else
|
|
4408
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
4409
|
-
#endif
|
|
4541
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
4410
4542
|
}
|
|
4411
4543
|
|
|
4412
4544
|
static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
|
|
@@ -4419,7 +4551,6 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
|
4419
4551
|
UNUSED(src1_shape_def);
|
|
4420
4552
|
|
|
4421
4553
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4422
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4423
4554
|
|
|
4424
4555
|
if (backend_ctx->kernel_repeat == nullptr) {
|
|
4425
4556
|
GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__);
|
|
@@ -4467,15 +4598,7 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
|
4467
4598
|
|
|
4468
4599
|
size_t global_work_size[] = { gws0, gws1, gws2 };
|
|
4469
4600
|
|
|
4470
|
-
|
|
4471
|
-
cl_event evt;
|
|
4472
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, &evt));
|
|
4473
|
-
|
|
4474
|
-
g_profiling_info.emplace_back();
|
|
4475
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, (size_t[3]){0,0,0}, dst);
|
|
4476
|
-
#else
|
|
4477
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL));
|
|
4478
|
-
#endif
|
|
4601
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
|
|
4479
4602
|
}
|
|
4480
4603
|
|
|
4481
4604
|
static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
@@ -4488,7 +4611,6 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
|
|
|
4488
4611
|
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1);
|
|
4489
4612
|
|
|
4490
4613
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4491
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4492
4614
|
|
|
4493
4615
|
if (backend_ctx->kernel_pad == nullptr) {
|
|
4494
4616
|
GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__);
|
|
@@ -4533,15 +4655,7 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
|
|
|
4533
4655
|
local_work_size_ptr = nullptr;
|
|
4534
4656
|
}
|
|
4535
4657
|
|
|
4536
|
-
|
|
4537
|
-
cl_event evt;
|
|
4538
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
4539
|
-
|
|
4540
|
-
g_profiling_info.emplace_back();
|
|
4541
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst);
|
|
4542
|
-
#else
|
|
4543
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
4544
|
-
#endif
|
|
4658
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
4545
4659
|
}
|
|
4546
4660
|
|
|
4547
4661
|
static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
@@ -4553,9 +4667,9 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
|
|
|
4553
4667
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
4554
4668
|
|
|
4555
4669
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4556
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4557
4670
|
|
|
4558
|
-
const
|
|
4671
|
+
const int mode_flags = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
|
|
4672
|
+
const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF);
|
|
4559
4673
|
cl_kernel kernel = nullptr;
|
|
4560
4674
|
|
|
4561
4675
|
if (mode == GGML_SCALE_MODE_NEAREST) {
|
|
@@ -4586,18 +4700,22 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
|
|
|
4586
4700
|
const cl_ulong nb02 = src0->nb[2];
|
|
4587
4701
|
const cl_ulong nb03 = src0->nb[3];
|
|
4588
4702
|
|
|
4589
|
-
const int
|
|
4590
|
-
const int
|
|
4703
|
+
const int ne00 = src0->ne[0];
|
|
4704
|
+
const int ne01 = src0->ne[1];
|
|
4705
|
+
const int ne02 = src0->ne[2];
|
|
4706
|
+
const int ne03 = src0->ne[3];
|
|
4707
|
+
|
|
4708
|
+
const int ne0 = dst->ne[0];
|
|
4709
|
+
const int ne1 = dst->ne[1];
|
|
4710
|
+
const int ne2 = dst->ne[2];
|
|
4711
|
+
const int ne3 = dst->ne[3];
|
|
4591
4712
|
|
|
4592
|
-
|
|
4593
|
-
|
|
4594
|
-
|
|
4595
|
-
|
|
4713
|
+
float sf0 = (float)ne0 / ne00;
|
|
4714
|
+
float sf1 = (float)ne1 / ne01;
|
|
4715
|
+
float sf2 = (float)ne2 / ne02;
|
|
4716
|
+
float sf3 = (float)ne3 / ne03;
|
|
4596
4717
|
|
|
4597
|
-
|
|
4598
|
-
const float sf1 = (float)dst->ne[1] / src0->ne[1];
|
|
4599
|
-
const float sf2 = (float)dst->ne[2] / src0->ne[2];
|
|
4600
|
-
const float sf3 = (float)dst->ne[3] / src0->ne[3];
|
|
4718
|
+
float pixel_offset = 0.5f;
|
|
4601
4719
|
|
|
4602
4720
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
|
|
4603
4721
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
|
@@ -4609,29 +4727,36 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
|
|
|
4609
4727
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb03));
|
|
4610
4728
|
|
|
4611
4729
|
if (mode == GGML_SCALE_MODE_NEAREST) {
|
|
4612
|
-
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &
|
|
4613
|
-
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &
|
|
4614
|
-
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &
|
|
4615
|
-
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &
|
|
4730
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne0));
|
|
4731
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne1));
|
|
4732
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne2));
|
|
4733
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne3));
|
|
4616
4734
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &sf0));
|
|
4617
4735
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float), &sf1));
|
|
4618
4736
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf2));
|
|
4619
4737
|
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf3));
|
|
4620
4738
|
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
|
4621
|
-
|
|
4622
|
-
|
|
4623
|
-
|
|
4624
|
-
|
|
4625
|
-
|
|
4626
|
-
|
|
4739
|
+
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
|
|
4740
|
+
sf0 = (float)(ne0 - 1) / (ne00 - 1);
|
|
4741
|
+
sf1 = (float)(ne1 - 1) / (ne01 - 1);
|
|
4742
|
+
pixel_offset = 0.0f;
|
|
4743
|
+
}
|
|
4744
|
+
|
|
4745
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
|
4746
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
|
|
4747
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne0));
|
|
4748
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne1));
|
|
4749
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne2));
|
|
4750
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne3));
|
|
4627
4751
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf0));
|
|
4628
4752
|
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf1));
|
|
4629
4753
|
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(float), &sf2));
|
|
4630
4754
|
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(float), &sf3));
|
|
4755
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float), &pixel_offset));
|
|
4631
4756
|
}
|
|
4632
4757
|
|
|
4633
4758
|
|
|
4634
|
-
size_t dst_total_elements = (size_t)
|
|
4759
|
+
size_t dst_total_elements = (size_t)ne0 * ne1 * ne2 * ne3;
|
|
4635
4760
|
if (dst_total_elements == 0) {
|
|
4636
4761
|
return;
|
|
4637
4762
|
}
|
|
@@ -4644,17 +4769,7 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
|
|
|
4644
4769
|
local_work_size_ptr = nullptr;
|
|
4645
4770
|
}
|
|
4646
4771
|
|
|
4647
|
-
|
|
4648
|
-
cl_event evt;
|
|
4649
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
4650
|
-
|
|
4651
|
-
g_profiling_info.emplace_back();
|
|
4652
|
-
size_t profiling_gws[3] = {global_work_size[0], 1, 1};
|
|
4653
|
-
size_t profiling_lws[3] = {local_work_size_ptr ? local_work_size[0] : 0, 1, 1};
|
|
4654
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst);
|
|
4655
|
-
#else
|
|
4656
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
4657
|
-
#endif
|
|
4772
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
4658
4773
|
}
|
|
4659
4774
|
|
|
4660
4775
|
static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4732,7 +4847,7 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
|
4732
4847
|
global_work_size[1] = d_ne1;
|
|
4733
4848
|
global_work_size[2] = d_ne2;
|
|
4734
4849
|
|
|
4735
|
-
|
|
4850
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
|
|
4736
4851
|
}
|
|
4737
4852
|
}
|
|
4738
4853
|
} else {
|
|
@@ -4782,7 +4897,7 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
|
4782
4897
|
d_ne2 > 0 ? (size_t)d_ne2 : 1,
|
|
4783
4898
|
d_ne3 > 0 ? (size_t)d_ne3 : 1 };
|
|
4784
4899
|
|
|
4785
|
-
|
|
4900
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_nc, NULL, dst);
|
|
4786
4901
|
}
|
|
4787
4902
|
}
|
|
4788
4903
|
|
|
@@ -4795,7 +4910,6 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
|
|
|
4795
4910
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
4796
4911
|
|
|
4797
4912
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4798
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4799
4913
|
|
|
4800
4914
|
if (backend_ctx->kernel_timestep_embedding == nullptr) {
|
|
4801
4915
|
GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__);
|
|
@@ -4828,17 +4942,59 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
|
|
|
4828
4942
|
|
|
4829
4943
|
size_t global_work_size[] = {gws0, gws1, 1};
|
|
4830
4944
|
|
|
4831
|
-
|
|
4832
|
-
|
|
4833
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, &evt)); // Pass 2 for 2D problem
|
|
4945
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
|
|
4946
|
+
}
|
|
4834
4947
|
|
|
4835
|
-
|
|
4836
|
-
|
|
4837
|
-
|
|
4838
|
-
|
|
4839
|
-
|
|
4840
|
-
|
|
4841
|
-
|
|
4948
|
+
static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
4949
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4950
|
+
|
|
4951
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4952
|
+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
4953
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
4954
|
+
|
|
4955
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
4956
|
+
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
|
4957
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
4958
|
+
|
|
4959
|
+
const int M = src0->ne[1];
|
|
4960
|
+
const int N = src1->ne[1];
|
|
4961
|
+
const int K = src0->ne[0];
|
|
4962
|
+
|
|
4963
|
+
cl_kernel kernel = backend_ctx->kernel_mul_mat_f16_f32_tiled;
|
|
4964
|
+
|
|
4965
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(int), &M));
|
|
4966
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &N));
|
|
4967
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &K));
|
|
4968
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0->data_device));
|
|
4969
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offset0));
|
|
4970
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extra1->data_device));
|
|
4971
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offset1));
|
|
4972
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), &extrad->data_device));
|
|
4973
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &offsetd));
|
|
4974
|
+
|
|
4975
|
+
// Tiling parameters. These need to be tuned for optimal performance.
|
|
4976
|
+
// They must match the #defines in the kernel mul_mat_f16_f32.cl.
|
|
4977
|
+
//
|
|
4978
|
+
// OPWM / OPWN: Output tile size per Work-Group. A work-group computes a tile of size OPWM x OPWN.
|
|
4979
|
+
// TPWM / TPWN: Threads per Work-group. This is the work-group size.
|
|
4980
|
+
// OPTM / OPTN: Output elements per Thread. Each thread computes OPTM x OPTN elements.
|
|
4981
|
+
//
|
|
4982
|
+
// The following relationships must hold:
|
|
4983
|
+
// OPWM = TPWM * OPTM
|
|
4984
|
+
// OPWN = TPWN * OPTN
|
|
4985
|
+
//
|
|
4986
|
+
const int OPWM = 64;
|
|
4987
|
+
const int OPWN = 64;
|
|
4988
|
+
const int TPWM = 16;
|
|
4989
|
+
const int TPWN = 8;
|
|
4990
|
+
|
|
4991
|
+
size_t local_work_size[2] = { TPWM, TPWN };
|
|
4992
|
+
size_t global_work_size[2] = {
|
|
4993
|
+
(size_t) ((M + OPWM - 1) / OPWM) * TPWM,
|
|
4994
|
+
(size_t) ((N + OPWN - 1) / OPWN) * TPWN,
|
|
4995
|
+
};
|
|
4996
|
+
|
|
4997
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
|
|
4842
4998
|
}
|
|
4843
4999
|
|
|
4844
5000
|
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4853,7 +5009,18 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
4853
5009
|
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
|
4854
5010
|
|
|
4855
5011
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4856
|
-
|
|
5012
|
+
|
|
5013
|
+
if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
|
|
5014
|
+
src0->ne[1] > 32 && // M > 32
|
|
5015
|
+
src1->ne[1] > 32 && // N > 32
|
|
5016
|
+
src0->ne[0] > 32 && // K > 32
|
|
5017
|
+
src0->ne[2] == 1 && src0->ne[3] == 1 &&
|
|
5018
|
+
src1->ne[2] == 1 && src1->ne[3] == 1 &&
|
|
5019
|
+
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) &&
|
|
5020
|
+
backend_ctx->kernel_mul_mat_f16_f32_tiled != NULL) {
|
|
5021
|
+
ggml_cl_mul_mat_f16_f32_tiled(backend, src0, src1, dst);
|
|
5022
|
+
return;
|
|
5023
|
+
}
|
|
4857
5024
|
|
|
4858
5025
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4859
5026
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -5058,15 +5225,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
5058
5225
|
static_cast<size_t>(padded_height_B)
|
|
5059
5226
|
};
|
|
5060
5227
|
|
|
5061
|
-
|
|
5062
|
-
cl_event evt;
|
|
5063
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, &evt));
|
|
5064
|
-
|
|
5065
|
-
g_profiling_info.emplace_back();
|
|
5066
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_size_t, local_size_t, dst);
|
|
5067
|
-
#else
|
|
5068
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, NULL));
|
|
5069
|
-
#endif
|
|
5228
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
|
|
5070
5229
|
} else {
|
|
5071
5230
|
// no need to transpose B in other cases
|
|
5072
5231
|
// create an image for B from sub_buffer
|
|
@@ -5188,16 +5347,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
5188
5347
|
|
|
5189
5348
|
// enqueue kernel with profiling
|
|
5190
5349
|
// <--------------------------------------------> //
|
|
5191
|
-
|
|
5192
|
-
cl_event evt;
|
|
5193
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5194
|
-
|
|
5195
|
-
g_profiling_info.emplace_back();
|
|
5196
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5197
|
-
// enqueue kernel without profiling
|
|
5198
|
-
#else
|
|
5199
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5200
|
-
#endif
|
|
5350
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5201
5351
|
// <--------------------------------------------> //
|
|
5202
5352
|
|
|
5203
5353
|
// deallocate sub buffers and images
|
|
@@ -5277,15 +5427,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
5277
5427
|
global_work_size[2] = (size_t)ne12*ne13;
|
|
5278
5428
|
}
|
|
5279
5429
|
|
|
5280
|
-
|
|
5281
|
-
cl_event evt;
|
|
5282
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5283
|
-
|
|
5284
|
-
g_profiling_info.emplace_back();
|
|
5285
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5286
|
-
#else
|
|
5287
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5288
|
-
#endif
|
|
5430
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5289
5431
|
return;
|
|
5290
5432
|
}
|
|
5291
5433
|
#else // GGML_OPENCL_SOA_Q
|
|
@@ -5515,15 +5657,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
5515
5657
|
size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
|
|
5516
5658
|
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
|
5517
5659
|
|
|
5518
|
-
|
|
5519
|
-
cl_event evt;
|
|
5520
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5521
|
-
|
|
5522
|
-
g_profiling_info.emplace_back();
|
|
5523
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5524
|
-
#else
|
|
5525
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5526
|
-
#endif
|
|
5660
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5527
5661
|
} else if (src0t == GGML_TYPE_Q4_K) {
|
|
5528
5662
|
GGML_ASSERT(false && "not implemented");
|
|
5529
5663
|
} else if (src0t == GGML_TYPE_Q3_K) {
|
|
@@ -5534,30 +5668,14 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
5534
5668
|
size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
|
|
5535
5669
|
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
|
5536
5670
|
|
|
5537
|
-
|
|
5538
|
-
cl_event evt;
|
|
5539
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5540
|
-
|
|
5541
|
-
g_profiling_info.emplace_back();
|
|
5542
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5543
|
-
#else
|
|
5544
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5545
|
-
#endif
|
|
5671
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5546
5672
|
} else {
|
|
5547
5673
|
int64_t ny = (ne11 + nrows - 1)/nrows;
|
|
5548
5674
|
|
|
5549
5675
|
size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
|
|
5550
5676
|
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
|
5551
5677
|
|
|
5552
|
-
|
|
5553
|
-
cl_event evt;
|
|
5554
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5555
|
-
|
|
5556
|
-
g_profiling_info.emplace_back();
|
|
5557
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5558
|
-
#else
|
|
5559
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5560
|
-
#endif
|
|
5678
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5561
5679
|
}
|
|
5562
5680
|
}
|
|
5563
5681
|
|
|
@@ -5574,7 +5692,6 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
5574
5692
|
GGML_ASSERT(src2->extra);
|
|
5575
5693
|
|
|
5576
5694
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5577
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
5578
5695
|
|
|
5579
5696
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
5580
5697
|
ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
|
|
@@ -5680,15 +5797,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
5680
5797
|
size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
|
|
5681
5798
|
size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
|
|
5682
5799
|
|
|
5683
|
-
|
|
5684
|
-
cl_event evt;
|
|
5685
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5686
|
-
|
|
5687
|
-
g_profiling_info.emplace_back();
|
|
5688
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5689
|
-
#else
|
|
5690
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5691
|
-
#endif
|
|
5800
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5692
5801
|
}
|
|
5693
5802
|
|
|
5694
5803
|
static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -5701,10 +5810,11 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
|
5701
5810
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
5702
5811
|
|
|
5703
5812
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5704
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
5705
5813
|
|
|
5706
5814
|
float scale;
|
|
5707
|
-
|
|
5815
|
+
float bias;
|
|
5816
|
+
memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(float));
|
|
5817
|
+
memcpy(&bias, ((int32_t *) dst->op_params) + 1, sizeof(float));
|
|
5708
5818
|
|
|
5709
5819
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
5710
5820
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -5719,6 +5829,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
|
5719
5829
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
5720
5830
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
5721
5831
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale));
|
|
5832
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float), &bias));
|
|
5722
5833
|
|
|
5723
5834
|
int n = ggml_nelements(dst)/4;
|
|
5724
5835
|
|
|
@@ -5730,15 +5841,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
|
5730
5841
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
5731
5842
|
}
|
|
5732
5843
|
|
|
5733
|
-
|
|
5734
|
-
cl_event evt;
|
|
5735
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
5736
|
-
|
|
5737
|
-
g_profiling_info.emplace_back();
|
|
5738
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
5739
|
-
#else
|
|
5740
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
5741
|
-
#endif
|
|
5844
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
5742
5845
|
}
|
|
5743
5846
|
|
|
5744
5847
|
static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -5775,7 +5878,6 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
5775
5878
|
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
|
5776
5879
|
|
|
5777
5880
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5778
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
5779
5881
|
|
|
5780
5882
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
5781
5883
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -5840,15 +5942,7 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
5840
5942
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
5841
5943
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
5842
5944
|
|
|
5843
|
-
|
|
5844
|
-
cl_event evt;
|
|
5845
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5846
|
-
|
|
5847
|
-
g_profiling_info.emplace_back();
|
|
5848
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, src1);
|
|
5849
|
-
#else
|
|
5850
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5851
|
-
#endif
|
|
5945
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
|
|
5852
5946
|
}
|
|
5853
5947
|
|
|
5854
5948
|
static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -5871,7 +5965,6 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
|
|
|
5871
5965
|
const int ne02 = src0 ? src0->ne[2] : 0;
|
|
5872
5966
|
|
|
5873
5967
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5874
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
5875
5968
|
|
|
5876
5969
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
5877
5970
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -5895,15 +5988,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
|
|
|
5895
5988
|
size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1};
|
|
5896
5989
|
size_t local_work_size[] = {64, 1, 1};
|
|
5897
5990
|
|
|
5898
|
-
|
|
5899
|
-
cl_event evt;
|
|
5900
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5901
|
-
|
|
5902
|
-
g_profiling_info.emplace_back();
|
|
5903
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5904
|
-
#else
|
|
5905
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5906
|
-
#endif
|
|
5991
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5907
5992
|
} else {
|
|
5908
5993
|
kernel = backend_ctx->kernel_diag_mask_inf;
|
|
5909
5994
|
|
|
@@ -5923,15 +6008,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
|
|
|
5923
6008
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
5924
6009
|
}
|
|
5925
6010
|
|
|
5926
|
-
|
|
5927
|
-
cl_event evt;
|
|
5928
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
5929
|
-
|
|
5930
|
-
g_profiling_info.emplace_back();
|
|
5931
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
5932
|
-
#else
|
|
5933
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
5934
|
-
#endif
|
|
6011
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
5935
6012
|
}
|
|
5936
6013
|
}
|
|
5937
6014
|
|
|
@@ -5951,7 +6028,6 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
5951
6028
|
}
|
|
5952
6029
|
|
|
5953
6030
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5954
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
5955
6031
|
|
|
5956
6032
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
5957
6033
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -5963,19 +6039,31 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
5963
6039
|
|
|
5964
6040
|
cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
|
|
5965
6041
|
|
|
5966
|
-
const int
|
|
5967
|
-
const int
|
|
5968
|
-
const int
|
|
5969
|
-
const int
|
|
6042
|
+
const int ne00 = src0->ne[0];
|
|
6043
|
+
const int ne01 = src0->ne[1];
|
|
6044
|
+
const int ne02 = src0->ne[2];
|
|
6045
|
+
const int ne03 = src0->ne[3];
|
|
6046
|
+
|
|
6047
|
+
const cl_long nb01 = src0->nb[1];
|
|
6048
|
+
const cl_long nb02 = src0->nb[2];
|
|
6049
|
+
const cl_long nb03 = src0->nb[3];
|
|
6050
|
+
|
|
6051
|
+
const int ne12 = src1 ? src1->ne[2] : 0;
|
|
6052
|
+
const int ne13 = src1 ? src1->ne[3] : 0;
|
|
6053
|
+
|
|
6054
|
+
const cl_long nb11 = src1 ? src1->nb[1] : 0;
|
|
6055
|
+
const cl_long nb12 = src1 ? src1->nb[2] : 0;
|
|
6056
|
+
const cl_long nb13 = src1 ? src1->nb[3] : 0;
|
|
6057
|
+
|
|
6058
|
+
const cl_long nb1 = dst->nb[1];
|
|
6059
|
+
const cl_long nb2 = dst->nb[2];
|
|
6060
|
+
const cl_long nb3 = dst->nb[3];
|
|
5970
6061
|
|
|
5971
6062
|
float scale, max_bias;
|
|
5972
6063
|
memcpy(&scale, dst->op_params + 0, sizeof(float));
|
|
5973
6064
|
memcpy(&max_bias, dst->op_params + 1, sizeof(float));
|
|
5974
6065
|
|
|
5975
|
-
const int
|
|
5976
|
-
const int nrows_y = src0->ne[1];
|
|
5977
|
-
|
|
5978
|
-
const int n_head = nrows_x/nrows_y;
|
|
6066
|
+
const int n_head = src0->ne[2];
|
|
5979
6067
|
const int n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
|
5980
6068
|
|
|
5981
6069
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
|
@@ -6020,26 +6108,27 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
6020
6108
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
6021
6109
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
6022
6110
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
|
6023
|
-
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(
|
|
6024
|
-
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(
|
|
6025
|
-
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(
|
|
6026
|
-
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(
|
|
6027
|
-
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(
|
|
6028
|
-
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(
|
|
6029
|
-
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(
|
|
6111
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
|
6112
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
|
6113
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
|
6114
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
|
|
6115
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13));
|
|
6116
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
|
|
6117
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
|
|
6118
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
|
|
6119
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb1));
|
|
6120
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb2));
|
|
6121
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb3));
|
|
6122
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float), &scale));
|
|
6123
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(float), &max_bias));
|
|
6124
|
+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(float), &m0));
|
|
6125
|
+
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(float), &m1));
|
|
6126
|
+
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &n_head_log2));
|
|
6030
6127
|
|
|
6031
6128
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
6032
6129
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
6033
6130
|
|
|
6034
|
-
|
|
6035
|
-
cl_event evt;
|
|
6036
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
6037
|
-
|
|
6038
|
-
g_profiling_info.emplace_back();
|
|
6039
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
6040
|
-
#else
|
|
6041
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
6042
|
-
#endif
|
|
6131
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6043
6132
|
}
|
|
6044
6133
|
|
|
6045
6134
|
static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -6051,7 +6140,6 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
6051
6140
|
GGML_ASSERT(dst->extra);
|
|
6052
6141
|
|
|
6053
6142
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
6054
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
6055
6143
|
|
|
6056
6144
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
6057
6145
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -6217,15 +6305,7 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
6217
6305
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
6218
6306
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
6219
6307
|
|
|
6220
|
-
|
|
6221
|
-
cl_event evt;
|
|
6222
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
6223
|
-
|
|
6224
|
-
g_profiling_info.emplace_back();
|
|
6225
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
6226
|
-
#else
|
|
6227
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
6228
|
-
#endif
|
|
6308
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6229
6309
|
}
|
|
6230
6310
|
|
|
6231
6311
|
static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -6240,7 +6320,6 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
|
6240
6320
|
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
|
6241
6321
|
|
|
6242
6322
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
6243
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
6244
6323
|
|
|
6245
6324
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
6246
6325
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -6309,15 +6388,7 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
|
6309
6388
|
size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC};
|
|
6310
6389
|
size_t local_work_size[] = {256, 1, 1};
|
|
6311
6390
|
|
|
6312
|
-
|
|
6313
|
-
cl_event evt;
|
|
6314
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
6315
|
-
|
|
6316
|
-
g_profiling_info.emplace_back();
|
|
6317
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
6318
|
-
#else
|
|
6319
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
6320
|
-
#endif
|
|
6391
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6321
6392
|
}
|
|
6322
6393
|
|
|
6323
6394
|
static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -6332,7 +6403,6 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
6332
6403
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
6333
6404
|
|
|
6334
6405
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
6335
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
6336
6406
|
|
|
6337
6407
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
6338
6408
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -6364,15 +6434,7 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
6364
6434
|
size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
|
|
6365
6435
|
size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
|
|
6366
6436
|
|
|
6367
|
-
|
|
6368
|
-
cl_event evt;
|
|
6369
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
6370
|
-
|
|
6371
|
-
g_profiling_info.emplace_back();
|
|
6372
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
6373
|
-
#else
|
|
6374
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
6375
|
-
#endif
|
|
6437
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6376
6438
|
}
|
|
6377
6439
|
|
|
6378
6440
|
static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -6386,7 +6448,6 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
6386
6448
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
6387
6449
|
|
|
6388
6450
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
6389
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
6390
6451
|
|
|
6391
6452
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
6392
6453
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -6427,15 +6488,106 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
6427
6488
|
size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
|
|
6428
6489
|
size_t local_work_size[] = {(size_t)64, 1, 1};
|
|
6429
6490
|
|
|
6430
|
-
|
|
6431
|
-
|
|
6432
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
6491
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6492
|
+
}
|
|
6433
6493
|
|
|
6434
|
-
|
|
6435
|
-
|
|
6436
|
-
|
|
6437
|
-
|
|
6438
|
-
|
|
6494
|
+
static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
6495
|
+
GGML_ASSERT(src0);
|
|
6496
|
+
GGML_ASSERT(src0->extra);
|
|
6497
|
+
GGML_ASSERT(dst);
|
|
6498
|
+
GGML_ASSERT(dst->extra);
|
|
6499
|
+
|
|
6500
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
6501
|
+
|
|
6502
|
+
if (src1) {
|
|
6503
|
+
GGML_ASSERT(src1);
|
|
6504
|
+
GGML_ASSERT(src1->extra);
|
|
6505
|
+
GGML_ASSERT(ggml_are_same_shape(src0, src1));
|
|
6506
|
+
}
|
|
6507
|
+
|
|
6508
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
6509
|
+
|
|
6510
|
+
cl_kernel kernel;
|
|
6511
|
+
switch (ggml_get_glu_op(dst)) {
|
|
6512
|
+
case GGML_GLU_OP_GEGLU:
|
|
6513
|
+
if (dst->type == GGML_TYPE_F32) {
|
|
6514
|
+
kernel = backend_ctx->kernel_geglu;
|
|
6515
|
+
} else {
|
|
6516
|
+
kernel = backend_ctx->kernel_geglu_f16;
|
|
6517
|
+
}
|
|
6518
|
+
break;
|
|
6519
|
+
case GGML_GLU_OP_REGLU:
|
|
6520
|
+
if (dst->type == GGML_TYPE_F32) {
|
|
6521
|
+
kernel = backend_ctx->kernel_reglu;
|
|
6522
|
+
} else {
|
|
6523
|
+
kernel = backend_ctx->kernel_reglu_f16;
|
|
6524
|
+
}
|
|
6525
|
+
break;
|
|
6526
|
+
case GGML_GLU_OP_SWIGLU:
|
|
6527
|
+
if (dst->type == GGML_TYPE_F32) {
|
|
6528
|
+
kernel = backend_ctx->kernel_swiglu;
|
|
6529
|
+
} else {
|
|
6530
|
+
kernel = backend_ctx->kernel_swiglu_f16;
|
|
6531
|
+
}
|
|
6532
|
+
break;
|
|
6533
|
+
case GGML_GLU_OP_GEGLU_ERF:
|
|
6534
|
+
if (dst->type == GGML_TYPE_F32) {
|
|
6535
|
+
kernel = backend_ctx->kernel_geglu_erf;
|
|
6536
|
+
} else {
|
|
6537
|
+
kernel = backend_ctx->kernel_geglu_erf_f16;
|
|
6538
|
+
}
|
|
6539
|
+
break;
|
|
6540
|
+
case GGML_GLU_OP_GEGLU_QUICK:
|
|
6541
|
+
if (dst->type == GGML_TYPE_F32) {
|
|
6542
|
+
kernel = backend_ctx->kernel_geglu_quick;
|
|
6543
|
+
} else {
|
|
6544
|
+
kernel = backend_ctx->kernel_geglu_quick_f16;
|
|
6545
|
+
}
|
|
6546
|
+
break;
|
|
6547
|
+
default:
|
|
6548
|
+
GGML_ABORT("Unsupported glu op");
|
|
6549
|
+
}
|
|
6550
|
+
|
|
6551
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
6552
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
6553
|
+
|
|
6554
|
+
ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr;
|
|
6555
|
+
|
|
6556
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
6557
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
6558
|
+
|
|
6559
|
+
cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
|
|
6560
|
+
|
|
6561
|
+
const int ne0 = dst->ne[0];
|
|
6562
|
+
|
|
6563
|
+
const cl_ulong nb01 = src0->nb[1];
|
|
6564
|
+
const cl_ulong nb11 = src1 ? src1->nb[1] : nb01;
|
|
6565
|
+
|
|
6566
|
+
const cl_ulong nb1 = dst->nb[1];
|
|
6567
|
+
|
|
6568
|
+
const int swp = ((const int32_t *) dst->op_params)[1];
|
|
6569
|
+
const int ne00_off = src1 ? 0 : (swp ? ne0 : 0);
|
|
6570
|
+
const int ne10_off = src1 ? 0 : (swp ? 0 : ne0);
|
|
6571
|
+
|
|
6572
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
6573
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
6574
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), src1 ? &extra1->data_device : &extra0->data_device));
|
|
6575
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
6576
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
6577
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
6578
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
|
|
6579
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb11));
|
|
6580
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne0));
|
|
6581
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb1));
|
|
6582
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne00_off));
|
|
6583
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10_off));
|
|
6584
|
+
|
|
6585
|
+
const size_t nrows = ggml_nrows(src0);
|
|
6586
|
+
size_t nth = 512;
|
|
6587
|
+
size_t global_work_size[] = {nrows*nth, 1, 1};
|
|
6588
|
+
size_t local_work_size[] = {nth, 1, 1};
|
|
6589
|
+
|
|
6590
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6439
6591
|
}
|
|
6440
6592
|
|
|
6441
6593
|
//------------------------------------------------------------------------------
|
|
@@ -6461,6 +6613,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
6461
6613
|
}
|
|
6462
6614
|
func = ggml_cl_get_rows;
|
|
6463
6615
|
break;
|
|
6616
|
+
case GGML_OP_SET_ROWS:
|
|
6617
|
+
if (!any_on_device) {
|
|
6618
|
+
return false;
|
|
6619
|
+
}
|
|
6620
|
+
func = ggml_cl_set_rows;
|
|
6621
|
+
break;
|
|
6464
6622
|
case GGML_OP_CPY:
|
|
6465
6623
|
if (!any_on_device) {
|
|
6466
6624
|
return false;
|
|
@@ -6506,6 +6664,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
6506
6664
|
}
|
|
6507
6665
|
func = ggml_cl_gelu;
|
|
6508
6666
|
break;
|
|
6667
|
+
case GGML_UNARY_OP_GELU_ERF:
|
|
6668
|
+
if (!any_on_device) {
|
|
6669
|
+
return false;
|
|
6670
|
+
}
|
|
6671
|
+
func = ggml_cl_gelu_erf;
|
|
6672
|
+
break;
|
|
6509
6673
|
case GGML_UNARY_OP_GELU_QUICK:
|
|
6510
6674
|
if (!any_on_device) {
|
|
6511
6675
|
return false;
|
|
@@ -6539,6 +6703,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
6539
6703
|
default:
|
|
6540
6704
|
return false;
|
|
6541
6705
|
} break;
|
|
6706
|
+
case GGML_OP_GLU:
|
|
6707
|
+
if (!any_on_device) {
|
|
6708
|
+
return false;
|
|
6709
|
+
}
|
|
6710
|
+
func = ggml_cl_glu;
|
|
6711
|
+
break;
|
|
6542
6712
|
case GGML_OP_CLAMP:
|
|
6543
6713
|
if (!any_on_device) {
|
|
6544
6714
|
return false;
|