@novastera-oss/llamarn 0.2.9 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +0 -1
- package/cpp/llama.cpp/README.md +4 -5
- package/cpp/llama.cpp/build-xcframework.sh +1 -1
- package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
- package/cpp/llama.cpp/common/arg.cpp +17 -0
- package/cpp/llama.cpp/common/chat.cpp +37 -20
- package/cpp/llama.cpp/common/chat.h +2 -0
- package/cpp/llama.cpp/common/common.h +4 -0
- package/cpp/llama.cpp/convert_hf_to_gguf.py +745 -6
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +9 -0
- package/cpp/llama.cpp/ggml/CMakeLists.txt +7 -2
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml.h +173 -10
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +36 -18
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +68 -5
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +16 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +28 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1203 -163
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +142 -9
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +8 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +14 -12
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +5 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +15 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +8 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +185 -79
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +2 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -5
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +64 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +35 -9
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +167 -39
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +254 -57
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +505 -40
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +693 -1034
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +60 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +15 -18
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +711 -292
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +58 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +23 -3
- package/cpp/llama.cpp/ggml/src/ggml.c +382 -61
- package/cpp/llama.cpp/ggml/src/gguf.cpp +8 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +209 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +8 -2
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +73 -21
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +12 -3
- package/cpp/llama.cpp/include/llama.h +0 -40
- package/cpp/llama.cpp/src/llama-arch.cpp +210 -3
- package/cpp/llama.cpp/src/llama-arch.h +18 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +27 -1
- package/cpp/llama.cpp/src/llama-batch.h +8 -1
- package/cpp/llama.cpp/src/llama-chat.cpp +15 -0
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-graph.cpp +119 -184
- package/cpp/llama.cpp/src/llama-graph.h +47 -60
- package/cpp/llama.cpp/src/llama-hparams.cpp +7 -1
- package/cpp/llama.cpp/src/llama-hparams.h +3 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +62 -24
- package/cpp/llama.cpp/src/llama-kv-cells.h +62 -10
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +3 -1
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +20 -10
- package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
- package/cpp/llama.cpp/src/llama-memory.h +3 -0
- package/cpp/llama.cpp/src/llama-model.cpp +2530 -685
- package/cpp/llama.cpp/src/llama-model.h +18 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +1 -0
- package/cpp/llama.cpp/src/llama-vocab.cpp +13 -2
- package/cpp/llama.cpp/src/llama-vocab.h +41 -0
- package/ios/include/chat.h +2 -0
- package/ios/include/common.h +4 -0
- package/ios/include/llama.h +0 -40
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5055 -4886
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3764
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3891 -3766
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4890
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3764
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5091 -4922
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5066 -4897
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3919 -3794
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
- package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
|
@@ -351,6 +351,8 @@ struct ggml_backend_opencl_context {
|
|
|
351
351
|
cl_program program_gemv_noshuffle_general;
|
|
352
352
|
cl_program program_gemv_noshuffle;
|
|
353
353
|
cl_program program_get_rows;
|
|
354
|
+
cl_program program_set_rows;
|
|
355
|
+
cl_program program_glu;
|
|
354
356
|
cl_program program_im2col_f16;
|
|
355
357
|
cl_program program_im2col_f32;
|
|
356
358
|
cl_program program_mul_mat_Ab_Bi_8x4;
|
|
@@ -366,6 +368,7 @@ struct ggml_backend_opencl_context {
|
|
|
366
368
|
cl_program program_mul_mv_f16_f32;
|
|
367
369
|
cl_program program_mul_mv_f32_f32;
|
|
368
370
|
cl_program program_mul;
|
|
371
|
+
cl_program program_mul_mat_f16_f32_tiled;
|
|
369
372
|
cl_program program_div;
|
|
370
373
|
cl_program program_sub;
|
|
371
374
|
cl_program program_norm;
|
|
@@ -397,10 +400,13 @@ struct ggml_backend_opencl_context {
|
|
|
397
400
|
cl_kernel kernel_scale;
|
|
398
401
|
cl_kernel kernel_silu, kernel_silu_4;
|
|
399
402
|
cl_kernel kernel_gelu, kernel_gelu_4;
|
|
403
|
+
cl_kernel kernel_gelu_erf, kernel_gelu_erf_4;
|
|
400
404
|
cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
|
|
401
405
|
cl_kernel kernel_relu;
|
|
402
406
|
cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
|
|
403
407
|
cl_kernel kernel_clamp;
|
|
408
|
+
cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, kernel_geglu_erf, kernel_geglu_quick,
|
|
409
|
+
kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16, kernel_geglu_erf_f16, kernel_geglu_quick_f16;
|
|
404
410
|
cl_kernel kernel_norm;
|
|
405
411
|
cl_kernel kernel_rms_norm;
|
|
406
412
|
cl_kernel kernel_group_norm;
|
|
@@ -408,6 +414,7 @@ struct ggml_backend_opencl_context {
|
|
|
408
414
|
cl_kernel kernel_soft_max, kernel_soft_max_4;
|
|
409
415
|
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
|
|
410
416
|
cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
|
|
417
|
+
cl_kernel kernel_set_rows_f32, kernel_set_rows_f16;
|
|
411
418
|
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
|
|
412
419
|
cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
|
|
413
420
|
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
|
|
@@ -416,6 +423,7 @@ struct ggml_backend_opencl_context {
|
|
|
416
423
|
cl_kernel kernel_mul_mat_f16_f32_1row;
|
|
417
424
|
cl_kernel kernel_mul_mat_f16_f32;
|
|
418
425
|
cl_kernel kernel_mul_mat_f16_f32_l4;
|
|
426
|
+
cl_kernel kernel_mul_mat_f16_f32_tiled;
|
|
419
427
|
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
|
|
420
428
|
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
|
|
421
429
|
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
|
|
@@ -525,6 +533,16 @@ struct ggml_backend_opencl_context {
|
|
|
525
533
|
fclose(ftrace);
|
|
526
534
|
}
|
|
527
535
|
|
|
536
|
+
size_t get_kernel_workgroup_size(cl_kernel kernel) const {
|
|
537
|
+
size_t workgroup_size = 0;
|
|
538
|
+
size_t ret_size = 0;
|
|
539
|
+
CL_CHECK(
|
|
540
|
+
clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
|
|
541
|
+
sizeof(size_t), &workgroup_size, &ret_size));
|
|
542
|
+
GGML_ASSERT(sizeof(size_t) == ret_size);
|
|
543
|
+
return workgroup_size;
|
|
544
|
+
}
|
|
545
|
+
|
|
528
546
|
void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
|
|
529
547
|
#ifdef GGML_OPENCL_PROFILING
|
|
530
548
|
cl_event evt;
|
|
@@ -733,11 +751,38 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
733
751
|
|
|
734
752
|
CL_CHECK((backend_ctx->kernel_gelu = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu", &err), err));
|
|
735
753
|
CL_CHECK((backend_ctx->kernel_gelu_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_4", &err), err));
|
|
754
|
+
CL_CHECK((backend_ctx->kernel_gelu_erf = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_erf", &err), err));
|
|
755
|
+
CL_CHECK((backend_ctx->kernel_gelu_erf_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_erf_4", &err), err));
|
|
736
756
|
CL_CHECK((backend_ctx->kernel_gelu_quick = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick", &err), err));
|
|
737
757
|
CL_CHECK((backend_ctx->kernel_gelu_quick_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick_4", &err), err));
|
|
738
758
|
GGML_LOG_CONT(".");
|
|
739
759
|
}
|
|
740
760
|
|
|
761
|
+
// glu
|
|
762
|
+
{
|
|
763
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
764
|
+
const std::string kernel_src {
|
|
765
|
+
#include "glu.cl.h"
|
|
766
|
+
};
|
|
767
|
+
#else
|
|
768
|
+
const std::string kernel_src = read_file("glu.cl");
|
|
769
|
+
#endif
|
|
770
|
+
backend_ctx->program_glu =
|
|
771
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
772
|
+
|
|
773
|
+
CL_CHECK((backend_ctx->kernel_geglu = clCreateKernel(backend_ctx->program_glu, "kernel_geglu", &err), err));
|
|
774
|
+
CL_CHECK((backend_ctx->kernel_reglu = clCreateKernel(backend_ctx->program_glu, "kernel_reglu", &err), err));
|
|
775
|
+
CL_CHECK((backend_ctx->kernel_swiglu = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu", &err), err));
|
|
776
|
+
CL_CHECK((backend_ctx->kernel_geglu_erf = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_erf", &err), err));
|
|
777
|
+
CL_CHECK((backend_ctx->kernel_geglu_quick = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_quick", &err), err));
|
|
778
|
+
CL_CHECK((backend_ctx->kernel_geglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_f16", &err), err));
|
|
779
|
+
CL_CHECK((backend_ctx->kernel_reglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_reglu_f16", &err), err));
|
|
780
|
+
CL_CHECK((backend_ctx->kernel_swiglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu_f16", &err), err));
|
|
781
|
+
CL_CHECK((backend_ctx->kernel_geglu_erf_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_erf_f16", &err), err));
|
|
782
|
+
CL_CHECK((backend_ctx->kernel_geglu_quick_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_quick_f16", &err), err));
|
|
783
|
+
GGML_LOG_CONT(".");
|
|
784
|
+
}
|
|
785
|
+
|
|
741
786
|
// get_rows
|
|
742
787
|
{
|
|
743
788
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
@@ -972,6 +1017,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
972
1017
|
GGML_LOG_CONT(".");
|
|
973
1018
|
}
|
|
974
1019
|
|
|
1020
|
+
// mul_mat_f16_f32_tiled
|
|
1021
|
+
{
|
|
1022
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1023
|
+
const std::string kernel_src {
|
|
1024
|
+
#include "mul_mat_f16_f32.cl.h"
|
|
1025
|
+
};
|
|
1026
|
+
#else
|
|
1027
|
+
const std::string kernel_src = read_file("mul_mat_f16_f32.cl");
|
|
1028
|
+
#endif
|
|
1029
|
+
backend_ctx->program_mul_mat_f16_f32_tiled =
|
|
1030
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1031
|
+
|
|
1032
|
+
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_tiled = clCreateKernel(backend_ctx->program_mul_mat_f16_f32_tiled, "mul_mat_f16_f32", &err), err));
|
|
1033
|
+
GGML_LOG_CONT(".");
|
|
1034
|
+
}
|
|
1035
|
+
|
|
975
1036
|
// mul
|
|
976
1037
|
{
|
|
977
1038
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
@@ -1400,6 +1461,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1400
1461
|
}
|
|
1401
1462
|
}
|
|
1402
1463
|
|
|
1464
|
+
// set_rows
|
|
1465
|
+
{
|
|
1466
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1467
|
+
const std::string kernel_src {
|
|
1468
|
+
#include "set_rows.cl.h"
|
|
1469
|
+
};
|
|
1470
|
+
#else
|
|
1471
|
+
const std::string kernel_src = read_file("set_rows.cl");
|
|
1472
|
+
#endif
|
|
1473
|
+
backend_ctx->program_set_rows =
|
|
1474
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1475
|
+
|
|
1476
|
+
CL_CHECK((backend_ctx->kernel_set_rows_f32 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f32", &err), err));
|
|
1477
|
+
CL_CHECK((backend_ctx->kernel_set_rows_f16 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f16", &err), err));
|
|
1478
|
+
GGML_LOG_CONT(".");
|
|
1479
|
+
}
|
|
1480
|
+
|
|
1403
1481
|
// mul_mv_id_q4_0_f32_8x_flat
|
|
1404
1482
|
{
|
|
1405
1483
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
@@ -2163,7 +2241,7 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm
|
|
|
2163
2241
|
// dependencies.
|
|
2164
2242
|
sync_with_other_backends(backend);
|
|
2165
2243
|
|
|
2166
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
|
2244
|
+
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
|
2167
2245
|
continue;
|
|
2168
2246
|
}
|
|
2169
2247
|
|
|
@@ -2198,6 +2276,21 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
2198
2276
|
default:
|
|
2199
2277
|
return false;
|
|
2200
2278
|
}
|
|
2279
|
+
case GGML_OP_SET_ROWS:
|
|
2280
|
+
{
|
|
2281
|
+
// TODO: add support
|
|
2282
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
|
|
2283
|
+
if (op->src[0]->type != GGML_TYPE_F32) {
|
|
2284
|
+
return false;
|
|
2285
|
+
}
|
|
2286
|
+
switch (op->type) {
|
|
2287
|
+
case GGML_TYPE_F16:
|
|
2288
|
+
case GGML_TYPE_F32:
|
|
2289
|
+
return true;
|
|
2290
|
+
default:
|
|
2291
|
+
return false;
|
|
2292
|
+
}
|
|
2293
|
+
}
|
|
2201
2294
|
case GGML_OP_CPY:
|
|
2202
2295
|
case GGML_OP_DUP:
|
|
2203
2296
|
case GGML_OP_CONT:
|
|
@@ -2232,6 +2325,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
2232
2325
|
case GGML_UNARY_OP_GELU:
|
|
2233
2326
|
case GGML_UNARY_OP_SILU:
|
|
2234
2327
|
case GGML_UNARY_OP_RELU:
|
|
2328
|
+
case GGML_UNARY_OP_GELU_ERF:
|
|
2235
2329
|
case GGML_UNARY_OP_GELU_QUICK:
|
|
2236
2330
|
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
|
2237
2331
|
case GGML_UNARY_OP_SIGMOID:
|
|
@@ -2242,6 +2336,17 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
2242
2336
|
default:
|
|
2243
2337
|
return false;
|
|
2244
2338
|
}
|
|
2339
|
+
case GGML_OP_GLU:
|
|
2340
|
+
switch (ggml_get_glu_op(op)) {
|
|
2341
|
+
case GGML_GLU_OP_GEGLU:
|
|
2342
|
+
case GGML_GLU_OP_REGLU:
|
|
2343
|
+
case GGML_GLU_OP_SWIGLU:
|
|
2344
|
+
case GGML_GLU_OP_GEGLU_ERF:
|
|
2345
|
+
case GGML_GLU_OP_GEGLU_QUICK:
|
|
2346
|
+
return ggml_is_contiguous_1(op->src[0]) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16);
|
|
2347
|
+
default:
|
|
2348
|
+
return false;
|
|
2349
|
+
}
|
|
2245
2350
|
case GGML_OP_CLAMP:
|
|
2246
2351
|
return op->src[0]->type == GGML_TYPE_F32;
|
|
2247
2352
|
case GGML_OP_SOFT_MAX:
|
|
@@ -3166,7 +3271,7 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
|
|
|
3166
3271
|
|
|
3167
3272
|
// Open file and dump.
|
|
3168
3273
|
char fname[512];
|
|
3169
|
-
|
|
3274
|
+
snprintf(fname, sizeof(fname), "./tensor-dumps/%s.txt", tensor->name);
|
|
3170
3275
|
FILE * f = fopen(fname, "w");
|
|
3171
3276
|
if (!f) {
|
|
3172
3277
|
printf("Failed to open %s\n", fname);
|
|
@@ -3325,6 +3430,111 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
3325
3430
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3326
3431
|
}
|
|
3327
3432
|
|
|
3433
|
+
static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3434
|
+
GGML_ASSERT(src0);
|
|
3435
|
+
GGML_ASSERT(src0->extra);
|
|
3436
|
+
GGML_ASSERT(src1);
|
|
3437
|
+
GGML_ASSERT(src1->extra);
|
|
3438
|
+
GGML_ASSERT(dst);
|
|
3439
|
+
GGML_ASSERT(dst->extra);
|
|
3440
|
+
|
|
3441
|
+
// ne0 = ne00
|
|
3442
|
+
// ne2 = ne02
|
|
3443
|
+
// ne3 = ne03
|
|
3444
|
+
|
|
3445
|
+
const int ne01 = src0->ne[1];
|
|
3446
|
+
const int ne02 = src0->ne[2];
|
|
3447
|
+
const int ne03 = src0->ne[3];
|
|
3448
|
+
|
|
3449
|
+
const cl_ulong nb01 = src0->nb[1];
|
|
3450
|
+
const cl_ulong nb02 = src0->nb[2];
|
|
3451
|
+
const cl_ulong nb03 = src0->nb[3];
|
|
3452
|
+
|
|
3453
|
+
const int ne11 = src1->ne[1];
|
|
3454
|
+
const int ne12 = src1->ne[2];
|
|
3455
|
+
|
|
3456
|
+
const cl_ulong nb10 = src1->nb[0];
|
|
3457
|
+
const cl_ulong nb11 = src1->nb[1];
|
|
3458
|
+
const cl_ulong nb12 = src1->nb[2];
|
|
3459
|
+
|
|
3460
|
+
const int ne0 = dst->ne[0];
|
|
3461
|
+
|
|
3462
|
+
const cl_ulong nb1 = dst->nb[1];
|
|
3463
|
+
const cl_ulong nb2 = dst->nb[2];
|
|
3464
|
+
const cl_ulong nb3 = dst->nb[3];
|
|
3465
|
+
|
|
3466
|
+
const int nblk0 = ne0/ggml_blck_size(dst->type);
|
|
3467
|
+
|
|
3468
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3469
|
+
|
|
3470
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3471
|
+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
3472
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
3473
|
+
|
|
3474
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
3475
|
+
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
|
3476
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
3477
|
+
|
|
3478
|
+
cl_kernel kernel;
|
|
3479
|
+
|
|
3480
|
+
switch (dst->type) {
|
|
3481
|
+
case GGML_TYPE_F32:
|
|
3482
|
+
kernel = backend_ctx->kernel_set_rows_f32;
|
|
3483
|
+
break;
|
|
3484
|
+
case GGML_TYPE_F16:
|
|
3485
|
+
kernel = backend_ctx->kernel_set_rows_f16;
|
|
3486
|
+
break;
|
|
3487
|
+
default:
|
|
3488
|
+
GGML_ABORT("not implemented");
|
|
3489
|
+
}
|
|
3490
|
+
|
|
3491
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
3492
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
3493
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
3494
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
3495
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
3496
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
3497
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01));
|
|
3498
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
|
3499
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
|
3500
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
|
3501
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne11));
|
|
3502
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
|
|
3503
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb10));
|
|
3504
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb11));
|
|
3505
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb12));
|
|
3506
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &nblk0));
|
|
3507
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb1));
|
|
3508
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb2));
|
|
3509
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb3));
|
|
3510
|
+
|
|
3511
|
+
int nth0 = 64;
|
|
3512
|
+
if (backend_ctx->gpu_family == INTEL) {
|
|
3513
|
+
nth0 = 32;
|
|
3514
|
+
} else if (backend_ctx->gpu_family == ADRENO) {
|
|
3515
|
+
nth0 = 64;
|
|
3516
|
+
}
|
|
3517
|
+
|
|
3518
|
+
int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
|
|
3519
|
+
while (nth0 < nblk0 && nth0 < max_workgroup_size) {
|
|
3520
|
+
nth0 *= 2;
|
|
3521
|
+
}
|
|
3522
|
+
|
|
3523
|
+
int rows_per_workgroup = 1;
|
|
3524
|
+
if (nth0 > nblk0) {
|
|
3525
|
+
rows_per_workgroup = nth0 / nblk0;
|
|
3526
|
+
nth0 = nblk0;
|
|
3527
|
+
}
|
|
3528
|
+
|
|
3529
|
+
size_t global_work_size[] = {
|
|
3530
|
+
(size_t)(ne01 + rows_per_workgroup - 1)/rows_per_workgroup*nth0,
|
|
3531
|
+
(size_t)ne02*rows_per_workgroup,
|
|
3532
|
+
(size_t)ne03};
|
|
3533
|
+
size_t local_work_size[] = {(size_t)nth0, (size_t)rows_per_workgroup, 1};
|
|
3534
|
+
|
|
3535
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3536
|
+
}
|
|
3537
|
+
|
|
3328
3538
|
static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3329
3539
|
GGML_ASSERT(src0);
|
|
3330
3540
|
GGML_ASSERT(src0->extra);
|
|
@@ -3825,6 +4035,44 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3825
4035
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3826
4036
|
}
|
|
3827
4037
|
|
|
4038
|
+
static void ggml_cl_gelu_erf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
4039
|
+
GGML_ASSERT(src0);
|
|
4040
|
+
GGML_ASSERT(src0->extra);
|
|
4041
|
+
GGML_ASSERT(dst);
|
|
4042
|
+
GGML_ASSERT(dst->extra);
|
|
4043
|
+
|
|
4044
|
+
UNUSED(src1);
|
|
4045
|
+
|
|
4046
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4047
|
+
|
|
4048
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4049
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
4050
|
+
|
|
4051
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
4052
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
4053
|
+
|
|
4054
|
+
cl_kernel kernel;
|
|
4055
|
+
|
|
4056
|
+
int n = ggml_nelements(dst);
|
|
4057
|
+
|
|
4058
|
+
if (n % 4 == 0) {
|
|
4059
|
+
kernel = backend_ctx->kernel_gelu_erf_4;
|
|
4060
|
+
n /= 4;
|
|
4061
|
+
} else {
|
|
4062
|
+
kernel = backend_ctx->kernel_gelu_erf;
|
|
4063
|
+
}
|
|
4064
|
+
|
|
4065
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
4066
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
4067
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
4068
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
4069
|
+
|
|
4070
|
+
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
4071
|
+
size_t local_work_size[] = {64, 1, 1};
|
|
4072
|
+
|
|
4073
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
4074
|
+
}
|
|
4075
|
+
|
|
3828
4076
|
static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3829
4077
|
GGML_ASSERT(src0);
|
|
3830
4078
|
GGML_ASSERT(src0->extra);
|
|
@@ -4420,7 +4668,8 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
|
|
|
4420
4668
|
|
|
4421
4669
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4422
4670
|
|
|
4423
|
-
const
|
|
4671
|
+
const int mode_flags = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
|
|
4672
|
+
const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF);
|
|
4424
4673
|
cl_kernel kernel = nullptr;
|
|
4425
4674
|
|
|
4426
4675
|
if (mode == GGML_SCALE_MODE_NEAREST) {
|
|
@@ -4451,18 +4700,22 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
|
|
|
4451
4700
|
const cl_ulong nb02 = src0->nb[2];
|
|
4452
4701
|
const cl_ulong nb03 = src0->nb[3];
|
|
4453
4702
|
|
|
4454
|
-
const int
|
|
4455
|
-
const int
|
|
4703
|
+
const int ne00 = src0->ne[0];
|
|
4704
|
+
const int ne01 = src0->ne[1];
|
|
4705
|
+
const int ne02 = src0->ne[2];
|
|
4706
|
+
const int ne03 = src0->ne[3];
|
|
4707
|
+
|
|
4708
|
+
const int ne0 = dst->ne[0];
|
|
4709
|
+
const int ne1 = dst->ne[1];
|
|
4710
|
+
const int ne2 = dst->ne[2];
|
|
4711
|
+
const int ne3 = dst->ne[3];
|
|
4456
4712
|
|
|
4457
|
-
|
|
4458
|
-
|
|
4459
|
-
|
|
4460
|
-
|
|
4713
|
+
float sf0 = (float)ne0 / ne00;
|
|
4714
|
+
float sf1 = (float)ne1 / ne01;
|
|
4715
|
+
float sf2 = (float)ne2 / ne02;
|
|
4716
|
+
float sf3 = (float)ne3 / ne03;
|
|
4461
4717
|
|
|
4462
|
-
|
|
4463
|
-
const float sf1 = (float)dst->ne[1] / src0->ne[1];
|
|
4464
|
-
const float sf2 = (float)dst->ne[2] / src0->ne[2];
|
|
4465
|
-
const float sf3 = (float)dst->ne[3] / src0->ne[3];
|
|
4718
|
+
float pixel_offset = 0.5f;
|
|
4466
4719
|
|
|
4467
4720
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
|
|
4468
4721
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
|
@@ -4474,29 +4727,36 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
|
|
|
4474
4727
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb03));
|
|
4475
4728
|
|
|
4476
4729
|
if (mode == GGML_SCALE_MODE_NEAREST) {
|
|
4477
|
-
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &
|
|
4478
|
-
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &
|
|
4479
|
-
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &
|
|
4480
|
-
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &
|
|
4730
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne0));
|
|
4731
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne1));
|
|
4732
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne2));
|
|
4733
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne3));
|
|
4481
4734
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &sf0));
|
|
4482
4735
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float), &sf1));
|
|
4483
4736
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf2));
|
|
4484
4737
|
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf3));
|
|
4485
4738
|
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
|
4486
|
-
|
|
4487
|
-
|
|
4488
|
-
|
|
4489
|
-
|
|
4490
|
-
|
|
4491
|
-
|
|
4739
|
+
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
|
|
4740
|
+
sf0 = (float)(ne0 - 1) / (ne00 - 1);
|
|
4741
|
+
sf1 = (float)(ne1 - 1) / (ne01 - 1);
|
|
4742
|
+
pixel_offset = 0.0f;
|
|
4743
|
+
}
|
|
4744
|
+
|
|
4745
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
|
4746
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
|
|
4747
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne0));
|
|
4748
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne1));
|
|
4749
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne2));
|
|
4750
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne3));
|
|
4492
4751
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf0));
|
|
4493
4752
|
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf1));
|
|
4494
4753
|
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(float), &sf2));
|
|
4495
4754
|
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(float), &sf3));
|
|
4755
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float), &pixel_offset));
|
|
4496
4756
|
}
|
|
4497
4757
|
|
|
4498
4758
|
|
|
4499
|
-
size_t dst_total_elements = (size_t)
|
|
4759
|
+
size_t dst_total_elements = (size_t)ne0 * ne1 * ne2 * ne3;
|
|
4500
4760
|
if (dst_total_elements == 0) {
|
|
4501
4761
|
return;
|
|
4502
4762
|
}
|
|
@@ -4685,6 +4945,58 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
|
|
|
4685
4945
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
|
|
4686
4946
|
}
|
|
4687
4947
|
|
|
4948
|
+
static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
4949
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4950
|
+
|
|
4951
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4952
|
+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
4953
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
4954
|
+
|
|
4955
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
4956
|
+
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
|
4957
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
4958
|
+
|
|
4959
|
+
const int M = src0->ne[1];
|
|
4960
|
+
const int N = src1->ne[1];
|
|
4961
|
+
const int K = src0->ne[0];
|
|
4962
|
+
|
|
4963
|
+
cl_kernel kernel = backend_ctx->kernel_mul_mat_f16_f32_tiled;
|
|
4964
|
+
|
|
4965
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(int), &M));
|
|
4966
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &N));
|
|
4967
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &K));
|
|
4968
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0->data_device));
|
|
4969
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offset0));
|
|
4970
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extra1->data_device));
|
|
4971
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offset1));
|
|
4972
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), &extrad->data_device));
|
|
4973
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &offsetd));
|
|
4974
|
+
|
|
4975
|
+
// Tiling parameters. These need to be tuned for optimal performance.
|
|
4976
|
+
// They must match the #defines in the kernel mul_mat_f16_f32.cl.
|
|
4977
|
+
//
|
|
4978
|
+
// OPWM / OPWN: Output tile size per Work-Group. A work-group computes a tile of size OPWM x OPWN.
|
|
4979
|
+
// TPWM / TPWN: Threads per Work-group. This is the work-group size.
|
|
4980
|
+
// OPTM / OPTN: Output elements per Thread. Each thread computes OPTM x OPTN elements.
|
|
4981
|
+
//
|
|
4982
|
+
// The following relationships must hold:
|
|
4983
|
+
// OPWM = TPWM * OPTM
|
|
4984
|
+
// OPWN = TPWN * OPTN
|
|
4985
|
+
//
|
|
4986
|
+
const int OPWM = 64;
|
|
4987
|
+
const int OPWN = 64;
|
|
4988
|
+
const int TPWM = 16;
|
|
4989
|
+
const int TPWN = 8;
|
|
4990
|
+
|
|
4991
|
+
size_t local_work_size[2] = { TPWM, TPWN };
|
|
4992
|
+
size_t global_work_size[2] = {
|
|
4993
|
+
(size_t) ((M + OPWM - 1) / OPWM) * TPWM,
|
|
4994
|
+
(size_t) ((N + OPWN - 1) / OPWN) * TPWN,
|
|
4995
|
+
};
|
|
4996
|
+
|
|
4997
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
|
|
4998
|
+
}
|
|
4999
|
+
|
|
4688
5000
|
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
4689
5001
|
GGML_ASSERT(src0);
|
|
4690
5002
|
GGML_ASSERT(src0->extra);
|
|
@@ -4698,6 +5010,18 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
4698
5010
|
|
|
4699
5011
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4700
5012
|
|
|
5013
|
+
if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
|
|
5014
|
+
src0->ne[1] > 32 && // M > 32
|
|
5015
|
+
src1->ne[1] > 32 && // N > 32
|
|
5016
|
+
src0->ne[0] > 32 && // K > 32
|
|
5017
|
+
src0->ne[2] == 1 && src0->ne[3] == 1 &&
|
|
5018
|
+
src1->ne[2] == 1 && src1->ne[3] == 1 &&
|
|
5019
|
+
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) &&
|
|
5020
|
+
backend_ctx->kernel_mul_mat_f16_f32_tiled != NULL) {
|
|
5021
|
+
ggml_cl_mul_mat_f16_f32_tiled(backend, src0, src1, dst);
|
|
5022
|
+
return;
|
|
5023
|
+
}
|
|
5024
|
+
|
|
4701
5025
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4702
5026
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
4703
5027
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -5488,7 +5812,9 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
|
5488
5812
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5489
5813
|
|
|
5490
5814
|
float scale;
|
|
5491
|
-
|
|
5815
|
+
float bias;
|
|
5816
|
+
memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(float));
|
|
5817
|
+
memcpy(&bias, ((int32_t *) dst->op_params) + 1, sizeof(float));
|
|
5492
5818
|
|
|
5493
5819
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
5494
5820
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -5503,6 +5829,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
|
5503
5829
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
5504
5830
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
5505
5831
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale));
|
|
5832
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float), &bias));
|
|
5506
5833
|
|
|
5507
5834
|
int n = ggml_nelements(dst)/4;
|
|
5508
5835
|
|
|
@@ -5712,19 +6039,31 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
5712
6039
|
|
|
5713
6040
|
cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
|
|
5714
6041
|
|
|
5715
|
-
const int
|
|
5716
|
-
const int
|
|
5717
|
-
const int
|
|
5718
|
-
const int
|
|
6042
|
+
const int ne00 = src0->ne[0];
|
|
6043
|
+
const int ne01 = src0->ne[1];
|
|
6044
|
+
const int ne02 = src0->ne[2];
|
|
6045
|
+
const int ne03 = src0->ne[3];
|
|
6046
|
+
|
|
6047
|
+
const cl_long nb01 = src0->nb[1];
|
|
6048
|
+
const cl_long nb02 = src0->nb[2];
|
|
6049
|
+
const cl_long nb03 = src0->nb[3];
|
|
6050
|
+
|
|
6051
|
+
const int ne12 = src1 ? src1->ne[2] : 0;
|
|
6052
|
+
const int ne13 = src1 ? src1->ne[3] : 0;
|
|
6053
|
+
|
|
6054
|
+
const cl_long nb11 = src1 ? src1->nb[1] : 0;
|
|
6055
|
+
const cl_long nb12 = src1 ? src1->nb[2] : 0;
|
|
6056
|
+
const cl_long nb13 = src1 ? src1->nb[3] : 0;
|
|
6057
|
+
|
|
6058
|
+
const cl_long nb1 = dst->nb[1];
|
|
6059
|
+
const cl_long nb2 = dst->nb[2];
|
|
6060
|
+
const cl_long nb3 = dst->nb[3];
|
|
5719
6061
|
|
|
5720
6062
|
float scale, max_bias;
|
|
5721
6063
|
memcpy(&scale, dst->op_params + 0, sizeof(float));
|
|
5722
6064
|
memcpy(&max_bias, dst->op_params + 1, sizeof(float));
|
|
5723
6065
|
|
|
5724
|
-
const int
|
|
5725
|
-
const int nrows_y = src0->ne[1];
|
|
5726
|
-
|
|
5727
|
-
const int n_head = nrows_x/nrows_y;
|
|
6066
|
+
const int n_head = src0->ne[2];
|
|
5728
6067
|
const int n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
|
5729
6068
|
|
|
5730
6069
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
|
@@ -5769,13 +6108,22 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
5769
6108
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
5770
6109
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
5771
6110
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
|
5772
|
-
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(
|
|
5773
|
-
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(
|
|
5774
|
-
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(
|
|
5775
|
-
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(
|
|
5776
|
-
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(
|
|
5777
|
-
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(
|
|
5778
|
-
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(
|
|
6111
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
|
6112
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
|
6113
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
|
6114
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
|
|
6115
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13));
|
|
6116
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
|
|
6117
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
|
|
6118
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
|
|
6119
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb1));
|
|
6120
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb2));
|
|
6121
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb3));
|
|
6122
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float), &scale));
|
|
6123
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(float), &max_bias));
|
|
6124
|
+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(float), &m0));
|
|
6125
|
+
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(float), &m1));
|
|
6126
|
+
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &n_head_log2));
|
|
5779
6127
|
|
|
5780
6128
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
5781
6129
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
@@ -6143,6 +6491,105 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
6143
6491
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6144
6492
|
}
|
|
6145
6493
|
|
|
6494
|
+
static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
6495
|
+
GGML_ASSERT(src0);
|
|
6496
|
+
GGML_ASSERT(src0->extra);
|
|
6497
|
+
GGML_ASSERT(dst);
|
|
6498
|
+
GGML_ASSERT(dst->extra);
|
|
6499
|
+
|
|
6500
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
6501
|
+
|
|
6502
|
+
if (src1) {
|
|
6503
|
+
GGML_ASSERT(src1);
|
|
6504
|
+
GGML_ASSERT(src1->extra);
|
|
6505
|
+
GGML_ASSERT(ggml_are_same_shape(src0, src1));
|
|
6506
|
+
}
|
|
6507
|
+
|
|
6508
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
6509
|
+
|
|
6510
|
+
cl_kernel kernel;
|
|
6511
|
+
switch (ggml_get_glu_op(dst)) {
|
|
6512
|
+
case GGML_GLU_OP_GEGLU:
|
|
6513
|
+
if (dst->type == GGML_TYPE_F32) {
|
|
6514
|
+
kernel = backend_ctx->kernel_geglu;
|
|
6515
|
+
} else {
|
|
6516
|
+
kernel = backend_ctx->kernel_geglu_f16;
|
|
6517
|
+
}
|
|
6518
|
+
break;
|
|
6519
|
+
case GGML_GLU_OP_REGLU:
|
|
6520
|
+
if (dst->type == GGML_TYPE_F32) {
|
|
6521
|
+
kernel = backend_ctx->kernel_reglu;
|
|
6522
|
+
} else {
|
|
6523
|
+
kernel = backend_ctx->kernel_reglu_f16;
|
|
6524
|
+
}
|
|
6525
|
+
break;
|
|
6526
|
+
case GGML_GLU_OP_SWIGLU:
|
|
6527
|
+
if (dst->type == GGML_TYPE_F32) {
|
|
6528
|
+
kernel = backend_ctx->kernel_swiglu;
|
|
6529
|
+
} else {
|
|
6530
|
+
kernel = backend_ctx->kernel_swiglu_f16;
|
|
6531
|
+
}
|
|
6532
|
+
break;
|
|
6533
|
+
case GGML_GLU_OP_GEGLU_ERF:
|
|
6534
|
+
if (dst->type == GGML_TYPE_F32) {
|
|
6535
|
+
kernel = backend_ctx->kernel_geglu_erf;
|
|
6536
|
+
} else {
|
|
6537
|
+
kernel = backend_ctx->kernel_geglu_erf_f16;
|
|
6538
|
+
}
|
|
6539
|
+
break;
|
|
6540
|
+
case GGML_GLU_OP_GEGLU_QUICK:
|
|
6541
|
+
if (dst->type == GGML_TYPE_F32) {
|
|
6542
|
+
kernel = backend_ctx->kernel_geglu_quick;
|
|
6543
|
+
} else {
|
|
6544
|
+
kernel = backend_ctx->kernel_geglu_quick_f16;
|
|
6545
|
+
}
|
|
6546
|
+
break;
|
|
6547
|
+
default:
|
|
6548
|
+
GGML_ABORT("Unsupported glu op");
|
|
6549
|
+
}
|
|
6550
|
+
|
|
6551
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
6552
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
6553
|
+
|
|
6554
|
+
ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr;
|
|
6555
|
+
|
|
6556
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
6557
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
6558
|
+
|
|
6559
|
+
cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
|
|
6560
|
+
|
|
6561
|
+
const int ne0 = dst->ne[0];
|
|
6562
|
+
|
|
6563
|
+
const cl_ulong nb01 = src0->nb[1];
|
|
6564
|
+
const cl_ulong nb11 = src1 ? src1->nb[1] : nb01;
|
|
6565
|
+
|
|
6566
|
+
const cl_ulong nb1 = dst->nb[1];
|
|
6567
|
+
|
|
6568
|
+
const int swp = ((const int32_t *) dst->op_params)[1];
|
|
6569
|
+
const int ne00_off = src1 ? 0 : (swp ? ne0 : 0);
|
|
6570
|
+
const int ne10_off = src1 ? 0 : (swp ? 0 : ne0);
|
|
6571
|
+
|
|
6572
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
6573
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
6574
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), src1 ? &extra1->data_device : &extra0->data_device));
|
|
6575
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
6576
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
6577
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
6578
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
|
|
6579
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb11));
|
|
6580
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne0));
|
|
6581
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb1));
|
|
6582
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne00_off));
|
|
6583
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10_off));
|
|
6584
|
+
|
|
6585
|
+
const size_t nrows = ggml_nrows(src0);
|
|
6586
|
+
size_t nth = 512;
|
|
6587
|
+
size_t global_work_size[] = {nrows*nth, 1, 1};
|
|
6588
|
+
size_t local_work_size[] = {nth, 1, 1};
|
|
6589
|
+
|
|
6590
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6591
|
+
}
|
|
6592
|
+
|
|
6146
6593
|
//------------------------------------------------------------------------------
|
|
6147
6594
|
// Op offloading
|
|
6148
6595
|
//------------------------------------------------------------------------------
|
|
@@ -6166,6 +6613,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
6166
6613
|
}
|
|
6167
6614
|
func = ggml_cl_get_rows;
|
|
6168
6615
|
break;
|
|
6616
|
+
case GGML_OP_SET_ROWS:
|
|
6617
|
+
if (!any_on_device) {
|
|
6618
|
+
return false;
|
|
6619
|
+
}
|
|
6620
|
+
func = ggml_cl_set_rows;
|
|
6621
|
+
break;
|
|
6169
6622
|
case GGML_OP_CPY:
|
|
6170
6623
|
if (!any_on_device) {
|
|
6171
6624
|
return false;
|
|
@@ -6211,6 +6664,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
6211
6664
|
}
|
|
6212
6665
|
func = ggml_cl_gelu;
|
|
6213
6666
|
break;
|
|
6667
|
+
case GGML_UNARY_OP_GELU_ERF:
|
|
6668
|
+
if (!any_on_device) {
|
|
6669
|
+
return false;
|
|
6670
|
+
}
|
|
6671
|
+
func = ggml_cl_gelu_erf;
|
|
6672
|
+
break;
|
|
6214
6673
|
case GGML_UNARY_OP_GELU_QUICK:
|
|
6215
6674
|
if (!any_on_device) {
|
|
6216
6675
|
return false;
|
|
@@ -6244,6 +6703,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
6244
6703
|
default:
|
|
6245
6704
|
return false;
|
|
6246
6705
|
} break;
|
|
6706
|
+
case GGML_OP_GLU:
|
|
6707
|
+
if (!any_on_device) {
|
|
6708
|
+
return false;
|
|
6709
|
+
}
|
|
6710
|
+
func = ggml_cl_glu;
|
|
6711
|
+
break;
|
|
6247
6712
|
case GGML_OP_CLAMP:
|
|
6248
6713
|
if (!any_on_device) {
|
|
6249
6714
|
return false;
|