@novastera-oss/llamarn 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +80 -14
- package/RNLlamaCpp.podspec +10 -3
- package/android/CMakeLists.txt +8 -0
- package/android/src/main/cpp/include/llama.h +62 -125
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/README.md +11 -3
- package/cpp/llama.cpp/build-xcframework.sh +1 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/common/arg.cpp +153 -113
- package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
- package/cpp/llama.cpp/common/chat-parser.h +117 -0
- package/cpp/llama.cpp/common/chat.cpp +847 -699
- package/cpp/llama.cpp/common/chat.h +73 -6
- package/cpp/llama.cpp/common/common.cpp +50 -82
- package/cpp/llama.cpp/common/common.h +21 -17
- package/cpp/llama.cpp/common/json-partial.cpp +255 -0
- package/cpp/llama.cpp/common/json-partial.h +37 -0
- package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
- package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
- package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
- package/cpp/llama.cpp/common/regex-partial.h +56 -0
- package/cpp/llama.cpp/common/sampling.cpp +7 -8
- package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
- package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
- package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
- package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
- package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
- package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
- package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
- package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
- package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
- package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
- package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
- package/cpp/llama.cpp/include/llama.h +62 -125
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
- package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
- package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
- package/cpp/llama.cpp/models/templates/README.md +2 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
- package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
- package/cpp/llama.cpp/src/llama-arch.h +2 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
- package/cpp/llama.cpp/src/llama-context.cpp +340 -123
- package/cpp/llama.cpp/src/llama-context.h +30 -0
- package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
- package/cpp/llama.cpp/src/llama-cparams.h +2 -0
- package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
- package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
- package/cpp/llama.cpp/src/llama-graph.h +52 -7
- package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
- package/cpp/llama.cpp/src/llama-hparams.h +37 -5
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
- package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
- package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
- package/cpp/llama.cpp/src/llama-memory.h +4 -3
- package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
- package/cpp/llama.cpp/src/llama-model.cpp +529 -172
- package/cpp/llama.cpp/src/llama-model.h +6 -1
- package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
- package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
- package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
- package/cpp/llama.cpp/src/llama-vocab.h +6 -0
- package/cpp/llama.cpp/src/llama.cpp +14 -0
- package/cpp/rn-completion.cpp +4 -2
- package/ios/include/chat.h +73 -6
- package/ios/include/common/minja/chat-template.hpp +9 -5
- package/ios/include/common/minja/minja.hpp +69 -36
- package/ios/include/common.h +21 -17
- package/ios/include/llama.h +62 -125
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
- package/cpp/llama.cpp/common/stb_image.h +0 -7988
- package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
|
@@ -27,6 +27,7 @@
|
|
|
27
27
|
#include <cmath>
|
|
28
28
|
#include <memory>
|
|
29
29
|
#include <charconv>
|
|
30
|
+
#include <mutex>
|
|
30
31
|
|
|
31
32
|
#undef MIN
|
|
32
33
|
#undef MAX
|
|
@@ -74,6 +75,7 @@ struct ggml_cl_version {
|
|
|
74
75
|
cl_uint minor = 0;
|
|
75
76
|
};
|
|
76
77
|
|
|
78
|
+
|
|
77
79
|
struct ggml_cl_compiler_version {
|
|
78
80
|
ADRENO_CL_COMPILER_TYPE type;
|
|
79
81
|
int major = -1;
|
|
@@ -91,6 +93,14 @@ struct ggml_cl_compiler_version {
|
|
|
91
93
|
}
|
|
92
94
|
};
|
|
93
95
|
|
|
96
|
+
static size_t align_to(size_t value, size_t to_alignment) {
|
|
97
|
+
GGML_ASSERT(to_alignment && "Invalid alignment (must be non-zero)");
|
|
98
|
+
GGML_ASSERT((to_alignment & (to_alignment - 1)) == 0 && "to_alignment must be power-of-two");
|
|
99
|
+
|
|
100
|
+
return ((value + to_alignment - 1) / to_alignment) * to_alignment;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
|
|
94
104
|
// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
|
|
95
105
|
static ggml_cl_version parse_cl_version(std::string_view str) {
|
|
96
106
|
size_t major_str_begin = 0;
|
|
@@ -221,13 +231,25 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
|
|
|
221
231
|
return { type, major, minor, patch };
|
|
222
232
|
}
|
|
223
233
|
|
|
234
|
+
struct ggml_backend_opencl_context;
|
|
235
|
+
|
|
224
236
|
// backend device context
|
|
225
237
|
struct ggml_backend_opencl_device_context {
|
|
226
238
|
cl_platform_id platform;
|
|
227
239
|
std::string platform_name;
|
|
228
240
|
|
|
229
|
-
cl_device_id
|
|
230
|
-
std::string
|
|
241
|
+
cl_device_id device;
|
|
242
|
+
std::string device_name;
|
|
243
|
+
cl_device_type device_type;
|
|
244
|
+
std::string device_version;
|
|
245
|
+
|
|
246
|
+
// Initialized by ggml_cl2_init().
|
|
247
|
+
ggml_backend_opencl_context * backend_ctx = nullptr;
|
|
248
|
+
|
|
249
|
+
// Initialized by ggml_backend_opencl_device_get_buffer_type()
|
|
250
|
+
ggml_backend_buffer_type buffer_type;
|
|
251
|
+
|
|
252
|
+
cl_context context = nullptr;
|
|
231
253
|
};
|
|
232
254
|
|
|
233
255
|
// backend context
|
|
@@ -248,6 +270,8 @@ struct ggml_backend_opencl_context {
|
|
|
248
270
|
|
|
249
271
|
int adreno_wave_size;
|
|
250
272
|
|
|
273
|
+
cl_bool non_uniform_workgroups;
|
|
274
|
+
|
|
251
275
|
cl_context context;
|
|
252
276
|
cl_command_queue queue;
|
|
253
277
|
|
|
@@ -275,27 +299,37 @@ struct ggml_backend_opencl_context {
|
|
|
275
299
|
cl_program program_mul_mv_f16_f32;
|
|
276
300
|
cl_program program_mul_mv_f32_f32;
|
|
277
301
|
cl_program program_mul;
|
|
302
|
+
cl_program program_div;
|
|
303
|
+
cl_program program_sub;
|
|
278
304
|
cl_program program_norm;
|
|
279
305
|
cl_program program_relu;
|
|
280
306
|
cl_program program_rms_norm;
|
|
307
|
+
cl_program program_group_norm;
|
|
281
308
|
cl_program program_rope;
|
|
282
309
|
cl_program program_scale;
|
|
283
310
|
cl_program program_silu;
|
|
311
|
+
cl_program program_sigmoid;
|
|
284
312
|
cl_program program_softmax_f32;
|
|
285
313
|
cl_program program_softmax_f16;
|
|
286
314
|
cl_program program_softmax_4_f32;
|
|
287
315
|
cl_program program_softmax_4_f16;
|
|
316
|
+
cl_program program_argsort_f32_i32;
|
|
317
|
+
cl_program program_sum_rows_f32;
|
|
288
318
|
|
|
289
319
|
cl_kernel kernel_add, kernel_add_row;
|
|
290
320
|
cl_kernel kernel_mul, kernel_mul_row;
|
|
321
|
+
cl_kernel kernel_div, kernel_div_row;
|
|
322
|
+
cl_kernel kernel_sub, kernel_sub_row;
|
|
291
323
|
cl_kernel kernel_scale;
|
|
292
324
|
cl_kernel kernel_silu, kernel_silu_4;
|
|
293
325
|
cl_kernel kernel_gelu, kernel_gelu_4;
|
|
294
326
|
cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
|
|
295
327
|
cl_kernel kernel_relu;
|
|
328
|
+
cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
|
|
296
329
|
cl_kernel kernel_clamp;
|
|
297
330
|
cl_kernel kernel_norm;
|
|
298
331
|
cl_kernel kernel_rms_norm;
|
|
332
|
+
cl_kernel kernel_group_norm;
|
|
299
333
|
cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
|
|
300
334
|
cl_kernel kernel_soft_max, kernel_soft_max_4;
|
|
301
335
|
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
|
|
@@ -315,6 +349,8 @@ struct ggml_backend_opencl_context {
|
|
|
315
349
|
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
|
|
316
350
|
cl_kernel kernel_mul_mv_q6_K_f32;
|
|
317
351
|
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
|
|
352
|
+
cl_kernel kernel_argsort_f32_i32;
|
|
353
|
+
cl_kernel kernel_sum_rows_f32;
|
|
318
354
|
|
|
319
355
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
320
356
|
// Transpose kernels
|
|
@@ -344,15 +380,8 @@ struct ggml_backend_opencl_context {
|
|
|
344
380
|
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
|
345
381
|
};
|
|
346
382
|
|
|
347
|
-
|
|
348
|
-
static
|
|
349
|
-
/*.platform =*/ nullptr,
|
|
350
|
-
/*.platform_nane =*/ "",
|
|
351
|
-
/*.device =*/ nullptr,
|
|
352
|
-
/*.device_name =*/ "",
|
|
353
|
-
};
|
|
354
|
-
|
|
355
|
-
static int ggml_backend_opencl_n_devices = 0;
|
|
383
|
+
// All registered devices with a default device in the front.
|
|
384
|
+
static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
|
|
356
385
|
|
|
357
386
|
// Profiling
|
|
358
387
|
#ifdef GGML_OPENCL_PROFILING
|
|
@@ -969,6 +998,105 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
969
998
|
GGML_LOG_CONT(".");
|
|
970
999
|
}
|
|
971
1000
|
|
|
1001
|
+
// argsort
|
|
1002
|
+
{
|
|
1003
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1004
|
+
const std::string kernel_src {
|
|
1005
|
+
#include "argsort.cl.h"
|
|
1006
|
+
};
|
|
1007
|
+
#else
|
|
1008
|
+
const std::string kernel_src = read_file("argsort.cl");
|
|
1009
|
+
#endif
|
|
1010
|
+
backend_ctx->program_argsort_f32_i32 =
|
|
1011
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1012
|
+
|
|
1013
|
+
CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
|
|
1014
|
+
GGML_LOG_CONT(".");
|
|
1015
|
+
}
|
|
1016
|
+
|
|
1017
|
+
// div
|
|
1018
|
+
{
|
|
1019
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1020
|
+
const std::string kernel_src {
|
|
1021
|
+
#include "div.cl.h"
|
|
1022
|
+
};
|
|
1023
|
+
#else
|
|
1024
|
+
const std::string kernel_src = read_file("div.cl");
|
|
1025
|
+
#endif
|
|
1026
|
+
backend_ctx->program_div =
|
|
1027
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1028
|
+
|
|
1029
|
+
CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
|
|
1030
|
+
CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
|
|
1031
|
+
GGML_LOG_CONT(".");
|
|
1032
|
+
}
|
|
1033
|
+
|
|
1034
|
+
// sub
|
|
1035
|
+
{
|
|
1036
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1037
|
+
const std::string kernel_src {
|
|
1038
|
+
#include "sub.cl.h"
|
|
1039
|
+
};
|
|
1040
|
+
#else
|
|
1041
|
+
const std::string kernel_src = read_file("sub.cl");
|
|
1042
|
+
#endif
|
|
1043
|
+
backend_ctx->program_sub =
|
|
1044
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1045
|
+
|
|
1046
|
+
CL_CHECK((backend_ctx->kernel_sub = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
|
|
1047
|
+
CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
|
|
1048
|
+
GGML_LOG_CONT(".");
|
|
1049
|
+
}
|
|
1050
|
+
|
|
1051
|
+
// sum_rows
|
|
1052
|
+
{
|
|
1053
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1054
|
+
const std::string kernel_src {
|
|
1055
|
+
#include "sum_rows.cl.h"
|
|
1056
|
+
};
|
|
1057
|
+
#else
|
|
1058
|
+
const std::string kernel_src = read_file("sum_rows.cl");
|
|
1059
|
+
#endif
|
|
1060
|
+
backend_ctx->program_sum_rows_f32 =
|
|
1061
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1062
|
+
|
|
1063
|
+
CL_CHECK((backend_ctx->kernel_sum_rows_f32 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32", &err), err));
|
|
1064
|
+
GGML_LOG_CONT(".");
|
|
1065
|
+
}
|
|
1066
|
+
|
|
1067
|
+
// sigmoid
|
|
1068
|
+
{
|
|
1069
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1070
|
+
const std::string kernel_src {
|
|
1071
|
+
#include "sigmoid.cl.h"
|
|
1072
|
+
};
|
|
1073
|
+
#else
|
|
1074
|
+
const std::string kernel_src = read_file("sigmoid.cl");
|
|
1075
|
+
#endif
|
|
1076
|
+
backend_ctx->program_sigmoid =
|
|
1077
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1078
|
+
|
|
1079
|
+
CL_CHECK((backend_ctx->kernel_sigmoid_f32 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f32", &err), err));
|
|
1080
|
+
CL_CHECK((backend_ctx->kernel_sigmoid_f16 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f16", &err), err));
|
|
1081
|
+
GGML_LOG_CONT(".");
|
|
1082
|
+
}
|
|
1083
|
+
|
|
1084
|
+
// group_norm
|
|
1085
|
+
{
|
|
1086
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1087
|
+
const std::string kernel_src {
|
|
1088
|
+
#include "group_norm.cl.h"
|
|
1089
|
+
};
|
|
1090
|
+
#else
|
|
1091
|
+
const std::string kernel_src = read_file("group_norm.cl");
|
|
1092
|
+
#endif
|
|
1093
|
+
backend_ctx->program_group_norm =
|
|
1094
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1095
|
+
|
|
1096
|
+
CL_CHECK((backend_ctx->kernel_group_norm = clCreateKernel(backend_ctx->program_group_norm, "kernel_group_norm", &err), err));
|
|
1097
|
+
GGML_LOG_CONT(".");
|
|
1098
|
+
}
|
|
1099
|
+
|
|
972
1100
|
// Adreno kernels
|
|
973
1101
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
974
1102
|
// transpose
|
|
@@ -1107,25 +1235,19 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1107
1235
|
GGML_LOG_CONT("\n");
|
|
1108
1236
|
}
|
|
1109
1237
|
|
|
1110
|
-
static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
1111
|
-
static bool initialized = false;
|
|
1112
|
-
static ggml_backend_opencl_context *backend_ctx = nullptr;
|
|
1113
|
-
|
|
1114
|
-
if (initialized) {
|
|
1115
|
-
return backend_ctx;
|
|
1116
|
-
}
|
|
1238
|
+
// XXX static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
1239
|
+
// XXX static bool initialized = false;
|
|
1240
|
+
// XXX static ggml_backend_opencl_context *backend_ctx = nullptr;
|
|
1117
1241
|
|
|
1118
|
-
|
|
1119
|
-
GGML_ASSERT(dev_ctx);
|
|
1120
|
-
GGML_ASSERT(dev_ctx->platform == nullptr);
|
|
1121
|
-
GGML_ASSERT(dev_ctx->device == nullptr);
|
|
1122
|
-
GGML_ASSERT(backend_ctx == nullptr);
|
|
1242
|
+
static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev);
|
|
1123
1243
|
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1244
|
+
namespace /* anonymous */ {
|
|
1245
|
+
extern struct ggml_backend_device_i ggml_backend_opencl_device_i;
|
|
1246
|
+
}
|
|
1127
1247
|
|
|
1128
|
-
|
|
1248
|
+
// Look for available and suitable devices.
|
|
1249
|
+
static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_reg * reg) {
|
|
1250
|
+
std::vector<ggml_backend_device> found_devices;
|
|
1129
1251
|
|
|
1130
1252
|
#ifdef GGML_OPENCL_PROFILING
|
|
1131
1253
|
GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
|
|
@@ -1158,11 +1280,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
1158
1280
|
struct cl_device devices[NDEV];
|
|
1159
1281
|
unsigned n_devices = 0;
|
|
1160
1282
|
struct cl_device * default_device = NULL;
|
|
1283
|
+
unsigned default_platform_number = 0;
|
|
1161
1284
|
|
|
1162
1285
|
cl_platform_id platform_ids[NPLAT];
|
|
1163
1286
|
if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) {
|
|
1164
1287
|
GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n");
|
|
1165
|
-
return
|
|
1288
|
+
return found_devices;
|
|
1166
1289
|
}
|
|
1167
1290
|
|
|
1168
1291
|
for (unsigned i = 0; i < n_platforms; i++) {
|
|
@@ -1197,19 +1320,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
1197
1320
|
}
|
|
1198
1321
|
|
|
1199
1322
|
if (default_device == NULL && p->default_device != NULL) {
|
|
1200
|
-
default_device
|
|
1323
|
+
default_device = p->default_device;
|
|
1324
|
+
default_platform_number = i;
|
|
1201
1325
|
}
|
|
1202
1326
|
}
|
|
1203
1327
|
|
|
1204
1328
|
if (n_devices == 0) {
|
|
1205
1329
|
GGML_LOG_ERROR("ggml_opencl: could find any OpenCL devices.\n");
|
|
1206
|
-
return
|
|
1330
|
+
return found_devices;
|
|
1207
1331
|
}
|
|
1208
1332
|
|
|
1209
|
-
char *
|
|
1210
|
-
char *
|
|
1211
|
-
int
|
|
1212
|
-
int
|
|
1333
|
+
char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
|
|
1334
|
+
char * user_device_string = getenv("GGML_OPENCL_DEVICE");
|
|
1335
|
+
int user_platform_number = -1;
|
|
1336
|
+
int user_device_number = -1;
|
|
1337
|
+
cl_device * candidate_devices = nullptr;
|
|
1338
|
+
unsigned n_candidate_devices = 0;
|
|
1213
1339
|
|
|
1214
1340
|
unsigned n;
|
|
1215
1341
|
if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
|
|
@@ -1224,12 +1350,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
1224
1350
|
GGML_LOG_ERROR("ggml_opencl: invalid device number %d\n", user_device_number);
|
|
1225
1351
|
exit(1);
|
|
1226
1352
|
}
|
|
1227
|
-
default_device
|
|
1353
|
+
default_device = &platform->devices[user_device_number];
|
|
1354
|
+
candidate_devices = platform->devices;
|
|
1355
|
+
n_candidate_devices = platform->n_devices;
|
|
1228
1356
|
} else {
|
|
1229
|
-
|
|
1230
|
-
struct cl_device * selected_devices = devices;
|
|
1231
|
-
unsigned n_selected_devices = n_devices;
|
|
1232
|
-
|
|
1357
|
+
// Choose a platform by matching a substring.
|
|
1233
1358
|
if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
|
|
1234
1359
|
for (unsigned i = 0; i < n_platforms; i++) {
|
|
1235
1360
|
struct cl_platform * p = &platforms[i];
|
|
@@ -1244,20 +1369,20 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
1244
1369
|
exit(1);
|
|
1245
1370
|
}
|
|
1246
1371
|
}
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1372
|
+
|
|
1373
|
+
int platform_idx = user_platform_number != -1 ? user_platform_number : default_platform_number;
|
|
1374
|
+
struct cl_platform * p = &platforms[platform_idx];
|
|
1375
|
+
candidate_devices = p->devices;
|
|
1376
|
+
n_candidate_devices = p->n_devices;
|
|
1377
|
+
default_device = p->default_device;
|
|
1378
|
+
if (n_candidate_devices == 0) {
|
|
1379
|
+
GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
|
|
1380
|
+
exit(1);
|
|
1256
1381
|
}
|
|
1257
1382
|
|
|
1258
1383
|
if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
|
|
1259
|
-
for (unsigned i = 0; i <
|
|
1260
|
-
struct cl_device * d = &
|
|
1384
|
+
for (unsigned i = 0; i < n_candidate_devices; i++) {
|
|
1385
|
+
struct cl_device * d = &candidate_devices[i];
|
|
1261
1386
|
if (strstr(d->name, user_device_string) != NULL) {
|
|
1262
1387
|
user_device_number = d->number;
|
|
1263
1388
|
break;
|
|
@@ -1269,71 +1394,145 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
1269
1394
|
}
|
|
1270
1395
|
}
|
|
1271
1396
|
if (user_device_number != -1) {
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
default_device
|
|
1397
|
+
candidate_devices = &devices[user_device_number];
|
|
1398
|
+
n_candidate_devices = 1;
|
|
1399
|
+
default_device = &candidate_devices[0];
|
|
1275
1400
|
}
|
|
1276
1401
|
|
|
1277
|
-
GGML_ASSERT(
|
|
1402
|
+
GGML_ASSERT(n_candidate_devices > 0);
|
|
1278
1403
|
|
|
1279
1404
|
if (default_device == NULL) {
|
|
1280
|
-
default_device = &
|
|
1405
|
+
default_device = &candidate_devices[0];
|
|
1406
|
+
}
|
|
1407
|
+
}
|
|
1408
|
+
|
|
1409
|
+
GGML_ASSERT(n_candidate_devices != 0 && candidate_devices);
|
|
1410
|
+
|
|
1411
|
+
// Put the default device in front.
|
|
1412
|
+
for (unsigned i = 1; i < n_candidate_devices; i++) {
|
|
1413
|
+
if (&candidate_devices[i] == default_device) {
|
|
1414
|
+
std::swap(candidate_devices[0], candidate_devices[i]);
|
|
1415
|
+
default_device = &candidate_devices[0];
|
|
1416
|
+
break;
|
|
1417
|
+
}
|
|
1418
|
+
}
|
|
1419
|
+
|
|
1420
|
+
GGML_LOG_INFO("ggml_opencl: selected platform: '%s'\n", default_device->platform->name);
|
|
1421
|
+
|
|
1422
|
+
std::vector<cl_device_id> device_ids;
|
|
1423
|
+
for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) {
|
|
1424
|
+
device_ids.push_back(dev->id);
|
|
1425
|
+
}
|
|
1426
|
+
|
|
1427
|
+
cl_int err;
|
|
1428
|
+
cl_context shared_context;
|
|
1429
|
+
cl_context_properties properties[] = { (intptr_t) CL_CONTEXT_PLATFORM, (intptr_t) default_device->platform->id, 0 };
|
|
1430
|
+
|
|
1431
|
+
CL_CHECK(
|
|
1432
|
+
(shared_context = clCreateContext(properties, device_ids.size(), device_ids.data(), NULL, NULL, &err), err));
|
|
1433
|
+
|
|
1434
|
+
for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) {
|
|
1435
|
+
GGML_LOG_INFO("\nggml_opencl: device: '%s (%s)'\n", dev->name, dev->version);
|
|
1436
|
+
|
|
1437
|
+
auto dev_ctx = std::unique_ptr<ggml_backend_opencl_device_context>(new ggml_backend_opencl_device_context{
|
|
1438
|
+
/*.platform =*/dev->platform->id,
|
|
1439
|
+
/*.platform_nane =*/dev->platform->name,
|
|
1440
|
+
/*.device =*/dev->id,
|
|
1441
|
+
/*.device_name =*/dev->name,
|
|
1442
|
+
/*.device_type =*/dev->type,
|
|
1443
|
+
/*.device_version =*/dev->version,
|
|
1444
|
+
/*.backend_ctx =*/nullptr,
|
|
1445
|
+
/*.buffer_type =*/{},
|
|
1446
|
+
/*.context =*/shared_context,
|
|
1447
|
+
});
|
|
1448
|
+
|
|
1449
|
+
found_devices.push_back(ggml_backend_device{
|
|
1450
|
+
/* .iface = */ ggml_backend_opencl_device_i,
|
|
1451
|
+
/* .reg = */ reg,
|
|
1452
|
+
/* .context = */ dev_ctx.get(),
|
|
1453
|
+
});
|
|
1454
|
+
|
|
1455
|
+
if (!ggml_cl2_init(&found_devices.back())) {
|
|
1456
|
+
found_devices.pop_back();
|
|
1457
|
+
GGML_LOG_INFO("ggml_opencl: drop unsupported device.\n");
|
|
1458
|
+
continue;
|
|
1459
|
+
}
|
|
1460
|
+
|
|
1461
|
+
dev_ctx.release();
|
|
1462
|
+
}
|
|
1463
|
+
|
|
1464
|
+
if (found_devices.size()) {
|
|
1465
|
+
auto * dev_ctx = static_cast<ggml_backend_opencl_device_context *>(found_devices.front().context);
|
|
1466
|
+
GGML_LOG_INFO("ggml_opencl: default device: '%s (%s)'\n", dev_ctx->device_name.c_str(),
|
|
1467
|
+
dev_ctx->device_version.c_str());
|
|
1468
|
+
|
|
1469
|
+
if (dev_ctx->device_type != CL_DEVICE_TYPE_GPU) {
|
|
1470
|
+
GGML_LOG_WARN("ggml_opencl: warning, the default device is not a GPU: '%s'.\n",
|
|
1471
|
+
dev_ctx->device_name.c_str());
|
|
1281
1472
|
}
|
|
1282
1473
|
}
|
|
1283
1474
|
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1475
|
+
return found_devices;
|
|
1476
|
+
}
|
|
1477
|
+
|
|
1478
|
+
// Initialize device if it is supported (returns nullptr if it is not).
|
|
1479
|
+
static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
1480
|
+
GGML_ASSERT(dev);
|
|
1481
|
+
GGML_ASSERT(dev->context);
|
|
1482
|
+
|
|
1483
|
+
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
|
|
1484
|
+
GGML_ASSERT(dev_ctx->platform);
|
|
1485
|
+
GGML_ASSERT(dev_ctx->device);
|
|
1486
|
+
|
|
1487
|
+
if (dev_ctx->backend_ctx) {
|
|
1488
|
+
return dev_ctx->backend_ctx;
|
|
1288
1489
|
}
|
|
1289
1490
|
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
backend_ctx->
|
|
1491
|
+
auto backend_ctx = std::make_unique<ggml_backend_opencl_context>();
|
|
1492
|
+
backend_ctx->device = dev_ctx->device;
|
|
1493
|
+
backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
|
|
1293
1494
|
|
|
1294
|
-
if (strstr(
|
|
1295
|
-
strstr(
|
|
1296
|
-
strstr(
|
|
1495
|
+
if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
|
|
1496
|
+
strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
|
|
1497
|
+
strstr(dev_ctx->device_version.c_str(), "Adreno")) {
|
|
1297
1498
|
backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
|
|
1298
1499
|
// Usually device version contains the detailed device name
|
|
1299
|
-
backend_ctx->adreno_gen = get_adreno_gpu_gen(
|
|
1500
|
+
backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str());
|
|
1300
1501
|
if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
|
|
1301
|
-
backend_ctx->adreno_gen = get_adreno_gpu_gen(
|
|
1502
|
+
backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str());
|
|
1302
1503
|
}
|
|
1303
1504
|
|
|
1304
1505
|
// Use wave size of 64 for all Adreno GPUs.
|
|
1305
1506
|
backend_ctx->adreno_wave_size = 64;
|
|
1306
|
-
} else if (strstr(
|
|
1507
|
+
} else if (strstr(dev_ctx->device_name.c_str(), "Intel")) {
|
|
1307
1508
|
backend_ctx->gpu_family = GPU_FAMILY::INTEL;
|
|
1308
1509
|
} else {
|
|
1309
|
-
GGML_LOG_ERROR("Unsupported GPU: %s\n",
|
|
1510
|
+
GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str());
|
|
1310
1511
|
backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
|
|
1311
|
-
return
|
|
1512
|
+
return nullptr;
|
|
1312
1513
|
}
|
|
1313
1514
|
|
|
1314
1515
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
1315
1516
|
if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
|
|
1316
1517
|
GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
|
|
1317
1518
|
"run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
|
|
1318
|
-
return
|
|
1519
|
+
return nullptr;
|
|
1319
1520
|
}
|
|
1320
1521
|
#endif
|
|
1321
1522
|
|
|
1322
1523
|
// Populate backend device name
|
|
1323
|
-
|
|
1324
|
-
dev_ctx->device_name = default_device->name;
|
|
1325
|
-
backend_ctx->device_name = default_device->name;
|
|
1524
|
+
backend_ctx->device_name = dev_ctx->device_name;
|
|
1326
1525
|
|
|
1327
1526
|
// A local ref of cl_device_id for convenience
|
|
1328
1527
|
cl_device_id device = backend_ctx->device;
|
|
1329
1528
|
|
|
1330
|
-
ggml_cl_version platform_version = get_opencl_platform_version(
|
|
1529
|
+
ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform);
|
|
1331
1530
|
|
|
1332
1531
|
// Check device OpenCL version, OpenCL 2.0 or above is required
|
|
1333
1532
|
ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
|
|
1334
1533
|
if (opencl_c_version.major < 2) {
|
|
1335
1534
|
GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
|
|
1336
|
-
return
|
|
1535
|
+
return nullptr;
|
|
1337
1536
|
}
|
|
1338
1537
|
|
|
1339
1538
|
// Check driver version
|
|
@@ -1364,7 +1563,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
1364
1563
|
// fp16 is required
|
|
1365
1564
|
if (!backend_ctx->fp16_support) {
|
|
1366
1565
|
GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n");
|
|
1367
|
-
return
|
|
1566
|
+
return nullptr;
|
|
1368
1567
|
}
|
|
1369
1568
|
|
|
1370
1569
|
// If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
|
|
@@ -1373,7 +1572,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
1373
1572
|
strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
|
|
1374
1573
|
GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
|
|
1375
1574
|
"(note that subgroups is an optional feature in OpenCL 3.0)\n");
|
|
1376
|
-
return
|
|
1575
|
+
return nullptr;
|
|
1377
1576
|
}
|
|
1378
1577
|
|
|
1379
1578
|
cl_uint base_align_in_bits;
|
|
@@ -1397,6 +1596,15 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
1397
1596
|
GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
|
|
1398
1597
|
svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
|
|
1399
1598
|
|
|
1599
|
+
if (opencl_c_version.major >= 3) {
|
|
1600
|
+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool),
|
|
1601
|
+
&backend_ctx->non_uniform_workgroups, 0));
|
|
1602
|
+
} else {
|
|
1603
|
+
GGML_ASSERT(opencl_c_version.major == 2);
|
|
1604
|
+
// Non-uniform workgroup sizes is mandatory feature in v2.x.
|
|
1605
|
+
backend_ctx->non_uniform_workgroups = true;
|
|
1606
|
+
}
|
|
1607
|
+
|
|
1400
1608
|
// Print out configurations
|
|
1401
1609
|
#ifdef GGML_OPENCL_SOA_Q
|
|
1402
1610
|
GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
|
|
@@ -1406,14 +1614,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
1406
1614
|
GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
|
|
1407
1615
|
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
|
1408
1616
|
|
|
1409
|
-
|
|
1410
|
-
(intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)dev_ctx->platform, 0
|
|
1411
|
-
};
|
|
1412
|
-
|
|
1413
|
-
CL_CHECK((backend_ctx->context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
|
|
1617
|
+
cl_int err;
|
|
1414
1618
|
|
|
1415
1619
|
// A local ref of cl_context for convenience
|
|
1416
|
-
cl_context context = backend_ctx->context;
|
|
1620
|
+
cl_context context = backend_ctx->context = dev_ctx->context;
|
|
1417
1621
|
|
|
1418
1622
|
//CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
|
|
1419
1623
|
// (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
|
|
@@ -1426,7 +1630,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
1426
1630
|
CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));
|
|
1427
1631
|
|
|
1428
1632
|
// Load kernels
|
|
1429
|
-
load_cl_kernels(backend_ctx, opencl_c_version);
|
|
1633
|
+
load_cl_kernels(backend_ctx.get(), opencl_c_version);
|
|
1430
1634
|
|
|
1431
1635
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
1432
1636
|
// Allocate intermediate buffers and images
|
|
@@ -1456,10 +1660,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
1456
1660
|
CL_CHECK((backend_ctx->B_d_max = clCreateBuffer(context, 0, max_B_d_bytes, NULL, &err), err));
|
|
1457
1661
|
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
|
1458
1662
|
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
return backend_ctx;
|
|
1663
|
+
dev_ctx->backend_ctx = backend_ctx.release();
|
|
1664
|
+
return dev_ctx->backend_ctx;
|
|
1463
1665
|
}
|
|
1464
1666
|
|
|
1465
1667
|
static void ggml_cl2_free(void) {
|
|
@@ -1664,10 +1866,46 @@ static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
|
|
|
1664
1866
|
GGML_UNUSED(backend);
|
|
1665
1867
|
}
|
|
1666
1868
|
|
|
1869
|
+
// Syncronizes the 'backend_ctx's device with others so that commands
|
|
1870
|
+
// enqueued to it won't start until commands in the other devices have
|
|
1871
|
+
// completed.
|
|
1872
|
+
static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) {
|
|
1873
|
+
if (g_ggml_backend_opencl_devices.size() < 2)
|
|
1874
|
+
return; // No other devices to synchronize with.
|
|
1875
|
+
|
|
1876
|
+
std::vector<cl_event> events;
|
|
1877
|
+
events.reserve(g_ggml_backend_opencl_devices.size());
|
|
1878
|
+
|
|
1879
|
+
for (ggml_backend_device & backend_dev : g_ggml_backend_opencl_devices) {
|
|
1880
|
+
auto * other_backend_ctx = ggml_cl2_init(&backend_dev);
|
|
1881
|
+
if (backend_ctx != other_backend_ctx) {
|
|
1882
|
+
cl_event ev;
|
|
1883
|
+
CL_CHECK(clEnqueueMarkerWithWaitList(other_backend_ctx->queue, 0, nullptr, &ev));
|
|
1884
|
+
CL_CHECK(clFlush(other_backend_ctx->queue));
|
|
1885
|
+
events.push_back(ev);
|
|
1886
|
+
}
|
|
1887
|
+
}
|
|
1888
|
+
|
|
1889
|
+
CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, events.size(), events.data(), nullptr));
|
|
1890
|
+
for (auto ev : events) {
|
|
1891
|
+
CL_CHECK(clReleaseEvent(ev));
|
|
1892
|
+
}
|
|
1893
|
+
}
|
|
1894
|
+
|
|
1895
|
+
static void sync_with_other_backends(ggml_backend_t backend) {
|
|
1896
|
+
auto * backend_ctx = static_cast<ggml_backend_opencl_context *>(backend->context);
|
|
1897
|
+
sync_with_other_backends(backend_ctx);
|
|
1898
|
+
}
|
|
1899
|
+
|
|
1667
1900
|
static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
|
1668
1901
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
1669
1902
|
ggml_tensor * node = cgraph->nodes[i];
|
|
1670
1903
|
|
|
1904
|
+
// NOTE: this may oversynchronize by synchronizing with
|
|
1905
|
+
// backends/devices which don't compute 'cgraph's
|
|
1906
|
+
// dependencies.
|
|
1907
|
+
sync_with_other_backends(backend);
|
|
1908
|
+
|
|
1671
1909
|
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
|
1672
1910
|
continue;
|
|
1673
1911
|
}
|
|
@@ -1729,6 +1967,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
1729
1967
|
case GGML_OP_ADD:
|
|
1730
1968
|
case GGML_OP_SCALE:
|
|
1731
1969
|
case GGML_OP_MUL:
|
|
1970
|
+
case GGML_OP_DIV:
|
|
1971
|
+
case GGML_OP_SUB:
|
|
1732
1972
|
return op->src[0]->type == GGML_TYPE_F32;
|
|
1733
1973
|
case GGML_OP_UNARY:
|
|
1734
1974
|
switch (ggml_get_unary_op(op)) {
|
|
@@ -1736,7 +1976,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
1736
1976
|
case GGML_UNARY_OP_SILU:
|
|
1737
1977
|
case GGML_UNARY_OP_RELU:
|
|
1738
1978
|
case GGML_UNARY_OP_GELU_QUICK:
|
|
1739
|
-
|
|
1979
|
+
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
|
1980
|
+
case GGML_UNARY_OP_SIGMOID:
|
|
1981
|
+
return ggml_is_contiguous(op->src[0]);
|
|
1740
1982
|
default:
|
|
1741
1983
|
return false;
|
|
1742
1984
|
}
|
|
@@ -1746,11 +1988,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
1746
1988
|
case GGML_OP_NORM:
|
|
1747
1989
|
case GGML_OP_RMS_NORM:
|
|
1748
1990
|
return true;
|
|
1991
|
+
case GGML_OP_GROUP_NORM:
|
|
1992
|
+
return ggml_is_contiguous(op->src[0]);
|
|
1749
1993
|
case GGML_OP_MUL_MAT:
|
|
1750
1994
|
if (op->src[0]->type == GGML_TYPE_F16) {
|
|
1751
1995
|
return true;
|
|
1752
1996
|
} else if (op->src[0]->type == GGML_TYPE_F32) {
|
|
1753
|
-
return op->src[1]->type == GGML_TYPE_F32
|
|
1997
|
+
return op->src[1]->type == GGML_TYPE_F32;
|
|
1754
1998
|
} else if (op->src[0]->type == GGML_TYPE_Q4_0 ||
|
|
1755
1999
|
op->src[0]->type == GGML_TYPE_Q6_K) {
|
|
1756
2000
|
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
|
@@ -1785,6 +2029,10 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
1785
2029
|
}
|
|
1786
2030
|
case GGML_OP_IM2COL:
|
|
1787
2031
|
return true;
|
|
2032
|
+
case GGML_OP_ARGSORT:
|
|
2033
|
+
return op->src[0]->type == GGML_TYPE_F32;
|
|
2034
|
+
case GGML_OP_SUM_ROWS:
|
|
2035
|
+
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
|
|
1788
2036
|
default:
|
|
1789
2037
|
return false;
|
|
1790
2038
|
}
|
|
@@ -2058,15 +2306,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
2058
2306
|
// The original tensor memory is divided into scales and quants, i.e.,
|
|
2059
2307
|
// we first store scales, then quants.
|
|
2060
2308
|
// Create subbuffer for scales.
|
|
2061
|
-
region.origin = extra_orig->offset + tensor->view_offs + offset;
|
|
2309
|
+
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
|
|
2062
2310
|
region.size = size_d;
|
|
2063
2311
|
extra->d = clCreateSubBuffer(
|
|
2064
2312
|
extra_orig->data_device, CL_MEM_READ_WRITE,
|
|
2065
2313
|
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
|
2066
2314
|
CL_CHECK(err);
|
|
2315
|
+
auto previous_origin = region.origin;
|
|
2067
2316
|
|
|
2068
2317
|
// Create subbuffer for quants.
|
|
2069
|
-
region.origin =
|
|
2318
|
+
region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
|
|
2070
2319
|
region.size = size_q;
|
|
2071
2320
|
extra->q = clCreateSubBuffer(
|
|
2072
2321
|
extra_orig->data_device, CL_MEM_READ_WRITE,
|
|
@@ -2271,8 +2520,8 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
|
2271
2520
|
cl_context context = backend_ctx->context;
|
|
2272
2521
|
cl_command_queue queue = backend_ctx->queue;
|
|
2273
2522
|
|
|
2274
|
-
// Make sure all previously submitted commands are finished.
|
|
2275
|
-
|
|
2523
|
+
// Make sure all previously submitted commands in other devices are finished.
|
|
2524
|
+
sync_with_other_backends(backend_ctx);
|
|
2276
2525
|
|
|
2277
2526
|
#ifdef GGML_OPENCL_SOA_Q
|
|
2278
2527
|
// In end-to-end runs, get_tensor is usually used to get back the logits,
|
|
@@ -2376,13 +2625,8 @@ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_b
|
|
|
2376
2625
|
}
|
|
2377
2626
|
|
|
2378
2627
|
static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
|
|
2379
|
-
|
|
2380
|
-
|
|
2381
|
-
if (alignment == (cl_uint)-1) {
|
|
2382
|
-
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
|
|
2383
|
-
alignment = backend_ctx->alignment;
|
|
2384
|
-
}
|
|
2385
|
-
return alignment;
|
|
2628
|
+
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
|
|
2629
|
+
return backend_ctx->alignment;
|
|
2386
2630
|
}
|
|
2387
2631
|
|
|
2388
2632
|
static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
|
|
@@ -2409,16 +2653,6 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
|
|
|
2409
2653
|
/* .is_host = */ NULL,
|
|
2410
2654
|
};
|
|
2411
2655
|
|
|
2412
|
-
ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
|
|
2413
|
-
static ggml_backend_buffer_type buffer_type = {
|
|
2414
|
-
/* .iface = */ ggml_backend_opencl_buffer_type_interface,
|
|
2415
|
-
/* .device = */ &g_ggml_backend_opencl_device,
|
|
2416
|
-
/* .context = */ nullptr,
|
|
2417
|
-
};
|
|
2418
|
-
|
|
2419
|
-
return &buffer_type;
|
|
2420
|
-
}
|
|
2421
|
-
|
|
2422
2656
|
//
|
|
2423
2657
|
// backend device
|
|
2424
2658
|
//
|
|
@@ -2476,9 +2710,15 @@ static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, co
|
|
|
2476
2710
|
}
|
|
2477
2711
|
|
|
2478
2712
|
static ggml_backend_buffer_type_t ggml_backend_opencl_device_get_buffer_type(ggml_backend_dev_t dev) {
|
|
2479
|
-
|
|
2713
|
+
auto * dev_ctx = static_cast<ggml_backend_opencl_device_context *>(dev->context);
|
|
2480
2714
|
|
|
2481
|
-
|
|
2715
|
+
dev_ctx->buffer_type = ggml_backend_buffer_type{
|
|
2716
|
+
/* .iface = */ ggml_backend_opencl_buffer_type_interface,
|
|
2717
|
+
/* .device = */ dev,
|
|
2718
|
+
/* .context = */ nullptr,
|
|
2719
|
+
};
|
|
2720
|
+
|
|
2721
|
+
return &dev_ctx->buffer_type;
|
|
2482
2722
|
}
|
|
2483
2723
|
|
|
2484
2724
|
static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
|
@@ -2494,12 +2734,21 @@ static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const
|
|
|
2494
2734
|
}
|
|
2495
2735
|
|
|
2496
2736
|
static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
|
2497
|
-
|
|
2737
|
+
// Check 'dev' and 'buffer_type' are not objects belonging to this backend.
|
|
2738
|
+
if (dev->iface.get_name != ggml_backend_opencl_device_get_name ||
|
|
2739
|
+
buft->iface.get_name != ggml_backend_opencl_buffer_type_get_name) {
|
|
2740
|
+
return false;
|
|
2741
|
+
}
|
|
2498
2742
|
|
|
2499
|
-
|
|
2743
|
+
// Check cl_context is the same. clEnqueue* commands may not use
|
|
2744
|
+
// buffers from another cl_context.
|
|
2745
|
+
ggml_backend_opencl_context * backend_ctx0 = ggml_cl2_init(dev);
|
|
2746
|
+
ggml_backend_opencl_context * backend_ctx1 = ggml_cl2_init(buft->device);
|
|
2747
|
+
return backend_ctx0->context == backend_ctx1->context;
|
|
2500
2748
|
}
|
|
2501
2749
|
|
|
2502
|
-
|
|
2750
|
+
namespace /* anonymous */ {
|
|
2751
|
+
struct ggml_backend_device_i ggml_backend_opencl_device_i = {
|
|
2503
2752
|
/* .get_name = */ ggml_backend_opencl_device_get_name,
|
|
2504
2753
|
/* .get_description = */ ggml_backend_opencl_device_get_description,
|
|
2505
2754
|
/* .get_memory = */ ggml_backend_opencl_device_get_memory,
|
|
@@ -2516,6 +2765,7 @@ static struct ggml_backend_device_i ggml_backend_opencl_device_i = {
|
|
|
2516
2765
|
/* .event_free = */ NULL,
|
|
2517
2766
|
/* .event_synchronize = */ NULL,
|
|
2518
2767
|
};
|
|
2768
|
+
}
|
|
2519
2769
|
|
|
2520
2770
|
// Backend registry
|
|
2521
2771
|
|
|
@@ -2526,15 +2776,15 @@ static const char * ggml_backend_opencl_reg_get_name(ggml_backend_reg_t reg) {
|
|
|
2526
2776
|
}
|
|
2527
2777
|
|
|
2528
2778
|
static size_t ggml_backend_opencl_reg_device_count(ggml_backend_reg_t reg) {
|
|
2529
|
-
return
|
|
2779
|
+
return g_ggml_backend_opencl_devices.size();
|
|
2530
2780
|
|
|
2531
2781
|
GGML_UNUSED(reg);
|
|
2532
2782
|
}
|
|
2533
2783
|
|
|
2534
2784
|
static ggml_backend_dev_t ggml_backend_opencl_reg_device_get(ggml_backend_reg_t reg, size_t index) {
|
|
2535
|
-
GGML_ASSERT(index
|
|
2785
|
+
GGML_ASSERT(index < ggml_backend_opencl_reg_device_count(reg));
|
|
2536
2786
|
|
|
2537
|
-
return &
|
|
2787
|
+
return &g_ggml_backend_opencl_devices[index];
|
|
2538
2788
|
|
|
2539
2789
|
GGML_UNUSED(reg);
|
|
2540
2790
|
GGML_UNUSED(index);
|
|
@@ -2548,27 +2798,23 @@ static struct ggml_backend_reg_i ggml_backend_opencl_reg_i = {
|
|
|
2548
2798
|
};
|
|
2549
2799
|
|
|
2550
2800
|
ggml_backend_reg_t ggml_backend_opencl_reg(void) {
|
|
2551
|
-
|
|
2801
|
+
static std::mutex mutex;
|
|
2552
2802
|
static ggml_backend_reg reg;
|
|
2553
2803
|
static bool initialized = false;
|
|
2804
|
+
std::lock_guard<std::mutex> lock(mutex);
|
|
2554
2805
|
|
|
2555
|
-
if (
|
|
2556
|
-
reg
|
|
2557
|
-
|
|
2558
|
-
|
|
2559
|
-
/* .context = */ NULL,
|
|
2560
|
-
};
|
|
2561
|
-
|
|
2562
|
-
g_ggml_backend_opencl_device = ggml_backend_device {
|
|
2563
|
-
/* .iface = */ ggml_backend_opencl_device_i,
|
|
2564
|
-
/* .reg = */ ®,
|
|
2565
|
-
/* .context = */ &g_ggml_ctx_dev_main,
|
|
2566
|
-
};
|
|
2806
|
+
if (initialized) {
|
|
2807
|
+
return ®
|
|
2808
|
+
}
|
|
2809
|
+
initialized = true;
|
|
2567
2810
|
|
|
2568
|
-
|
|
2811
|
+
g_ggml_backend_opencl_devices = ggml_opencl_probe_devices(®);
|
|
2569
2812
|
|
|
2570
|
-
|
|
2571
|
-
|
|
2813
|
+
reg = ggml_backend_reg{
|
|
2814
|
+
/* .api_version = */ GGML_BACKEND_API_VERSION,
|
|
2815
|
+
/* .iface = */ ggml_backend_opencl_reg_i,
|
|
2816
|
+
/* .context = */ NULL,
|
|
2817
|
+
};
|
|
2572
2818
|
|
|
2573
2819
|
return ®
|
|
2574
2820
|
}
|
|
@@ -2942,14 +3188,19 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
2942
3188
|
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
2943
3189
|
size_t local_work_size[] = {64, 1, 1};
|
|
2944
3190
|
|
|
3191
|
+
size_t * local_work_size_ptr = local_work_size;
|
|
3192
|
+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
|
3193
|
+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
3194
|
+
}
|
|
3195
|
+
|
|
2945
3196
|
#ifdef GGML_OPENCL_PROFILING
|
|
2946
3197
|
cl_event evt;
|
|
2947
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size,
|
|
3198
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
2948
3199
|
|
|
2949
3200
|
g_profiling_info.emplace_back();
|
|
2950
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size,
|
|
3201
|
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
2951
3202
|
#else
|
|
2952
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size,
|
|
3203
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
2953
3204
|
#endif
|
|
2954
3205
|
} else {
|
|
2955
3206
|
unsigned int nth = MIN(64, ne0);
|
|
@@ -3077,14 +3328,19 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3077
3328
|
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
3078
3329
|
size_t local_work_size[] = {64, 1, 1};
|
|
3079
3330
|
|
|
3331
|
+
size_t * local_work_size_ptr = local_work_size;
|
|
3332
|
+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
|
3333
|
+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
3334
|
+
}
|
|
3335
|
+
|
|
3080
3336
|
#ifdef GGML_OPENCL_PROFILING
|
|
3081
3337
|
cl_event evt;
|
|
3082
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size,
|
|
3338
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
3083
3339
|
|
|
3084
3340
|
g_profiling_info.emplace_back();
|
|
3085
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size,
|
|
3341
|
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
3086
3342
|
#else
|
|
3087
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size,
|
|
3343
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
3088
3344
|
#endif
|
|
3089
3345
|
} else {
|
|
3090
3346
|
unsigned int nth = MIN(64, ne0);
|
|
@@ -3103,54 +3359,304 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3103
3359
|
}
|
|
3104
3360
|
}
|
|
3105
3361
|
|
|
3106
|
-
static void
|
|
3362
|
+
static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3107
3363
|
GGML_ASSERT(src0);
|
|
3108
3364
|
GGML_ASSERT(src0->extra);
|
|
3365
|
+
GGML_ASSERT(src1);
|
|
3366
|
+
GGML_ASSERT(src1->extra);
|
|
3109
3367
|
GGML_ASSERT(dst);
|
|
3110
3368
|
GGML_ASSERT(dst->extra);
|
|
3111
3369
|
|
|
3112
|
-
|
|
3370
|
+
const int ne00 = src0->ne[0];
|
|
3371
|
+
const int ne01 = src0->ne[1];
|
|
3372
|
+
const int ne02 = src0->ne[2];
|
|
3373
|
+
const int ne03 = src0->ne[3];
|
|
3374
|
+
|
|
3375
|
+
const cl_ulong nb00 = src0->nb[0];
|
|
3376
|
+
const cl_ulong nb01 = src0->nb[1];
|
|
3377
|
+
const cl_ulong nb02 = src0->nb[2];
|
|
3378
|
+
const cl_ulong nb03 = src0->nb[3];
|
|
3379
|
+
|
|
3380
|
+
const int ne10 = src1->ne[0];
|
|
3381
|
+
const int ne11 = src1->ne[1];
|
|
3382
|
+
const int ne12 = src1->ne[2];
|
|
3383
|
+
const int ne13 = src1->ne[3];
|
|
3384
|
+
|
|
3385
|
+
const cl_ulong nb10 = src1->nb[0];
|
|
3386
|
+
const cl_ulong nb11 = src1->nb[1];
|
|
3387
|
+
const cl_ulong nb12 = src1->nb[2];
|
|
3388
|
+
const cl_ulong nb13 = src1->nb[3];
|
|
3389
|
+
|
|
3390
|
+
const int ne0 = dst->ne[0];
|
|
3391
|
+
|
|
3392
|
+
const cl_ulong nb0 = dst->nb[0];
|
|
3393
|
+
const cl_ulong nb1 = dst->nb[1];
|
|
3394
|
+
const cl_ulong nb2 = dst->nb[2];
|
|
3395
|
+
const cl_ulong nb3 = dst->nb[3];
|
|
3113
3396
|
|
|
3114
3397
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3115
3398
|
cl_command_queue queue = backend_ctx->queue;
|
|
3116
3399
|
|
|
3117
3400
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3401
|
+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
3118
3402
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
3119
3403
|
|
|
3120
3404
|
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
3405
|
+
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
|
3121
3406
|
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
3122
3407
|
|
|
3408
|
+
bool bcast_row = false;
|
|
3123
3409
|
cl_kernel kernel;
|
|
3124
3410
|
|
|
3125
|
-
|
|
3411
|
+
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
|
|
3412
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
3126
3413
|
|
|
3127
|
-
|
|
3128
|
-
|
|
3129
|
-
|
|
3414
|
+
// src1 is a row
|
|
3415
|
+
GGML_ASSERT(ne11 == 1);
|
|
3416
|
+
|
|
3417
|
+
bcast_row = true;
|
|
3418
|
+
int ne = ne00 / 4;
|
|
3419
|
+
kernel = backend_ctx->kernel_div_row;
|
|
3420
|
+
|
|
3421
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
3422
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
3423
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
3424
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
3425
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
3426
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
3427
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
|
3130
3428
|
} else {
|
|
3131
|
-
kernel = backend_ctx->
|
|
3429
|
+
kernel = backend_ctx->kernel_div;
|
|
3430
|
+
|
|
3431
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
3432
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
3433
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
3434
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
3435
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
3436
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
3437
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
|
|
3438
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
|
3439
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
|
3440
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
|
3441
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
|
|
3442
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
|
|
3443
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
|
|
3444
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
|
|
3445
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
|
|
3446
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
|
|
3447
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
|
|
3448
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
|
|
3449
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
|
|
3450
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
|
|
3451
|
+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
|
|
3452
|
+
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
|
|
3453
|
+
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
|
|
3132
3454
|
}
|
|
3133
3455
|
|
|
3134
|
-
|
|
3135
|
-
|
|
3136
|
-
|
|
3137
|
-
|
|
3456
|
+
if (bcast_row) {
|
|
3457
|
+
int n = ggml_nelements(dst)/4;
|
|
3458
|
+
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
3459
|
+
size_t local_work_size[] = {64, 1, 1};
|
|
3138
3460
|
|
|
3139
|
-
|
|
3140
|
-
|
|
3461
|
+
#ifdef GGML_OPENCL_PROFILING
|
|
3462
|
+
cl_event evt;
|
|
3463
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3464
|
+
|
|
3465
|
+
g_profiling_info.emplace_back();
|
|
3466
|
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3467
|
+
#else
|
|
3468
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3469
|
+
#endif
|
|
3470
|
+
} else {
|
|
3471
|
+
unsigned int nth = MIN(64, ne0);
|
|
3472
|
+
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
3473
|
+
size_t local_work_size[] = {nth, 1, 1};
|
|
3141
3474
|
|
|
3142
3475
|
#ifdef GGML_OPENCL_PROFILING
|
|
3143
|
-
|
|
3144
|
-
|
|
3476
|
+
cl_event evt;
|
|
3477
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3145
3478
|
|
|
3146
|
-
|
|
3147
|
-
|
|
3479
|
+
g_profiling_info.emplace_back();
|
|
3480
|
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3148
3481
|
#else
|
|
3149
|
-
|
|
3482
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3150
3483
|
#endif
|
|
3484
|
+
}
|
|
3151
3485
|
}
|
|
3152
3486
|
|
|
3153
|
-
static void
|
|
3487
|
+
static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3488
|
+
GGML_ASSERT(src0);
|
|
3489
|
+
GGML_ASSERT(src0->extra);
|
|
3490
|
+
GGML_ASSERT(src1);
|
|
3491
|
+
GGML_ASSERT(src1->extra);
|
|
3492
|
+
GGML_ASSERT(dst);
|
|
3493
|
+
GGML_ASSERT(dst->extra);
|
|
3494
|
+
|
|
3495
|
+
const int ne00 = src0->ne[0];
|
|
3496
|
+
const int ne01 = src0->ne[1];
|
|
3497
|
+
const int ne02 = src0->ne[2];
|
|
3498
|
+
const int ne03 = src0->ne[3];
|
|
3499
|
+
|
|
3500
|
+
const cl_ulong nb00 = src0->nb[0];
|
|
3501
|
+
const cl_ulong nb01 = src0->nb[1];
|
|
3502
|
+
const cl_ulong nb02 = src0->nb[2];
|
|
3503
|
+
const cl_ulong nb03 = src0->nb[3];
|
|
3504
|
+
|
|
3505
|
+
const int ne10 = src1->ne[0];
|
|
3506
|
+
const int ne11 = src1->ne[1];
|
|
3507
|
+
const int ne12 = src1->ne[2];
|
|
3508
|
+
const int ne13 = src1->ne[3];
|
|
3509
|
+
|
|
3510
|
+
const cl_ulong nb10 = src1->nb[0];
|
|
3511
|
+
const cl_ulong nb11 = src1->nb[1];
|
|
3512
|
+
const cl_ulong nb12 = src1->nb[2];
|
|
3513
|
+
const cl_ulong nb13 = src1->nb[3];
|
|
3514
|
+
|
|
3515
|
+
const int ne0 = dst->ne[0];
|
|
3516
|
+
|
|
3517
|
+
const cl_ulong nb0 = dst->nb[0];
|
|
3518
|
+
const cl_ulong nb1 = dst->nb[1];
|
|
3519
|
+
const cl_ulong nb2 = dst->nb[2];
|
|
3520
|
+
const cl_ulong nb3 = dst->nb[3];
|
|
3521
|
+
|
|
3522
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3523
|
+
cl_command_queue queue = backend_ctx->queue;
|
|
3524
|
+
|
|
3525
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3526
|
+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
3527
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
3528
|
+
|
|
3529
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
3530
|
+
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
|
3531
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
3532
|
+
|
|
3533
|
+
bool bcast_row = false;
|
|
3534
|
+
cl_kernel kernel;
|
|
3535
|
+
|
|
3536
|
+
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
|
|
3537
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
3538
|
+
|
|
3539
|
+
// src1 is a row
|
|
3540
|
+
GGML_ASSERT(ne11 == 1);
|
|
3541
|
+
|
|
3542
|
+
bcast_row = true;
|
|
3543
|
+
int ne = ne00 / 4;
|
|
3544
|
+
kernel = backend_ctx->kernel_sub_row;
|
|
3545
|
+
|
|
3546
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
3547
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
3548
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
3549
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
3550
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
3551
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
3552
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
|
3553
|
+
} else {
|
|
3554
|
+
kernel = backend_ctx->kernel_sub;
|
|
3555
|
+
|
|
3556
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
3557
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
3558
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
3559
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
3560
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
3561
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
3562
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
|
|
3563
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
|
3564
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
|
3565
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
|
3566
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
|
|
3567
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
|
|
3568
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
|
|
3569
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
|
|
3570
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
|
|
3571
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
|
|
3572
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
|
|
3573
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
|
|
3574
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
|
|
3575
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
|
|
3576
|
+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
|
|
3577
|
+
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
|
|
3578
|
+
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
|
|
3579
|
+
}
|
|
3580
|
+
|
|
3581
|
+
if (bcast_row) {
|
|
3582
|
+
int n = ggml_nelements(dst)/4;
|
|
3583
|
+
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
3584
|
+
size_t local_work_size[] = {64, 1, 1};
|
|
3585
|
+
|
|
3586
|
+
#ifdef GGML_OPENCL_PROFILING
|
|
3587
|
+
cl_event evt;
|
|
3588
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3589
|
+
|
|
3590
|
+
g_profiling_info.emplace_back();
|
|
3591
|
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3592
|
+
#else
|
|
3593
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3594
|
+
#endif
|
|
3595
|
+
} else {
|
|
3596
|
+
unsigned int nth = MIN(64, ne0);
|
|
3597
|
+
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
3598
|
+
size_t local_work_size[] = {nth, 1, 1};
|
|
3599
|
+
|
|
3600
|
+
#ifdef GGML_OPENCL_PROFILING
|
|
3601
|
+
cl_event evt;
|
|
3602
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3603
|
+
|
|
3604
|
+
g_profiling_info.emplace_back();
|
|
3605
|
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3606
|
+
#else
|
|
3607
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3608
|
+
#endif
|
|
3609
|
+
}
|
|
3610
|
+
}
|
|
3611
|
+
|
|
3612
|
+
static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3613
|
+
GGML_ASSERT(src0);
|
|
3614
|
+
GGML_ASSERT(src0->extra);
|
|
3615
|
+
GGML_ASSERT(dst);
|
|
3616
|
+
GGML_ASSERT(dst->extra);
|
|
3617
|
+
|
|
3618
|
+
UNUSED(src1);
|
|
3619
|
+
|
|
3620
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3621
|
+
cl_command_queue queue = backend_ctx->queue;
|
|
3622
|
+
|
|
3623
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3624
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
3625
|
+
|
|
3626
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
3627
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
3628
|
+
|
|
3629
|
+
cl_kernel kernel;
|
|
3630
|
+
|
|
3631
|
+
int n = ggml_nelements(dst);
|
|
3632
|
+
|
|
3633
|
+
if (n % 4 == 0) {
|
|
3634
|
+
kernel = backend_ctx->kernel_gelu_4;
|
|
3635
|
+
n /= 4;
|
|
3636
|
+
} else {
|
|
3637
|
+
kernel = backend_ctx->kernel_gelu;
|
|
3638
|
+
}
|
|
3639
|
+
|
|
3640
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
3641
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
3642
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
3643
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
3644
|
+
|
|
3645
|
+
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
3646
|
+
size_t local_work_size[] = {64, 1, 1};
|
|
3647
|
+
|
|
3648
|
+
#ifdef GGML_OPENCL_PROFILING
|
|
3649
|
+
cl_event evt;
|
|
3650
|
+
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
|
|
3651
|
+
|
|
3652
|
+
g_profiling_info.emplace_back();
|
|
3653
|
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3654
|
+
#else
|
|
3655
|
+
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
|
|
3656
|
+
#endif
|
|
3657
|
+
}
|
|
3658
|
+
|
|
3659
|
+
static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3154
3660
|
GGML_ASSERT(src0);
|
|
3155
3661
|
GGML_ASSERT(src0->extra);
|
|
3156
3662
|
GGML_ASSERT(dst);
|
|
@@ -3233,14 +3739,19 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3233
3739
|
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
3234
3740
|
size_t local_work_size[] = {64, 1, 1};
|
|
3235
3741
|
|
|
3742
|
+
size_t * local_work_size_ptr = local_work_size;
|
|
3743
|
+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
|
3744
|
+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
3745
|
+
}
|
|
3746
|
+
|
|
3236
3747
|
#ifdef GGML_OPENCL_PROFILING
|
|
3237
3748
|
cl_event evt;
|
|
3238
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size,
|
|
3749
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
3239
3750
|
|
|
3240
3751
|
g_profiling_info.emplace_back();
|
|
3241
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size,
|
|
3752
|
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
3242
3753
|
#else
|
|
3243
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size,
|
|
3754
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
3244
3755
|
#endif
|
|
3245
3756
|
}
|
|
3246
3757
|
|
|
@@ -3273,14 +3784,71 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3273
3784
|
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
3274
3785
|
size_t local_work_size[] = {64, 1, 1};
|
|
3275
3786
|
|
|
3787
|
+
size_t * local_work_size_ptr = local_work_size;
|
|
3788
|
+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
|
3789
|
+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
3790
|
+
}
|
|
3791
|
+
|
|
3276
3792
|
#ifdef GGML_OPENCL_PROFILING
|
|
3277
3793
|
cl_event evt;
|
|
3278
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size,
|
|
3794
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
3279
3795
|
|
|
3280
3796
|
g_profiling_info.emplace_back();
|
|
3281
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size,
|
|
3797
|
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
3282
3798
|
#else
|
|
3283
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size,
|
|
3799
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
3800
|
+
#endif
|
|
3801
|
+
}
|
|
3802
|
+
|
|
3803
|
+
static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3804
|
+
GGML_ASSERT(src0);
|
|
3805
|
+
GGML_ASSERT(src0->extra);
|
|
3806
|
+
GGML_ASSERT(dst);
|
|
3807
|
+
GGML_ASSERT(dst->extra);
|
|
3808
|
+
|
|
3809
|
+
UNUSED(src1);
|
|
3810
|
+
|
|
3811
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3812
|
+
cl_command_queue queue = backend_ctx->queue;
|
|
3813
|
+
|
|
3814
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3815
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
3816
|
+
|
|
3817
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
3818
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
3819
|
+
|
|
3820
|
+
cl_kernel kernel;
|
|
3821
|
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
3822
|
+
kernel = backend_ctx->kernel_sigmoid_f32;
|
|
3823
|
+
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
|
3824
|
+
kernel = backend_ctx->kernel_sigmoid_f16;
|
|
3825
|
+
} else {
|
|
3826
|
+
GGML_ASSERT(false && "Unsupported data types for sigmoid (input and output must be both f32 or f16)");
|
|
3827
|
+
}
|
|
3828
|
+
|
|
3829
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
3830
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
3831
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
3832
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
3833
|
+
|
|
3834
|
+
const int64_t n = ggml_nelements(dst);
|
|
3835
|
+
|
|
3836
|
+
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
3837
|
+
size_t local_work_size[] = {64, 1, 1};
|
|
3838
|
+
|
|
3839
|
+
size_t * local_work_size_ptr = local_work_size;
|
|
3840
|
+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
|
3841
|
+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
3842
|
+
}
|
|
3843
|
+
|
|
3844
|
+
#ifdef GGML_OPENCL_PROFILING
|
|
3845
|
+
cl_event evt;
|
|
3846
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
3847
|
+
|
|
3848
|
+
g_profiling_info.emplace_back();
|
|
3849
|
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
3850
|
+
#else
|
|
3851
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
3284
3852
|
#endif
|
|
3285
3853
|
}
|
|
3286
3854
|
|
|
@@ -3320,14 +3888,19 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
|
3320
3888
|
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
3321
3889
|
size_t local_work_size[] = {64, 1, 1};
|
|
3322
3890
|
|
|
3891
|
+
size_t * local_work_size_ptr = local_work_size;
|
|
3892
|
+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
|
3893
|
+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
3894
|
+
}
|
|
3895
|
+
|
|
3323
3896
|
#ifdef GGML_OPENCL_PROFILING
|
|
3324
3897
|
cl_event evt;
|
|
3325
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size,
|
|
3898
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
3326
3899
|
|
|
3327
3900
|
g_profiling_info.emplace_back();
|
|
3328
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size,
|
|
3901
|
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
3329
3902
|
#else
|
|
3330
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size,
|
|
3903
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
3331
3904
|
#endif
|
|
3332
3905
|
}
|
|
3333
3906
|
|
|
@@ -3476,6 +4049,65 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
3476
4049
|
#endif
|
|
3477
4050
|
}
|
|
3478
4051
|
|
|
4052
|
+
static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
4053
|
+
GGML_ASSERT(src0);
|
|
4054
|
+
GGML_ASSERT(src0->extra);
|
|
4055
|
+
GGML_ASSERT(dst);
|
|
4056
|
+
GGML_ASSERT(dst->extra);
|
|
4057
|
+
|
|
4058
|
+
UNUSED(src1);
|
|
4059
|
+
|
|
4060
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4061
|
+
cl_command_queue queue = backend_ctx->queue;
|
|
4062
|
+
|
|
4063
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4064
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
4065
|
+
|
|
4066
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
4067
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
4068
|
+
|
|
4069
|
+
int32_t n_groups = ((const int32_t *) dst->op_params)[0];
|
|
4070
|
+
int32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + n_groups - 1) / n_groups);
|
|
4071
|
+
float eps = ((const float *) dst->op_params)[1];
|
|
4072
|
+
|
|
4073
|
+
const int ne00 = src0->ne[0];
|
|
4074
|
+
const int ne01 = src0->ne[1];
|
|
4075
|
+
const int ne02 = src0->ne[2];
|
|
4076
|
+
const int ne = ne00*ne01*ne02;
|
|
4077
|
+
|
|
4078
|
+
cl_kernel kernel = backend_ctx->kernel_group_norm;
|
|
4079
|
+
|
|
4080
|
+
size_t sgs = 64;
|
|
4081
|
+
if (backend_ctx->gpu_family == ADRENO) {
|
|
4082
|
+
sgs = 64;
|
|
4083
|
+
} else if (backend_ctx->gpu_family == INTEL) {
|
|
4084
|
+
sgs = 32;
|
|
4085
|
+
} else {
|
|
4086
|
+
GGML_ASSERT(false && "Unsupported GPU");
|
|
4087
|
+
}
|
|
4088
|
+
|
|
4089
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
4090
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
4091
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
4092
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
4093
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne));
|
|
4094
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &group_size));
|
|
4095
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
|
|
4096
|
+
|
|
4097
|
+
size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
|
|
4098
|
+
size_t local_work_size[] = {(size_t)sgs, 1, 1};
|
|
4099
|
+
|
|
4100
|
+
#ifdef GGML_OPENCL_PROFILING
|
|
4101
|
+
cl_event evt;
|
|
4102
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
4103
|
+
|
|
4104
|
+
g_profiling_info.emplace_back();
|
|
4105
|
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
4106
|
+
#else
|
|
4107
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
4108
|
+
#endif
|
|
4109
|
+
}
|
|
4110
|
+
|
|
3479
4111
|
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3480
4112
|
GGML_ASSERT(src0);
|
|
3481
4113
|
GGML_ASSERT(src0->extra);
|
|
@@ -4230,14 +4862,19 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
|
4230
4862
|
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
4231
4863
|
size_t local_work_size[] = {64, 1, 1};
|
|
4232
4864
|
|
|
4865
|
+
size_t * local_work_size_ptr = local_work_size;
|
|
4866
|
+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
|
4867
|
+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
4868
|
+
}
|
|
4869
|
+
|
|
4233
4870
|
#ifdef GGML_OPENCL_PROFILING
|
|
4234
4871
|
cl_event evt;
|
|
4235
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size,
|
|
4872
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
4236
4873
|
|
|
4237
4874
|
g_profiling_info.emplace_back();
|
|
4238
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size,
|
|
4875
|
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
4239
4876
|
#else
|
|
4240
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size,
|
|
4877
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
4241
4878
|
#endif
|
|
4242
4879
|
}
|
|
4243
4880
|
|
|
@@ -4418,14 +5055,19 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
|
|
|
4418
5055
|
size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02};
|
|
4419
5056
|
size_t local_work_size[] = {64, 1, 1};
|
|
4420
5057
|
|
|
5058
|
+
size_t * local_work_size_ptr = local_work_size;
|
|
5059
|
+
if (ne00 % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
|
5060
|
+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
5061
|
+
}
|
|
5062
|
+
|
|
4421
5063
|
#ifdef GGML_OPENCL_PROFILING
|
|
4422
5064
|
cl_event evt;
|
|
4423
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size,
|
|
5065
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
4424
5066
|
|
|
4425
5067
|
g_profiling_info.emplace_back();
|
|
4426
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size,
|
|
5068
|
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
4427
5069
|
#else
|
|
4428
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size,
|
|
5070
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
4429
5071
|
#endif
|
|
4430
5072
|
}
|
|
4431
5073
|
}
|
|
@@ -4815,6 +5457,124 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
|
4815
5457
|
#endif
|
|
4816
5458
|
}
|
|
4817
5459
|
|
|
5460
|
+
static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
5461
|
+
GGML_ASSERT(src0);
|
|
5462
|
+
GGML_ASSERT(src0->extra);
|
|
5463
|
+
GGML_ASSERT(dst);
|
|
5464
|
+
GGML_ASSERT(dst->extra);
|
|
5465
|
+
GGML_UNUSED(src1);
|
|
5466
|
+
|
|
5467
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
5468
|
+
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
|
5469
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
5470
|
+
|
|
5471
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5472
|
+
cl_command_queue queue = backend_ctx->queue;
|
|
5473
|
+
|
|
5474
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
5475
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
5476
|
+
|
|
5477
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
5478
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
5479
|
+
|
|
5480
|
+
const int ne00 = src0->ne[0];
|
|
5481
|
+
const int nrows = ggml_nrows(src0);
|
|
5482
|
+
|
|
5483
|
+
int ne00_padded = 1;
|
|
5484
|
+
while (ne00_padded < ne00) {
|
|
5485
|
+
ne00_padded *= 2;
|
|
5486
|
+
}
|
|
5487
|
+
|
|
5488
|
+
int order = (enum ggml_sort_order) dst->op_params[0];
|
|
5489
|
+
|
|
5490
|
+
cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32;
|
|
5491
|
+
|
|
5492
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
5493
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
5494
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
5495
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
5496
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
|
5497
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00_padded));
|
|
5498
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &order));
|
|
5499
|
+
CL_CHECK(clSetKernelArg(kernel, 7, ne00_padded*sizeof(int), NULL));
|
|
5500
|
+
|
|
5501
|
+
size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
|
|
5502
|
+
size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
|
|
5503
|
+
|
|
5504
|
+
#ifdef GGML_OPENCL_PROFILING
|
|
5505
|
+
cl_event evt;
|
|
5506
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5507
|
+
|
|
5508
|
+
g_profiling_info.emplace_back();
|
|
5509
|
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5510
|
+
#else
|
|
5511
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5512
|
+
#endif
|
|
5513
|
+
}
|
|
5514
|
+
|
|
5515
|
+
static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
5516
|
+
GGML_ASSERT(src0);
|
|
5517
|
+
GGML_ASSERT(src0->extra);
|
|
5518
|
+
GGML_ASSERT(dst);
|
|
5519
|
+
GGML_ASSERT(dst->extra);
|
|
5520
|
+
GGML_UNUSED(src1);
|
|
5521
|
+
|
|
5522
|
+
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
|
5523
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
5524
|
+
|
|
5525
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5526
|
+
cl_command_queue queue = backend_ctx->queue;
|
|
5527
|
+
|
|
5528
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
5529
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
5530
|
+
|
|
5531
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
5532
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
5533
|
+
|
|
5534
|
+
const int ne00 = src0->ne[0];
|
|
5535
|
+
const int ne01 = src0->ne[1];
|
|
5536
|
+
const int ne02 = src0->ne[2];
|
|
5537
|
+
const int ne03 = src0->ne[3];
|
|
5538
|
+
|
|
5539
|
+
const cl_ulong nb01 = src0->nb[1];
|
|
5540
|
+
const cl_ulong nb02 = src0->nb[2];
|
|
5541
|
+
const cl_ulong nb03 = src0->nb[3];
|
|
5542
|
+
|
|
5543
|
+
const cl_ulong nb1 = dst->nb[1];
|
|
5544
|
+
const cl_ulong nb2 = dst->nb[2];
|
|
5545
|
+
const cl_ulong nb3 = dst->nb[3];
|
|
5546
|
+
|
|
5547
|
+
cl_kernel kernel = backend_ctx->kernel_sum_rows_f32;
|
|
5548
|
+
|
|
5549
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
5550
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
5551
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
5552
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
5553
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
|
5554
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
|
5555
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
|
5556
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
|
5557
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
|
5558
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
|
5559
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
|
5560
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb1));
|
|
5561
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
|
|
5562
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
|
|
5563
|
+
|
|
5564
|
+
size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
|
|
5565
|
+
size_t local_work_size[] = {(size_t)64, 1, 1};
|
|
5566
|
+
|
|
5567
|
+
#ifdef GGML_OPENCL_PROFILING
|
|
5568
|
+
cl_event evt;
|
|
5569
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5570
|
+
|
|
5571
|
+
g_profiling_info.emplace_back();
|
|
5572
|
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5573
|
+
#else
|
|
5574
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5575
|
+
#endif
|
|
5576
|
+
}
|
|
5577
|
+
|
|
4818
5578
|
//------------------------------------------------------------------------------
|
|
4819
5579
|
// Op offloading
|
|
4820
5580
|
//------------------------------------------------------------------------------
|
|
@@ -4855,8 +5615,6 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
4855
5615
|
if (!any_on_device) {
|
|
4856
5616
|
return false;
|
|
4857
5617
|
}
|
|
4858
|
-
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
4859
|
-
GGML_ASSERT(ggml_is_contiguous(src1));
|
|
4860
5618
|
func = ggml_cl_add;
|
|
4861
5619
|
break;
|
|
4862
5620
|
case GGML_OP_MUL:
|
|
@@ -4865,6 +5623,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
4865
5623
|
}
|
|
4866
5624
|
func = ggml_cl_mul;
|
|
4867
5625
|
break;
|
|
5626
|
+
case GGML_OP_DIV:
|
|
5627
|
+
if (!any_on_device) {
|
|
5628
|
+
return false;
|
|
5629
|
+
}
|
|
5630
|
+
func = ggml_cl_div;
|
|
5631
|
+
break;
|
|
5632
|
+
case GGML_OP_SUB:
|
|
5633
|
+
if (!any_on_device) {
|
|
5634
|
+
return false;
|
|
5635
|
+
}
|
|
5636
|
+
func = ggml_cl_sub;
|
|
5637
|
+
break;
|
|
4868
5638
|
case GGML_OP_UNARY:
|
|
4869
5639
|
switch (ggml_get_unary_op(tensor)) {
|
|
4870
5640
|
case GGML_UNARY_OP_GELU:
|
|
@@ -4891,6 +5661,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
4891
5661
|
}
|
|
4892
5662
|
func = ggml_cl_relu;
|
|
4893
5663
|
break;
|
|
5664
|
+
case GGML_UNARY_OP_SIGMOID:
|
|
5665
|
+
if (!any_on_device) {
|
|
5666
|
+
return false;
|
|
5667
|
+
}
|
|
5668
|
+
func = ggml_cl_sigmoid;
|
|
5669
|
+
break;
|
|
4894
5670
|
default:
|
|
4895
5671
|
return false;
|
|
4896
5672
|
} break;
|
|
@@ -4912,6 +5688,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
4912
5688
|
}
|
|
4913
5689
|
func = ggml_cl_rms_norm;
|
|
4914
5690
|
break;
|
|
5691
|
+
case GGML_OP_GROUP_NORM:
|
|
5692
|
+
if (!any_on_device) {
|
|
5693
|
+
return false;
|
|
5694
|
+
}
|
|
5695
|
+
func = ggml_cl_group_norm;
|
|
5696
|
+
break;
|
|
4915
5697
|
case GGML_OP_MUL_MAT:
|
|
4916
5698
|
if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
|
4917
5699
|
return false;
|
|
@@ -4957,6 +5739,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
4957
5739
|
}
|
|
4958
5740
|
func = ggml_cl_im2col;
|
|
4959
5741
|
break;
|
|
5742
|
+
case GGML_OP_ARGSORT:
|
|
5743
|
+
if (!any_on_device) {
|
|
5744
|
+
return false;
|
|
5745
|
+
}
|
|
5746
|
+
func = ggml_cl_argsort;
|
|
5747
|
+
break;
|
|
5748
|
+
case GGML_OP_SUM_ROWS:
|
|
5749
|
+
if (!any_on_device) {
|
|
5750
|
+
return false;
|
|
5751
|
+
}
|
|
5752
|
+
func = ggml_cl_sum_rows;
|
|
5753
|
+
break;
|
|
4960
5754
|
default:
|
|
4961
5755
|
return false;
|
|
4962
5756
|
}
|