whispercpp 1.3.5 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +99 -2
- data/ext/extconf.rb +1 -0
- data/ext/ruby_whisper.c +20 -4
- data/ext/ruby_whisper.h +30 -2
- data/ext/ruby_whisper_context.c +216 -124
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_model.c +0 -1
- data/ext/ruby_whisper_params.c +0 -1
- data/ext/ruby_whisper_segment.c +0 -1
- data/ext/ruby_whisper_token.c +29 -9
- data/ext/ruby_whisper_transcribe.cpp +4 -1
- data/ext/ruby_whisper_vad_context.c +48 -1
- data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +0 -1
- data/ext/ruby_whisper_vad_segments.c +0 -1
- data/ext/sources/CMakeLists.txt +1 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/examples/bench/bench.cpp +23 -18
- data/ext/sources/examples/cli/cli.cpp +8 -0
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/server/server.cpp +18 -4
- data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
- data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
- data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
- data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
- data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
- data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
- data/ext/sources/examples/talk-llama/llama-context.h +27 -28
- data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
- data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
- data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
- data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
- data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
- data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
- data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
- data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
- data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
- data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
- data/ext/sources/examples/talk-llama/llama-model.h +72 -19
- data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
- data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
- data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
- data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
- data/ext/sources/examples/talk-llama/llama.cpp +76 -22
- data/ext/sources/examples/talk-llama/llama.h +63 -30
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
- data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
- data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
- data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
- data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
- data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/models.h +181 -46
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
- data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
- data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
- data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
- data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
- data/ext/sources/ggml/CMakeLists.txt +9 -3
- data/ext/sources/ggml/include/ggml-backend.h +1 -1
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +5 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +6 -1
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml.h +56 -9
- data/ext/sources/ggml/src/CMakeLists.txt +3 -0
- data/ext/sources/ggml/src/ggml-alloc.c +4 -9
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
- data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
- data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
- data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
- data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
- data/ext/sources/ggml/src/ggml-impl.h +62 -0
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
- data/ext/sources/ggml/src/ggml-quants.c +96 -5
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
- data/ext/sources/ggml/src/ggml.c +167 -33
- data/ext/sources/ggml/src/gguf.cpp +229 -44
- data/ext/sources/src/whisper.cpp +6 -28
- data/sig/whisper.rbs +43 -2
- data/test/test_context_params.rb +82 -0
- data/test/test_token.rb +11 -0
- data/test/test_vad_context.rb +58 -8
- data/test/test_whisper.rb +20 -0
- data/whispercpp.gemspec +1 -1
- metadata +240 -28
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
|
@@ -226,7 +226,8 @@ static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
|
|
|
226
226
|
return ADRENO_GPU_GEN::A7X;
|
|
227
227
|
}
|
|
228
228
|
|
|
229
|
-
if (strstr(device_name, "830")
|
|
229
|
+
if (strstr(device_name, "830") ||
|
|
230
|
+
strstr(device_name, "840")) {
|
|
230
231
|
return ADRENO_GPU_GEN::A8X;
|
|
231
232
|
}
|
|
232
233
|
|
|
@@ -312,7 +313,7 @@ struct ProfilingInfo {
|
|
|
312
313
|
cl_ulong cmd_duration_ns;
|
|
313
314
|
// The time for the kernel to complete - COMPLETE - END
|
|
314
315
|
cl_ulong cmd_complete_duration_ns;
|
|
315
|
-
// Total time to finish the kernel -
|
|
316
|
+
// Total time to finish the kernel - COMPLETE - QUEUED
|
|
316
317
|
cl_ulong cmd_total_duration_ns;
|
|
317
318
|
// Global and local work sizes.
|
|
318
319
|
size_t global_size[3];
|
|
@@ -398,6 +399,7 @@ struct ggml_backend_opencl_context {
|
|
|
398
399
|
int adreno_wave_size;
|
|
399
400
|
|
|
400
401
|
cl_bool non_uniform_workgroups;
|
|
402
|
+
size_t image_max_buffer_size;
|
|
401
403
|
|
|
402
404
|
cl_context context;
|
|
403
405
|
cl_command_queue queue;
|
|
@@ -407,10 +409,13 @@ struct ggml_backend_opencl_context {
|
|
|
407
409
|
ggml_cl_buffer prealloc_scales_trans;
|
|
408
410
|
ggml_cl_buffer prealloc_act_trans;
|
|
409
411
|
|
|
412
|
+
// prealloc buffers for src0 and src1
|
|
413
|
+
ggml_cl_buffer prealloc_src0;
|
|
414
|
+
ggml_cl_buffer prealloc_src1;
|
|
415
|
+
|
|
410
416
|
cl_program program_add;
|
|
411
417
|
cl_program program_add_id;
|
|
412
418
|
cl_program program_clamp;
|
|
413
|
-
cl_program program_cpy;
|
|
414
419
|
cl_program program_cvt;
|
|
415
420
|
cl_program program_diag_mask_inf;
|
|
416
421
|
cl_program program_gelu;
|
|
@@ -447,7 +452,6 @@ struct ggml_backend_opencl_context {
|
|
|
447
452
|
cl_program program_rms_norm;
|
|
448
453
|
cl_program program_group_norm;
|
|
449
454
|
cl_program program_rope;
|
|
450
|
-
cl_program program_scale;
|
|
451
455
|
cl_program program_silu;
|
|
452
456
|
cl_program program_sigmoid;
|
|
453
457
|
cl_program program_softmax_f32;
|
|
@@ -456,11 +460,8 @@ struct ggml_backend_opencl_context {
|
|
|
456
460
|
cl_program program_softmax_4_f16;
|
|
457
461
|
cl_program program_argsort_f32_i32;
|
|
458
462
|
cl_program program_sum_rows_f32;
|
|
459
|
-
cl_program program_repeat;
|
|
460
463
|
cl_program program_pad;
|
|
461
|
-
cl_program program_tanh;
|
|
462
464
|
cl_program program_upscale;
|
|
463
|
-
cl_program program_concat;
|
|
464
465
|
cl_program program_conv_2d_f16;
|
|
465
466
|
cl_program program_conv_2d_f32;
|
|
466
467
|
cl_program program_conv_2d_f16_f32;
|
|
@@ -479,24 +480,27 @@ struct ggml_backend_opencl_context {
|
|
|
479
480
|
cl_kernel kernel_div, kernel_div_row, kernel_div_f16, kernel_div_row_f16;
|
|
480
481
|
cl_kernel kernel_sub, kernel_sub_row, kernel_sub_f16, kernel_sub_row_f16;
|
|
481
482
|
cl_kernel kernel_add_id;
|
|
482
|
-
cl_kernel
|
|
483
|
+
cl_kernel kernel_scale_f32, kernel_scale_f32_4;
|
|
483
484
|
cl_kernel kernel_sqr_cont_f32, kernel_sqr_cont_f32_4, kernel_sqr_cont_f16, kernel_sqr_cont_f16_4;
|
|
484
485
|
cl_kernel kernel_sqrt_cont_f32, kernel_sqrt_cont_f32_4, kernel_sqrt_cont_f16, kernel_sqrt_cont_f16_4;
|
|
485
|
-
cl_kernel kernel_mean_f32;
|
|
486
|
+
cl_kernel kernel_mean_f32, kernel_mean_f32_4;
|
|
486
487
|
cl_kernel kernel_silu, kernel_silu_4;
|
|
487
488
|
cl_kernel kernel_gelu, kernel_gelu_4;
|
|
488
489
|
cl_kernel kernel_gelu_erf, kernel_gelu_erf_4;
|
|
489
490
|
cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
|
|
490
491
|
cl_kernel kernel_relu;
|
|
491
492
|
cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
|
|
493
|
+
cl_kernel kernel_tri;
|
|
492
494
|
cl_kernel kernel_fill;
|
|
493
495
|
cl_kernel kernel_clamp;
|
|
494
496
|
cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, kernel_swiglu_oai, kernel_geglu_erf, kernel_geglu_quick,
|
|
495
497
|
kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16, kernel_geglu_erf_f16, kernel_geglu_quick_f16;
|
|
496
498
|
cl_kernel kernel_norm, kernel_norm_mul_add;
|
|
497
499
|
cl_kernel kernel_rms_norm, kernel_rms_norm_mul;
|
|
500
|
+
cl_kernel kernel_l2_norm_f32;
|
|
498
501
|
cl_kernel kernel_group_norm, kernel_group_norm_mul_add;
|
|
499
502
|
cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
|
|
503
|
+
cl_kernel kernel_diag_f32;
|
|
500
504
|
cl_kernel kernel_soft_max, kernel_soft_max_4;
|
|
501
505
|
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
|
|
502
506
|
std::map<std::pair<int, int>, cl_kernel> kernels_flash_attn_f16;
|
|
@@ -511,7 +515,7 @@ struct ggml_backend_opencl_context {
|
|
|
511
515
|
cl_kernel kernel_set_rows_f32_i64, kernel_set_rows_f32_i32, kernel_set_rows_f16_i64, kernel_set_rows_f16_i32;
|
|
512
516
|
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
|
|
513
517
|
cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
|
|
514
|
-
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
|
|
518
|
+
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_i32_i32;
|
|
515
519
|
cl_kernel kernel_mul_mat_f32_f32;
|
|
516
520
|
cl_kernel kernel_mul_mat_f16_f16;
|
|
517
521
|
cl_kernel kernel_mul_mat_f16_f32_1row;
|
|
@@ -522,30 +526,43 @@ struct ggml_backend_opencl_context {
|
|
|
522
526
|
cl_kernel kernel_mul_mm_f16_f32_kq;
|
|
523
527
|
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
|
|
524
528
|
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
|
|
529
|
+
cl_kernel kernel_convert_block_q4_1, kernel_restore_block_q4_1;
|
|
525
530
|
cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans;
|
|
526
|
-
cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
|
|
531
|
+
cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0, kernel_restore_block_q8_0_trans;
|
|
527
532
|
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
|
|
528
533
|
cl_kernel kernel_convert_block_q4_0_noshuffle;
|
|
529
534
|
cl_kernel kernel_restore_block_q4_0_noshuffle;
|
|
535
|
+
cl_kernel kernel_convert_block_q4_1_noshuffle;
|
|
536
|
+
cl_kernel kernel_restore_block_q4_1_noshuffle;
|
|
537
|
+
cl_kernel kernel_convert_block_q6_K, kernel_restore_block_q6_K;
|
|
530
538
|
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
|
|
539
|
+
cl_kernel kernel_mul_mv_q4_1_f32;
|
|
540
|
+
cl_kernel kernel_mul_mv_q4_1_f32_flat;
|
|
541
|
+
cl_kernel kernel_mul_mv_q4_K_f32;
|
|
531
542
|
cl_kernel kernel_mul_mv_q6_K_f32;
|
|
543
|
+
cl_kernel kernel_mul_mv_q6_K_f32_flat;
|
|
532
544
|
cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
|
|
533
545
|
cl_kernel kernel_mul_mv_q8_0_f32, kernel_mul_mv_q8_0_f32_flat;
|
|
546
|
+
cl_kernel kernel_solve_tri_f32;
|
|
534
547
|
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
|
|
535
548
|
cl_kernel kernel_argsort_f32_i32;
|
|
536
|
-
cl_kernel kernel_sum_rows_f32;
|
|
537
|
-
cl_kernel
|
|
549
|
+
cl_kernel kernel_sum_rows_f32, kernel_sum_rows_f32_4;
|
|
550
|
+
cl_kernel kernel_cumsum_blk, kernel_cumsum_add;
|
|
551
|
+
cl_kernel kernel_repeat_f32;
|
|
538
552
|
cl_kernel kernel_pad;
|
|
539
|
-
cl_kernel
|
|
540
|
-
cl_kernel
|
|
541
|
-
cl_kernel
|
|
542
|
-
cl_kernel
|
|
543
|
-
cl_kernel
|
|
544
|
-
cl_kernel
|
|
553
|
+
cl_kernel kernel_tanh_f32, kernel_tanh_f32_4, kernel_tanh_f32_nc;
|
|
554
|
+
cl_kernel kernel_tanh_f16, kernel_tanh_f16_4, kernel_tanh_f16_nc;
|
|
555
|
+
cl_kernel kernel_neg_f32, kernel_neg_f32_4, kernel_neg_f32_nc;
|
|
556
|
+
cl_kernel kernel_neg_f16, kernel_neg_f16_4, kernel_neg_f16_nc;
|
|
557
|
+
cl_kernel kernel_exp_f32, kernel_exp_f32_4, kernel_exp_f32_nc;
|
|
558
|
+
cl_kernel kernel_exp_f16, kernel_exp_f16_4, kernel_exp_f16_nc;
|
|
559
|
+
cl_kernel kernel_expm1_f32, kernel_expm1_f32_4, kernel_expm1_f32_nc;
|
|
560
|
+
cl_kernel kernel_expm1_f16, kernel_expm1_f16_4, kernel_expm1_f16_nc;
|
|
561
|
+
cl_kernel kernel_softplus_f32, kernel_softplus_f32_4, kernel_softplus_f32_nc;
|
|
562
|
+
cl_kernel kernel_softplus_f16, kernel_softplus_f16_4, kernel_softplus_f16_nc;
|
|
545
563
|
cl_kernel kernel_upscale;
|
|
546
564
|
cl_kernel kernel_upscale_bilinear;
|
|
547
|
-
cl_kernel
|
|
548
|
-
cl_kernel kernel_concat_f32_non_contiguous;
|
|
565
|
+
cl_kernel kernel_concat_f32;
|
|
549
566
|
cl_kernel kernel_conv_2d_f16;
|
|
550
567
|
cl_kernel kernel_conv_2d_f32;
|
|
551
568
|
cl_kernel kernel_conv_2d_f16_f32;
|
|
@@ -558,7 +575,10 @@ struct ggml_backend_opencl_context {
|
|
|
558
575
|
cl_kernel kernel_mul_mv_id_mxfp4_f32_flat;
|
|
559
576
|
cl_kernel kernel_mul_mm_f32_f32_l4_lm;
|
|
560
577
|
cl_kernel kernel_mul_mm_f16_f32_l4_lm;
|
|
578
|
+
cl_kernel kernel_mul_mm_q4_0_f32_l4_lm;
|
|
579
|
+
cl_kernel kernel_mul_mm_q4_1_f32_l4_lm;
|
|
561
580
|
cl_kernel kernel_mul_mm_q8_0_f32_l4_lm;
|
|
581
|
+
cl_kernel kernel_mul_mm_q6_k_f32_l4_lm;
|
|
562
582
|
|
|
563
583
|
std::vector<ProfilingInfo> profiling_info;
|
|
564
584
|
|
|
@@ -671,7 +691,9 @@ struct ggml_backend_opencl_context {
|
|
|
671
691
|
cl_kernel kernel_transpose_32;
|
|
672
692
|
cl_kernel kernel_transpose_32_16;
|
|
673
693
|
cl_kernel kernel_transpose_16;
|
|
694
|
+
cl_kernel kernel_transpose_8_buf;
|
|
674
695
|
cl_kernel kernel_transpose_16_buf;
|
|
696
|
+
cl_kernel kernel_transpose_32_buf;
|
|
675
697
|
cl_kernel kernel_transpose_16_4x1;
|
|
676
698
|
|
|
677
699
|
// Gemm and Gemv related programs, kernels, etc
|
|
@@ -687,6 +709,10 @@ struct ggml_backend_opencl_context {
|
|
|
687
709
|
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096;
|
|
688
710
|
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
|
|
689
711
|
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
|
|
712
|
+
cl_kernel kernel_gemv_noshuffle_q4_1_f32;
|
|
713
|
+
cl_kernel kernel_gemm_noshuffle_q4_1_f32;
|
|
714
|
+
cl_kernel kernel_mul_mm_q8_0_f32_8x4;
|
|
715
|
+
cl_kernel CL_mul_mat_vec_q8_0_f32;
|
|
690
716
|
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
|
691
717
|
|
|
692
718
|
void free() {
|
|
@@ -792,6 +818,24 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
792
818
|
GGML_LOG_CONT(".");
|
|
793
819
|
}
|
|
794
820
|
|
|
821
|
+
// tri
|
|
822
|
+
{
|
|
823
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
824
|
+
const std::string kernel_src {
|
|
825
|
+
#include "tri.cl.h"
|
|
826
|
+
};
|
|
827
|
+
#else
|
|
828
|
+
const std::string kernel_src = read_file("tri.cl");
|
|
829
|
+
#endif
|
|
830
|
+
cl_program prog =
|
|
831
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
832
|
+
|
|
833
|
+
CL_CHECK((backend_ctx->kernel_tri = clCreateKernel(prog, "kernel_tri_f32", &err), err));
|
|
834
|
+
GGML_LOG_CONT(".");
|
|
835
|
+
|
|
836
|
+
CL_CHECK(clReleaseProgram(prog));
|
|
837
|
+
}
|
|
838
|
+
|
|
795
839
|
// fill
|
|
796
840
|
{
|
|
797
841
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
@@ -835,13 +879,14 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
835
879
|
#else
|
|
836
880
|
const std::string kernel_src = read_file("cpy.cl");
|
|
837
881
|
#endif
|
|
838
|
-
|
|
882
|
+
cl_program prog =
|
|
839
883
|
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
840
884
|
|
|
841
|
-
CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(
|
|
842
|
-
CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(
|
|
843
|
-
CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(
|
|
844
|
-
CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(
|
|
885
|
+
CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(prog, "kernel_cpy_f16_f16", &err), err));
|
|
886
|
+
CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(prog, "kernel_cpy_f16_f32", &err), err));
|
|
887
|
+
CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(prog, "kernel_cpy_f32_f16", &err), err));
|
|
888
|
+
CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(prog, "kernel_cpy_f32_f32", &err), err));
|
|
889
|
+
CL_CHECK((backend_ctx->kernel_cpy_i32_i32 = clCreateKernel(prog, "kernel_cpy_i32_i32", &err), err));
|
|
845
890
|
GGML_LOG_CONT(".");
|
|
846
891
|
}
|
|
847
892
|
|
|
@@ -861,12 +906,19 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
861
906
|
CL_CHECK((backend_ctx->kernel_restore_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0_noshuffle", &err), err));
|
|
862
907
|
CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
|
|
863
908
|
CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
|
|
909
|
+
CL_CHECK((backend_ctx->kernel_convert_block_q4_1_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_1_noshuffle", &err), err));
|
|
910
|
+
CL_CHECK((backend_ctx->kernel_restore_block_q4_1_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_1_noshuffle", &err), err));
|
|
911
|
+
CL_CHECK((backend_ctx->kernel_convert_block_q4_1 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_1", &err), err));
|
|
912
|
+
CL_CHECK((backend_ctx->kernel_restore_block_q4_1 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_1", &err), err));
|
|
864
913
|
CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err));
|
|
865
914
|
CL_CHECK((backend_ctx->kernel_convert_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4_trans", &err), err));
|
|
866
915
|
CL_CHECK((backend_ctx->kernel_restore_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4_trans", &err), err));
|
|
867
916
|
CL_CHECK((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4", &err), err));
|
|
868
917
|
CL_CHECK((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q8_0", &err), err));
|
|
869
918
|
CL_CHECK((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0", &err), err));
|
|
919
|
+
CL_CHECK((backend_ctx->kernel_restore_block_q8_0_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0_trans", &err), err));
|
|
920
|
+
CL_CHECK((backend_ctx->kernel_convert_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err));
|
|
921
|
+
CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err));
|
|
870
922
|
GGML_LOG_CONT(".");
|
|
871
923
|
}
|
|
872
924
|
|
|
@@ -887,6 +939,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
887
939
|
GGML_LOG_CONT(".");
|
|
888
940
|
}
|
|
889
941
|
|
|
942
|
+
// diag
|
|
943
|
+
{
|
|
944
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
945
|
+
const std::string kernel_src {
|
|
946
|
+
#include "diag.cl.h"
|
|
947
|
+
};
|
|
948
|
+
#else
|
|
949
|
+
const std::string kernel_src = read_file("diag.cl");
|
|
950
|
+
#endif
|
|
951
|
+
cl_program prog =
|
|
952
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
953
|
+
|
|
954
|
+
CL_CHECK((backend_ctx->kernel_diag_f32 = clCreateKernel(prog, "kernel_diag_f32", &err), err));
|
|
955
|
+
CL_CHECK(clReleaseProgram(prog));
|
|
956
|
+
GGML_LOG_CONT(".");
|
|
957
|
+
}
|
|
958
|
+
|
|
890
959
|
// gelu
|
|
891
960
|
{
|
|
892
961
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
@@ -952,6 +1021,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
952
1021
|
GGML_LOG_CONT(".");
|
|
953
1022
|
}
|
|
954
1023
|
|
|
1024
|
+
// solve_tri_f32
|
|
1025
|
+
{
|
|
1026
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1027
|
+
const std::string kernel_src {
|
|
1028
|
+
#include "solve_tri.cl.h"
|
|
1029
|
+
};
|
|
1030
|
+
#else
|
|
1031
|
+
const std::string kernel_src = read_file("solve_tri.cl");
|
|
1032
|
+
#endif
|
|
1033
|
+
cl_program prog =
|
|
1034
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1035
|
+
|
|
1036
|
+
CL_CHECK((backend_ctx->kernel_solve_tri_f32 = clCreateKernel(prog, "kernel_solve_tri_f32", &err), err));
|
|
1037
|
+
GGML_LOG_CONT(".");
|
|
1038
|
+
CL_CHECK(clReleaseProgram(prog));
|
|
1039
|
+
}
|
|
1040
|
+
|
|
955
1041
|
// im2col_f32
|
|
956
1042
|
{
|
|
957
1043
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
@@ -1072,14 +1158,65 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1072
1158
|
GGML_LOG_CONT(".");
|
|
1073
1159
|
}
|
|
1074
1160
|
|
|
1075
|
-
//
|
|
1161
|
+
// mul_mv_q4_1_f32
|
|
1162
|
+
{
|
|
1163
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1164
|
+
const std::string kernel_src {
|
|
1165
|
+
#include "mul_mv_q4_1_f32.cl.h"
|
|
1166
|
+
};
|
|
1167
|
+
#else
|
|
1168
|
+
const std::string kernel_src = read_file("mul_mv_q4_1_f32.cl");
|
|
1169
|
+
#endif
|
|
1170
|
+
cl_program prog =
|
|
1171
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1172
|
+
|
|
1173
|
+
CL_CHECK((backend_ctx->kernel_mul_mv_q4_1_f32 = clCreateKernel(prog, "kernel_mul_mv_q4_1_f32", &err), err));
|
|
1174
|
+
CL_CHECK(clReleaseProgram(prog));
|
|
1175
|
+
GGML_LOG_CONT(".");
|
|
1176
|
+
}
|
|
1177
|
+
|
|
1178
|
+
// mul_mv_q4_1_f32_flat
|
|
1179
|
+
{
|
|
1180
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1181
|
+
const std::string kernel_src {
|
|
1182
|
+
#include "mul_mv_q4_1_f32_flat.cl.h"
|
|
1183
|
+
};
|
|
1184
|
+
#else
|
|
1185
|
+
const std::string kernel_src = read_file("mul_mv_q4_1_f32_flat.cl");
|
|
1186
|
+
#endif
|
|
1187
|
+
cl_program prog =
|
|
1188
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1189
|
+
|
|
1190
|
+
CL_CHECK((backend_ctx->kernel_mul_mv_q4_1_f32_flat = clCreateKernel(prog, "kernel_mul_mv_q4_1_f32_flat", &err), err));
|
|
1191
|
+
CL_CHECK(clReleaseProgram(prog));
|
|
1192
|
+
GGML_LOG_CONT(".");
|
|
1193
|
+
}
|
|
1194
|
+
|
|
1195
|
+
// mul_mv_q4_k_f32
|
|
1196
|
+
{
|
|
1197
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1198
|
+
const std::string kernel_src {
|
|
1199
|
+
#include "mul_mv_q4_k_f32.cl.h"
|
|
1200
|
+
};
|
|
1201
|
+
#else
|
|
1202
|
+
const std::string kernel_src = read_file("mul_mv_q4_k_f32.cl");
|
|
1203
|
+
#endif
|
|
1204
|
+
cl_program prog =
|
|
1205
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1206
|
+
|
|
1207
|
+
CL_CHECK((backend_ctx->kernel_mul_mv_q4_K_f32 = clCreateKernel(prog, "kernel_mul_mv_q4_K_f32", &err), err));
|
|
1208
|
+
CL_CHECK(clReleaseProgram(prog));
|
|
1209
|
+
GGML_LOG_CONT(".");
|
|
1210
|
+
}
|
|
1211
|
+
|
|
1212
|
+
// mul_mv_q6_k_f32
|
|
1076
1213
|
{
|
|
1077
1214
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1078
1215
|
const std::string kernel_src {
|
|
1079
|
-
#include "
|
|
1216
|
+
#include "mul_mv_q6_k_f32.cl.h"
|
|
1080
1217
|
};
|
|
1081
1218
|
#else
|
|
1082
|
-
const std::string kernel_src = read_file("
|
|
1219
|
+
const std::string kernel_src = read_file("mul_mv_q6_k_f32.cl");
|
|
1083
1220
|
#endif
|
|
1084
1221
|
backend_ctx->program_mul_mv_q6_K =
|
|
1085
1222
|
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
@@ -1088,6 +1225,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1088
1225
|
GGML_LOG_CONT(".");
|
|
1089
1226
|
}
|
|
1090
1227
|
|
|
1228
|
+
// mul_mv_q6_k_f32_flat
|
|
1229
|
+
{
|
|
1230
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1231
|
+
const std::string kernel_src {
|
|
1232
|
+
#include "mul_mv_q6_k_f32_flat.cl.h"
|
|
1233
|
+
};
|
|
1234
|
+
#else
|
|
1235
|
+
const std::string kernel_src = read_file("mul_mv_q6_k_f32_flat.cl");
|
|
1236
|
+
#endif
|
|
1237
|
+
cl_program prog =
|
|
1238
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1239
|
+
|
|
1240
|
+
CL_CHECK((backend_ctx->kernel_mul_mv_q6_K_f32_flat = clCreateKernel(prog, "kernel_mul_mv_q6_K_f32_flat", &err), err));
|
|
1241
|
+
CL_CHECK(clReleaseProgram(prog));
|
|
1242
|
+
GGML_LOG_CONT(".");
|
|
1243
|
+
}
|
|
1244
|
+
|
|
1091
1245
|
// mul_mv_q8_0_f32
|
|
1092
1246
|
{
|
|
1093
1247
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
@@ -1280,6 +1434,38 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1280
1434
|
GGML_LOG_CONT(".");
|
|
1281
1435
|
}
|
|
1282
1436
|
|
|
1437
|
+
// mul_mm_q4_0_f32_l4_lm
|
|
1438
|
+
{
|
|
1439
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1440
|
+
const std::string kernel_src {
|
|
1441
|
+
#include "mul_mm_q4_0_f32_l4_lm.cl.h"
|
|
1442
|
+
};
|
|
1443
|
+
#else
|
|
1444
|
+
const std::string kernel_src = read_file("mul_mm_q4_0_f32_l4_lm.cl");
|
|
1445
|
+
#endif
|
|
1446
|
+
cl_program prog =
|
|
1447
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1448
|
+
|
|
1449
|
+
CL_CHECK((backend_ctx->kernel_mul_mm_q4_0_f32_l4_lm = clCreateKernel(prog, "kernel_mul_mm_q4_0_f32_l4_lm", &err), err));
|
|
1450
|
+
GGML_LOG_CONT(".");
|
|
1451
|
+
}
|
|
1452
|
+
|
|
1453
|
+
// mul_mm_q4_1_f32_l4_lm
|
|
1454
|
+
{
|
|
1455
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1456
|
+
const std::string kernel_src {
|
|
1457
|
+
#include "mul_mm_q4_1_f32_l4_lm.cl.h"
|
|
1458
|
+
};
|
|
1459
|
+
#else
|
|
1460
|
+
const std::string kernel_src = read_file("mul_mm_q4_1_f32_l4_lm.cl");
|
|
1461
|
+
#endif
|
|
1462
|
+
cl_program prog =
|
|
1463
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1464
|
+
|
|
1465
|
+
CL_CHECK((backend_ctx->kernel_mul_mm_q4_1_f32_l4_lm = clCreateKernel(prog, "kernel_mul_mm_q4_1_f32_l4_lm", &err), err));
|
|
1466
|
+
GGML_LOG_CONT(".");
|
|
1467
|
+
}
|
|
1468
|
+
|
|
1283
1469
|
// mul_mm_q8_0_f32_l4_lm
|
|
1284
1470
|
{
|
|
1285
1471
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
@@ -1296,6 +1482,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1296
1482
|
GGML_LOG_CONT(".");
|
|
1297
1483
|
}
|
|
1298
1484
|
|
|
1485
|
+
// mul_mm_q6_k_f32_l4_lm
|
|
1486
|
+
{
|
|
1487
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1488
|
+
const std::string kernel_src {
|
|
1489
|
+
#include "mul_mm_q6_k_f32_l4_lm.cl.h"
|
|
1490
|
+
};
|
|
1491
|
+
#else
|
|
1492
|
+
const std::string kernel_src = read_file("mul_mm_q6_k_f32_l4_lm.cl");
|
|
1493
|
+
#endif
|
|
1494
|
+
cl_program prog =
|
|
1495
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1496
|
+
|
|
1497
|
+
CL_CHECK((backend_ctx->kernel_mul_mm_q6_k_f32_l4_lm = clCreateKernel(prog, "kernel_mul_mm_q6_k_f32_l4_lm", &err), err));
|
|
1498
|
+
CL_CHECK(clReleaseProgram(prog));
|
|
1499
|
+
GGML_LOG_CONT(".");
|
|
1500
|
+
}
|
|
1501
|
+
|
|
1299
1502
|
// mul_mm_f16_f32_kq_kqv
|
|
1300
1503
|
{
|
|
1301
1504
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
@@ -1384,6 +1587,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1384
1587
|
GGML_LOG_CONT(".");
|
|
1385
1588
|
}
|
|
1386
1589
|
|
|
1590
|
+
// l2_norm
|
|
1591
|
+
{
|
|
1592
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1593
|
+
const std::string kernel_src {
|
|
1594
|
+
#include "l2_norm.cl.h"
|
|
1595
|
+
};
|
|
1596
|
+
#else
|
|
1597
|
+
const std::string kernel_src = read_file("l2_norm.cl");
|
|
1598
|
+
#endif
|
|
1599
|
+
cl_program prog =
|
|
1600
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1601
|
+
|
|
1602
|
+
CL_CHECK((backend_ctx->kernel_l2_norm_f32 = clCreateKernel(prog, "kernel_l2_norm_f32", &err), err));
|
|
1603
|
+
CL_CHECK(clReleaseProgram(prog));
|
|
1604
|
+
GGML_LOG_CONT(".");
|
|
1605
|
+
}
|
|
1606
|
+
|
|
1387
1607
|
// rope
|
|
1388
1608
|
{
|
|
1389
1609
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
@@ -1416,10 +1636,12 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1416
1636
|
#else
|
|
1417
1637
|
const std::string kernel_src = read_file("scale.cl");
|
|
1418
1638
|
#endif
|
|
1419
|
-
|
|
1639
|
+
cl_program prog =
|
|
1420
1640
|
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1421
1641
|
|
|
1422
|
-
CL_CHECK((backend_ctx->
|
|
1642
|
+
CL_CHECK((backend_ctx->kernel_scale_f32 = clCreateKernel(prog, "kernel_scale_f32", &err), err));
|
|
1643
|
+
CL_CHECK((backend_ctx->kernel_scale_f32_4 = clCreateKernel(prog, "kernel_scale_f32_4", &err), err));
|
|
1644
|
+
CL_CHECK(clReleaseProgram(prog));
|
|
1423
1645
|
GGML_LOG_CONT(".");
|
|
1424
1646
|
}
|
|
1425
1647
|
|
|
@@ -1664,6 +1886,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1664
1886
|
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1665
1887
|
|
|
1666
1888
|
CL_CHECK((backend_ctx->kernel_mean_f32 = clCreateKernel(prog, "kernel_mean_f32", &err), err));
|
|
1889
|
+
CL_CHECK((backend_ctx->kernel_mean_f32_4 = clCreateKernel(prog, "kernel_mean_f32_4", &err), err));
|
|
1667
1890
|
|
|
1668
1891
|
CL_CHECK(clReleaseProgram(prog));
|
|
1669
1892
|
GGML_LOG_CONT(".");
|
|
@@ -1701,7 +1924,26 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1701
1924
|
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1702
1925
|
|
|
1703
1926
|
CL_CHECK((backend_ctx->kernel_sum_rows_f32 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32", &err), err));
|
|
1927
|
+
CL_CHECK((backend_ctx->kernel_sum_rows_f32_4 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32_4", &err), err));
|
|
1928
|
+
GGML_LOG_CONT(".");
|
|
1929
|
+
}
|
|
1930
|
+
|
|
1931
|
+
// cumsum
|
|
1932
|
+
{
|
|
1933
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1934
|
+
const std::string kernel_src {
|
|
1935
|
+
#include "cumsum.cl.h"
|
|
1936
|
+
};
|
|
1937
|
+
#else
|
|
1938
|
+
const std::string kernel_src = read_file("cumsum.cl");
|
|
1939
|
+
#endif
|
|
1940
|
+
cl_program prog;
|
|
1941
|
+
prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1942
|
+
|
|
1943
|
+
CL_CHECK((backend_ctx->kernel_cumsum_blk = clCreateKernel(prog, "kernel_cumsum_blk", &err), err));
|
|
1944
|
+
CL_CHECK((backend_ctx->kernel_cumsum_add = clCreateKernel(prog, "kernel_cumsum_add", &err), err));
|
|
1704
1945
|
GGML_LOG_CONT(".");
|
|
1946
|
+
CL_CHECK(clReleaseProgram(prog));
|
|
1705
1947
|
}
|
|
1706
1948
|
|
|
1707
1949
|
// sigmoid
|
|
@@ -1747,16 +1989,11 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1747
1989
|
#else
|
|
1748
1990
|
const std::string kernel_src = read_file("repeat.cl");
|
|
1749
1991
|
#endif
|
|
1750
|
-
|
|
1751
|
-
backend_ctx->
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
} else {
|
|
1756
|
-
GGML_LOG_WARN("ggml_opencl: repeat kernel source not found or empty. Repeat operations will not be available.\n");
|
|
1757
|
-
backend_ctx->program_repeat = nullptr;
|
|
1758
|
-
backend_ctx->kernel_repeat = nullptr;
|
|
1759
|
-
}
|
|
1992
|
+
cl_program prog =
|
|
1993
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1994
|
+
CL_CHECK((backend_ctx->kernel_repeat_f32 = clCreateKernel(prog, "kernel_repeat_f32", &err), err));
|
|
1995
|
+
CL_CHECK(clReleaseProgram(prog));
|
|
1996
|
+
GGML_LOG_CONT(".");
|
|
1760
1997
|
}
|
|
1761
1998
|
|
|
1762
1999
|
// pad
|
|
@@ -1789,18 +2026,58 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1789
2026
|
#else
|
|
1790
2027
|
const std::string kernel_src = read_file("tanh.cl");
|
|
1791
2028
|
#endif
|
|
1792
|
-
|
|
1793
|
-
backend_ctx->
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
|
|
1802
|
-
|
|
1803
|
-
|
|
2029
|
+
cl_program prog =
|
|
2030
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
2031
|
+
CL_CHECK((backend_ctx->kernel_tanh_f32 = clCreateKernel(prog, "kernel_tanh_f32", &err), err));
|
|
2032
|
+
CL_CHECK((backend_ctx->kernel_tanh_f32_4 = clCreateKernel(prog, "kernel_tanh_f32_4", &err), err));
|
|
2033
|
+
CL_CHECK((backend_ctx->kernel_tanh_f32_nc = clCreateKernel(prog, "kernel_tanh_f32_nc", &err), err));
|
|
2034
|
+
CL_CHECK((backend_ctx->kernel_tanh_f16 = clCreateKernel(prog, "kernel_tanh_f16", &err), err));
|
|
2035
|
+
CL_CHECK((backend_ctx->kernel_tanh_f16_4 = clCreateKernel(prog, "kernel_tanh_f16_4", &err), err));
|
|
2036
|
+
CL_CHECK((backend_ctx->kernel_tanh_f16_nc = clCreateKernel(prog, "kernel_tanh_f16_nc", &err), err));
|
|
2037
|
+
CL_CHECK(clReleaseProgram(prog));
|
|
2038
|
+
GGML_LOG_CONT(".");
|
|
2039
|
+
}
|
|
2040
|
+
|
|
2041
|
+
// neg
|
|
2042
|
+
{
|
|
2043
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
2044
|
+
const std::string kernel_src {
|
|
2045
|
+
#include "neg.cl.h"
|
|
2046
|
+
};
|
|
2047
|
+
#else
|
|
2048
|
+
const std::string kernel_src = read_file("neg.cl");
|
|
2049
|
+
#endif
|
|
2050
|
+
cl_program prog =
|
|
2051
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
2052
|
+
CL_CHECK((backend_ctx->kernel_neg_f32 = clCreateKernel(prog, "kernel_neg_f32", &err), err));
|
|
2053
|
+
CL_CHECK((backend_ctx->kernel_neg_f32_4 = clCreateKernel(prog, "kernel_neg_f32_4", &err), err));
|
|
2054
|
+
CL_CHECK((backend_ctx->kernel_neg_f32_nc = clCreateKernel(prog, "kernel_neg_f32_nc", &err), err));
|
|
2055
|
+
CL_CHECK((backend_ctx->kernel_neg_f16 = clCreateKernel(prog, "kernel_neg_f16", &err), err));
|
|
2056
|
+
CL_CHECK((backend_ctx->kernel_neg_f16_4 = clCreateKernel(prog, "kernel_neg_f16_4", &err), err));
|
|
2057
|
+
CL_CHECK((backend_ctx->kernel_neg_f16_nc = clCreateKernel(prog, "kernel_neg_f16_nc", &err), err));
|
|
2058
|
+
CL_CHECK(clReleaseProgram(prog));
|
|
2059
|
+
GGML_LOG_CONT(".");
|
|
2060
|
+
}
|
|
2061
|
+
|
|
2062
|
+
// exp
|
|
2063
|
+
{
|
|
2064
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
2065
|
+
const std::string kernel_src {
|
|
2066
|
+
#include "exp.cl.h"
|
|
2067
|
+
};
|
|
2068
|
+
#else
|
|
2069
|
+
const std::string kernel_src = read_file("exp.cl");
|
|
2070
|
+
#endif
|
|
2071
|
+
cl_program prog =
|
|
2072
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
2073
|
+
CL_CHECK((backend_ctx->kernel_exp_f32 = clCreateKernel(prog, "kernel_exp_f32", &err), err));
|
|
2074
|
+
CL_CHECK((backend_ctx->kernel_exp_f32_4 = clCreateKernel(prog, "kernel_exp_f32_4", &err), err));
|
|
2075
|
+
CL_CHECK((backend_ctx->kernel_exp_f32_nc = clCreateKernel(prog, "kernel_exp_f32_nc", &err), err));
|
|
2076
|
+
CL_CHECK((backend_ctx->kernel_exp_f16 = clCreateKernel(prog, "kernel_exp_f16", &err), err));
|
|
2077
|
+
CL_CHECK((backend_ctx->kernel_exp_f16_4 = clCreateKernel(prog, "kernel_exp_f16_4", &err), err));
|
|
2078
|
+
CL_CHECK((backend_ctx->kernel_exp_f16_nc = clCreateKernel(prog, "kernel_exp_f16_nc", &err), err));
|
|
2079
|
+
CL_CHECK(clReleaseProgram(prog));
|
|
2080
|
+
GGML_LOG_CONT(".");
|
|
1804
2081
|
}
|
|
1805
2082
|
|
|
1806
2083
|
// expm1
|
|
@@ -1812,20 +2089,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1812
2089
|
#else
|
|
1813
2090
|
const std::string kernel_src = read_file("expm1.cl");
|
|
1814
2091
|
#endif
|
|
1815
|
-
cl_program prog
|
|
1816
|
-
|
|
1817
|
-
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
|
|
1823
|
-
GGML_LOG_WARN("ggml_opencl: expm1 kernel source not found or empty. Expm1 operation will not be available.\n");
|
|
1824
|
-
prog = nullptr;
|
|
1825
|
-
backend_ctx->kernel_expm1_f32_nd = nullptr;
|
|
1826
|
-
backend_ctx->kernel_expm1_f16_nd = nullptr;
|
|
1827
|
-
}
|
|
2092
|
+
cl_program prog =
|
|
2093
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
2094
|
+
CL_CHECK((backend_ctx->kernel_expm1_f32 = clCreateKernel(prog, "kernel_expm1_f32", &err), err));
|
|
2095
|
+
CL_CHECK((backend_ctx->kernel_expm1_f32_4 = clCreateKernel(prog, "kernel_expm1_f32_4", &err), err));
|
|
2096
|
+
CL_CHECK((backend_ctx->kernel_expm1_f32_nc = clCreateKernel(prog, "kernel_expm1_f32_nc", &err), err));
|
|
2097
|
+
CL_CHECK((backend_ctx->kernel_expm1_f16 = clCreateKernel(prog, "kernel_expm1_f16", &err), err));
|
|
2098
|
+
CL_CHECK((backend_ctx->kernel_expm1_f16_4 = clCreateKernel(prog, "kernel_expm1_f16_4", &err), err));
|
|
2099
|
+
CL_CHECK((backend_ctx->kernel_expm1_f16_nc = clCreateKernel(prog, "kernel_expm1_f16_nc", &err), err));
|
|
1828
2100
|
CL_CHECK(clReleaseProgram(prog));
|
|
2101
|
+
GGML_LOG_CONT(".");
|
|
1829
2102
|
}
|
|
1830
2103
|
|
|
1831
2104
|
// softplus
|
|
@@ -1837,20 +2110,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1837
2110
|
#else
|
|
1838
2111
|
const std::string kernel_src = read_file("softplus.cl");
|
|
1839
2112
|
#endif
|
|
1840
|
-
cl_program prog
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
|
|
1844
|
-
|
|
1845
|
-
|
|
1846
|
-
|
|
1847
|
-
|
|
1848
|
-
GGML_LOG_WARN("ggml_opencl: softplus kernel source not found or empty. Softplus operation will not be available.\n");
|
|
1849
|
-
prog = nullptr;
|
|
1850
|
-
backend_ctx->kernel_softplus_f32_nd = nullptr;
|
|
1851
|
-
backend_ctx->kernel_softplus_f16_nd = nullptr;
|
|
1852
|
-
}
|
|
2113
|
+
cl_program prog =
|
|
2114
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
2115
|
+
CL_CHECK((backend_ctx->kernel_softplus_f32 = clCreateKernel(prog, "kernel_softplus_f32", &err), err));
|
|
2116
|
+
CL_CHECK((backend_ctx->kernel_softplus_f32_4 = clCreateKernel(prog, "kernel_softplus_f32_4", &err), err));
|
|
2117
|
+
CL_CHECK((backend_ctx->kernel_softplus_f32_nc = clCreateKernel(prog, "kernel_softplus_f32_nc", &err), err));
|
|
2118
|
+
CL_CHECK((backend_ctx->kernel_softplus_f16 = clCreateKernel(prog, "kernel_softplus_f16", &err), err));
|
|
2119
|
+
CL_CHECK((backend_ctx->kernel_softplus_f16_4 = clCreateKernel(prog, "kernel_softplus_f16_4", &err), err));
|
|
2120
|
+
CL_CHECK((backend_ctx->kernel_softplus_f16_nc = clCreateKernel(prog, "kernel_softplus_f16_nc", &err), err));
|
|
1853
2121
|
CL_CHECK(clReleaseProgram(prog));
|
|
2122
|
+
GGML_LOG_CONT(".");
|
|
1854
2123
|
}
|
|
1855
2124
|
|
|
1856
2125
|
// upscale
|
|
@@ -1892,22 +2161,13 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1892
2161
|
#include "concat.cl.h"
|
|
1893
2162
|
};
|
|
1894
2163
|
#else
|
|
1895
|
-
|
|
1896
2164
|
const std::string kernel_src = read_file("concat.cl");
|
|
1897
2165
|
#endif
|
|
1898
|
-
|
|
1899
|
-
backend_ctx->
|
|
1900
|
-
|
|
1901
|
-
|
|
1902
|
-
|
|
1903
|
-
CL_CHECK((backend_ctx->kernel_concat_f32_non_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_non_contiguous", &err), err));
|
|
1904
|
-
GGML_LOG_CONT(".");
|
|
1905
|
-
} else {
|
|
1906
|
-
GGML_LOG_WARN("ggml_opencl: concat kernel source not found or empty. Concat operations will not be available.\n");
|
|
1907
|
-
backend_ctx->program_concat = nullptr;
|
|
1908
|
-
backend_ctx->kernel_concat_f32_contiguous = nullptr;
|
|
1909
|
-
backend_ctx->kernel_concat_f32_non_contiguous = nullptr;
|
|
1910
|
-
}
|
|
2166
|
+
cl_program prog =
|
|
2167
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
2168
|
+
CL_CHECK((backend_ctx->kernel_concat_f32 = clCreateKernel(prog, "kernel_concat_f32", &err), err));
|
|
2169
|
+
CL_CHECK(clReleaseProgram(prog));
|
|
2170
|
+
GGML_LOG_CONT(".");
|
|
1911
2171
|
}
|
|
1912
2172
|
|
|
1913
2173
|
// timestep_embedding
|
|
@@ -2107,7 +2367,9 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
2107
2367
|
CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
|
|
2108
2368
|
CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
|
|
2109
2369
|
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
|
|
2370
|
+
CL_CHECK((backend_ctx->kernel_transpose_8_buf = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_8_buf", &err), err));
|
|
2110
2371
|
CL_CHECK((backend_ctx->kernel_transpose_16_buf = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_buf", &err), err));
|
|
2372
|
+
CL_CHECK((backend_ctx->kernel_transpose_32_buf = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_buf", &err), err));
|
|
2111
2373
|
CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
|
|
2112
2374
|
GGML_LOG_CONT(".");
|
|
2113
2375
|
}
|
|
@@ -2227,42 +2489,121 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
2227
2489
|
GGML_LOG_CONT(".");
|
|
2228
2490
|
}
|
|
2229
2491
|
|
|
2230
|
-
|
|
2231
|
-
" -cl-mad-enable "
|
|
2232
|
-
" -cl-fast-relaxed-math";
|
|
2233
|
-
|
|
2234
|
-
// gemv_moe_mxfp4_f32
|
|
2492
|
+
// gemm_noshuffle_q4_1_f32
|
|
2235
2493
|
{
|
|
2236
2494
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
2237
2495
|
const std::string kernel_src {
|
|
2238
|
-
#include "
|
|
2239
|
-
|
|
2496
|
+
#include "gemm_noshuffle_q4_1_f32.cl.h"
|
|
2497
|
+
};
|
|
2240
2498
|
#else
|
|
2241
|
-
const std::string kernel_src = read_file("
|
|
2499
|
+
const std::string kernel_src = read_file("gemm_noshuffle_q4_1_f32.cl");
|
|
2242
2500
|
#endif
|
|
2243
|
-
backend_ctx->
|
|
2244
|
-
|
|
2245
|
-
|
|
2246
|
-
CL_CHECK((backend_ctx->kernel_gemv_moe_mxfp4_f32 = clCreateKernel(backend_ctx->program_gemv_moe_mxfp4_f32, "kernel_gemv_moe_mxfp4_f32", &err), err));
|
|
2501
|
+
cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
2502
|
+
CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q4_1_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q4_1_f32", &err), err));
|
|
2503
|
+
CL_CHECK(clReleaseProgram(prog));
|
|
2247
2504
|
GGML_LOG_CONT(".");
|
|
2248
2505
|
}
|
|
2249
2506
|
|
|
2250
|
-
//
|
|
2507
|
+
// gemv_noshuffle_q4_1_f32
|
|
2251
2508
|
{
|
|
2509
|
+
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
2510
|
+
" -cl-mad-enable ";
|
|
2511
|
+
if (backend_ctx->has_vector_subgroup_broadcast) {
|
|
2512
|
+
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
2513
|
+
}
|
|
2514
|
+
|
|
2252
2515
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
2253
2516
|
const std::string kernel_src {
|
|
2254
|
-
#include "
|
|
2517
|
+
#include "gemv_noshuffle_q4_1_f32.cl.h"
|
|
2255
2518
|
};
|
|
2256
2519
|
#else
|
|
2257
|
-
const std::string kernel_src = read_file("
|
|
2520
|
+
const std::string kernel_src = read_file("gemv_noshuffle_q4_1_f32.cl");
|
|
2258
2521
|
#endif
|
|
2259
|
-
backend_ctx->program_gemm_moe_mxfp4_f32 =
|
|
2260
|
-
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
|
|
2261
2522
|
|
|
2262
|
-
|
|
2523
|
+
cl_program prog = build_program_from_source(
|
|
2524
|
+
backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_gemv_compile_opts);
|
|
2525
|
+
|
|
2526
|
+
CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q4_1_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q4_1_f32", &err), err));
|
|
2527
|
+
CL_CHECK(clReleaseProgram(prog));
|
|
2263
2528
|
GGML_LOG_CONT(".");
|
|
2264
2529
|
}
|
|
2265
|
-
|
|
2530
|
+
|
|
2531
|
+
// mul_mm_q8_0_f32_8x4
|
|
2532
|
+
{
|
|
2533
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
2534
|
+
const std::string kernel_src_q8_8x4_gemm {
|
|
2535
|
+
#include "mul_mm_q8_0_f32_8x4.cl.h"
|
|
2536
|
+
};
|
|
2537
|
+
#else
|
|
2538
|
+
const std::string kernel_src_q8_8x4_gemm = read_file("mul_mm_q8_0_f32_8x4.cl");
|
|
2539
|
+
#endif
|
|
2540
|
+
backend_ctx->program_CL_gemm = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_q8_8x4_gemm.c_str(), compile_opts);
|
|
2541
|
+
CL_CHECK((backend_ctx->kernel_mul_mm_q8_0_f32_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mm_q8_0_f32_8x4", &err), err));
|
|
2542
|
+
GGML_LOG_CONT(".");
|
|
2543
|
+
}
|
|
2544
|
+
|
|
2545
|
+
// gemv_noshuffle_general_q8_0_f32
|
|
2546
|
+
{
|
|
2547
|
+
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
2548
|
+
" -cl-mad-enable "
|
|
2549
|
+
" -DSIMDGROUP_WIDTH=" +
|
|
2550
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
|
2551
|
+
if (backend_ctx->has_vector_subgroup_broadcast) {
|
|
2552
|
+
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
2553
|
+
}
|
|
2554
|
+
|
|
2555
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
2556
|
+
const std::string kernel_src_CL_gemv_general {
|
|
2557
|
+
#include "gemv_noshuffle_general_q8_0_f32.cl.h"
|
|
2558
|
+
};
|
|
2559
|
+
#else
|
|
2560
|
+
const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_general_q8_0_f32.cl");
|
|
2561
|
+
#endif
|
|
2562
|
+
|
|
2563
|
+
cl_program prog = build_program_from_source(
|
|
2564
|
+
backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
|
|
2565
|
+
|
|
2566
|
+
CL_CHECK((backend_ctx->CL_mul_mat_vec_q8_0_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q8_0_f32", &err), err));
|
|
2567
|
+
CL_CHECK(clReleaseProgram(prog));
|
|
2568
|
+
GGML_LOG_CONT(".");
|
|
2569
|
+
}
|
|
2570
|
+
|
|
2571
|
+
std::string CL_moe_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
2572
|
+
" -cl-mad-enable "
|
|
2573
|
+
" -cl-fast-relaxed-math";
|
|
2574
|
+
|
|
2575
|
+
// gemv_moe_mxfp4_f32
|
|
2576
|
+
{
|
|
2577
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
2578
|
+
const std::string kernel_src {
|
|
2579
|
+
#include "gemv_moe_mxfp4_f32.cl.h"
|
|
2580
|
+
};
|
|
2581
|
+
#else
|
|
2582
|
+
const std::string kernel_src = read_file("gemv_moe_mxfp4_f32.cl");
|
|
2583
|
+
#endif
|
|
2584
|
+
backend_ctx->program_gemv_moe_mxfp4_f32 =
|
|
2585
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
|
|
2586
|
+
|
|
2587
|
+
CL_CHECK((backend_ctx->kernel_gemv_moe_mxfp4_f32 = clCreateKernel(backend_ctx->program_gemv_moe_mxfp4_f32, "kernel_gemv_moe_mxfp4_f32", &err), err));
|
|
2588
|
+
GGML_LOG_CONT(".");
|
|
2589
|
+
}
|
|
2590
|
+
|
|
2591
|
+
// gemm_moe_mxfp4_f32
|
|
2592
|
+
{
|
|
2593
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
2594
|
+
const std::string kernel_src {
|
|
2595
|
+
#include "gemm_moe_mxfp4_f32.cl.h"
|
|
2596
|
+
};
|
|
2597
|
+
#else
|
|
2598
|
+
const std::string kernel_src = read_file("gemm_moe_mxfp4_f32.cl");
|
|
2599
|
+
#endif
|
|
2600
|
+
backend_ctx->program_gemm_moe_mxfp4_f32 =
|
|
2601
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
|
|
2602
|
+
|
|
2603
|
+
CL_CHECK((backend_ctx->kernel_gemm_moe_mxfp4_f32 = clCreateKernel(backend_ctx->program_gemm_moe_mxfp4_f32, "kernel_gemm_moe_mxfp4_f32", &err), err));
|
|
2604
|
+
GGML_LOG_CONT(".");
|
|
2605
|
+
}
|
|
2606
|
+
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
|
2266
2607
|
GGML_LOG_CONT("\n");
|
|
2267
2608
|
}
|
|
2268
2609
|
|
|
@@ -2315,7 +2656,7 @@ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_r
|
|
|
2315
2656
|
|
|
2316
2657
|
cl_platform_id platform_ids[NPLAT];
|
|
2317
2658
|
if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) {
|
|
2318
|
-
GGML_LOG_ERROR("ggml_opencl:
|
|
2659
|
+
GGML_LOG_ERROR("ggml_opencl: platform IDs not available.\n");
|
|
2319
2660
|
return found_devices;
|
|
2320
2661
|
}
|
|
2321
2662
|
|
|
@@ -2621,6 +2962,9 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
2621
2962
|
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
|
|
2622
2963
|
GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024);
|
|
2623
2964
|
|
|
2965
|
+
clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(size_t), &backend_ctx->image_max_buffer_size, NULL);
|
|
2966
|
+
GGML_LOG_INFO("ggml_opencl: device max image buffer size (pixels): %lu\n", backend_ctx->image_max_buffer_size);
|
|
2967
|
+
|
|
2624
2968
|
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &backend_ctx->max_workgroup_size, NULL);
|
|
2625
2969
|
GGML_LOG_INFO("ggml_opencl: device max workgroup size: %lu\n", backend_ctx->max_workgroup_size);
|
|
2626
2970
|
|
|
@@ -2729,6 +3073,82 @@ static void ggml_cl2_free(ggml_backend_t backend) {
|
|
|
2729
3073
|
}
|
|
2730
3074
|
}
|
|
2731
3075
|
|
|
3076
|
+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
3077
|
+
static void transpose_2d(
|
|
3078
|
+
ggml_backend_opencl_context * backend_ctx,
|
|
3079
|
+
cl_kernel kernel,
|
|
3080
|
+
cl_mem src, cl_mem dst, size_t size,
|
|
3081
|
+
cl_int stride, cl_int rows,
|
|
3082
|
+
bool blocking = true
|
|
3083
|
+
) {
|
|
3084
|
+
static ggml_cl_buffer buf;
|
|
3085
|
+
|
|
3086
|
+
cl_event evt;
|
|
3087
|
+
cl_int err;
|
|
3088
|
+
|
|
3089
|
+
buf.allocate(backend_ctx->context, size);
|
|
3090
|
+
|
|
3091
|
+
cl_mem trans;
|
|
3092
|
+
cl_buffer_region region;
|
|
3093
|
+
|
|
3094
|
+
region.origin = 0;
|
|
3095
|
+
region.size = size;
|
|
3096
|
+
CL_CHECK((trans = clCreateSubBuffer(
|
|
3097
|
+
buf.buffer, CL_MEM_READ_WRITE,
|
|
3098
|
+
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
|
3099
|
+
|
|
3100
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &src));
|
|
3101
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &trans));
|
|
3102
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &stride));
|
|
3103
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &rows));
|
|
3104
|
+
|
|
3105
|
+
size_t local_size[3] = {64, 1, 1};
|
|
3106
|
+
size_t global_size[3] = {(size_t)stride, (size_t)rows, 1};;
|
|
3107
|
+
CL_CHECK(clEnqueueNDRangeKernel(backend_ctx->queue, kernel, 3, NULL,
|
|
3108
|
+
global_size, local_size, 0, NULL, NULL));
|
|
3109
|
+
|
|
3110
|
+
if (blocking) {
|
|
3111
|
+
CL_CHECK(clEnqueueCopyBuffer(backend_ctx->queue, trans, dst, 0, 0, size, 0, NULL, &evt));
|
|
3112
|
+
CL_CHECK(clWaitForEvents(1, &evt));
|
|
3113
|
+
CL_CHECK(clReleaseEvent(evt));
|
|
3114
|
+
} else {
|
|
3115
|
+
CL_CHECK(clEnqueueCopyBuffer(backend_ctx->queue, trans, dst, 0, 0, size, 0, NULL, NULL));
|
|
3116
|
+
}
|
|
3117
|
+
|
|
3118
|
+
CL_CHECK(clReleaseMemObject(trans));
|
|
3119
|
+
}
|
|
3120
|
+
|
|
3121
|
+
static void transpose_2d_as_8b(
|
|
3122
|
+
ggml_backend_opencl_context * backend_ctx,
|
|
3123
|
+
cl_mem src, cl_mem dst, size_t size,
|
|
3124
|
+
cl_int stride, cl_int rows,
|
|
3125
|
+
bool blocking = true
|
|
3126
|
+
) {
|
|
3127
|
+
transpose_2d(backend_ctx, backend_ctx->kernel_transpose_8_buf,
|
|
3128
|
+
src, dst, size, stride, rows, blocking);
|
|
3129
|
+
}
|
|
3130
|
+
|
|
3131
|
+
static void transpose_2d_as_16b(
|
|
3132
|
+
ggml_backend_opencl_context * backend_ctx,
|
|
3133
|
+
cl_mem src, cl_mem dst, size_t size,
|
|
3134
|
+
cl_int stride, cl_int rows,
|
|
3135
|
+
bool blocking = true
|
|
3136
|
+
) {
|
|
3137
|
+
transpose_2d(backend_ctx, backend_ctx->kernel_transpose_16_buf,
|
|
3138
|
+
src, dst, size, stride, rows, blocking);
|
|
3139
|
+
}
|
|
3140
|
+
|
|
3141
|
+
static void transpose_2d_as_32b(
|
|
3142
|
+
ggml_backend_opencl_context * backend_ctx,
|
|
3143
|
+
cl_mem src, cl_mem dst, size_t size,
|
|
3144
|
+
cl_int stride, cl_int rows,
|
|
3145
|
+
bool blocking = true
|
|
3146
|
+
) {
|
|
3147
|
+
transpose_2d(backend_ctx, backend_ctx->kernel_transpose_32_buf,
|
|
3148
|
+
src, dst, size, stride, rows, blocking);
|
|
3149
|
+
}
|
|
3150
|
+
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
|
3151
|
+
|
|
2732
3152
|
//------------------------------------------------------------------------------
|
|
2733
3153
|
// Tensor extra management
|
|
2734
3154
|
//------------------------------------------------------------------------------
|
|
@@ -2796,6 +3216,59 @@ struct ggml_tensor_extra_cl_q4_0 {
|
|
|
2796
3216
|
}
|
|
2797
3217
|
};
|
|
2798
3218
|
|
|
3219
|
+
struct ggml_tensor_extra_cl_q4_1 {
|
|
3220
|
+
// Quantized values.
|
|
3221
|
+
cl_mem q = nullptr;
|
|
3222
|
+
// Quantized values in image1d_buffer_t.
|
|
3223
|
+
cl_mem q_img = nullptr;
|
|
3224
|
+
// Scales.
|
|
3225
|
+
cl_mem d = nullptr;
|
|
3226
|
+
// Scales in image1d_buffer_t.
|
|
3227
|
+
cl_mem d_img = nullptr;
|
|
3228
|
+
// Min
|
|
3229
|
+
cl_mem m = nullptr;
|
|
3230
|
+
// Min in image1d_buffer_t.
|
|
3231
|
+
cl_mem m_img = nullptr;
|
|
3232
|
+
// Size of quantized values.
|
|
3233
|
+
size_t size_q = 0;
|
|
3234
|
+
// Size of scales.
|
|
3235
|
+
size_t size_d = 0;
|
|
3236
|
+
// Size of min values.
|
|
3237
|
+
size_t size_m = 0;
|
|
3238
|
+
|
|
3239
|
+
~ggml_tensor_extra_cl_q4_1() {
|
|
3240
|
+
reset();
|
|
3241
|
+
}
|
|
3242
|
+
|
|
3243
|
+
void reset() {
|
|
3244
|
+
// q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
|
|
3245
|
+
// They must be properly released so that the original buffer can be
|
|
3246
|
+
// properly released to avoid memory leak.
|
|
3247
|
+
if (q != nullptr) {
|
|
3248
|
+
CL_CHECK(clReleaseMemObject(q));
|
|
3249
|
+
q = nullptr;
|
|
3250
|
+
}
|
|
3251
|
+
if (d != nullptr) {
|
|
3252
|
+
CL_CHECK(clReleaseMemObject(d));
|
|
3253
|
+
d = nullptr;
|
|
3254
|
+
}
|
|
3255
|
+
if (m != nullptr) {
|
|
3256
|
+
CL_CHECK(clReleaseMemObject(m));
|
|
3257
|
+
m = nullptr;
|
|
3258
|
+
}
|
|
3259
|
+
// Currently, q_img and d_img are only initialized when SMALL_ALLOC is
|
|
3260
|
+
// enabled. They point to the images in ggml_backend_opencl_buffer_context.
|
|
3261
|
+
// So, there is no need to release them here.
|
|
3262
|
+
// TODO: initialize them for non SMALL_PATH path, or remove them.
|
|
3263
|
+
q_img = nullptr;
|
|
3264
|
+
d_img = nullptr;
|
|
3265
|
+
m_img = nullptr;
|
|
3266
|
+
size_q = 0;
|
|
3267
|
+
size_d = 0;
|
|
3268
|
+
size_m = 0;
|
|
3269
|
+
}
|
|
3270
|
+
};
|
|
3271
|
+
|
|
2799
3272
|
struct ggml_tensor_extra_cl_mxfp4 {
|
|
2800
3273
|
// Quantized values.
|
|
2801
3274
|
cl_mem q = nullptr;
|
|
@@ -2874,6 +3347,50 @@ struct ggml_tensor_extra_cl_q8_0 {
|
|
|
2874
3347
|
}
|
|
2875
3348
|
};
|
|
2876
3349
|
|
|
3350
|
+
struct ggml_tensor_extra_cl_q6_K {
|
|
3351
|
+
// Lower 4 bits of quantized weights.
|
|
3352
|
+
cl_mem ql = nullptr;
|
|
3353
|
+
// Upper 2 bits of quantized weights.
|
|
3354
|
+
cl_mem qh = nullptr;
|
|
3355
|
+
// Scales for each block.
|
|
3356
|
+
cl_mem s = nullptr;
|
|
3357
|
+
// Scales for each super block.
|
|
3358
|
+
cl_mem d = nullptr;
|
|
3359
|
+
|
|
3360
|
+
size_t size_ql = 0;
|
|
3361
|
+
size_t size_qh = 0;
|
|
3362
|
+
size_t size_s = 0;
|
|
3363
|
+
size_t size_d = 0;
|
|
3364
|
+
|
|
3365
|
+
~ggml_tensor_extra_cl_q6_K() {
|
|
3366
|
+
reset();
|
|
3367
|
+
}
|
|
3368
|
+
|
|
3369
|
+
void reset() {
|
|
3370
|
+
if (ql != nullptr) {
|
|
3371
|
+
CL_CHECK(clReleaseMemObject(ql));
|
|
3372
|
+
ql = nullptr;
|
|
3373
|
+
}
|
|
3374
|
+
if (qh != nullptr) {
|
|
3375
|
+
CL_CHECK(clReleaseMemObject(qh));
|
|
3376
|
+
qh = nullptr;
|
|
3377
|
+
}
|
|
3378
|
+
if (s != nullptr) {
|
|
3379
|
+
CL_CHECK(clReleaseMemObject(s));
|
|
3380
|
+
s = nullptr;
|
|
3381
|
+
}
|
|
3382
|
+
if (d != nullptr) {
|
|
3383
|
+
CL_CHECK(clReleaseMemObject(d));
|
|
3384
|
+
d = nullptr;
|
|
3385
|
+
}
|
|
3386
|
+
|
|
3387
|
+
size_ql = 0;
|
|
3388
|
+
size_qh = 0;
|
|
3389
|
+
size_s = 0;
|
|
3390
|
+
size_d = 0;
|
|
3391
|
+
}
|
|
3392
|
+
};
|
|
3393
|
+
|
|
2877
3394
|
//------------------------------------------------------------------------------
|
|
2878
3395
|
// Backend API
|
|
2879
3396
|
//------------------------------------------------------------------------------
|
|
@@ -2923,7 +3440,7 @@ static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
|
|
|
2923
3440
|
CL_CHECK(clReleaseEvent(evt));
|
|
2924
3441
|
}
|
|
2925
3442
|
|
|
2926
|
-
//
|
|
3443
|
+
// Synchronizes the 'backend_ctx's device with others so that commands
|
|
2927
3444
|
// enqueued to it won't start until commands in the other devices have
|
|
2928
3445
|
// completed.
|
|
2929
3446
|
static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) {
|
|
@@ -3040,6 +3557,10 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm
|
|
|
3040
3557
|
continue;
|
|
3041
3558
|
}
|
|
3042
3559
|
|
|
3560
|
+
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
|
3561
|
+
continue;
|
|
3562
|
+
}
|
|
3563
|
+
|
|
3043
3564
|
if (!backend_ctx->disable_fusion && ggml_opencl_can_fuse(cgraph, i, { GGML_OP_NORM, GGML_OP_MUL, GGML_OP_ADD })) {
|
|
3044
3565
|
ggml_opencl_op_norm_fused(backend, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
|
|
3045
3566
|
i += 2;
|
|
@@ -3124,9 +3645,21 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
3124
3645
|
default:
|
|
3125
3646
|
return false;
|
|
3126
3647
|
}
|
|
3648
|
+
case GGML_TYPE_I32:
|
|
3649
|
+
switch (op->type) {
|
|
3650
|
+
case GGML_TYPE_I32:
|
|
3651
|
+
return true;
|
|
3652
|
+
default:
|
|
3653
|
+
return false;
|
|
3654
|
+
}
|
|
3127
3655
|
default:
|
|
3128
3656
|
return false;
|
|
3129
3657
|
}
|
|
3658
|
+
case GGML_OP_SET: {
|
|
3659
|
+
return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_I32) &&
|
|
3660
|
+
op->type == op->src[0]->type &&
|
|
3661
|
+
op->type == op->src[1]->type;
|
|
3662
|
+
}
|
|
3130
3663
|
case GGML_OP_SCALE:
|
|
3131
3664
|
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
|
|
3132
3665
|
case GGML_OP_ADD:
|
|
@@ -3160,14 +3693,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
3160
3693
|
case GGML_UNARY_OP_SIGMOID:
|
|
3161
3694
|
return ggml_is_contiguous(op->src[0]);
|
|
3162
3695
|
case GGML_UNARY_OP_TANH:
|
|
3163
|
-
|
|
3164
|
-
|
|
3696
|
+
case GGML_UNARY_OP_NEG:
|
|
3697
|
+
case GGML_UNARY_OP_EXP:
|
|
3698
|
+
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
|
|
3165
3699
|
case GGML_UNARY_OP_EXPM1:
|
|
3166
|
-
return
|
|
3167
|
-
(op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
|
|
3700
|
+
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
|
|
3168
3701
|
case GGML_UNARY_OP_SOFTPLUS:
|
|
3169
|
-
return
|
|
3170
|
-
(op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
|
|
3702
|
+
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
|
|
3171
3703
|
default:
|
|
3172
3704
|
return false;
|
|
3173
3705
|
}
|
|
@@ -3183,6 +3715,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
3183
3715
|
default:
|
|
3184
3716
|
return false;
|
|
3185
3717
|
}
|
|
3718
|
+
case GGML_OP_TRI:
|
|
3719
|
+
return op->type == GGML_TYPE_F32 && ggml_is_contiguous(op);
|
|
3186
3720
|
case GGML_OP_FILL:
|
|
3187
3721
|
return op->type == GGML_TYPE_F32 && ggml_is_contiguous(op);
|
|
3188
3722
|
case GGML_OP_CLAMP:
|
|
@@ -3192,6 +3726,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
3192
3726
|
return true;
|
|
3193
3727
|
case GGML_OP_RMS_NORM:
|
|
3194
3728
|
return op->ne[0] % 4 == 0 && ggml_is_contiguous_rows(op->src[0]);
|
|
3729
|
+
case GGML_OP_L2_NORM:
|
|
3730
|
+
return ggml_is_contiguous_rows(op->src[0]);
|
|
3195
3731
|
case GGML_OP_REPEAT:
|
|
3196
3732
|
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
|
|
3197
3733
|
case GGML_OP_PAD:
|
|
@@ -3223,7 +3759,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
3223
3759
|
return true;
|
|
3224
3760
|
} else if (op->src[0]->type == GGML_TYPE_F32) {
|
|
3225
3761
|
return op->src[1]->type == GGML_TYPE_F32;
|
|
3226
|
-
} else if (op->src[0]->type == GGML_TYPE_Q4_0
|
|
3762
|
+
} else if (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_Q4_1 ||
|
|
3763
|
+
op->src[0]->type == GGML_TYPE_MXFP4 ||
|
|
3764
|
+
op->src[0]->type == GGML_TYPE_Q4_K ||
|
|
3227
3765
|
op->src[0]->type == GGML_TYPE_Q6_K) {
|
|
3228
3766
|
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
|
3229
3767
|
} else if (op->src[0]->type == GGML_TYPE_Q8_0) {
|
|
@@ -3244,6 +3782,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
3244
3782
|
case GGML_OP_PERMUTE:
|
|
3245
3783
|
case GGML_OP_TRANSPOSE:
|
|
3246
3784
|
return true;
|
|
3785
|
+
case GGML_OP_DIAG:
|
|
3786
|
+
return true;
|
|
3247
3787
|
case GGML_OP_DIAG_MASK_INF:
|
|
3248
3788
|
return op->ne[3] == 1;
|
|
3249
3789
|
case GGML_OP_ROPE: {
|
|
@@ -3266,6 +3806,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
3266
3806
|
}
|
|
3267
3807
|
return true;
|
|
3268
3808
|
}
|
|
3809
|
+
case GGML_OP_SOLVE_TRI:
|
|
3810
|
+
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
|
|
3269
3811
|
case GGML_OP_IM2COL:
|
|
3270
3812
|
return true;
|
|
3271
3813
|
case GGML_OP_ARGSORT: {
|
|
@@ -3280,8 +3822,10 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
3280
3822
|
return cols <= max_workgroup_size && op->src[0]->type == GGML_TYPE_F32;
|
|
3281
3823
|
}
|
|
3282
3824
|
case GGML_OP_SUM_ROWS:
|
|
3283
|
-
case
|
|
3825
|
+
case GGML_OP_CUMSUM:
|
|
3284
3826
|
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
|
|
3827
|
+
case GGML_OP_MEAN:
|
|
3828
|
+
return op->src[0]->type == GGML_TYPE_F32;
|
|
3285
3829
|
case GGML_OP_FLASH_ATTN_EXT:
|
|
3286
3830
|
{
|
|
3287
3831
|
const ggml_tensor * q = op->src[0];
|
|
@@ -3412,6 +3956,12 @@ struct ggml_backend_opencl_buffer_context {
|
|
|
3412
3956
|
for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
|
|
3413
3957
|
delete e;
|
|
3414
3958
|
}
|
|
3959
|
+
for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K) {
|
|
3960
|
+
delete e;
|
|
3961
|
+
}
|
|
3962
|
+
for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K_in_use) {
|
|
3963
|
+
delete e;
|
|
3964
|
+
}
|
|
3415
3965
|
}
|
|
3416
3966
|
|
|
3417
3967
|
ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra() {
|
|
@@ -3444,6 +3994,21 @@ struct ggml_backend_opencl_buffer_context {
|
|
|
3444
3994
|
return extra;
|
|
3445
3995
|
}
|
|
3446
3996
|
|
|
3997
|
+
ggml_tensor_extra_cl_q4_1 * ggml_opencl_alloc_temp_tensor_extra_q4_1() {
|
|
3998
|
+
ggml_tensor_extra_cl_q4_1 * extra;
|
|
3999
|
+
if (temp_tensor_extras_q4_1.empty()) {
|
|
4000
|
+
extra = new ggml_tensor_extra_cl_q4_1();
|
|
4001
|
+
} else {
|
|
4002
|
+
extra = temp_tensor_extras_q4_1.back();
|
|
4003
|
+
temp_tensor_extras_q4_1.pop_back();
|
|
4004
|
+
}
|
|
4005
|
+
|
|
4006
|
+
temp_tensor_extras_q4_1_in_use.push_back(extra);
|
|
4007
|
+
|
|
4008
|
+
extra->reset();
|
|
4009
|
+
return extra;
|
|
4010
|
+
}
|
|
4011
|
+
|
|
3447
4012
|
ggml_tensor_extra_cl_mxfp4 * ggml_opencl_alloc_temp_tensor_extra_mxfp4() {
|
|
3448
4013
|
ggml_tensor_extra_cl_mxfp4 * extra;
|
|
3449
4014
|
if (temp_tensor_extras_mxfp4.empty()) {
|
|
@@ -3474,6 +4039,21 @@ struct ggml_backend_opencl_buffer_context {
|
|
|
3474
4039
|
return extra;
|
|
3475
4040
|
}
|
|
3476
4041
|
|
|
4042
|
+
ggml_tensor_extra_cl_q6_K * ggml_opencl_alloc_temp_tensor_extra_q6_K() {
|
|
4043
|
+
ggml_tensor_extra_cl_q6_K * extra;
|
|
4044
|
+
if (temp_tensor_extras_q6_K.empty()) {
|
|
4045
|
+
extra = new ggml_tensor_extra_cl_q6_K();
|
|
4046
|
+
} else {
|
|
4047
|
+
extra = temp_tensor_extras_q6_K.back();
|
|
4048
|
+
temp_tensor_extras_q6_K.pop_back();
|
|
4049
|
+
}
|
|
4050
|
+
|
|
4051
|
+
temp_tensor_extras_q6_K_in_use.push_back(extra);
|
|
4052
|
+
|
|
4053
|
+
extra->reset();
|
|
4054
|
+
return extra;
|
|
4055
|
+
}
|
|
4056
|
+
|
|
3477
4057
|
void reset() {
|
|
3478
4058
|
for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
|
|
3479
4059
|
temp_tensor_extras.push_back(e);
|
|
@@ -3485,6 +4065,11 @@ struct ggml_backend_opencl_buffer_context {
|
|
|
3485
4065
|
}
|
|
3486
4066
|
temp_tensor_extras_q4_0_in_use.clear();
|
|
3487
4067
|
|
|
4068
|
+
for (ggml_tensor_extra_cl_q4_1 * e : temp_tensor_extras_q4_1_in_use) {
|
|
4069
|
+
temp_tensor_extras_q4_1.push_back(e);
|
|
4070
|
+
}
|
|
4071
|
+
temp_tensor_extras_q4_1_in_use.clear();
|
|
4072
|
+
|
|
3488
4073
|
for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4_in_use) {
|
|
3489
4074
|
temp_tensor_extras_mxfp4.push_back(e);
|
|
3490
4075
|
}
|
|
@@ -3494,6 +4079,11 @@ struct ggml_backend_opencl_buffer_context {
|
|
|
3494
4079
|
temp_tensor_extras_q8_0.push_back(e);
|
|
3495
4080
|
}
|
|
3496
4081
|
temp_tensor_extras_q8_0_in_use.clear();
|
|
4082
|
+
|
|
4083
|
+
for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K_in_use) {
|
|
4084
|
+
temp_tensor_extras_q6_K.push_back(e);
|
|
4085
|
+
}
|
|
4086
|
+
temp_tensor_extras_q6_K_in_use.clear();
|
|
3497
4087
|
}
|
|
3498
4088
|
|
|
3499
4089
|
// Pools for extras. Available extras are in `temp_tensor_extras`. Extras
|
|
@@ -3505,14 +4095,18 @@ struct ggml_backend_opencl_buffer_context {
|
|
|
3505
4095
|
std::vector<ggml_tensor_extra_cl *> temp_tensor_extras_in_use;
|
|
3506
4096
|
std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0;
|
|
3507
4097
|
std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
|
|
4098
|
+
std::vector<ggml_tensor_extra_cl_q4_1 *> temp_tensor_extras_q4_1;
|
|
4099
|
+
std::vector<ggml_tensor_extra_cl_q4_1 *> temp_tensor_extras_q4_1_in_use;
|
|
3508
4100
|
std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4;
|
|
3509
4101
|
std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4_in_use;
|
|
3510
4102
|
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
|
|
3511
4103
|
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0_in_use;
|
|
4104
|
+
std::vector<ggml_tensor_extra_cl_q6_K *> temp_tensor_extras_q6_K;
|
|
4105
|
+
std::vector<ggml_tensor_extra_cl_q6_K *> temp_tensor_extras_q6_K_in_use;
|
|
3512
4106
|
|
|
3513
4107
|
// The buffer_context is initially created by ggml_backend_buft_alloc_buffer
|
|
3514
4108
|
// before any tensor is initialized (at the beginning of alloc_tensor_range).
|
|
3515
|
-
// Hence, there is
|
|
4109
|
+
// Hence, there is always a buffer object in this vector. When each tensor is
|
|
3516
4110
|
// being initialized, this original buffer object will be released if both
|
|
3517
4111
|
// flattening and small allocation are enabled, and additional buffer
|
|
3518
4112
|
// objects will be created in init_tensor to represent flattened quantized
|
|
@@ -3550,7 +4144,7 @@ static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buff
|
|
|
3550
4144
|
// Reuse extra of the parent tensor. The offset of this view tensor
|
|
3551
4145
|
// becomes `extra->offset + view_offs` and needs to be calculated when
|
|
3552
4146
|
// it is used. This changes is needed because of the change to
|
|
3553
|
-
// ggml_alloc.c in https://github.com/
|
|
4147
|
+
// ggml_alloc.c in https://github.com/ggml-org/llama.cpp/pull/7640.
|
|
3554
4148
|
// `buffer` passed in here will always be `tensor->buffer`. It is OK
|
|
3555
4149
|
// to allocate extras from the same buffer context for ordinary
|
|
3556
4150
|
// intermediate tensors. But for views into kv cache tensors, doing so
|
|
@@ -3599,6 +4193,15 @@ inline bool use_adreno_moe_kernels(const ggml_backend_opencl_context *backend_ct
|
|
|
3599
4193
|
return ((strstr(tensor->name, "ffn") != NULL) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 64 == 0);
|
|
3600
4194
|
}
|
|
3601
4195
|
|
|
4196
|
+
inline bool enable_adreno_trans_weight(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
|
|
4197
|
+
|
|
4198
|
+
bool adreno_kernel = use_adreno_kernels(backend_ctx, tensor);
|
|
4199
|
+
|
|
4200
|
+
size_t elem_num = tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3];
|
|
4201
|
+
|
|
4202
|
+
return ((elem_num < 128 * 1024 * 1024) && adreno_kernel); // max element num: 2**27
|
|
4203
|
+
}
|
|
4204
|
+
|
|
3602
4205
|
static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
3603
4206
|
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
|
|
3604
4207
|
|
|
@@ -3638,7 +4241,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
3638
4241
|
//GGML_ASSERT(offset == 0);
|
|
3639
4242
|
|
|
3640
4243
|
// We create subbuffers from the original tensor buffer for scales and
|
|
3641
|
-
// quants - i.e., scales and quants are aliases into the buffer
|
|
4244
|
+
// quants - i.e., scales and quants are aliases into the buffer object
|
|
3642
4245
|
// that backs the original tensor. This is a cleaner way to adapt to the
|
|
3643
4246
|
// new memory management.
|
|
3644
4247
|
// In the old code, we allocate new buffers for scales and quants
|
|
@@ -3863,17 +4466,18 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
3863
4466
|
return;
|
|
3864
4467
|
|
|
3865
4468
|
}
|
|
3866
|
-
if (tensor->type ==
|
|
4469
|
+
if (tensor->type == GGML_TYPE_Q4_1) {
|
|
3867
4470
|
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
|
|
3868
4471
|
GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
|
|
3869
4472
|
|
|
3870
4473
|
// Allocate the new extra and create aliases from the original.
|
|
3871
4474
|
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
|
3872
|
-
|
|
4475
|
+
ggml_tensor_extra_cl_q4_1 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q4_1();
|
|
3873
4476
|
|
|
3874
|
-
size_t
|
|
4477
|
+
size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
|
|
4478
|
+
size_t size_m = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
|
|
3875
4479
|
size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
|
|
3876
|
-
GGML_ASSERT(
|
|
4480
|
+
GGML_ASSERT(size_d + size_m + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
|
|
3877
4481
|
|
|
3878
4482
|
cl_int err;
|
|
3879
4483
|
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
|
@@ -3883,83 +4487,175 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
3883
4487
|
queue, data_device, CL_TRUE, 0,
|
|
3884
4488
|
ggml_nbytes(tensor), data, 0, NULL, NULL));
|
|
3885
4489
|
|
|
3886
|
-
// The original tensor memory is divided into scales and quants, i.e.,
|
|
3887
|
-
// we first store scales, then quants.
|
|
3888
4490
|
cl_buffer_region region;
|
|
3889
4491
|
|
|
4492
|
+
// The original tensor memory is divided into scales and quants, i.e.,
|
|
4493
|
+
// we first store scales, mins, then quants.
|
|
3890
4494
|
// Create subbuffer for scales.
|
|
3891
4495
|
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
|
|
3892
|
-
region.size =
|
|
3893
|
-
extra->
|
|
4496
|
+
region.size = size_d;
|
|
4497
|
+
extra->d = clCreateSubBuffer(
|
|
3894
4498
|
extra_orig->data_device, CL_MEM_READ_WRITE,
|
|
3895
4499
|
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
|
3896
4500
|
CL_CHECK(err);
|
|
3897
4501
|
auto previous_origin = region.origin;
|
|
3898
4502
|
|
|
4503
|
+
// Create subbuffer for mins.
|
|
4504
|
+
region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
|
|
4505
|
+
region.size = size_m;
|
|
4506
|
+
extra->m = clCreateSubBuffer(
|
|
4507
|
+
extra_orig->data_device, CL_MEM_READ_WRITE,
|
|
4508
|
+
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
|
4509
|
+
CL_CHECK(err);
|
|
4510
|
+
previous_origin = region.origin;
|
|
4511
|
+
|
|
3899
4512
|
// Create subbuffer for quants.
|
|
3900
|
-
region.origin = align_to(previous_origin +
|
|
4513
|
+
region.origin = align_to(previous_origin + size_m, backend_ctx->alignment);
|
|
3901
4514
|
region.size = size_q;
|
|
3902
4515
|
extra->q = clCreateSubBuffer(
|
|
3903
4516
|
extra_orig->data_device, CL_MEM_READ_WRITE,
|
|
3904
4517
|
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
|
3905
4518
|
CL_CHECK(err);
|
|
3906
4519
|
|
|
3907
|
-
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
3908
|
-
|
|
3909
|
-
cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4_trans;
|
|
3910
|
-
|
|
3911
|
-
int ne00 = tensor->ne[0];
|
|
3912
|
-
int ne01 = tensor->ne[1];
|
|
3913
|
-
int ne02 = tensor->ne[2];
|
|
3914
|
-
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
|
3915
|
-
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
|
|
3916
|
-
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e));
|
|
3917
|
-
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &ne00));
|
|
3918
|
-
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01));
|
|
3919
|
-
|
|
3920
|
-
size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
|
|
3921
|
-
size_t local_work_size[3] = {64, 2, 1};
|
|
3922
|
-
|
|
3923
|
-
cl_event evt;
|
|
3924
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3925
|
-
CL_CHECK(clWaitForEvents(1, &evt));
|
|
3926
|
-
CL_CHECK(clReleaseMemObject(data_device));
|
|
3927
|
-
tensor->extra = extra;
|
|
4520
|
+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
4521
|
+
cl_kernel kernel = backend_ctx->kernel_convert_block_q4_1;
|
|
3928
4522
|
|
|
3929
|
-
|
|
4523
|
+
if (use_adreno_kernels(backend_ctx, tensor)) {
|
|
4524
|
+
kernel = backend_ctx->kernel_convert_block_q4_1_noshuffle;
|
|
3930
4525
|
}
|
|
3931
|
-
#
|
|
3932
|
-
cl_kernel kernel = backend_ctx->
|
|
3933
|
-
|
|
4526
|
+
#else
|
|
4527
|
+
cl_kernel kernel = backend_ctx->kernel_convert_block_q4_1;
|
|
4528
|
+
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
|
3934
4529
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
|
3935
4530
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
|
|
3936
|
-
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->
|
|
4531
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
|
|
4532
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->m));
|
|
3937
4533
|
|
|
3938
|
-
size_t global_work_size[
|
|
3939
|
-
size_t local_work_size[
|
|
4534
|
+
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
|
4535
|
+
size_t local_work_size[] = {64, 1, 1};
|
|
3940
4536
|
|
|
3941
4537
|
cl_event evt;
|
|
3942
4538
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3943
4539
|
CL_CHECK(clWaitForEvents(1, &evt));
|
|
3944
4540
|
CL_CHECK(clReleaseMemObject(data_device));
|
|
3945
4541
|
|
|
3946
|
-
// Create image for Q
|
|
3947
|
-
cl_image_format img_format_q = {CL_RG, CL_UNSIGNED_INT32};
|
|
3948
|
-
cl_image_desc img_desc_q = {
|
|
3949
|
-
CL_MEM_OBJECT_IMAGE1D_BUFFER,
|
|
3950
|
-
static_cast<size_t>(ggml_nelements(tensor)/32*2),
|
|
3951
|
-
0, 0, 0, 0, 0, 0, 0,
|
|
3952
|
-
{ extra->q }
|
|
3953
|
-
};
|
|
3954
|
-
extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err);
|
|
3955
4542
|
tensor->extra = extra;
|
|
3956
4543
|
|
|
3957
|
-
|
|
3958
|
-
|
|
3959
|
-
|
|
3960
|
-
|
|
3961
|
-
|
|
3962
|
-
|
|
4544
|
+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
4545
|
+
if (use_adreno_kernels(backend_ctx, tensor)) {
|
|
4546
|
+
|
|
4547
|
+
int M = tensor->ne[1];
|
|
4548
|
+
int K = tensor->ne[0];
|
|
4549
|
+
|
|
4550
|
+
GGML_ASSERT(K % 32 == 0);
|
|
4551
|
+
|
|
4552
|
+
// Transpose q as ushort
|
|
4553
|
+
transpose_2d_as_16b(backend_ctx, extra->q, extra->q, size_q, K/4, M);
|
|
4554
|
+
// Transpose d as ushort
|
|
4555
|
+
transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/32, M);
|
|
4556
|
+
// Transpose m as ushort
|
|
4557
|
+
transpose_2d_as_16b(backend_ctx, extra->m, extra->m, size_m, K/32, M);
|
|
4558
|
+
}
|
|
4559
|
+
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
|
4560
|
+
return;
|
|
4561
|
+
}
|
|
4562
|
+
if (tensor->type == GGML_TYPE_MXFP4) {
|
|
4563
|
+
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
|
|
4564
|
+
GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
|
|
4565
|
+
|
|
4566
|
+
// Allocate the new extra and create aliases from the original.
|
|
4567
|
+
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
|
4568
|
+
ggml_tensor_extra_cl_mxfp4 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_mxfp4();
|
|
4569
|
+
|
|
4570
|
+
size_t size_e = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(char);
|
|
4571
|
+
size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
|
|
4572
|
+
GGML_ASSERT(size_e + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
|
|
4573
|
+
|
|
4574
|
+
cl_int err;
|
|
4575
|
+
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
|
4576
|
+
ggml_nbytes(tensor), NULL, &err);
|
|
4577
|
+
CL_CHECK(err);
|
|
4578
|
+
CL_CHECK(clEnqueueWriteBuffer(
|
|
4579
|
+
queue, data_device, CL_TRUE, 0,
|
|
4580
|
+
ggml_nbytes(tensor), data, 0, NULL, NULL));
|
|
4581
|
+
|
|
4582
|
+
// The original tensor memory is divided into scales and quants, i.e.,
|
|
4583
|
+
// we first store scales, then quants.
|
|
4584
|
+
cl_buffer_region region;
|
|
4585
|
+
|
|
4586
|
+
// Create subbuffer for scales.
|
|
4587
|
+
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
|
|
4588
|
+
region.size = size_e;
|
|
4589
|
+
extra->e = clCreateSubBuffer(
|
|
4590
|
+
extra_orig->data_device, CL_MEM_READ_WRITE,
|
|
4591
|
+
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
|
4592
|
+
CL_CHECK(err);
|
|
4593
|
+
auto previous_origin = region.origin;
|
|
4594
|
+
|
|
4595
|
+
// Create subbuffer for quants.
|
|
4596
|
+
region.origin = align_to(previous_origin + size_e, backend_ctx->alignment);
|
|
4597
|
+
region.size = size_q;
|
|
4598
|
+
extra->q = clCreateSubBuffer(
|
|
4599
|
+
extra_orig->data_device, CL_MEM_READ_WRITE,
|
|
4600
|
+
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
|
4601
|
+
CL_CHECK(err);
|
|
4602
|
+
|
|
4603
|
+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
4604
|
+
if (use_adreno_moe_kernels(backend_ctx, tensor)) {
|
|
4605
|
+
cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4_trans;
|
|
4606
|
+
|
|
4607
|
+
int ne00 = tensor->ne[0];
|
|
4608
|
+
int ne01 = tensor->ne[1];
|
|
4609
|
+
int ne02 = tensor->ne[2];
|
|
4610
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
|
4611
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
|
|
4612
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e));
|
|
4613
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &ne00));
|
|
4614
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01));
|
|
4615
|
+
|
|
4616
|
+
size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
|
|
4617
|
+
size_t local_work_size[3] = {64, 2, 1};
|
|
4618
|
+
|
|
4619
|
+
cl_event evt;
|
|
4620
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
4621
|
+
CL_CHECK(clWaitForEvents(1, &evt));
|
|
4622
|
+
CL_CHECK(clReleaseMemObject(data_device));
|
|
4623
|
+
tensor->extra = extra;
|
|
4624
|
+
|
|
4625
|
+
return;
|
|
4626
|
+
}
|
|
4627
|
+
#endif
|
|
4628
|
+
cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4;
|
|
4629
|
+
|
|
4630
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
|
4631
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
|
|
4632
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e));
|
|
4633
|
+
|
|
4634
|
+
size_t global_work_size[3] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
|
4635
|
+
size_t local_work_size[3] = {64, 1, 1};
|
|
4636
|
+
|
|
4637
|
+
cl_event evt;
|
|
4638
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
4639
|
+
CL_CHECK(clWaitForEvents(1, &evt));
|
|
4640
|
+
CL_CHECK(clReleaseMemObject(data_device));
|
|
4641
|
+
|
|
4642
|
+
// Create image for Q
|
|
4643
|
+
cl_image_format img_format_q = {CL_RG, CL_UNSIGNED_INT32};
|
|
4644
|
+
cl_image_desc img_desc_q = {
|
|
4645
|
+
CL_MEM_OBJECT_IMAGE1D_BUFFER,
|
|
4646
|
+
static_cast<size_t>(ggml_nelements(tensor)/32*2),
|
|
4647
|
+
0, 0, 0, 0, 0, 0, 0,
|
|
4648
|
+
{ extra->q }
|
|
4649
|
+
};
|
|
4650
|
+
extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err);
|
|
4651
|
+
tensor->extra = extra;
|
|
4652
|
+
|
|
4653
|
+
return;
|
|
4654
|
+
}
|
|
4655
|
+
if (tensor->type == GGML_TYPE_Q8_0) {
|
|
4656
|
+
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
|
|
4657
|
+
GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
|
|
4658
|
+
|
|
3963
4659
|
// Allocate the new extra and create aliases from the original.
|
|
3964
4660
|
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
|
3965
4661
|
ggml_tensor_extra_cl_q8_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q8_0();
|
|
@@ -4013,6 +4709,216 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
4013
4709
|
|
|
4014
4710
|
tensor->extra = extra;
|
|
4015
4711
|
|
|
4712
|
+
// Transpose the weights and scales
|
|
4713
|
+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
4714
|
+
if (enable_adreno_trans_weight(backend_ctx, tensor)) {
|
|
4715
|
+
|
|
4716
|
+
int M = tensor->ne[1]; // ne01
|
|
4717
|
+
int K = tensor->ne[0]; // ne00
|
|
4718
|
+
|
|
4719
|
+
GGML_ASSERT(K % 32 == 0);
|
|
4720
|
+
GGML_ASSERT(M % 4 == 0);
|
|
4721
|
+
GGML_ASSERT(tensor->ne[2] == 1);
|
|
4722
|
+
GGML_ASSERT(tensor->ne[3] == 1);
|
|
4723
|
+
|
|
4724
|
+
// Transpose weights
|
|
4725
|
+
size_t q_size_bytes = K * M / 4 * sizeof(float);
|
|
4726
|
+
cl_buffer_region region;
|
|
4727
|
+
region.origin = 0;
|
|
4728
|
+
region.size = q_size_bytes;
|
|
4729
|
+
cl_mem qT_d = clCreateSubBuffer(
|
|
4730
|
+
backend_ctx->prealloc_quant_trans.buffer,
|
|
4731
|
+
0,
|
|
4732
|
+
CL_BUFFER_CREATE_TYPE_REGION,
|
|
4733
|
+
®ion,
|
|
4734
|
+
&err);
|
|
4735
|
+
CL_CHECK(err);
|
|
4736
|
+
|
|
4737
|
+
cl_mem q_d_image1D;
|
|
4738
|
+
cl_mem qT_d_image1D;
|
|
4739
|
+
|
|
4740
|
+
cl_image_format img_fmt_1d;
|
|
4741
|
+
cl_image_desc img_desc_1d;
|
|
4742
|
+
|
|
4743
|
+
img_fmt_1d = { CL_RGBA, CL_FLOAT };
|
|
4744
|
+
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
4745
|
+
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
4746
|
+
img_desc_1d.image_width = M * K / 4 / 4;
|
|
4747
|
+
img_desc_1d.buffer = extra->q;
|
|
4748
|
+
q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
|
4749
|
+
CL_CHECK(err);
|
|
4750
|
+
|
|
4751
|
+
img_fmt_1d = { CL_RGBA, CL_FLOAT };
|
|
4752
|
+
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
4753
|
+
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
4754
|
+
img_desc_1d.image_width = M * K / 4 / 4;
|
|
4755
|
+
img_desc_1d.buffer = qT_d;
|
|
4756
|
+
qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
|
4757
|
+
CL_CHECK(err);
|
|
4758
|
+
|
|
4759
|
+
int height_q = M / 4;
|
|
4760
|
+
int width_q = K / 4 / 4;
|
|
4761
|
+
kernel = backend_ctx->kernel_transpose_32;
|
|
4762
|
+
|
|
4763
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
|
|
4764
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D));
|
|
4765
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_q));
|
|
4766
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_q));
|
|
4767
|
+
|
|
4768
|
+
size_t local_size_q[3] = {4, 16, 1};
|
|
4769
|
+
size_t global_size_q[3] = {static_cast<size_t>(width_q), static_cast<size_t>(height_q), 1};
|
|
4770
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt));
|
|
4771
|
+
CL_CHECK(clWaitForEvents(1, &evt));
|
|
4772
|
+
|
|
4773
|
+
// Transpose scales
|
|
4774
|
+
size_t d_size_bytes = M * (K / 32) * 2;
|
|
4775
|
+
region.origin = 0;
|
|
4776
|
+
region.size = d_size_bytes;
|
|
4777
|
+
cl_mem dT_d = clCreateSubBuffer(
|
|
4778
|
+
backend_ctx->prealloc_scales_trans.buffer,
|
|
4779
|
+
0,
|
|
4780
|
+
CL_BUFFER_CREATE_TYPE_REGION,
|
|
4781
|
+
®ion,
|
|
4782
|
+
&err);
|
|
4783
|
+
CL_CHECK(err);
|
|
4784
|
+
|
|
4785
|
+
cl_mem d_d_image1D;
|
|
4786
|
+
cl_mem dT_d_image1D;
|
|
4787
|
+
|
|
4788
|
+
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
4789
|
+
img_fmt_1d = { CL_R, CL_HALF_FLOAT };
|
|
4790
|
+
img_desc_1d.image_width = M * K / 32;
|
|
4791
|
+
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
4792
|
+
img_desc_1d.buffer = extra->d;
|
|
4793
|
+
d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
|
4794
|
+
CL_CHECK(err);
|
|
4795
|
+
|
|
4796
|
+
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
|
4797
|
+
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
4798
|
+
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
4799
|
+
img_desc_1d.image_width = M * K / 32 / 4;
|
|
4800
|
+
img_desc_1d.buffer = dT_d;
|
|
4801
|
+
dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
|
4802
|
+
CL_CHECK(err);
|
|
4803
|
+
|
|
4804
|
+
int height_s = M / 4;
|
|
4805
|
+
int width_s = K / 32;
|
|
4806
|
+
|
|
4807
|
+
kernel = backend_ctx->kernel_transpose_16_4x1;
|
|
4808
|
+
|
|
4809
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
|
|
4810
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
|
|
4811
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
|
|
4812
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s));
|
|
4813
|
+
|
|
4814
|
+
size_t local_size_s[3] = {4, 16, 1};
|
|
4815
|
+
size_t global_size_s[3] = {static_cast<size_t>(width_s), static_cast<size_t>(height_s), 1};
|
|
4816
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt));
|
|
4817
|
+
CL_CHECK(clWaitForEvents(1, &evt));
|
|
4818
|
+
|
|
4819
|
+
// copy transposed buffer contents to original buffers
|
|
4820
|
+
CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt));
|
|
4821
|
+
CL_CHECK(clWaitForEvents(1, &evt));
|
|
4822
|
+
|
|
4823
|
+
CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt));
|
|
4824
|
+
CL_CHECK(clWaitForEvents(1, &evt));
|
|
4825
|
+
|
|
4826
|
+
CL_CHECK(clReleaseMemObject(qT_d));
|
|
4827
|
+
CL_CHECK(clReleaseMemObject(dT_d));
|
|
4828
|
+
|
|
4829
|
+
CL_CHECK(clReleaseMemObject(q_d_image1D));
|
|
4830
|
+
CL_CHECK(clReleaseMemObject(d_d_image1D));
|
|
4831
|
+
CL_CHECK(clReleaseMemObject(qT_d_image1D));
|
|
4832
|
+
CL_CHECK(clReleaseMemObject(dT_d_image1D));
|
|
4833
|
+
} // end transpose
|
|
4834
|
+
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
|
4835
|
+
|
|
4836
|
+
return;
|
|
4837
|
+
}
|
|
4838
|
+
if (tensor->type == GGML_TYPE_Q6_K) {
|
|
4839
|
+
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
|
|
4840
|
+
GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
|
|
4841
|
+
|
|
4842
|
+
// Allocate the new extra and create aliases from the original.
|
|
4843
|
+
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
|
4844
|
+
ggml_tensor_extra_cl_q6_K * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q6_K();
|
|
4845
|
+
|
|
4846
|
+
size_t size_ql = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
|
|
4847
|
+
size_t size_qh = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/4;
|
|
4848
|
+
size_t size_s = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/16;
|
|
4849
|
+
size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
|
|
4850
|
+
GGML_ASSERT(size_ql + size_qh + size_s + size_d == ggml_nbytes(tensor) &&
|
|
4851
|
+
"Incorrect tensor size");
|
|
4852
|
+
|
|
4853
|
+
cl_int err;
|
|
4854
|
+
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
|
4855
|
+
ggml_nbytes(tensor), NULL, &err);
|
|
4856
|
+
CL_CHECK(err);
|
|
4857
|
+
CL_CHECK(clEnqueueWriteBuffer(
|
|
4858
|
+
queue, data_device, CL_TRUE, 0,
|
|
4859
|
+
ggml_nbytes(tensor), data, 0, NULL, NULL));
|
|
4860
|
+
|
|
4861
|
+
cl_buffer_region region;
|
|
4862
|
+
|
|
4863
|
+
// Subbuffer for ql
|
|
4864
|
+
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
|
|
4865
|
+
region.size = size_ql;
|
|
4866
|
+
extra->ql = clCreateSubBuffer(
|
|
4867
|
+
extra_orig->data_device, CL_MEM_READ_WRITE,
|
|
4868
|
+
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
|
4869
|
+
CL_CHECK(err);
|
|
4870
|
+
auto previous_origin = region.origin;
|
|
4871
|
+
|
|
4872
|
+
// Subbuffer for qh
|
|
4873
|
+
region.origin = align_to(previous_origin + size_ql, backend_ctx->alignment);
|
|
4874
|
+
region.size = size_qh;
|
|
4875
|
+
extra->qh = clCreateSubBuffer(
|
|
4876
|
+
extra_orig->data_device, CL_MEM_READ_WRITE,
|
|
4877
|
+
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
|
4878
|
+
CL_CHECK(err);
|
|
4879
|
+
previous_origin = region.origin;
|
|
4880
|
+
|
|
4881
|
+
// Subbuffer for scales
|
|
4882
|
+
region.origin = align_to(previous_origin + size_qh, backend_ctx->alignment);
|
|
4883
|
+
region.size = size_s;
|
|
4884
|
+
extra->s = clCreateSubBuffer(
|
|
4885
|
+
extra_orig->data_device, CL_MEM_READ_WRITE,
|
|
4886
|
+
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
|
4887
|
+
CL_CHECK(err);
|
|
4888
|
+
previous_origin = region.origin;
|
|
4889
|
+
|
|
4890
|
+
// Create subbuffer for d.
|
|
4891
|
+
region.origin = align_to(previous_origin + size_s, backend_ctx->alignment);
|
|
4892
|
+
region.size = size_d;
|
|
4893
|
+
extra->d = clCreateSubBuffer(
|
|
4894
|
+
extra_orig->data_device, CL_MEM_READ_WRITE,
|
|
4895
|
+
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
|
4896
|
+
CL_CHECK(err);
|
|
4897
|
+
previous_origin = region.origin;
|
|
4898
|
+
|
|
4899
|
+
// Flatten the weights
|
|
4900
|
+
cl_kernel kernel = backend_ctx->kernel_convert_block_q6_K;
|
|
4901
|
+
|
|
4902
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
|
4903
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->ql));
|
|
4904
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
|
|
4905
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->s));
|
|
4906
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->d));
|
|
4907
|
+
|
|
4908
|
+
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
|
4909
|
+
size_t local_work_size[] = {64, 1, 1};
|
|
4910
|
+
|
|
4911
|
+
cl_event evt;
|
|
4912
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
4913
|
+
CL_CHECK(clWaitForEvents(1, &evt));
|
|
4914
|
+
CL_CHECK(clReleaseMemObject(data_device));
|
|
4915
|
+
|
|
4916
|
+
extra->size_ql = size_ql;
|
|
4917
|
+
extra->size_qh = size_qh;
|
|
4918
|
+
extra->size_s = size_s;
|
|
4919
|
+
extra->size_d = size_d;
|
|
4920
|
+
|
|
4921
|
+
tensor->extra = extra;
|
|
4016
4922
|
return;
|
|
4017
4923
|
}
|
|
4018
4924
|
#endif // GGML_OPENCL_SOA_Q
|
|
@@ -4155,28 +5061,103 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
|
4155
5061
|
size, data, 0, NULL, NULL));
|
|
4156
5062
|
CL_CHECK(clReleaseMemObject(data_device));
|
|
4157
5063
|
return;
|
|
4158
|
-
}
|
|
4159
|
-
|
|
5064
|
+
}
|
|
5065
|
+
if (tensor->type == GGML_TYPE_Q4_1) {
|
|
5066
|
+
ggml_tensor_extra_cl_q4_1 * extra = (ggml_tensor_extra_cl_q4_1 *)tensor->extra;
|
|
5067
|
+
|
|
5068
|
+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
5069
|
+
if (use_adreno_kernels(backend_ctx, tensor)) {
|
|
5070
|
+
static ggml_cl_buffer buf_trans_q;
|
|
5071
|
+
static ggml_cl_buffer buf_trans_m;
|
|
5072
|
+
static ggml_cl_buffer buf_trans_d;
|
|
5073
|
+
static ggml_cl_buffer buf_unpacked;
|
|
5074
|
+
|
|
5075
|
+
cl_int M = tensor->ne[1];
|
|
5076
|
+
cl_int K = tensor->ne[0];
|
|
5077
|
+
|
|
5078
|
+
GGML_ASSERT(K % ggml_blck_size(tensor->type) == 0);
|
|
5079
|
+
|
|
5080
|
+
size_t size_q = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*ggml_blck_size(tensor->type)/2;
|
|
5081
|
+
size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
|
|
5082
|
+
size_t size_m = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
|
|
5083
|
+
GGML_ASSERT(size_d + size_q + size_m == ggml_nbytes(tensor) && "Incorrect tensor size");
|
|
5084
|
+
|
|
5085
|
+
buf_trans_q.allocate(backend_ctx->context, size_q);
|
|
5086
|
+
buf_trans_m.allocate(backend_ctx->context, size_m);
|
|
5087
|
+
buf_trans_d.allocate(backend_ctx->context, size_d);
|
|
5088
|
+
buf_unpacked.allocate(backend_ctx->context, ggml_nbytes(tensor));
|
|
5089
|
+
|
|
5090
|
+
// transpose q, d, m back
|
|
5091
|
+
transpose_2d_as_16b(backend_ctx, extra->q, buf_trans_q.buffer, size_q, M, K/4);
|
|
5092
|
+
transpose_2d_as_16b(backend_ctx, extra->d, buf_trans_d.buffer, size_d, M, K/32);
|
|
5093
|
+
transpose_2d_as_16b(backend_ctx, extra->m, buf_trans_m.buffer, size_m, M, K/32);
|
|
5094
|
+
|
|
5095
|
+
cl_uchar mask_0F = 0x0F;
|
|
5096
|
+
cl_uchar mask_F0 = 0xF0;
|
|
5097
|
+
|
|
5098
|
+
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
|
5099
|
+
size_t local_work_size[] = {1, 1, 1};
|
|
5100
|
+
|
|
5101
|
+
cl_kernel kernel = backend_ctx->kernel_restore_block_q4_1_noshuffle;
|
|
5102
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q.buffer));
|
|
5103
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d.buffer));
|
|
5104
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_trans_m.buffer));
|
|
5105
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_unpacked.buffer));
|
|
5106
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uchar), &mask_0F));
|
|
5107
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_F0));
|
|
5108
|
+
|
|
5109
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5110
|
+
CL_CHECK(clEnqueueReadBuffer(queue, buf_unpacked.buffer, CL_TRUE, offset, size, data, 0, NULL, NULL));
|
|
5111
|
+
return;
|
|
5112
|
+
}
|
|
5113
|
+
#endif
|
|
4160
5114
|
|
|
4161
5115
|
cl_int err;
|
|
4162
5116
|
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
|
4163
5117
|
ggml_nbytes(tensor), NULL, &err);
|
|
4164
5118
|
CL_CHECK(err);
|
|
4165
5119
|
|
|
4166
|
-
|
|
4167
|
-
|
|
4168
|
-
|
|
5120
|
+
cl_kernel kernel = backend_ctx->kernel_restore_block_q4_1;
|
|
5121
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
|
5122
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
|
|
5123
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->m));
|
|
5124
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &data_device));
|
|
4169
5125
|
|
|
4170
|
-
|
|
4171
|
-
|
|
4172
|
-
int ne02 = tensor->ne[2];
|
|
4173
|
-
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
|
4174
|
-
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->e));
|
|
4175
|
-
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
|
|
4176
|
-
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &ne00));
|
|
4177
|
-
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01));
|
|
5126
|
+
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
|
5127
|
+
size_t local_work_size[] = {1, 1, 1};
|
|
4178
5128
|
|
|
4179
|
-
|
|
5129
|
+
cl_event evt;
|
|
5130
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
|
5131
|
+
global_work_size, local_work_size, 0, NULL, &evt));
|
|
5132
|
+
CL_CHECK(clWaitForEvents(1, &evt));
|
|
5133
|
+
CL_CHECK(clEnqueueReadBuffer(
|
|
5134
|
+
queue, data_device, CL_TRUE, offset,
|
|
5135
|
+
size, data, 0, NULL, NULL));
|
|
5136
|
+
CL_CHECK(clReleaseMemObject(data_device));
|
|
5137
|
+
return;
|
|
5138
|
+
}
|
|
5139
|
+
if (tensor->type == GGML_TYPE_MXFP4) {
|
|
5140
|
+
ggml_tensor_extra_cl_mxfp4 * extra = (ggml_tensor_extra_cl_mxfp4 *)tensor->extra;
|
|
5141
|
+
|
|
5142
|
+
cl_int err;
|
|
5143
|
+
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
|
5144
|
+
ggml_nbytes(tensor), NULL, &err);
|
|
5145
|
+
CL_CHECK(err);
|
|
5146
|
+
|
|
5147
|
+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
5148
|
+
if (use_adreno_moe_kernels(backend_ctx, tensor)) {
|
|
5149
|
+
cl_kernel kernel = backend_ctx->kernel_restore_block_mxfp4_trans;
|
|
5150
|
+
|
|
5151
|
+
int ne00 = tensor->ne[0];
|
|
5152
|
+
int ne01 = tensor->ne[1];
|
|
5153
|
+
int ne02 = tensor->ne[2];
|
|
5154
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
|
5155
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->e));
|
|
5156
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
|
|
5157
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &ne00));
|
|
5158
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01));
|
|
5159
|
+
|
|
5160
|
+
size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
|
|
4180
5161
|
size_t local_work_size[3] = {64, 2, 1};
|
|
4181
5162
|
|
|
4182
5163
|
cl_event evt;
|
|
@@ -4216,6 +5197,36 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
|
4216
5197
|
ggml_nbytes(tensor), NULL, &err);
|
|
4217
5198
|
CL_CHECK(err);
|
|
4218
5199
|
|
|
5200
|
+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
5201
|
+
if (enable_adreno_trans_weight(backend_ctx, tensor)) {
|
|
5202
|
+
cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0_trans;
|
|
5203
|
+
|
|
5204
|
+
int ne00 = tensor->ne[0];
|
|
5205
|
+
int ne01 = tensor->ne[1];
|
|
5206
|
+
GGML_ASSERT(tensor->ne[2] == 1);
|
|
5207
|
+
GGML_ASSERT(tensor->ne[3] == 1);
|
|
5208
|
+
|
|
5209
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
|
5210
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
|
|
5211
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
|
|
5212
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &ne00));
|
|
5213
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01));
|
|
5214
|
+
|
|
5215
|
+
size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), 1, 1};
|
|
5216
|
+
size_t local_work_size[3] = {64, 1, 1};
|
|
5217
|
+
|
|
5218
|
+
cl_event evt;
|
|
5219
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
|
5220
|
+
global_work_size, local_work_size, 0, NULL, &evt));
|
|
5221
|
+
CL_CHECK(clWaitForEvents(1, &evt));
|
|
5222
|
+
|
|
5223
|
+
CL_CHECK(clEnqueueReadBuffer(
|
|
5224
|
+
queue, data_device, CL_TRUE, offset,
|
|
5225
|
+
size, data, 0, NULL, NULL));
|
|
5226
|
+
CL_CHECK(clReleaseMemObject(data_device));
|
|
5227
|
+
return;
|
|
5228
|
+
}
|
|
5229
|
+
#endif
|
|
4219
5230
|
cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0;
|
|
4220
5231
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
|
4221
5232
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
|
|
@@ -4224,6 +5235,34 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
|
4224
5235
|
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
|
4225
5236
|
size_t local_work_size[] = {1, 1, 1};
|
|
4226
5237
|
|
|
5238
|
+
cl_event evt;
|
|
5239
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
|
5240
|
+
global_work_size, local_work_size, 0, NULL, &evt));
|
|
5241
|
+
CL_CHECK(clWaitForEvents(1, &evt));
|
|
5242
|
+
CL_CHECK(clEnqueueReadBuffer(
|
|
5243
|
+
queue, data_device, CL_TRUE, offset,
|
|
5244
|
+
size, data, 0, NULL, NULL));
|
|
5245
|
+
CL_CHECK(clReleaseMemObject(data_device));
|
|
5246
|
+
return;
|
|
5247
|
+
}
|
|
5248
|
+
if (tensor->type == GGML_TYPE_Q6_K) {
|
|
5249
|
+
ggml_tensor_extra_cl_q6_K * extra = (ggml_tensor_extra_cl_q6_K *)tensor->extra;
|
|
5250
|
+
|
|
5251
|
+
cl_int err;
|
|
5252
|
+
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
|
5253
|
+
ggml_nbytes(tensor), NULL, &err);
|
|
5254
|
+
CL_CHECK(err);
|
|
5255
|
+
|
|
5256
|
+
cl_kernel kernel = backend_ctx->kernel_restore_block_q6_K;
|
|
5257
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->ql));
|
|
5258
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh));
|
|
5259
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s));
|
|
5260
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
|
|
5261
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device));
|
|
5262
|
+
|
|
5263
|
+
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
|
5264
|
+
size_t local_work_size[] = {1, 1, 1};
|
|
5265
|
+
|
|
4227
5266
|
cl_event evt;
|
|
4228
5267
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
|
4229
5268
|
global_work_size, local_work_size, 0, NULL, &evt));
|
|
@@ -4347,7 +5386,8 @@ static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_
|
|
|
4347
5386
|
}
|
|
4348
5387
|
|
|
4349
5388
|
static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
|
4350
|
-
|
|
5389
|
+
// no memory to report
|
|
5390
|
+
*free = 0;
|
|
4351
5391
|
*total = 0;
|
|
4352
5392
|
|
|
4353
5393
|
GGML_UNUSED(dev);
|
|
@@ -4666,6 +5706,81 @@ static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct gg
|
|
|
4666
5706
|
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
|
|
4667
5707
|
}
|
|
4668
5708
|
|
|
5709
|
+
// Copy a noncontiguous tensor to contiguous tensor. ne[] remains the same but
|
|
5710
|
+
// nb[] is recalculated such that tensor is contiguous.
|
|
5711
|
+
static void ggml_cl_copy_to_contiguous(ggml_backend_t backend, const ggml_tensor * src, cl_mem dst,
|
|
5712
|
+
cl_ulong &nb0, cl_ulong &nb1, cl_ulong &nb2, cl_ulong &nb3) {
|
|
5713
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5714
|
+
|
|
5715
|
+
const int tensor_type_size = ggml_type_size(src->type);
|
|
5716
|
+
|
|
5717
|
+
const int ne00 = src->ne[0];
|
|
5718
|
+
const int ne01 = src->ne[1];
|
|
5719
|
+
const int ne02 = src->ne[2];
|
|
5720
|
+
const int ne03 = src->ne[3];
|
|
5721
|
+
|
|
5722
|
+
const cl_ulong nb00 = src->nb[0];
|
|
5723
|
+
const cl_ulong nb01 = src->nb[1];
|
|
5724
|
+
const cl_ulong nb02 = src->nb[2];
|
|
5725
|
+
const cl_ulong nb03 = src->nb[3];
|
|
5726
|
+
|
|
5727
|
+
const int ne0 = src->ne[0];
|
|
5728
|
+
const int ne1 = src->ne[1];
|
|
5729
|
+
const int ne2 = src->ne[2];
|
|
5730
|
+
const int ne3 = src->ne[3];
|
|
5731
|
+
|
|
5732
|
+
nb0 = tensor_type_size;
|
|
5733
|
+
nb1 = tensor_type_size*ne00;
|
|
5734
|
+
nb2 = tensor_type_size*ne00*ne01;
|
|
5735
|
+
nb3 = tensor_type_size*ne00*ne01*ne02;
|
|
5736
|
+
|
|
5737
|
+
ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *)src->extra;
|
|
5738
|
+
|
|
5739
|
+
cl_ulong offset0 = extra->offset + src->view_offs;
|
|
5740
|
+
cl_ulong offsetd = 0;
|
|
5741
|
+
|
|
5742
|
+
cl_kernel kernel;
|
|
5743
|
+
|
|
5744
|
+
switch (src->type) {
|
|
5745
|
+
case GGML_TYPE_F32:
|
|
5746
|
+
kernel = backend_ctx->kernel_cpy_f32_f32;
|
|
5747
|
+
break;
|
|
5748
|
+
case GGML_TYPE_F16:
|
|
5749
|
+
kernel = backend_ctx->kernel_cpy_f16_f16;
|
|
5750
|
+
break;
|
|
5751
|
+
default:
|
|
5752
|
+
GGML_ASSERT(false && "not implemented");
|
|
5753
|
+
}
|
|
5754
|
+
|
|
5755
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->data_device));
|
|
5756
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
5757
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst));
|
|
5758
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
5759
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
|
5760
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
|
5761
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
|
5762
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
|
5763
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
|
|
5764
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
|
5765
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
|
|
5766
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
|
|
5767
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne0));
|
|
5768
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne1));
|
|
5769
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne2));
|
|
5770
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne3));
|
|
5771
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb0));
|
|
5772
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb1));
|
|
5773
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb2));
|
|
5774
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb3));
|
|
5775
|
+
|
|
5776
|
+
const int nth = MIN(64, ne00);
|
|
5777
|
+
|
|
5778
|
+
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
5779
|
+
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
5780
|
+
|
|
5781
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src);
|
|
5782
|
+
}
|
|
5783
|
+
|
|
4669
5784
|
static void ggml_cl_nop(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
4670
5785
|
UNUSED(backend);
|
|
4671
5786
|
UNUSED(src0);
|
|
@@ -4681,19 +5796,12 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
4681
5796
|
GGML_ASSERT(dst);
|
|
4682
5797
|
GGML_ASSERT(dst->extra);
|
|
4683
5798
|
|
|
4684
|
-
|
|
4685
|
-
|
|
4686
|
-
|
|
4687
|
-
|
|
4688
|
-
|
|
4689
|
-
|
|
4690
|
-
const int ne11 = src1->ne[1];
|
|
4691
|
-
const int ne12 = src1->ne[2];
|
|
4692
|
-
const cl_ulong nb11 = src1->nb[1];
|
|
4693
|
-
const cl_ulong nb12 = src1->nb[2];
|
|
4694
|
-
const cl_ulong nb1 = dst->nb[1];
|
|
4695
|
-
const cl_ulong nb2 = dst->nb[2];
|
|
4696
|
-
const cl_ulong nb3 = dst->nb[3];
|
|
5799
|
+
GGML_TENSOR_LOCALS(int, ne0, src0, ne);
|
|
5800
|
+
GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
|
|
5801
|
+
GGML_TENSOR_LOCALS(int, ne1, src1, ne);
|
|
5802
|
+
GGML_TENSOR_LOCALS(cl_ulong, nb1, src1, nb);
|
|
5803
|
+
GGML_TENSOR_LOCALS(int, ne, dst, ne);
|
|
5804
|
+
GGML_TENSOR_LOCALS(cl_ulong, nb, dst, nb);
|
|
4697
5805
|
|
|
4698
5806
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4699
5807
|
|
|
@@ -4739,8 +5847,14 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
4739
5847
|
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb2));
|
|
4740
5848
|
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb3));
|
|
4741
5849
|
|
|
4742
|
-
|
|
4743
|
-
|
|
5850
|
+
int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
|
|
5851
|
+
int nth = 1;
|
|
5852
|
+
while (nth < ne00 && 2*nth <= max_workgroup_size) {
|
|
5853
|
+
nth *= 2;
|
|
5854
|
+
}
|
|
5855
|
+
|
|
5856
|
+
size_t global_work_size[] = {(size_t)ne10*nth, (size_t)ne11, (size_t)ne12};
|
|
5857
|
+
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
4744
5858
|
|
|
4745
5859
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
4746
5860
|
}
|
|
@@ -5595,7 +6709,6 @@ static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
5595
6709
|
GGML_UNUSED(src1);
|
|
5596
6710
|
|
|
5597
6711
|
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
|
5598
|
-
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
5599
6712
|
|
|
5600
6713
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5601
6714
|
|
|
@@ -5618,7 +6731,14 @@ static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
5618
6731
|
const cl_ulong nb2 = dst->nb[2];
|
|
5619
6732
|
const cl_ulong nb3 = dst->nb[3];
|
|
5620
6733
|
|
|
5621
|
-
cl_kernel kernel
|
|
6734
|
+
cl_kernel kernel;
|
|
6735
|
+
|
|
6736
|
+
const bool is_c4 = ne00 % 4 == 0;
|
|
6737
|
+
if (is_c4) {
|
|
6738
|
+
kernel = backend_ctx->kernel_mean_f32_4;
|
|
6739
|
+
} else {
|
|
6740
|
+
kernel = backend_ctx->kernel_mean_f32;
|
|
6741
|
+
}
|
|
5622
6742
|
|
|
5623
6743
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
5624
6744
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
@@ -5635,7 +6755,7 @@ static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
5635
6755
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
|
|
5636
6756
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
|
|
5637
6757
|
|
|
5638
|
-
size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
|
|
6758
|
+
size_t global_work_size[] = {64 * (size_t)ne01, (size_t)ne02, (size_t)ne03};
|
|
5639
6759
|
size_t local_work_size[] = {(size_t)64, 1, 1};
|
|
5640
6760
|
|
|
5641
6761
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
@@ -5941,6 +7061,44 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
5941
7061
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
5942
7062
|
}
|
|
5943
7063
|
|
|
7064
|
+
static void ggml_cl_tri(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
7065
|
+
GGML_ASSERT(src0);
|
|
7066
|
+
GGML_ASSERT(src0->extra);
|
|
7067
|
+
GGML_ASSERT(dst);
|
|
7068
|
+
GGML_ASSERT(dst->extra);
|
|
7069
|
+
|
|
7070
|
+
UNUSED(src1);
|
|
7071
|
+
|
|
7072
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
7073
|
+
|
|
7074
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
7075
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
7076
|
+
|
|
7077
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
7078
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
7079
|
+
|
|
7080
|
+
const int tri_type = ggml_get_op_params_i32(dst, 0);
|
|
7081
|
+
const int64_t n = ggml_nelements(dst);
|
|
7082
|
+
const int ne0 = dst->ne[0];
|
|
7083
|
+
const int ne1 = dst->ne[1];
|
|
7084
|
+
|
|
7085
|
+
cl_kernel kernel = backend_ctx->kernel_tri;
|
|
7086
|
+
|
|
7087
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
7088
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
7089
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
7090
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
7091
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &n));
|
|
7092
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne0));
|
|
7093
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne1));
|
|
7094
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &tri_type));
|
|
7095
|
+
|
|
7096
|
+
size_t local_work_size[1] = { 256 };
|
|
7097
|
+
size_t global_work_size[1] = { ((size_t)n + local_work_size[0] - 1) / local_work_size[0] * local_work_size[0] };
|
|
7098
|
+
|
|
7099
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst);
|
|
7100
|
+
}
|
|
7101
|
+
|
|
5944
7102
|
static void ggml_cl_fill(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
5945
7103
|
GGML_ASSERT(dst);
|
|
5946
7104
|
GGML_ASSERT(dst->extra);
|
|
@@ -6436,6 +7594,64 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
6436
7594
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6437
7595
|
}
|
|
6438
7596
|
|
|
7597
|
+
static void ggml_cl_l2_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
7598
|
+
GGML_ASSERT(src0);
|
|
7599
|
+
GGML_ASSERT(src0->extra);
|
|
7600
|
+
GGML_ASSERT(dst);
|
|
7601
|
+
GGML_ASSERT(dst->extra);
|
|
7602
|
+
|
|
7603
|
+
UNUSED(src1);
|
|
7604
|
+
|
|
7605
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
7606
|
+
|
|
7607
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
7608
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
7609
|
+
|
|
7610
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
7611
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
7612
|
+
|
|
7613
|
+
float eps;
|
|
7614
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
|
7615
|
+
|
|
7616
|
+
GGML_TENSOR_LOCALS(int, ne0, src0, ne);
|
|
7617
|
+
GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
|
|
7618
|
+
|
|
7619
|
+
size_t sgs;
|
|
7620
|
+
if (backend_ctx->gpu_family == ADRENO) {
|
|
7621
|
+
sgs = 64;
|
|
7622
|
+
} else if (backend_ctx->gpu_family == INTEL) {
|
|
7623
|
+
sgs = 32;
|
|
7624
|
+
} else {
|
|
7625
|
+
GGML_ASSERT(false && "Unsupported GPU");
|
|
7626
|
+
}
|
|
7627
|
+
|
|
7628
|
+
cl_kernel kernel = backend_ctx->kernel_l2_norm_f32;
|
|
7629
|
+
|
|
7630
|
+
int nth = sgs;
|
|
7631
|
+
while (nth < ne00 && nth < (int)backend_ctx->get_kernel_workgroup_size(kernel)) {
|
|
7632
|
+
nth *= 2;
|
|
7633
|
+
}
|
|
7634
|
+
|
|
7635
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
7636
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
7637
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
7638
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
7639
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
|
7640
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
|
7641
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
|
7642
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
|
7643
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
|
7644
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
|
7645
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
|
7646
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
|
|
7647
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
|
|
7648
|
+
|
|
7649
|
+
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
7650
|
+
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
7651
|
+
|
|
7652
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
7653
|
+
}
|
|
7654
|
+
|
|
6439
7655
|
static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
6440
7656
|
GGML_ASSERT(src0);
|
|
6441
7657
|
GGML_ASSERT(src0->extra);
|
|
@@ -6449,82 +7665,172 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
6449
7665
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
6450
7666
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
6451
7667
|
|
|
6452
|
-
cl_ulong
|
|
6453
|
-
cl_ulong
|
|
7668
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
7669
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
7670
|
+
|
|
7671
|
+
const int ne00 = src0->ne[0];
|
|
7672
|
+
const int ne01 = src0->ne[1];
|
|
7673
|
+
const int ne02 = src0->ne[2];
|
|
7674
|
+
const int ne03 = src0->ne[3];
|
|
7675
|
+
|
|
7676
|
+
const cl_ulong nb00 = src0->nb[0];
|
|
7677
|
+
const cl_ulong nb01 = src0->nb[1];
|
|
7678
|
+
const cl_ulong nb02 = src0->nb[2];
|
|
7679
|
+
const cl_ulong nb03 = src0->nb[3];
|
|
7680
|
+
|
|
7681
|
+
const cl_ulong nb0 = dst->nb[0];
|
|
7682
|
+
const cl_ulong nb1 = dst->nb[1];
|
|
7683
|
+
const cl_ulong nb2 = dst->nb[2];
|
|
7684
|
+
const cl_ulong nb3 = dst->nb[3];
|
|
6454
7685
|
|
|
6455
7686
|
cl_kernel kernel;
|
|
6456
|
-
|
|
6457
|
-
|
|
6458
|
-
|
|
6459
|
-
|
|
7687
|
+
|
|
7688
|
+
if (ggml_is_contiguous(src0)) {
|
|
7689
|
+
// Handle contiguous input
|
|
7690
|
+
int n = ggml_nelements(dst);
|
|
7691
|
+
if (n % 4 == 0) {
|
|
7692
|
+
if (src0->type == GGML_TYPE_F32) {
|
|
7693
|
+
kernel = backend_ctx->kernel_tanh_f32_4;
|
|
7694
|
+
} else {
|
|
7695
|
+
kernel = backend_ctx->kernel_tanh_f16_4;
|
|
7696
|
+
}
|
|
7697
|
+
n /= 4;
|
|
7698
|
+
} else {
|
|
7699
|
+
if (src0->type == GGML_TYPE_F32) {
|
|
7700
|
+
kernel = backend_ctx->kernel_tanh_f32;
|
|
7701
|
+
} else {
|
|
7702
|
+
kernel = backend_ctx->kernel_tanh_f16;
|
|
7703
|
+
}
|
|
7704
|
+
}
|
|
7705
|
+
|
|
7706
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
7707
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
7708
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
7709
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
7710
|
+
|
|
7711
|
+
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
7712
|
+
size_t local_work_size[] = {64, 1, 1};
|
|
7713
|
+
|
|
7714
|
+
size_t * local_work_size_ptr = local_work_size;
|
|
7715
|
+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
|
7716
|
+
local_work_size_ptr = nullptr;
|
|
7717
|
+
}
|
|
7718
|
+
|
|
7719
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
6460
7720
|
} else {
|
|
6461
|
-
|
|
7721
|
+
// Handle non-contiguous input
|
|
7722
|
+
if (src0->type == GGML_TYPE_F32) {
|
|
7723
|
+
kernel = backend_ctx->kernel_tanh_f32_nc;
|
|
7724
|
+
} else {
|
|
7725
|
+
kernel = backend_ctx->kernel_tanh_f16_nc;
|
|
7726
|
+
}
|
|
7727
|
+
|
|
7728
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
7729
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
7730
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
7731
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
7732
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
|
7733
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb00));
|
|
7734
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
|
|
7735
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb02));
|
|
7736
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb03));
|
|
7737
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb0));
|
|
7738
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb1));
|
|
7739
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb2));
|
|
7740
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb3));
|
|
7741
|
+
|
|
7742
|
+
int nth = 64;
|
|
7743
|
+
|
|
7744
|
+
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
7745
|
+
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
7746
|
+
|
|
7747
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6462
7748
|
}
|
|
6463
|
-
|
|
7749
|
+
}
|
|
6464
7750
|
|
|
6465
|
-
|
|
6466
|
-
|
|
7751
|
+
static void ggml_cl_neg(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
7752
|
+
GGML_ASSERT(src0);
|
|
7753
|
+
GGML_ASSERT(src0->extra);
|
|
7754
|
+
GGML_ASSERT(dst);
|
|
7755
|
+
GGML_ASSERT(dst->extra);
|
|
6467
7756
|
|
|
6468
|
-
|
|
6469
|
-
const cl_ulong nb10 = dst->nb[0]; const cl_ulong nb11 = dst->nb[1]; const cl_ulong nb12 = dst->nb[2]; const cl_ulong nb13 = dst->nb[3];
|
|
7757
|
+
UNUSED(src1);
|
|
6470
7758
|
|
|
6471
|
-
|
|
6472
|
-
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
|
|
6473
|
-
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
6474
|
-
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
|
|
7759
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
6475
7760
|
|
|
6476
|
-
|
|
6477
|
-
|
|
6478
|
-
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
|
6479
|
-
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
|
6480
|
-
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
|
|
6481
|
-
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
|
6482
|
-
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
|
|
6483
|
-
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
|
|
7761
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
7762
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
6484
7763
|
|
|
6485
|
-
|
|
6486
|
-
|
|
6487
|
-
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
|
|
6488
|
-
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
|
|
6489
|
-
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
|
|
6490
|
-
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
|
|
6491
|
-
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
|
|
6492
|
-
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
|
|
6493
|
-
|
|
6494
|
-
size_t global_work_size[3];
|
|
6495
|
-
if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
|
|
6496
|
-
return;
|
|
6497
|
-
}
|
|
6498
|
-
global_work_size[0] = (size_t)ne10;
|
|
6499
|
-
global_work_size[1] = (size_t)ne11;
|
|
6500
|
-
global_work_size[2] = (size_t)ne12;
|
|
7764
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
7765
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
6501
7766
|
|
|
6502
|
-
|
|
6503
|
-
|
|
6504
|
-
|
|
6505
|
-
|
|
7767
|
+
GGML_TENSOR_LOCALS(int, ne0, src0, ne);
|
|
7768
|
+
GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
|
|
7769
|
+
GGML_TENSOR_LOCALS(int, ne, dst, ne);
|
|
7770
|
+
GGML_TENSOR_LOCALS(cl_ulong, nb, dst, nb);
|
|
6506
7771
|
|
|
6507
|
-
|
|
6508
|
-
|
|
6509
|
-
|
|
7772
|
+
cl_kernel kernel;
|
|
7773
|
+
|
|
7774
|
+
if (ggml_is_contiguous(src0)) {
|
|
7775
|
+
// Handle contiguous input
|
|
7776
|
+
int n = ggml_nelements(dst);
|
|
7777
|
+
if (n % 4 == 0) {
|
|
7778
|
+
if (src0->type == GGML_TYPE_F32) {
|
|
7779
|
+
kernel = backend_ctx->kernel_neg_f32_4;
|
|
7780
|
+
} else {
|
|
7781
|
+
kernel = backend_ctx->kernel_neg_f16_4;
|
|
7782
|
+
}
|
|
7783
|
+
n /= 4;
|
|
7784
|
+
} else {
|
|
7785
|
+
if (src0->type == GGML_TYPE_F32) {
|
|
7786
|
+
kernel = backend_ctx->kernel_neg_f32;
|
|
7787
|
+
} else {
|
|
7788
|
+
kernel = backend_ctx->kernel_neg_f16;
|
|
7789
|
+
}
|
|
7790
|
+
}
|
|
6510
7791
|
|
|
7792
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
7793
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
7794
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
7795
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
7796
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &n));
|
|
6511
7797
|
|
|
6512
|
-
|
|
7798
|
+
size_t global_work_size[] = {(size_t)CEIL_DIV(n, 64)*64, 1, 1};
|
|
7799
|
+
size_t local_work_size[] = {64, 1, 1};
|
|
6513
7800
|
|
|
6514
|
-
|
|
6515
|
-
|
|
6516
|
-
|
|
6517
|
-
|
|
6518
|
-
|
|
6519
|
-
|
|
7801
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
7802
|
+
} else {
|
|
7803
|
+
// Handle non-contiguous input
|
|
7804
|
+
if (src0->type == GGML_TYPE_F32) {
|
|
7805
|
+
kernel = backend_ctx->kernel_neg_f32_nc;
|
|
7806
|
+
} else {
|
|
7807
|
+
kernel = backend_ctx->kernel_neg_f16_nc;
|
|
6520
7808
|
}
|
|
6521
|
-
}
|
|
6522
|
-
if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
|
|
6523
7809
|
|
|
6524
|
-
|
|
7810
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
7811
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
7812
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
7813
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
7814
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
|
7815
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb00));
|
|
7816
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
|
|
7817
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb02));
|
|
7818
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb03));
|
|
7819
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb0));
|
|
7820
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb1));
|
|
7821
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb2));
|
|
7822
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb3));
|
|
7823
|
+
|
|
7824
|
+
int nth = 64;
|
|
7825
|
+
|
|
7826
|
+
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
7827
|
+
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
7828
|
+
|
|
7829
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
7830
|
+
}
|
|
6525
7831
|
}
|
|
6526
7832
|
|
|
6527
|
-
static void
|
|
7833
|
+
static void ggml_cl_exp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
6528
7834
|
GGML_ASSERT(src0);
|
|
6529
7835
|
GGML_ASSERT(src0->extra);
|
|
6530
7836
|
GGML_ASSERT(dst);
|
|
@@ -6537,18 +7843,90 @@ static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
|
6537
7843
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
6538
7844
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
6539
7845
|
|
|
6540
|
-
cl_ulong
|
|
6541
|
-
cl_ulong
|
|
7846
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
7847
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
7848
|
+
|
|
7849
|
+
GGML_TENSOR_LOCALS(int, ne0, src0, ne);
|
|
7850
|
+
GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
|
|
7851
|
+
GGML_TENSOR_LOCALS(int, ne, dst, ne);
|
|
7852
|
+
GGML_TENSOR_LOCALS(cl_ulong, nb, dst, nb);
|
|
6542
7853
|
|
|
6543
7854
|
cl_kernel kernel;
|
|
6544
|
-
|
|
6545
|
-
|
|
6546
|
-
|
|
6547
|
-
|
|
7855
|
+
|
|
7856
|
+
if (ggml_is_contiguous(src0)) {
|
|
7857
|
+
// Handle contiguous input
|
|
7858
|
+
int n = ggml_nelements(dst);
|
|
7859
|
+
if (n % 4 == 0) {
|
|
7860
|
+
if (src0->type == GGML_TYPE_F32) {
|
|
7861
|
+
kernel = backend_ctx->kernel_exp_f32_4;
|
|
7862
|
+
} else {
|
|
7863
|
+
kernel = backend_ctx->kernel_exp_f16_4;
|
|
7864
|
+
}
|
|
7865
|
+
n /= 4;
|
|
7866
|
+
} else {
|
|
7867
|
+
if (src0->type == GGML_TYPE_F32) {
|
|
7868
|
+
kernel = backend_ctx->kernel_exp_f32;
|
|
7869
|
+
} else {
|
|
7870
|
+
kernel = backend_ctx->kernel_exp_f16;
|
|
7871
|
+
}
|
|
7872
|
+
}
|
|
7873
|
+
|
|
7874
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
7875
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
7876
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
7877
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
7878
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &n));
|
|
7879
|
+
|
|
7880
|
+
size_t global_work_size[] = {(size_t)CEIL_DIV(n, 64)*64, 1, 1};
|
|
7881
|
+
size_t local_work_size[] = {64, 1, 1};
|
|
7882
|
+
|
|
7883
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6548
7884
|
} else {
|
|
6549
|
-
|
|
7885
|
+
// Handle non-contiguous input
|
|
7886
|
+
if (src0->type == GGML_TYPE_F32) {
|
|
7887
|
+
kernel = backend_ctx->kernel_exp_f32_nc;
|
|
7888
|
+
} else {
|
|
7889
|
+
kernel = backend_ctx->kernel_exp_f16_nc;
|
|
7890
|
+
}
|
|
7891
|
+
|
|
7892
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
7893
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
7894
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
7895
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
7896
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
|
7897
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb00));
|
|
7898
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
|
|
7899
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb02));
|
|
7900
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb03));
|
|
7901
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb0));
|
|
7902
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb1));
|
|
7903
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb2));
|
|
7904
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb3));
|
|
7905
|
+
|
|
7906
|
+
int nth = 64;
|
|
7907
|
+
|
|
7908
|
+
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
7909
|
+
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
7910
|
+
|
|
7911
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6550
7912
|
}
|
|
6551
|
-
|
|
7913
|
+
}
|
|
7914
|
+
|
|
7915
|
+
static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
7916
|
+
GGML_ASSERT(src0);
|
|
7917
|
+
GGML_ASSERT(src0->extra);
|
|
7918
|
+
GGML_ASSERT(dst);
|
|
7919
|
+
GGML_ASSERT(dst->extra);
|
|
7920
|
+
|
|
7921
|
+
UNUSED(src1);
|
|
7922
|
+
|
|
7923
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
7924
|
+
|
|
7925
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
7926
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
7927
|
+
|
|
7928
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
7929
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
6552
7930
|
|
|
6553
7931
|
const int ne00 = src0->ne[0];
|
|
6554
7932
|
const int ne01 = src0->ne[1];
|
|
@@ -6560,70 +7938,74 @@ static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
|
6560
7938
|
const cl_ulong nb02 = src0->nb[2];
|
|
6561
7939
|
const cl_ulong nb03 = src0->nb[3];
|
|
6562
7940
|
|
|
6563
|
-
const
|
|
6564
|
-
const
|
|
6565
|
-
const
|
|
6566
|
-
const
|
|
7941
|
+
const cl_ulong nb0 = dst->nb[0];
|
|
7942
|
+
const cl_ulong nb1 = dst->nb[1];
|
|
7943
|
+
const cl_ulong nb2 = dst->nb[2];
|
|
7944
|
+
const cl_ulong nb3 = dst->nb[3];
|
|
6567
7945
|
|
|
6568
|
-
|
|
6569
|
-
const cl_ulong nb11 = dst->nb[1];
|
|
6570
|
-
const cl_ulong nb12 = dst->nb[2];
|
|
6571
|
-
const cl_ulong nb13 = dst->nb[3];
|
|
7946
|
+
cl_kernel kernel;
|
|
6572
7947
|
|
|
6573
|
-
|
|
6574
|
-
|
|
6575
|
-
|
|
6576
|
-
|
|
7948
|
+
if (ggml_is_contiguous(src0)) {
|
|
7949
|
+
// Handle contiguous input
|
|
7950
|
+
int n = ggml_nelements(dst);
|
|
7951
|
+
if (n % 4 == 0) {
|
|
7952
|
+
if (src0->type == GGML_TYPE_F32) {
|
|
7953
|
+
kernel = backend_ctx->kernel_expm1_f32_4;
|
|
7954
|
+
} else {
|
|
7955
|
+
kernel = backend_ctx->kernel_expm1_f16_4;
|
|
7956
|
+
}
|
|
7957
|
+
n /= 4;
|
|
7958
|
+
} else {
|
|
7959
|
+
if (src0->type == GGML_TYPE_F32) {
|
|
7960
|
+
kernel = backend_ctx->kernel_expm1_f32;
|
|
7961
|
+
} else {
|
|
7962
|
+
kernel = backend_ctx->kernel_expm1_f16;
|
|
7963
|
+
}
|
|
7964
|
+
}
|
|
6577
7965
|
|
|
6578
|
-
|
|
6579
|
-
|
|
6580
|
-
|
|
6581
|
-
|
|
6582
|
-
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
|
|
6583
|
-
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
|
6584
|
-
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
|
|
6585
|
-
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
|
|
7966
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
7967
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
7968
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
7969
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
6586
7970
|
|
|
6587
|
-
|
|
6588
|
-
|
|
6589
|
-
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
|
|
6590
|
-
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
|
|
6591
|
-
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
|
|
6592
|
-
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
|
|
6593
|
-
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
|
|
6594
|
-
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
|
|
6595
|
-
|
|
6596
|
-
size_t global_work_size[3];
|
|
6597
|
-
if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
|
|
6598
|
-
return;
|
|
6599
|
-
}
|
|
6600
|
-
global_work_size[0] = (size_t)ne10;
|
|
6601
|
-
global_work_size[1] = (size_t)ne11;
|
|
6602
|
-
global_work_size[2] = (size_t)ne12;
|
|
7971
|
+
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
7972
|
+
size_t local_work_size[] = {64, 1, 1};
|
|
6603
7973
|
|
|
6604
|
-
|
|
6605
|
-
|
|
6606
|
-
|
|
6607
|
-
|
|
7974
|
+
size_t * local_work_size_ptr = local_work_size;
|
|
7975
|
+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
|
7976
|
+
local_work_size_ptr = nullptr;
|
|
7977
|
+
}
|
|
6608
7978
|
|
|
6609
|
-
|
|
6610
|
-
|
|
6611
|
-
|
|
7979
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
7980
|
+
} else {
|
|
7981
|
+
// Handle non-contiguous input
|
|
7982
|
+
if (src0->type == GGML_TYPE_F32) {
|
|
7983
|
+
kernel = backend_ctx->kernel_expm1_f32_nc;
|
|
7984
|
+
} else {
|
|
7985
|
+
kernel = backend_ctx->kernel_expm1_f16_nc;
|
|
7986
|
+
}
|
|
6612
7987
|
|
|
7988
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
7989
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
7990
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
7991
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
7992
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
|
7993
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb00));
|
|
7994
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
|
|
7995
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb02));
|
|
7996
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb03));
|
|
7997
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb0));
|
|
7998
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb1));
|
|
7999
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb2));
|
|
8000
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb3));
|
|
8001
|
+
|
|
8002
|
+
int nth = 64;
|
|
6613
8003
|
|
|
6614
|
-
|
|
8004
|
+
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
8005
|
+
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
6615
8006
|
|
|
6616
|
-
|
|
6617
|
-
if (!backend_ctx->non_uniform_workgroups) {
|
|
6618
|
-
if (global_work_size[0] % local_work_size[0] != 0 ||
|
|
6619
|
-
global_work_size[1] % local_work_size[1] != 0 ||
|
|
6620
|
-
global_work_size[2] % local_work_size[2] != 0) {
|
|
6621
|
-
local_work_size_ptr = NULL;
|
|
6622
|
-
}
|
|
8007
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6623
8008
|
}
|
|
6624
|
-
if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
|
|
6625
|
-
|
|
6626
|
-
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
6627
8009
|
}
|
|
6628
8010
|
|
|
6629
8011
|
static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -6639,18 +8021,8 @@ static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
6639
8021
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
6640
8022
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
6641
8023
|
|
|
6642
|
-
cl_ulong
|
|
6643
|
-
cl_ulong
|
|
6644
|
-
|
|
6645
|
-
cl_kernel kernel;
|
|
6646
|
-
if (dst->type == GGML_TYPE_F32) {
|
|
6647
|
-
kernel = backend_ctx->kernel_softplus_f32_nd;
|
|
6648
|
-
} else if (dst->type == GGML_TYPE_F16) {
|
|
6649
|
-
kernel = backend_ctx->kernel_softplus_f16_nd;
|
|
6650
|
-
} else {
|
|
6651
|
-
GGML_ASSERT(false && "Unsupported type for ggml_cl_softplus");
|
|
6652
|
-
}
|
|
6653
|
-
GGML_ASSERT(kernel != nullptr);
|
|
8024
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
8025
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
6654
8026
|
|
|
6655
8027
|
const int ne00 = src0->ne[0];
|
|
6656
8028
|
const int ne01 = src0->ne[1];
|
|
@@ -6662,70 +8034,74 @@ static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
6662
8034
|
const cl_ulong nb02 = src0->nb[2];
|
|
6663
8035
|
const cl_ulong nb03 = src0->nb[3];
|
|
6664
8036
|
|
|
6665
|
-
const
|
|
6666
|
-
const
|
|
6667
|
-
const
|
|
6668
|
-
const
|
|
8037
|
+
const cl_ulong nb0 = dst->nb[0];
|
|
8038
|
+
const cl_ulong nb1 = dst->nb[1];
|
|
8039
|
+
const cl_ulong nb2 = dst->nb[2];
|
|
8040
|
+
const cl_ulong nb3 = dst->nb[3];
|
|
6669
8041
|
|
|
6670
|
-
|
|
6671
|
-
const cl_ulong nb11 = dst->nb[1];
|
|
6672
|
-
const cl_ulong nb12 = dst->nb[2];
|
|
6673
|
-
const cl_ulong nb13 = dst->nb[3];
|
|
8042
|
+
cl_kernel kernel;
|
|
6674
8043
|
|
|
6675
|
-
|
|
6676
|
-
|
|
6677
|
-
|
|
6678
|
-
|
|
8044
|
+
if (ggml_is_contiguous(src0)) {
|
|
8045
|
+
// Handle contiguous input
|
|
8046
|
+
int n = ggml_nelements(dst);
|
|
8047
|
+
if (n % 4 == 0) {
|
|
8048
|
+
if (src0->type == GGML_TYPE_F32) {
|
|
8049
|
+
kernel = backend_ctx->kernel_softplus_f32_4;
|
|
8050
|
+
} else {
|
|
8051
|
+
kernel = backend_ctx->kernel_softplus_f16_4;
|
|
8052
|
+
}
|
|
8053
|
+
n /= 4;
|
|
8054
|
+
} else {
|
|
8055
|
+
if (src0->type == GGML_TYPE_F32) {
|
|
8056
|
+
kernel = backend_ctx->kernel_softplus_f32;
|
|
8057
|
+
} else {
|
|
8058
|
+
kernel = backend_ctx->kernel_softplus_f16;
|
|
8059
|
+
}
|
|
8060
|
+
}
|
|
6679
8061
|
|
|
6680
|
-
|
|
6681
|
-
|
|
6682
|
-
|
|
6683
|
-
|
|
6684
|
-
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
|
|
6685
|
-
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
|
6686
|
-
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
|
|
6687
|
-
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
|
|
8062
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
8063
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
8064
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
8065
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
6688
8066
|
|
|
6689
|
-
|
|
6690
|
-
|
|
6691
|
-
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
|
|
6692
|
-
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
|
|
6693
|
-
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
|
|
6694
|
-
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
|
|
6695
|
-
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
|
|
6696
|
-
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
|
|
6697
|
-
|
|
6698
|
-
size_t global_work_size[3];
|
|
6699
|
-
if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
|
|
6700
|
-
return;
|
|
6701
|
-
}
|
|
6702
|
-
global_work_size[0] = (size_t)ne10;
|
|
6703
|
-
global_work_size[1] = (size_t)ne11;
|
|
6704
|
-
global_work_size[2] = (size_t)ne12;
|
|
8067
|
+
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
8068
|
+
size_t local_work_size[] = {64, 1, 1};
|
|
6705
8069
|
|
|
6706
|
-
|
|
6707
|
-
|
|
6708
|
-
|
|
6709
|
-
|
|
8070
|
+
size_t * local_work_size_ptr = local_work_size;
|
|
8071
|
+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
|
8072
|
+
local_work_size_ptr = nullptr;
|
|
8073
|
+
}
|
|
6710
8074
|
|
|
6711
|
-
|
|
6712
|
-
|
|
6713
|
-
|
|
8075
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
8076
|
+
} else {
|
|
8077
|
+
// Handle non-contiguous input
|
|
8078
|
+
if (src0->type == GGML_TYPE_F32) {
|
|
8079
|
+
kernel = backend_ctx->kernel_softplus_f32_nc;
|
|
8080
|
+
} else {
|
|
8081
|
+
kernel = backend_ctx->kernel_softplus_f16_nc;
|
|
8082
|
+
}
|
|
6714
8083
|
|
|
8084
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
8085
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
8086
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
8087
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
8088
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
|
8089
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb00));
|
|
8090
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
|
|
8091
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb02));
|
|
8092
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb03));
|
|
8093
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb0));
|
|
8094
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb1));
|
|
8095
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb2));
|
|
8096
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb3));
|
|
8097
|
+
|
|
8098
|
+
int nth = 64;
|
|
6715
8099
|
|
|
6716
|
-
|
|
8100
|
+
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
8101
|
+
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
6717
8102
|
|
|
6718
|
-
|
|
6719
|
-
if (!backend_ctx->non_uniform_workgroups) {
|
|
6720
|
-
if (global_work_size[0] % local_work_size[0] != 0 ||
|
|
6721
|
-
global_work_size[1] % local_work_size[1] != 0 ||
|
|
6722
|
-
global_work_size[2] % local_work_size[2] != 0) {
|
|
6723
|
-
local_work_size_ptr = NULL;
|
|
6724
|
-
}
|
|
8103
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6725
8104
|
}
|
|
6726
|
-
if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
|
|
6727
|
-
|
|
6728
|
-
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
6729
8105
|
}
|
|
6730
8106
|
|
|
6731
8107
|
static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
|
|
@@ -6739,53 +8115,58 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
|
6739
8115
|
|
|
6740
8116
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
6741
8117
|
|
|
6742
|
-
|
|
6743
|
-
|
|
6744
|
-
return;
|
|
6745
|
-
}
|
|
8118
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
8119
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
6746
8120
|
|
|
6747
|
-
|
|
6748
|
-
|
|
8121
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
8122
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
6749
8123
|
|
|
6750
|
-
|
|
6751
|
-
|
|
8124
|
+
const int ne00 = src0->ne[0];
|
|
8125
|
+
const int ne01 = src0->ne[1];
|
|
8126
|
+
const int ne02 = src0->ne[2];
|
|
8127
|
+
const int ne03 = src0->ne[3];
|
|
8128
|
+
|
|
8129
|
+
const cl_ulong nb00 = src0->nb[0];
|
|
8130
|
+
const cl_ulong nb01 = src0->nb[1];
|
|
8131
|
+
const cl_ulong nb02 = src0->nb[2];
|
|
8132
|
+
const cl_ulong nb03 = src0->nb[3];
|
|
8133
|
+
|
|
8134
|
+
const int ne0 = dst->ne[0];
|
|
8135
|
+
const int ne1 = dst->ne[1];
|
|
8136
|
+
const int ne2 = dst->ne[2];
|
|
8137
|
+
const int ne3 = dst->ne[3];
|
|
6752
8138
|
|
|
6753
|
-
const
|
|
6754
|
-
const cl_ulong
|
|
8139
|
+
const cl_ulong nb0 = dst->nb[0];
|
|
8140
|
+
const cl_ulong nb1 = dst->nb[1];
|
|
8141
|
+
const cl_ulong nb2 = dst->nb[2];
|
|
8142
|
+
const cl_ulong nb3 = dst->nb[3];
|
|
8143
|
+
|
|
8144
|
+
cl_kernel kernel = backend_ctx->kernel_repeat_f32;
|
|
6755
8145
|
|
|
6756
|
-
|
|
6757
|
-
|
|
8146
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
8147
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
8148
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
8149
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
8150
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
|
8151
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
|
8152
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
|
8153
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
|
8154
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
|
|
8155
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
|
8156
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
|
|
8157
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
|
|
8158
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne0));
|
|
8159
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb0));
|
|
8160
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb1));
|
|
8161
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb2));
|
|
8162
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb3));
|
|
6758
8163
|
|
|
6759
|
-
|
|
8164
|
+
int nth = 64;
|
|
6760
8165
|
|
|
6761
|
-
|
|
6762
|
-
|
|
6763
|
-
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &off_src0));
|
|
6764
|
-
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
|
|
6765
|
-
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &src0_ne0));
|
|
6766
|
-
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &src0_ne1));
|
|
6767
|
-
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &src0_ne2));
|
|
6768
|
-
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &src0_ne3));
|
|
6769
|
-
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &src0_nb0));
|
|
6770
|
-
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &src0_nb1));
|
|
6771
|
-
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &src0_nb2));
|
|
6772
|
-
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &src0_nb3));
|
|
6773
|
-
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &dst_ne0));
|
|
6774
|
-
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &dst_ne1));
|
|
6775
|
-
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &dst_ne2));
|
|
6776
|
-
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dst_ne3));
|
|
6777
|
-
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &dst_nb0));
|
|
6778
|
-
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &dst_nb1));
|
|
6779
|
-
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &dst_nb2));
|
|
6780
|
-
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &dst_nb3));
|
|
6781
|
-
|
|
6782
|
-
size_t gws0 = dst_ne1 > 0 ? (size_t)dst_ne1 : 1;
|
|
6783
|
-
size_t gws1 = dst_ne2 > 0 ? (size_t)dst_ne2 : 1;
|
|
6784
|
-
size_t gws2 = dst_ne3 > 0 ? (size_t)dst_ne3 : 1;
|
|
6785
|
-
|
|
6786
|
-
size_t global_work_size[] = { gws0, gws1, gws2 };
|
|
8166
|
+
size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
|
|
8167
|
+
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
6787
8168
|
|
|
6788
|
-
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size,
|
|
8169
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6789
8170
|
}
|
|
6790
8171
|
|
|
6791
8172
|
static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
@@ -7009,121 +8390,76 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
|
7009
8390
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
7010
8391
|
|
|
7011
8392
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
7012
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
7013
8393
|
|
|
7014
|
-
|
|
7015
|
-
|
|
7016
|
-
|
|
7017
|
-
|
|
8394
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
8395
|
+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
8396
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
8397
|
+
|
|
8398
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
8399
|
+
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
|
8400
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
8401
|
+
|
|
8402
|
+
const int ne00 = src0->ne[0];
|
|
8403
|
+
const int ne01 = src0->ne[1];
|
|
8404
|
+
const int ne02 = src0->ne[2];
|
|
8405
|
+
const int ne03 = src0->ne[3];
|
|
8406
|
+
|
|
8407
|
+
const cl_ulong nb00 = src0->nb[0];
|
|
8408
|
+
const cl_ulong nb01 = src0->nb[1];
|
|
8409
|
+
const cl_ulong nb02 = src0->nb[2];
|
|
8410
|
+
const cl_ulong nb03 = src0->nb[3];
|
|
8411
|
+
|
|
8412
|
+
const cl_ulong nb10 = src1->nb[0];
|
|
8413
|
+
const cl_ulong nb11 = src1->nb[1];
|
|
8414
|
+
const cl_ulong nb12 = src1->nb[2];
|
|
8415
|
+
const cl_ulong nb13 = src1->nb[3];
|
|
7018
8416
|
|
|
7019
|
-
|
|
7020
|
-
|
|
7021
|
-
|
|
8417
|
+
const int ne0 = dst->ne[0];
|
|
8418
|
+
const int ne1 = dst->ne[1];
|
|
8419
|
+
const int ne2 = dst->ne[2];
|
|
8420
|
+
const int ne3 = dst->ne[3];
|
|
7022
8421
|
|
|
7023
|
-
cl_ulong
|
|
7024
|
-
cl_ulong
|
|
7025
|
-
cl_ulong
|
|
8422
|
+
const cl_ulong nb0 = dst->nb[0];
|
|
8423
|
+
const cl_ulong nb1 = dst->nb[1];
|
|
8424
|
+
const cl_ulong nb2 = dst->nb[2];
|
|
8425
|
+
const cl_ulong nb3 = dst->nb[3];
|
|
7026
8426
|
|
|
7027
|
-
const
|
|
8427
|
+
const cl_int dim = ((const int32_t *) dst->op_params)[0];
|
|
7028
8428
|
GGML_ASSERT(dim >= 0 && dim <= 3);
|
|
7029
8429
|
|
|
7030
|
-
|
|
7031
|
-
if (dim == 3) {
|
|
8430
|
+
int nth = MIN(64, ne0);
|
|
7032
8431
|
|
|
7033
|
-
|
|
7034
|
-
size_t nbytes_src1 = ggml_nbytes(src1);
|
|
8432
|
+
cl_kernel kernel = backend_ctx->kernel_concat_f32;
|
|
7035
8433
|
|
|
7036
|
-
|
|
7037
|
-
|
|
7038
|
-
|
|
7039
|
-
|
|
7040
|
-
|
|
8434
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
8435
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
8436
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
8437
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
8438
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
8439
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
8440
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
|
8441
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
|
8442
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
|
8443
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
|
|
8444
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
|
|
8445
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
|
|
8446
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
|
8447
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
|
|
8448
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
|
|
8449
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
|
|
8450
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
|
|
8451
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
|
|
8452
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
|
|
8453
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
|
|
8454
|
+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
|
|
8455
|
+
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
|
|
8456
|
+
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
|
|
8457
|
+
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_int), &dim));
|
|
8458
|
+
|
|
8459
|
+
size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
|
|
8460
|
+
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
7041
8461
|
|
|
7042
|
-
|
|
7043
|
-
size_t global_work_size[3];
|
|
7044
|
-
|
|
7045
|
-
for (int i3 = 0; i3 < dst->ne[3]; ++i3) {
|
|
7046
|
-
cl_ulong current_off_src0 = off_src0 + (i3 * src0->nb[3]);
|
|
7047
|
-
cl_ulong current_off_src1 = off_src1 + (i3 * src1->nb[3]);
|
|
7048
|
-
cl_ulong current_off_dst = off_dst + (i3 * dst->nb[3]);
|
|
7049
|
-
|
|
7050
|
-
int d_ne00 = src0->ne[0]; int d_ne01 = src0->ne[1]; int d_ne02 = src0->ne[2];
|
|
7051
|
-
int d_ne10 = src1->ne[0]; int d_ne11 = src1->ne[1]; int d_ne12 = src1->ne[2];
|
|
7052
|
-
int d_ne0 = dst->ne[0]; int d_ne1 = dst->ne[1]; int d_ne2 = dst->ne[2];
|
|
7053
|
-
|
|
7054
|
-
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
|
|
7055
|
-
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), ¤t_off_src0));
|
|
7056
|
-
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
|
|
7057
|
-
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), ¤t_off_src1));
|
|
7058
|
-
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
|
|
7059
|
-
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), ¤t_off_dst));
|
|
7060
|
-
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &d_ne00));
|
|
7061
|
-
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne01));
|
|
7062
|
-
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne02));
|
|
7063
|
-
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne10));
|
|
7064
|
-
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &d_ne11));
|
|
7065
|
-
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &d_ne12));
|
|
7066
|
-
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &d_ne0));
|
|
7067
|
-
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &d_ne1));
|
|
7068
|
-
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &d_ne2));
|
|
7069
|
-
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dim));
|
|
7070
|
-
|
|
7071
|
-
global_work_size[0] = d_ne0;
|
|
7072
|
-
global_work_size[1] = d_ne1;
|
|
7073
|
-
global_work_size[2] = d_ne2;
|
|
7074
|
-
|
|
7075
|
-
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
|
|
7076
|
-
}
|
|
7077
|
-
}
|
|
7078
|
-
} else {
|
|
7079
|
-
cl_kernel kernel = backend_ctx->kernel_concat_f32_non_contiguous;
|
|
7080
|
-
|
|
7081
|
-
cl_long ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3];
|
|
7082
|
-
cl_ulong nb00 = src0->nb[0], nb01 = src0->nb[1], nb02 = src0->nb[2], nb03 = src0->nb[3];
|
|
7083
|
-
|
|
7084
|
-
cl_ulong nb10 = src1->nb[0], nb11 = src1->nb[1], nb12 = src1->nb[2], nb13 = src1->nb[3];
|
|
7085
|
-
|
|
7086
|
-
cl_long d_ne0 = dst->ne[0], d_ne1 = dst->ne[1], d_ne2 = dst->ne[2], d_ne3 = dst->ne[3];
|
|
7087
|
-
cl_ulong d_nb0 = dst->nb[0], d_nb1 = dst->nb[1], d_nb2 = dst->nb[2], d_nb3 = dst->nb[3];
|
|
7088
|
-
|
|
7089
|
-
|
|
7090
|
-
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
|
|
7091
|
-
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
|
7092
|
-
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
|
|
7093
|
-
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_src1));
|
|
7094
|
-
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
|
|
7095
|
-
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &off_dst));
|
|
7096
|
-
|
|
7097
|
-
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_long), &ne00));
|
|
7098
|
-
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_long), &ne01));
|
|
7099
|
-
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_long), &ne02));
|
|
7100
|
-
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_long), &ne03));
|
|
7101
|
-
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
|
|
7102
|
-
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
|
|
7103
|
-
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
|
7104
|
-
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
|
|
7105
|
-
|
|
7106
|
-
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
|
|
7107
|
-
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
|
|
7108
|
-
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
|
|
7109
|
-
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
|
|
7110
|
-
|
|
7111
|
-
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_long), &d_ne0));
|
|
7112
|
-
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_long), &d_ne1));
|
|
7113
|
-
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_long), &d_ne2));
|
|
7114
|
-
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_long), &d_ne3));
|
|
7115
|
-
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &d_nb0));
|
|
7116
|
-
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &d_nb1));
|
|
7117
|
-
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(cl_ulong), &d_nb2));
|
|
7118
|
-
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(cl_ulong), &d_nb3));
|
|
7119
|
-
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &dim));
|
|
7120
|
-
|
|
7121
|
-
size_t global_work_size_nc[] = { d_ne1 > 0 ? (size_t)d_ne1 : 1,
|
|
7122
|
-
d_ne2 > 0 ? (size_t)d_ne2 : 1,
|
|
7123
|
-
d_ne3 > 0 ? (size_t)d_ne3 : 1 };
|
|
7124
|
-
|
|
7125
|
-
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_nc, NULL, dst);
|
|
7126
|
-
}
|
|
8462
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
7127
8463
|
}
|
|
7128
8464
|
|
|
7129
8465
|
static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
@@ -7496,82 +8832,503 @@ static void ggml_cl_mul_mat_kq_kqv_adreno(ggml_backend_t backend, const ggml_ten
|
|
|
7496
8832
|
region.size = nb02 * ne02;
|
|
7497
8833
|
}
|
|
7498
8834
|
|
|
7499
|
-
A_sub_buffer = clCreateSubBuffer((extra0->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
|
8835
|
+
A_sub_buffer = clCreateSubBuffer((extra0->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
|
8836
|
+
CL_CHECK(status);
|
|
8837
|
+
|
|
8838
|
+
// <--------------------------------------------> //
|
|
8839
|
+
|
|
8840
|
+
// create sub-buffer for B
|
|
8841
|
+
// <--------------------------------------------> //
|
|
8842
|
+
region.origin = (extra1->offset);
|
|
8843
|
+
region.size = nb10 * ne10 * ne11 * ne12;
|
|
8844
|
+
B_sub_buffer = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
|
8845
|
+
CL_CHECK(status);
|
|
8846
|
+
// <--------------------------------------------> //
|
|
8847
|
+
|
|
8848
|
+
img_fmt_1d = {CL_RGBA, CL_FLOAT};
|
|
8849
|
+
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
8850
|
+
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
8851
|
+
if (nb01 > nb02) {
|
|
8852
|
+
img_desc_1d.image_width = (nb01 * ne01 / 4)/4;
|
|
8853
|
+
}
|
|
8854
|
+
else {
|
|
8855
|
+
img_desc_1d.image_width = (nb02 * ne02 / 4)/4;
|
|
8856
|
+
}
|
|
8857
|
+
img_desc_1d.buffer = A_sub_buffer;
|
|
8858
|
+
A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
|
8859
|
+
CL_CHECK(status);
|
|
8860
|
+
|
|
8861
|
+
// create sub-buffer for output C
|
|
8862
|
+
// <--------------------------------------------> //
|
|
8863
|
+
region.origin = (extrad->offset);
|
|
8864
|
+
region.size = ne0 * ne1 * dst->ne[2] * dst->nb[0]; // size of C in bytes
|
|
8865
|
+
D_sub_buffer = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
|
8866
|
+
CL_CHECK(status);
|
|
8867
|
+
// <--------------------------------------------> //
|
|
8868
|
+
|
|
8869
|
+
// create image for C output
|
|
8870
|
+
// <--------------------------------------------> //
|
|
8871
|
+
img_fmt_1d = {CL_R, CL_FLOAT};
|
|
8872
|
+
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
8873
|
+
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
8874
|
+
img_desc_1d.image_width = ne0 * ne1 * dst->ne[2] * dst->nb[0] / 4;
|
|
8875
|
+
img_desc_1d.buffer = D_sub_buffer;
|
|
8876
|
+
D_image1d = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
|
8877
|
+
CL_CHECK(status);
|
|
8878
|
+
// <--------------------------------------------> //
|
|
8879
|
+
|
|
8880
|
+
int offset_src0 = 0;
|
|
8881
|
+
int offset_src1 = 0;
|
|
8882
|
+
|
|
8883
|
+
// set kernel args
|
|
8884
|
+
// <--------------------------------------------> //
|
|
8885
|
+
cl_uint k_arg = 0;
|
|
8886
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
|
|
8887
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &offset_src0));
|
|
8888
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_sub_buffer));
|
|
8889
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &offset_src1));
|
|
8890
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &D_image1d));
|
|
8891
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &extrad->offset));
|
|
8892
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &M));
|
|
8893
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &K));
|
|
8894
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &N));
|
|
8895
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
|
|
8896
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12));
|
|
8897
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &nb01));
|
|
8898
|
+
|
|
8899
|
+
size_t global_work_size[3] = {64, static_cast<size_t>(((M+63)/64)), static_cast<size_t>(((N+31)/32)*ne12)};
|
|
8900
|
+
size_t local_work_size[3] = {64, 1, 2};
|
|
8901
|
+
|
|
8902
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
8903
|
+
|
|
8904
|
+
// deallocate sub buffers and images
|
|
8905
|
+
// <--------------------------------------------> //
|
|
8906
|
+
CL_CHECK(clReleaseMemObject(A_image1d));
|
|
8907
|
+
CL_CHECK(clReleaseMemObject(D_image1d));
|
|
8908
|
+
CL_CHECK(clReleaseMemObject(A_sub_buffer));
|
|
8909
|
+
CL_CHECK(clReleaseMemObject(B_sub_buffer));
|
|
8910
|
+
CL_CHECK(clReleaseMemObject(D_sub_buffer));
|
|
8911
|
+
}
|
|
8912
|
+
|
|
8913
|
+
static void ggml_cl_mul_mat_q4_1_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
8914
|
+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
8915
|
+
GGML_ASSERT(src0);
|
|
8916
|
+
GGML_ASSERT(src0->extra);
|
|
8917
|
+
GGML_ASSERT(src1);
|
|
8918
|
+
GGML_ASSERT(src1->extra);
|
|
8919
|
+
GGML_ASSERT(dst);
|
|
8920
|
+
GGML_ASSERT(dst->extra);
|
|
8921
|
+
|
|
8922
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
8923
|
+
|
|
8924
|
+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
8925
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
8926
|
+
ggml_tensor_extra_cl_q4_1 * extra0_q4_1 = (ggml_tensor_extra_cl_q4_1 *)src0->extra;
|
|
8927
|
+
|
|
8928
|
+
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
|
8929
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
8930
|
+
|
|
8931
|
+
const int ne00 = src0->ne[0];
|
|
8932
|
+
const int ne01 = src0->ne[1];
|
|
8933
|
+
|
|
8934
|
+
const int ne1 = dst->ne[1];
|
|
8935
|
+
|
|
8936
|
+
GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
|
|
8937
|
+
|
|
8938
|
+
cl_context context = backend_ctx->context;
|
|
8939
|
+
cl_kernel kernel;
|
|
8940
|
+
|
|
8941
|
+
cl_int err;
|
|
8942
|
+
cl_image_format img_fmt;
|
|
8943
|
+
cl_image_desc img_desc;
|
|
8944
|
+
cl_buffer_region region;
|
|
8945
|
+
|
|
8946
|
+
int M = ne01;
|
|
8947
|
+
int N = ne1;
|
|
8948
|
+
int K = ne00;
|
|
8949
|
+
|
|
8950
|
+
if (ne1 == 1) {
|
|
8951
|
+
cl_mem q_img = nullptr;
|
|
8952
|
+
cl_mem b_sub_buf = nullptr;
|
|
8953
|
+
cl_mem b_img = nullptr;
|
|
8954
|
+
|
|
8955
|
+
// image for q
|
|
8956
|
+
img_fmt = { CL_R, CL_UNSIGNED_INT32};
|
|
8957
|
+
memset(&img_desc, 0, sizeof(img_desc));
|
|
8958
|
+
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
8959
|
+
img_desc.image_width = M * K / 2 / 4;
|
|
8960
|
+
img_desc.buffer = extra0_q4_1->q;
|
|
8961
|
+
CL_CHECK((q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
|
8962
|
+
|
|
8963
|
+
// subbuffer for activations
|
|
8964
|
+
region.origin = offset1;
|
|
8965
|
+
region.size = K * N * sizeof(float);
|
|
8966
|
+
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
|
8967
|
+
|
|
8968
|
+
// image for activations
|
|
8969
|
+
img_fmt = {CL_RGBA, CL_FLOAT};
|
|
8970
|
+
memset(&img_desc, 0, sizeof(img_desc));
|
|
8971
|
+
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
8972
|
+
img_desc.image_width = K * N / 4;
|
|
8973
|
+
img_desc.buffer = b_sub_buf;
|
|
8974
|
+
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
|
8975
|
+
|
|
8976
|
+
kernel = backend_ctx->kernel_gemv_noshuffle_q4_1_f32;
|
|
8977
|
+
|
|
8978
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_img));
|
|
8979
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_1->d));
|
|
8980
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_1->m));
|
|
8981
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &b_img));
|
|
8982
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
8983
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
8984
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne00));
|
|
8985
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne01));
|
|
8986
|
+
|
|
8987
|
+
size_t local_work_size[3] = {64, 4, 1};
|
|
8988
|
+
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1};
|
|
8989
|
+
|
|
8990
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
8991
|
+
|
|
8992
|
+
CL_CHECK(clReleaseMemObject(q_img));
|
|
8993
|
+
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
|
8994
|
+
CL_CHECK(clReleaseMemObject(b_img));
|
|
8995
|
+
} else {
|
|
8996
|
+
cl_mem b_sub_buf = nullptr;
|
|
8997
|
+
cl_mem b_sub_buf_trans = nullptr;
|
|
8998
|
+
cl_mem b_img = nullptr;
|
|
8999
|
+
cl_mem b_img_trans = nullptr;
|
|
9000
|
+
|
|
9001
|
+
// subbuffer for activations
|
|
9002
|
+
region.origin = offset1;
|
|
9003
|
+
region.size = K * N * sizeof(float);
|
|
9004
|
+
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
|
9005
|
+
|
|
9006
|
+
// image for activations
|
|
9007
|
+
img_fmt = {CL_RGBA, CL_FLOAT};
|
|
9008
|
+
memset(&img_desc, 0, sizeof(img_desc));
|
|
9009
|
+
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
9010
|
+
img_desc.image_width = K * N / 4;
|
|
9011
|
+
img_desc.buffer = b_sub_buf;
|
|
9012
|
+
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
|
9013
|
+
|
|
9014
|
+
// pad N to multiple of 8
|
|
9015
|
+
int extra_elements = N % 8;
|
|
9016
|
+
int padding = 0;
|
|
9017
|
+
if (extra_elements > 0){
|
|
9018
|
+
padding = 8 - extra_elements;
|
|
9019
|
+
}
|
|
9020
|
+
|
|
9021
|
+
// subbuffer for transposed activations
|
|
9022
|
+
region.origin = 0;
|
|
9023
|
+
region.size = K * (N + padding) * sizeof(float)/2;
|
|
9024
|
+
backend_ctx->prealloc_act_trans.allocate(context, region.size);
|
|
9025
|
+
CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
|
9026
|
+
|
|
9027
|
+
// image for transposed activations
|
|
9028
|
+
img_fmt = {CL_RGBA, CL_HALF_FLOAT};
|
|
9029
|
+
memset(&img_desc, 0, sizeof(img_desc));
|
|
9030
|
+
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
9031
|
+
img_desc.image_width = K * (N + padding) / 4;
|
|
9032
|
+
img_desc.buffer = b_sub_buf_trans;
|
|
9033
|
+
CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err));
|
|
9034
|
+
|
|
9035
|
+
// transpose activations
|
|
9036
|
+
int height_B = N/4;
|
|
9037
|
+
if (height_B == 0) {
|
|
9038
|
+
height_B = 1;
|
|
9039
|
+
}
|
|
9040
|
+
int width_B = K/4;
|
|
9041
|
+
int padded_height_B = (N + padding)/4;
|
|
9042
|
+
|
|
9043
|
+
kernel = backend_ctx->kernel_transpose_32_16;
|
|
9044
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img));
|
|
9045
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans));
|
|
9046
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
|
|
9047
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
|
|
9048
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
|
|
9049
|
+
|
|
9050
|
+
size_t local_work_size_t[2] = { 1, 16 };
|
|
9051
|
+
size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
|
|
9052
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
|
|
9053
|
+
|
|
9054
|
+
// gemm
|
|
9055
|
+
kernel = backend_ctx->kernel_gemm_noshuffle_q4_1_f32;
|
|
9056
|
+
int padded_N = N + padding;
|
|
9057
|
+
|
|
9058
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_1->q));
|
|
9059
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_1->d));
|
|
9060
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_1->m));
|
|
9061
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &b_img_trans));
|
|
9062
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
9063
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
9064
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne01));
|
|
9065
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &padded_N));
|
|
9066
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &ne00));
|
|
9067
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_int), &ne1));
|
|
9068
|
+
|
|
9069
|
+
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1};
|
|
9070
|
+
size_t local_work_size[3] = {1, 128, 1};
|
|
9071
|
+
|
|
9072
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
9073
|
+
|
|
9074
|
+
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
|
9075
|
+
CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
|
|
9076
|
+
CL_CHECK(clReleaseMemObject(b_img));
|
|
9077
|
+
CL_CHECK(clReleaseMemObject(b_img_trans));
|
|
9078
|
+
}
|
|
9079
|
+
#else
|
|
9080
|
+
GGML_UNUSED(backend);
|
|
9081
|
+
GGML_UNUSED(src0);
|
|
9082
|
+
GGML_UNUSED(src1);
|
|
9083
|
+
GGML_UNUSED(dst);
|
|
9084
|
+
#endif
|
|
9085
|
+
}
|
|
9086
|
+
|
|
9087
|
+
static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
9088
|
+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
9089
|
+
GGML_ASSERT(src0);
|
|
9090
|
+
GGML_ASSERT(src0->extra);
|
|
9091
|
+
GGML_ASSERT(src1);
|
|
9092
|
+
GGML_ASSERT(src1->extra);
|
|
9093
|
+
GGML_ASSERT(dst);
|
|
9094
|
+
GGML_ASSERT(dst->extra);
|
|
9095
|
+
|
|
9096
|
+
const enum ggml_type src0t = src0->type;
|
|
9097
|
+
const enum ggml_type src1t = src1->type;
|
|
9098
|
+
|
|
9099
|
+
GGML_ASSERT(src0t == GGML_TYPE_Q8_0);
|
|
9100
|
+
GGML_ASSERT(src1t == GGML_TYPE_F32);
|
|
9101
|
+
|
|
9102
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
9103
|
+
|
|
9104
|
+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
9105
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
9106
|
+
|
|
9107
|
+
ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
|
|
9108
|
+
|
|
9109
|
+
GGML_ASSERT(src1->view_offs == 0);
|
|
9110
|
+
GGML_ASSERT(dst->view_offs == 0);
|
|
9111
|
+
|
|
9112
|
+
const int ne00 = src0->ne[0];
|
|
9113
|
+
const int ne01 = src0->ne[1];
|
|
9114
|
+
const int ne02 = src0->ne[2];
|
|
9115
|
+
|
|
9116
|
+
const int ne10 = src1->ne[0];
|
|
9117
|
+
const int ne12 = src1->ne[2];
|
|
9118
|
+
|
|
9119
|
+
const int ne0 = dst->ne[0];
|
|
9120
|
+
const int ne1 = dst->ne[1];
|
|
9121
|
+
|
|
9122
|
+
GGML_ASSERT(ne00 == ne10);
|
|
9123
|
+
GGML_ASSERT((ne00 % 32) == 0);
|
|
9124
|
+
GGML_ASSERT(ne0 == ne01);
|
|
9125
|
+
|
|
9126
|
+
cl_context context = backend_ctx->context;
|
|
9127
|
+
cl_kernel kernel;
|
|
9128
|
+
|
|
9129
|
+
// init CL objects
|
|
9130
|
+
cl_int status;
|
|
9131
|
+
cl_image_format img_fmt_1d;
|
|
9132
|
+
cl_image_desc img_desc_1d;
|
|
9133
|
+
cl_buffer_region region;
|
|
9134
|
+
cl_mem A_image1d;
|
|
9135
|
+
cl_mem B_image1d;
|
|
9136
|
+
cl_mem B_sub_buffer;
|
|
9137
|
+
cl_mem S_image1d;
|
|
9138
|
+
|
|
9139
|
+
cl_mem D_image1d;
|
|
9140
|
+
cl_mem D_sub_buffer;
|
|
9141
|
+
|
|
9142
|
+
int M = ne01;
|
|
9143
|
+
int N = ne1;
|
|
9144
|
+
int K = ne00;
|
|
9145
|
+
|
|
9146
|
+
// create an image for A
|
|
9147
|
+
img_fmt_1d = { CL_R, CL_FLOAT};
|
|
9148
|
+
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
9149
|
+
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
9150
|
+
img_desc_1d.image_width = M * K / 4; // Divide by 4 for char -> float
|
|
9151
|
+
img_desc_1d.buffer = extra0_q8_0->q;
|
|
9152
|
+
A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
|
7500
9153
|
CL_CHECK(status);
|
|
7501
9154
|
|
|
7502
|
-
//
|
|
9155
|
+
// create an image for Scale
|
|
9156
|
+
img_fmt_1d = { CL_R, CL_HALF_FLOAT};
|
|
9157
|
+
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
9158
|
+
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
9159
|
+
img_desc_1d.image_width = M * K / 32; // Block size is 32
|
|
9160
|
+
img_desc_1d.buffer = extra0_q8_0->d;
|
|
9161
|
+
S_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
|
9162
|
+
CL_CHECK(status);
|
|
7503
9163
|
|
|
7504
|
-
// create
|
|
7505
|
-
//
|
|
7506
|
-
region.
|
|
7507
|
-
region.size = nb10 * ne10 * ne11 * ne12;
|
|
9164
|
+
// create a sub_buffer for B
|
|
9165
|
+
region.origin = (extra1->offset); // + src1->view_offs);
|
|
9166
|
+
region.size = K * N * sizeof(float);
|
|
7508
9167
|
B_sub_buffer = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
|
7509
9168
|
CL_CHECK(status);
|
|
7510
|
-
// <--------------------------------------------> //
|
|
7511
9169
|
|
|
9170
|
+
// create an image for B from sub_buffer: RGBA (OCL)
|
|
7512
9171
|
img_fmt_1d = {CL_RGBA, CL_FLOAT};
|
|
7513
9172
|
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
7514
9173
|
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
7515
|
-
|
|
7516
|
-
|
|
7517
|
-
|
|
7518
|
-
else {
|
|
7519
|
-
img_desc_1d.image_width = (nb02 * ne02 / 4)/4;
|
|
7520
|
-
}
|
|
7521
|
-
img_desc_1d.buffer = A_sub_buffer;
|
|
7522
|
-
A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
|
9174
|
+
img_desc_1d.image_width = K * N / 4;
|
|
9175
|
+
img_desc_1d.buffer = B_sub_buffer;
|
|
9176
|
+
B_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
|
7523
9177
|
CL_CHECK(status);
|
|
7524
9178
|
|
|
7525
|
-
//
|
|
7526
|
-
//
|
|
7527
|
-
region.
|
|
7528
|
-
region.size = ne0 * ne1 * dst->ne[2] * dst->nb[0]; // size of C in bytes
|
|
9179
|
+
// Create subbuffer and image1d_buffer for dst
|
|
9180
|
+
region.origin = (extrad->offset); // + dst->view_offs;
|
|
9181
|
+
region.size = M * N * sizeof(float);
|
|
7529
9182
|
D_sub_buffer = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
|
7530
9183
|
CL_CHECK(status);
|
|
7531
|
-
// <--------------------------------------------> //
|
|
7532
9184
|
|
|
7533
|
-
// create image for C output
|
|
7534
|
-
// <--------------------------------------------> //
|
|
7535
9185
|
img_fmt_1d = {CL_R, CL_FLOAT};
|
|
7536
9186
|
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
7537
9187
|
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
7538
|
-
img_desc_1d.image_width =
|
|
9188
|
+
img_desc_1d.image_width = M * N;
|
|
7539
9189
|
img_desc_1d.buffer = D_sub_buffer;
|
|
7540
9190
|
D_image1d = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
|
7541
9191
|
CL_CHECK(status);
|
|
7542
|
-
// <--------------------------------------------> //
|
|
7543
9192
|
|
|
7544
|
-
|
|
7545
|
-
|
|
9193
|
+
size_t local_work_size[3] = {1, 1, 1};
|
|
9194
|
+
size_t global_work_size[3] = {1, 1, 1};
|
|
7546
9195
|
|
|
7547
|
-
|
|
7548
|
-
|
|
7549
|
-
cl_uint k_arg = 0;
|
|
7550
|
-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
|
|
7551
|
-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &offset_src0));
|
|
7552
|
-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_sub_buffer));
|
|
7553
|
-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &offset_src1));
|
|
7554
|
-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &D_image1d));
|
|
7555
|
-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &extrad->offset));
|
|
7556
|
-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &M));
|
|
7557
|
-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &K));
|
|
7558
|
-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &N));
|
|
7559
|
-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
|
|
7560
|
-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12));
|
|
7561
|
-
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &nb01));
|
|
9196
|
+
if (N == 1) {
|
|
9197
|
+
kernel = backend_ctx->CL_mul_mat_vec_q8_0_f32;
|
|
7562
9198
|
|
|
7563
|
-
|
|
7564
|
-
|
|
9199
|
+
int r2 = 1;
|
|
9200
|
+
int r3 = 1;
|
|
9201
|
+
cl_uint k_arg = 0;
|
|
9202
|
+
|
|
9203
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
|
|
9204
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extra0_q8_0->d));
|
|
9205
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_image1d));
|
|
9206
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extra1->offset));
|
|
9207
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extrad->data_device));
|
|
9208
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extrad->offset));
|
|
9209
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne00));
|
|
9210
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne01));
|
|
9211
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
|
|
9212
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne10));
|
|
9213
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12));
|
|
9214
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne0));
|
|
9215
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne1));
|
|
9216
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r2));
|
|
9217
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r3));
|
|
9218
|
+
|
|
9219
|
+
size_t wavesize = backend_ctx->adreno_wave_size;
|
|
9220
|
+
local_work_size[0] = wavesize;
|
|
9221
|
+
local_work_size[1] = 4; // reduce factor
|
|
9222
|
+
local_work_size[2] = 1;
|
|
9223
|
+
|
|
9224
|
+
global_work_size[0] = ((M + wavesize - 1) / wavesize) * wavesize;
|
|
9225
|
+
global_work_size[1] = 4; // reduce factor
|
|
9226
|
+
global_work_size[2] = 1;
|
|
9227
|
+
} else {
|
|
9228
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
9229
|
+
cl_mem B_image1d_trans = nullptr;
|
|
9230
|
+
// for B transpose
|
|
9231
|
+
cl_mem B_d = nullptr;
|
|
9232
|
+
int padding;
|
|
9233
|
+
|
|
9234
|
+
//how many extra elements beyond multiple of 8
|
|
9235
|
+
int extra_elements = N % 8;
|
|
9236
|
+
|
|
9237
|
+
//how much padding to add
|
|
9238
|
+
padding = 0;
|
|
9239
|
+
if (extra_elements > 0){
|
|
9240
|
+
padding = 8 - extra_elements;
|
|
9241
|
+
}
|
|
9242
|
+
|
|
9243
|
+
// Specify the starting offset (in bytes)
|
|
9244
|
+
region.origin = 0;
|
|
9245
|
+
// Specify the size of the sub-buffer (divide by 2 for FP16)
|
|
9246
|
+
region.size = K * (N + padding) * sizeof(float)/2;
|
|
9247
|
+
backend_ctx->prealloc_act_trans.allocate(context, region.size);
|
|
9248
|
+
B_d = clCreateSubBuffer(
|
|
9249
|
+
backend_ctx->prealloc_act_trans.buffer,
|
|
9250
|
+
0,
|
|
9251
|
+
CL_BUFFER_CREATE_TYPE_REGION,
|
|
9252
|
+
®ion,
|
|
9253
|
+
&status);
|
|
9254
|
+
CL_CHECK(status);
|
|
9255
|
+
|
|
9256
|
+
cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; //(CL_HALF_FLOAT for FP16)
|
|
9257
|
+
cl_image_desc image_desc_B_d_output = {
|
|
9258
|
+
CL_MEM_OBJECT_IMAGE1D_BUFFER,
|
|
9259
|
+
static_cast<size_t>(K * (N + padding)/4),
|
|
9260
|
+
0, 0, 0, 0, 0, 0, 0, { B_d }
|
|
9261
|
+
};
|
|
9262
|
+
B_image1d_trans = clCreateImage(
|
|
9263
|
+
context,
|
|
9264
|
+
0,
|
|
9265
|
+
&image_format_B_d_output,
|
|
9266
|
+
&image_desc_B_d_output,
|
|
9267
|
+
NULL,
|
|
9268
|
+
&status);
|
|
9269
|
+
CL_CHECK(status);
|
|
9270
|
+
|
|
9271
|
+
int height_B = N/4;
|
|
9272
|
+
if (height_B == 0) {
|
|
9273
|
+
height_B = 1;
|
|
9274
|
+
}
|
|
9275
|
+
int width_B = K/4;
|
|
9276
|
+
int padded_height_B = (N + padding)/4;
|
|
9277
|
+
|
|
9278
|
+
kernel = backend_ctx->kernel_transpose_32_16;
|
|
9279
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_image1d));
|
|
9280
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d_trans));
|
|
9281
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
|
|
9282
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
|
|
9283
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
|
|
9284
|
+
|
|
9285
|
+
size_t local_size_t[2] = { 1, 16 };
|
|
9286
|
+
size_t global_size_t[2] = {
|
|
9287
|
+
static_cast<size_t>(width_B),
|
|
9288
|
+
static_cast<size_t>(padded_height_B)
|
|
9289
|
+
};
|
|
9290
|
+
|
|
9291
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
|
|
9292
|
+
|
|
9293
|
+
kernel = backend_ctx->kernel_mul_mm_q8_0_f32_8x4;
|
|
9294
|
+
|
|
9295
|
+
int N_with_padding = N + padding;
|
|
9296
|
+
|
|
9297
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q));
|
|
9298
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
|
|
9299
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &B_image1d_trans));
|
|
9300
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extrad->data_device));
|
|
9301
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &K));
|
|
9302
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &M));
|
|
9303
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &N_with_padding));
|
|
9304
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &N));
|
|
9305
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &offsetd));
|
|
9306
|
+
|
|
9307
|
+
global_work_size[0] = (size_t)(N + 7) / 8;
|
|
9308
|
+
global_work_size[1] = (size_t)(M + 3) / 4;
|
|
9309
|
+
global_work_size[2] = 1;
|
|
7565
9310
|
|
|
9311
|
+
local_work_size[0] = 2;
|
|
9312
|
+
local_work_size[1] = 128;
|
|
9313
|
+
local_work_size[2] = 1;
|
|
9314
|
+
}
|
|
9315
|
+
|
|
9316
|
+
// enqueue kernel with profiling
|
|
7566
9317
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
7567
9318
|
|
|
7568
9319
|
// deallocate sub buffers and images
|
|
7569
|
-
// <--------------------------------------------> //
|
|
7570
9320
|
CL_CHECK(clReleaseMemObject(A_image1d));
|
|
7571
|
-
CL_CHECK(clReleaseMemObject(D_image1d));
|
|
7572
|
-
CL_CHECK(clReleaseMemObject(A_sub_buffer));
|
|
7573
9321
|
CL_CHECK(clReleaseMemObject(B_sub_buffer));
|
|
9322
|
+
CL_CHECK(clReleaseMemObject(B_image1d));
|
|
9323
|
+
CL_CHECK(clReleaseMemObject(S_image1d));
|
|
7574
9324
|
CL_CHECK(clReleaseMemObject(D_sub_buffer));
|
|
9325
|
+
CL_CHECK(clReleaseMemObject(D_image1d));
|
|
9326
|
+
#else
|
|
9327
|
+
GGML_UNUSED(backend);
|
|
9328
|
+
GGML_UNUSED(src0);
|
|
9329
|
+
GGML_UNUSED(src1);
|
|
9330
|
+
GGML_UNUSED(dst);
|
|
9331
|
+
#endif
|
|
7575
9332
|
}
|
|
7576
9333
|
|
|
7577
9334
|
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -7597,8 +9354,10 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
7597
9354
|
|
|
7598
9355
|
#ifdef GGML_OPENCL_SOA_Q
|
|
7599
9356
|
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
|
|
9357
|
+
ggml_tensor_extra_cl_q4_1 * extra0_q4_1 = (ggml_tensor_extra_cl_q4_1 *)src0->extra;
|
|
7600
9358
|
ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
|
|
7601
9359
|
ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
|
|
9360
|
+
ggml_tensor_extra_cl_q6_K * extra0_q6_K = (ggml_tensor_extra_cl_q6_K *)src0->extra;
|
|
7602
9361
|
#endif
|
|
7603
9362
|
|
|
7604
9363
|
const int ne00 = src0 ? src0->ne[0] : 0;
|
|
@@ -7641,9 +9400,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
7641
9400
|
cl_context context = backend_ctx->context;
|
|
7642
9401
|
|
|
7643
9402
|
if(src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32){
|
|
7644
|
-
if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0
|
|
9403
|
+
if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0 &&
|
|
9404
|
+
// dst is wrapped with image1d_buffer, the size limit applies, also src0
|
|
9405
|
+
(ne0 * ne1 * dst->ne[2] * dst->nb[0] / 4 <= backend_ctx->image_max_buffer_size)) {
|
|
7645
9406
|
// For KQ
|
|
7646
9407
|
if (ggml_is_permuted(src0) && ggml_is_permuted(src1) &&
|
|
9408
|
+
((nb01 * ne01 / 4)/4 <= backend_ctx->image_max_buffer_size) &&
|
|
7647
9409
|
nb00 <= nb02 &&
|
|
7648
9410
|
nb02 <= nb01 &&
|
|
7649
9411
|
nb01 <= nb03 &&
|
|
@@ -7654,7 +9416,8 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
7654
9416
|
return;
|
|
7655
9417
|
}
|
|
7656
9418
|
// For KQV
|
|
7657
|
-
if (!ggml_is_contiguous(src0) && ggml_is_contiguous(src1)
|
|
9419
|
+
if (!ggml_is_contiguous(src0) && ggml_is_contiguous(src1) &&
|
|
9420
|
+
((nb02 * ne02 / 4)/4 <= backend_ctx->image_max_buffer_size)) {
|
|
7658
9421
|
ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
|
|
7659
9422
|
return;
|
|
7660
9423
|
}
|
|
@@ -7686,6 +9449,23 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
7686
9449
|
int padding;
|
|
7687
9450
|
// <--------------------------------------------> //
|
|
7688
9451
|
|
|
9452
|
+
// NOTE: Kernels using image1d_buffer_t (e.g., src0_q) would normally require
|
|
9453
|
+
// a limit check, but q4_0 / q4_1 tensors are very unlikely to exceed that
|
|
9454
|
+
// limit, so the check is omitted.
|
|
9455
|
+
|
|
9456
|
+
// q4_1 x fp32
|
|
9457
|
+
if (src0t == GGML_TYPE_Q4_1 && src1t == GGML_TYPE_F32) {
|
|
9458
|
+
ggml_cl_mul_mat_q4_1_f32_adreno(backend, src0, src1, dst);
|
|
9459
|
+
return;
|
|
9460
|
+
}
|
|
9461
|
+
|
|
9462
|
+
// q8_0 x fp32
|
|
9463
|
+
if (src0t == GGML_TYPE_Q8_0 && src1t == GGML_TYPE_F32 &&
|
|
9464
|
+
enable_adreno_trans_weight(backend_ctx, src0)) {
|
|
9465
|
+
ggml_cl_mul_mat_q8_0_f32_adreno(backend, src0, src1, dst);
|
|
9466
|
+
return;
|
|
9467
|
+
}
|
|
9468
|
+
|
|
7689
9469
|
// q4_0 x fp32
|
|
7690
9470
|
if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) {
|
|
7691
9471
|
// TODO: remove duplicate definitions of image description + format -- move to top
|
|
@@ -7960,9 +9740,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
7960
9740
|
|
|
7961
9741
|
// GEMM using local memory
|
|
7962
9742
|
// Current BK = 16, so ne00 % 16 == 0
|
|
7963
|
-
if (
|
|
7964
|
-
ggml_is_contiguous(src1) &&
|
|
7965
|
-
src1t == GGML_TYPE_F32 &&
|
|
9743
|
+
if (src1t == GGML_TYPE_F32 &&
|
|
7966
9744
|
ne00 % 16 == 0 &&
|
|
7967
9745
|
ne11 > 1) {
|
|
7968
9746
|
switch(src0t) {
|
|
@@ -7974,10 +9752,42 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
7974
9752
|
int batch_stride_b = ne10*ne11;
|
|
7975
9753
|
int batch_stride_d = ne0*ne1;
|
|
7976
9754
|
|
|
7977
|
-
|
|
7978
|
-
|
|
7979
|
-
|
|
7980
|
-
|
|
9755
|
+
cl_mem mem_src0 = extra0->data_device;
|
|
9756
|
+
cl_mem mem_src1 = extra1->data_device;
|
|
9757
|
+
|
|
9758
|
+
cl_ulong nb00_cont = nb00;
|
|
9759
|
+
cl_ulong nb01_cont = nb01;
|
|
9760
|
+
cl_ulong nb02_cont = nb02;
|
|
9761
|
+
cl_ulong nb03_cont = nb03;
|
|
9762
|
+
|
|
9763
|
+
cl_ulong nb10_cont = nb10;
|
|
9764
|
+
cl_ulong nb11_cont = nb11;
|
|
9765
|
+
cl_ulong nb12_cont = nb12;
|
|
9766
|
+
cl_ulong nb13_cont = nb13;
|
|
9767
|
+
|
|
9768
|
+
cl_ulong offset0_cont = offset0;
|
|
9769
|
+
cl_ulong offset1_cont = offset1;
|
|
9770
|
+
|
|
9771
|
+
if (!ggml_is_contiguous(src0)) {
|
|
9772
|
+
backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0));
|
|
9773
|
+
ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer,
|
|
9774
|
+
nb00_cont, nb01_cont, nb02_cont, nb03_cont);
|
|
9775
|
+
mem_src0 = backend_ctx->prealloc_src0.buffer;
|
|
9776
|
+
offset0_cont = 0;
|
|
9777
|
+
}
|
|
9778
|
+
|
|
9779
|
+
if (!ggml_is_contiguous(src1)) {
|
|
9780
|
+
backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1));
|
|
9781
|
+
ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer,
|
|
9782
|
+
nb10_cont, nb11_cont, nb12_cont, nb13_cont);
|
|
9783
|
+
mem_src1 = backend_ctx->prealloc_src1.buffer;
|
|
9784
|
+
offset1_cont = 0;
|
|
9785
|
+
}
|
|
9786
|
+
|
|
9787
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_src0));
|
|
9788
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_cont));
|
|
9789
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_src1));
|
|
9790
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1_cont));
|
|
7981
9791
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
7982
9792
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
7983
9793
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
|
@@ -8009,8 +9819,82 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
8009
9819
|
int batch_stride_b = ne10*ne11;
|
|
8010
9820
|
int batch_stride_d = ne0*ne1;
|
|
8011
9821
|
|
|
8012
|
-
|
|
8013
|
-
|
|
9822
|
+
cl_mem mem_src0 = extra0->data_device;
|
|
9823
|
+
cl_mem mem_src1 = extra1->data_device;
|
|
9824
|
+
|
|
9825
|
+
cl_ulong nb00_cont = nb00;
|
|
9826
|
+
cl_ulong nb01_cont = nb01;
|
|
9827
|
+
cl_ulong nb02_cont = nb02;
|
|
9828
|
+
cl_ulong nb03_cont = nb03;
|
|
9829
|
+
|
|
9830
|
+
cl_ulong nb10_cont = nb10;
|
|
9831
|
+
cl_ulong nb11_cont = nb11;
|
|
9832
|
+
cl_ulong nb12_cont = nb12;
|
|
9833
|
+
cl_ulong nb13_cont = nb13;
|
|
9834
|
+
|
|
9835
|
+
cl_ulong offset0_cont = offset0;
|
|
9836
|
+
cl_ulong offset1_cont = offset1;
|
|
9837
|
+
|
|
9838
|
+
if (!ggml_is_contiguous(src0)) {
|
|
9839
|
+
backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0));
|
|
9840
|
+
ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer,
|
|
9841
|
+
nb00_cont, nb01_cont, nb02_cont, nb03_cont);
|
|
9842
|
+
mem_src0 = backend_ctx->prealloc_src0.buffer;
|
|
9843
|
+
offset0_cont = 0;
|
|
9844
|
+
}
|
|
9845
|
+
|
|
9846
|
+
if (!ggml_is_contiguous(src1)) {
|
|
9847
|
+
backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1));
|
|
9848
|
+
ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer,
|
|
9849
|
+
nb10_cont, nb11_cont, nb12_cont, nb13_cont);
|
|
9850
|
+
mem_src1 = backend_ctx->prealloc_src1.buffer;
|
|
9851
|
+
offset1_cont = 0;
|
|
9852
|
+
}
|
|
9853
|
+
|
|
9854
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_src0));
|
|
9855
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_cont));
|
|
9856
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_src1));
|
|
9857
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1_cont));
|
|
9858
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
9859
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
9860
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
|
9861
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
|
9862
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
|
9863
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11));
|
|
9864
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
|
|
9865
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); // stride_a
|
|
9866
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_b
|
|
9867
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne01)); // stride_d
|
|
9868
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &batch_stride_a));
|
|
9869
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_b));
|
|
9870
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_d));
|
|
9871
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
|
|
9872
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
|
|
9873
|
+
|
|
9874
|
+
// 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
|
|
9875
|
+
size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
|
|
9876
|
+
size_t local_work_size[] = {(size_t)nth0, 1, 1};
|
|
9877
|
+
|
|
9878
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
9879
|
+
return;
|
|
9880
|
+
}
|
|
9881
|
+
case GGML_TYPE_Q4_0: {
|
|
9882
|
+
if (ne11 < 32) {
|
|
9883
|
+
break;
|
|
9884
|
+
}
|
|
9885
|
+
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
|
|
9886
|
+
break;
|
|
9887
|
+
}
|
|
9888
|
+
|
|
9889
|
+
kernel = backend_ctx->kernel_mul_mm_q4_0_f32_l4_lm;
|
|
9890
|
+
nth0 = 128; // calculated as (BM*BN)/(TM*TN)
|
|
9891
|
+
|
|
9892
|
+
int batch_stride_a = ne00*ne01;
|
|
9893
|
+
int batch_stride_b = ne10*ne11;
|
|
9894
|
+
int batch_stride_d = ne0*ne1;
|
|
9895
|
+
|
|
9896
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
|
|
9897
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
|
|
8014
9898
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
8015
9899
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
8016
9900
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
@@ -8036,10 +9920,57 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
8036
9920
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
8037
9921
|
return;
|
|
8038
9922
|
}
|
|
9923
|
+
case GGML_TYPE_Q4_1: {
|
|
9924
|
+
if (ne11 < 32) {
|
|
9925
|
+
break;
|
|
9926
|
+
}
|
|
9927
|
+
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
|
|
9928
|
+
break;
|
|
9929
|
+
}
|
|
9930
|
+
|
|
9931
|
+
kernel = backend_ctx->kernel_mul_mm_q4_1_f32_l4_lm;
|
|
9932
|
+
nth0 = 128; // calculated as (BM*BN)/(TM*TN)
|
|
9933
|
+
|
|
9934
|
+
int batch_stride_a = ne00*ne01;
|
|
9935
|
+
int batch_stride_b = ne10*ne11;
|
|
9936
|
+
int batch_stride_d = ne0*ne1;
|
|
9937
|
+
|
|
9938
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_1->q));
|
|
9939
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_1->d));
|
|
9940
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_1->m));
|
|
9941
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra1->data_device));
|
|
9942
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offset1));
|
|
9943
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extrad->data_device));
|
|
9944
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd));
|
|
9945
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne00));
|
|
9946
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne01));
|
|
9947
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne02));
|
|
9948
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne11));
|
|
9949
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
|
|
9950
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_a
|
|
9951
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); // stride_b
|
|
9952
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne01)); // stride_d
|
|
9953
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_a));
|
|
9954
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_b));
|
|
9955
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &batch_stride_d));
|
|
9956
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r2));
|
|
9957
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &r3));
|
|
9958
|
+
|
|
9959
|
+
// 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
|
|
9960
|
+
size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
|
|
9961
|
+
size_t local_work_size[] = {(size_t)nth0, 1, 1};
|
|
9962
|
+
|
|
9963
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
9964
|
+
return;
|
|
9965
|
+
}
|
|
8039
9966
|
case GGML_TYPE_Q8_0: {
|
|
8040
9967
|
if (ne11 < 32) {
|
|
8041
9968
|
break;
|
|
8042
9969
|
}
|
|
9970
|
+
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
|
|
9971
|
+
break;
|
|
9972
|
+
}
|
|
9973
|
+
|
|
8043
9974
|
kernel = backend_ctx->kernel_mul_mm_q8_0_f32_l4_lm;
|
|
8044
9975
|
nth0 = 128; // calculated as (BM*BN)/(TM*TN)
|
|
8045
9976
|
|
|
@@ -8074,6 +10005,50 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
8074
10005
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
8075
10006
|
return;
|
|
8076
10007
|
}
|
|
10008
|
+
case GGML_TYPE_Q6_K: {
|
|
10009
|
+
if (ne11 < 32) {
|
|
10010
|
+
break;
|
|
10011
|
+
}
|
|
10012
|
+
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
|
|
10013
|
+
break;
|
|
10014
|
+
}
|
|
10015
|
+
|
|
10016
|
+
kernel = backend_ctx->kernel_mul_mm_q6_k_f32_l4_lm;
|
|
10017
|
+
nth0 = 128; // calculated as (BM*BN)/(TM*TN)
|
|
10018
|
+
|
|
10019
|
+
int batch_stride_a = ne00*ne01;
|
|
10020
|
+
int batch_stride_b = ne10*ne11;
|
|
10021
|
+
int batch_stride_d = ne0*ne1;
|
|
10022
|
+
|
|
10023
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q6_K->ql));
|
|
10024
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q6_K->qh));
|
|
10025
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q6_K->s));
|
|
10026
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q6_K->d));
|
|
10027
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra1->data_device));
|
|
10028
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset1));
|
|
10029
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
|
10030
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
|
10031
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
|
10032
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
|
|
10033
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
|
|
10034
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
|
|
10035
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
|
|
10036
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); // stride_a
|
|
10037
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); // stride_b
|
|
10038
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne01)); // stride_d
|
|
10039
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_a));
|
|
10040
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &batch_stride_b));
|
|
10041
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &batch_stride_d));
|
|
10042
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &r2));
|
|
10043
|
+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &r3));
|
|
10044
|
+
|
|
10045
|
+
// 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
|
|
10046
|
+
size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
|
|
10047
|
+
size_t local_work_size[] = {(size_t)nth0, 1, 1};
|
|
10048
|
+
|
|
10049
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
10050
|
+
return;
|
|
10051
|
+
}
|
|
8077
10052
|
default:
|
|
8078
10053
|
break;
|
|
8079
10054
|
}
|
|
@@ -8328,7 +10303,71 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
8328
10303
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
|
|
8329
10304
|
#endif // GGML_OPENCL_SOA_Q
|
|
8330
10305
|
break;
|
|
8331
|
-
case GGML_TYPE_Q4_1:
|
|
10306
|
+
case GGML_TYPE_Q4_1: {
|
|
10307
|
+
#ifdef GGML_OPENCL_SOA_Q
|
|
10308
|
+
if (backend_ctx->gpu_family == INTEL) {
|
|
10309
|
+
nth0 = 16;
|
|
10310
|
+
nth1 = 1;
|
|
10311
|
+
ndst = 4;
|
|
10312
|
+
} else if (backend_ctx->gpu_family == ADRENO) {
|
|
10313
|
+
nth0 = 64;
|
|
10314
|
+
nth1 = 1;
|
|
10315
|
+
ndst = 4;
|
|
10316
|
+
} else {
|
|
10317
|
+
GGML_ASSERT(false && "TODO: Unknown GPU");
|
|
10318
|
+
}
|
|
10319
|
+
|
|
10320
|
+
kernel = backend_ctx->kernel_mul_mv_q4_1_f32_flat;
|
|
10321
|
+
|
|
10322
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_1->q));
|
|
10323
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_1->d));
|
|
10324
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_1->m));
|
|
10325
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra1->data_device));
|
|
10326
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offset1));
|
|
10327
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extrad->data_device));
|
|
10328
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd));
|
|
10329
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne00));
|
|
10330
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne01));
|
|
10331
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne02));
|
|
10332
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
|
|
10333
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
|
|
10334
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne0));
|
|
10335
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne1));
|
|
10336
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r2));
|
|
10337
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &r3));
|
|
10338
|
+
#else
|
|
10339
|
+
if (backend_ctx->gpu_family == INTEL) {
|
|
10340
|
+
nth0 = 16;
|
|
10341
|
+
nth1 = 1;
|
|
10342
|
+
ndst = 4;
|
|
10343
|
+
} else if (backend_ctx->gpu_family == ADRENO) {
|
|
10344
|
+
nth0 = 64;
|
|
10345
|
+
nth1 = 1;
|
|
10346
|
+
ndst = 4;
|
|
10347
|
+
} else {
|
|
10348
|
+
GGML_ASSERT(false && "TODO: Unknown GPU");
|
|
10349
|
+
}
|
|
10350
|
+
|
|
10351
|
+
kernel = backend_ctx->kernel_mul_mv_q4_1_f32;
|
|
10352
|
+
|
|
10353
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
10354
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
10355
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
10356
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
10357
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
10358
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
10359
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
|
10360
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
|
10361
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
|
10362
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
|
|
10363
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
|
|
10364
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0));
|
|
10365
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
|
|
10366
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
|
|
10367
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
|
|
10368
|
+
#endif // GGML_OPENCL_SOA_Q
|
|
10369
|
+
break;
|
|
10370
|
+
}
|
|
8332
10371
|
case GGML_TYPE_Q8_0: {
|
|
8333
10372
|
#ifdef GGML_OPENCL_SOA_Q
|
|
8334
10373
|
kernel = backend_ctx->kernel_mul_mv_q8_0_f32_flat;
|
|
@@ -8407,19 +10446,89 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
8407
10446
|
#endif // GGML_OPENCL_SOA_Q
|
|
8408
10447
|
break;
|
|
8409
10448
|
}
|
|
8410
|
-
case GGML_TYPE_Q2_K:
|
|
8411
|
-
case GGML_TYPE_Q3_K:
|
|
8412
|
-
case GGML_TYPE_Q4_K:
|
|
10449
|
+
case GGML_TYPE_Q2_K:
|
|
10450
|
+
case GGML_TYPE_Q3_K:
|
|
10451
|
+
case GGML_TYPE_Q4_K: {
|
|
10452
|
+
kernel = backend_ctx->kernel_mul_mv_q4_K_f32;
|
|
10453
|
+
|
|
10454
|
+
if (backend_ctx->gpu_family == INTEL) {
|
|
10455
|
+
nth0 = 16;
|
|
10456
|
+
nth1 = 1;
|
|
10457
|
+
ndst = 4;
|
|
10458
|
+
} else if (backend_ctx->gpu_family == ADRENO) {
|
|
10459
|
+
nth0 = 64;
|
|
10460
|
+
nth1 = 1;
|
|
10461
|
+
ndst = 4;
|
|
10462
|
+
} else {
|
|
10463
|
+
GGML_ASSERT(false && "TODO: Unknown GPU");
|
|
10464
|
+
}
|
|
10465
|
+
|
|
10466
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
10467
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
|
|
10468
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
10469
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1));
|
|
10470
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
10471
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd));
|
|
10472
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
|
10473
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
|
10474
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
|
10475
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
|
10476
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
|
10477
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
|
|
10478
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
|
|
10479
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
|
|
10480
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
|
|
10481
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne0));
|
|
10482
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne1));
|
|
10483
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
|
|
10484
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
|
|
10485
|
+
break;
|
|
10486
|
+
}
|
|
8413
10487
|
case GGML_TYPE_Q5_K:
|
|
8414
10488
|
case GGML_TYPE_Q6_K:
|
|
10489
|
+
#ifdef GGML_OPENCL_SOA_Q
|
|
10490
|
+
kernel = backend_ctx->kernel_mul_mv_q6_K_f32_flat;
|
|
10491
|
+
|
|
10492
|
+
if (backend_ctx->gpu_family == INTEL) {
|
|
10493
|
+
nth0 = 16;
|
|
10494
|
+
nth1 = 2;
|
|
10495
|
+
ndst = 4;
|
|
10496
|
+
} else if (backend_ctx->gpu_family == ADRENO) {
|
|
10497
|
+
nth0 = 64;
|
|
10498
|
+
nth1 = 2;
|
|
10499
|
+
ndst = 4;
|
|
10500
|
+
} else {
|
|
10501
|
+
GGML_ASSERT(false && "TODO: Unknown GPU");
|
|
10502
|
+
}
|
|
10503
|
+
|
|
10504
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q6_K->ql));
|
|
10505
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q6_K->qh));
|
|
10506
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q6_K->s));
|
|
10507
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q6_K->d));
|
|
10508
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra1->data_device));
|
|
10509
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset1));
|
|
10510
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
|
10511
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
|
10512
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
|
10513
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
|
|
10514
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
|
|
10515
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10));
|
|
10516
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
|
|
10517
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne0));
|
|
10518
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne1));
|
|
10519
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &r2));
|
|
10520
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &r3));
|
|
10521
|
+
#else
|
|
8415
10522
|
kernel = backend_ctx->kernel_mul_mv_q6_K_f32;
|
|
8416
10523
|
|
|
8417
10524
|
if (backend_ctx->gpu_family == INTEL) {
|
|
8418
|
-
nth0 =
|
|
8419
|
-
nth1 =
|
|
10525
|
+
nth0 = 16;
|
|
10526
|
+
nth1 = 2;
|
|
10527
|
+
ndst = 1;
|
|
8420
10528
|
} else if (backend_ctx->gpu_family == ADRENO) {
|
|
8421
|
-
nth0 =
|
|
8422
|
-
nth1 =
|
|
10529
|
+
nth0 = 64;
|
|
10530
|
+
nth1 = 2;
|
|
10531
|
+
ndst = 1;
|
|
8423
10532
|
} else {
|
|
8424
10533
|
GGML_ASSERT(false && "TODO: Unknown GPU");
|
|
8425
10534
|
}
|
|
@@ -8439,6 +10548,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
8439
10548
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
|
|
8440
10549
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
|
|
8441
10550
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
|
|
10551
|
+
#endif // GGML_OPENCL_SOA_Q
|
|
8442
10552
|
break;
|
|
8443
10553
|
case GGML_TYPE_MXFP4: {
|
|
8444
10554
|
#ifdef GGML_OPENCL_SOA_Q
|
|
@@ -8535,13 +10645,16 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
8535
10645
|
|
|
8536
10646
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
8537
10647
|
} else if (src0t == GGML_TYPE_Q4_K) {
|
|
8538
|
-
|
|
10648
|
+
size_t global_work_size[] = {(size_t)(ne01+ndst*nth1-1)/(ndst*nth1)*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
|
|
10649
|
+
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
|
10650
|
+
|
|
10651
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
8539
10652
|
} else if (src0t == GGML_TYPE_Q3_K) {
|
|
8540
10653
|
GGML_ASSERT(false && "not implemented");
|
|
8541
10654
|
} else if (src0t == GGML_TYPE_Q5_K) {
|
|
8542
10655
|
GGML_ASSERT(false && "not implemented");
|
|
8543
10656
|
} else if (src0t == GGML_TYPE_Q6_K) {
|
|
8544
|
-
size_t global_work_size[] = {(size_t)(ne01+1)/
|
|
10657
|
+
size_t global_work_size[] = {(size_t)(ne01+ndst*nth1-1)/(ndst*nth1)*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
|
|
8545
10658
|
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
|
8546
10659
|
|
|
8547
10660
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
@@ -8973,7 +11086,16 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
|
8973
11086
|
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
8974
11087
|
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
8975
11088
|
|
|
8976
|
-
cl_kernel kernel
|
|
11089
|
+
cl_kernel kernel;
|
|
11090
|
+
|
|
11091
|
+
int n = ggml_nelements(dst);
|
|
11092
|
+
|
|
11093
|
+
if (n % 4 == 0) {
|
|
11094
|
+
kernel = backend_ctx->kernel_scale_f32_4;
|
|
11095
|
+
n /= 4;
|
|
11096
|
+
} else {
|
|
11097
|
+
kernel = backend_ctx->kernel_scale_f32;
|
|
11098
|
+
}
|
|
8977
11099
|
|
|
8978
11100
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
8979
11101
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
@@ -8982,8 +11104,6 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
|
8982
11104
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale));
|
|
8983
11105
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float), &bias));
|
|
8984
11106
|
|
|
8985
|
-
int n = ggml_nelements(dst)/4;
|
|
8986
|
-
|
|
8987
11107
|
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
8988
11108
|
size_t local_work_size[] = {64, 1, 1};
|
|
8989
11109
|
|
|
@@ -9005,28 +11125,13 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
9005
11125
|
// GGML_OP_DUP and GGML_OP_CONT happen between src0 and dst.
|
|
9006
11126
|
UNUSED(dst);
|
|
9007
11127
|
|
|
9008
|
-
|
|
9009
|
-
|
|
9010
|
-
|
|
9011
|
-
|
|
9012
|
-
|
|
9013
|
-
const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
|
|
9014
|
-
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
|
9015
|
-
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
|
9016
|
-
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
|
9017
|
-
|
|
9018
|
-
const int ne10 = src1 ? src1->ne[0] : 0;
|
|
9019
|
-
const int ne11 = src1 ? src1->ne[1] : 0;
|
|
9020
|
-
const int ne12 = src1 ? src1->ne[2] : 0;
|
|
9021
|
-
const int ne13 = src1 ? src1->ne[3] : 0;
|
|
9022
|
-
|
|
9023
|
-
const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
|
|
9024
|
-
const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
|
|
9025
|
-
const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
|
|
9026
|
-
const cl_ulong nb13 = src1 ? src1->nb[3] : 0;
|
|
11128
|
+
GGML_TENSOR_LOCALS(int, ne0, src0, ne);
|
|
11129
|
+
GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
|
|
11130
|
+
GGML_TENSOR_LOCALS(int, ne1, src1, ne);
|
|
11131
|
+
GGML_TENSOR_LOCALS(cl_ulong, nb1, src1, nb);
|
|
9027
11132
|
|
|
9028
|
-
const enum ggml_type src0t = src0
|
|
9029
|
-
const enum ggml_type src1t = src1
|
|
11133
|
+
const enum ggml_type src0t = src0->type;
|
|
11134
|
+
const enum ggml_type src1t = src1->type;
|
|
9030
11135
|
|
|
9031
11136
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
9032
11137
|
|
|
@@ -9063,6 +11168,15 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
9063
11168
|
GGML_ASSERT(false && "not implemented");
|
|
9064
11169
|
}
|
|
9065
11170
|
break;
|
|
11171
|
+
case GGML_TYPE_I32:
|
|
11172
|
+
switch (src1t) {
|
|
11173
|
+
case GGML_TYPE_I32:
|
|
11174
|
+
kernel = backend_ctx->kernel_cpy_i32_i32;
|
|
11175
|
+
break;
|
|
11176
|
+
default:
|
|
11177
|
+
GGML_ASSERT(false && "not implemented");
|
|
11178
|
+
}
|
|
11179
|
+
break;
|
|
9066
11180
|
default:
|
|
9067
11181
|
GGML_ASSERT(false && "not implemented");
|
|
9068
11182
|
}
|
|
@@ -9101,6 +11215,89 @@ static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
9101
11215
|
UNUSED(src1);
|
|
9102
11216
|
}
|
|
9103
11217
|
|
|
11218
|
+
static void ggml_cl_set(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
11219
|
+
GGML_ASSERT(src0);
|
|
11220
|
+
GGML_ASSERT(src0->extra);
|
|
11221
|
+
GGML_ASSERT(src1);
|
|
11222
|
+
GGML_ASSERT(src1->extra);
|
|
11223
|
+
GGML_ASSERT(dst);
|
|
11224
|
+
GGML_ASSERT(dst->extra);
|
|
11225
|
+
|
|
11226
|
+
GGML_ASSERT((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32) &&
|
|
11227
|
+
src1->type == src0->type && dst->type == src0->type);
|
|
11228
|
+
|
|
11229
|
+
GGML_TENSOR_LOCALS(int, ne0, src0, ne);
|
|
11230
|
+
GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
|
|
11231
|
+
GGML_TENSOR_LOCALS(int, ne1, src1, ne);
|
|
11232
|
+
GGML_TENSOR_LOCALS(cl_ulong, nb1, src1, nb);
|
|
11233
|
+
GGML_TENSOR_LOCALS(int, ne, dst, ne);
|
|
11234
|
+
GGML_TENSOR_LOCALS(cl_ulong, nb, dst, nb);
|
|
11235
|
+
|
|
11236
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
11237
|
+
|
|
11238
|
+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
11239
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
11240
|
+
|
|
11241
|
+
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
|
11242
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
11243
|
+
|
|
11244
|
+
const cl_ulong pnb1 = ((const int32_t *)dst->op_params)[0];
|
|
11245
|
+
const cl_ulong pnb2 = ((const int32_t *)dst->op_params)[1];
|
|
11246
|
+
const cl_ulong pnb3 = ((const int32_t *)dst->op_params)[2];
|
|
11247
|
+
const cl_ulong offs = ((const int32_t *)dst->op_params)[3];
|
|
11248
|
+
const bool inplace = (bool)((const int32_t *)dst->op_params)[4];
|
|
11249
|
+
|
|
11250
|
+
cl_kernel kernel = nullptr;
|
|
11251
|
+
|
|
11252
|
+
// for inplace case, dst is a view of src0 and is updated on top of it
|
|
11253
|
+
// so for non-inplace case, copy src0 to dst first
|
|
11254
|
+
if (!inplace) {
|
|
11255
|
+
ggml_cl_cpy(backend, src0, dst, nullptr);
|
|
11256
|
+
}
|
|
11257
|
+
|
|
11258
|
+
// then copy src1 to dst with specified offset
|
|
11259
|
+
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
11260
|
+
kernel = backend_ctx->kernel_cpy_f32_f32;
|
|
11261
|
+
} else if (src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
|
|
11262
|
+
kernel = backend_ctx->kernel_cpy_i32_i32;
|
|
11263
|
+
} else {
|
|
11264
|
+
GGML_ASSERT(false && "not implemented");
|
|
11265
|
+
}
|
|
11266
|
+
|
|
11267
|
+
offsetd += offs;
|
|
11268
|
+
cl_ulong nb = ggml_element_size(dst);
|
|
11269
|
+
|
|
11270
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra1->data_device));
|
|
11271
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset1));
|
|
11272
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
11273
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
11274
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne10));
|
|
11275
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne11));
|
|
11276
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne12));
|
|
11277
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne13));
|
|
11278
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb10));
|
|
11279
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb11));
|
|
11280
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb12));
|
|
11281
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb13));
|
|
11282
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
|
|
11283
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
|
|
11284
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
|
|
11285
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
|
|
11286
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb));
|
|
11287
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &pnb1));
|
|
11288
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &pnb2));
|
|
11289
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &pnb3));
|
|
11290
|
+
|
|
11291
|
+
int max_local_size = backend_ctx->get_kernel_workgroup_size(kernel);
|
|
11292
|
+
|
|
11293
|
+
const int nth = MIN(max_local_size, ne00);
|
|
11294
|
+
|
|
11295
|
+
size_t global_work_size[] = {(size_t)ne11*nth, (size_t)ne12, (size_t)ne13};
|
|
11296
|
+
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
11297
|
+
|
|
11298
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
11299
|
+
}
|
|
11300
|
+
|
|
9104
11301
|
static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
9105
11302
|
GGML_ASSERT(src0);
|
|
9106
11303
|
GGML_ASSERT(src0->extra);
|
|
@@ -9163,6 +11360,49 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
|
|
|
9163
11360
|
}
|
|
9164
11361
|
}
|
|
9165
11362
|
|
|
11363
|
+
static void ggml_cl_diag(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
11364
|
+
GGML_ASSERT(src0);
|
|
11365
|
+
GGML_ASSERT(src0->extra);
|
|
11366
|
+
GGML_ASSERT(dst);
|
|
11367
|
+
GGML_ASSERT(dst->extra);
|
|
11368
|
+
|
|
11369
|
+
UNUSED(src1);
|
|
11370
|
+
|
|
11371
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
11372
|
+
|
|
11373
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
11374
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
11375
|
+
|
|
11376
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
11377
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
11378
|
+
|
|
11379
|
+
GGML_TENSOR_LOCALS(int, ne0, src0, ne);
|
|
11380
|
+
GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
|
|
11381
|
+
GGML_TENSOR_LOCALS(int, ne, dst, ne);
|
|
11382
|
+
GGML_TENSOR_LOCALS(cl_ulong, nb, dst, nb);
|
|
11383
|
+
|
|
11384
|
+
cl_kernel kernel = backend_ctx->kernel_diag_f32;
|
|
11385
|
+
|
|
11386
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
11387
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
11388
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
11389
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
11390
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &nb01));
|
|
11391
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb02));
|
|
11392
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb03));
|
|
11393
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne0));
|
|
11394
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb0));
|
|
11395
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb2));
|
|
11396
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb3));
|
|
11397
|
+
|
|
11398
|
+
int nth = 64;
|
|
11399
|
+
|
|
11400
|
+
size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
|
|
11401
|
+
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
11402
|
+
|
|
11403
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
11404
|
+
}
|
|
11405
|
+
|
|
9166
11406
|
static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
9167
11407
|
GGML_ASSERT(src0);
|
|
9168
11408
|
GGML_ASSERT(src0->extra);
|
|
@@ -9474,6 +11714,72 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
9474
11714
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
9475
11715
|
}
|
|
9476
11716
|
|
|
11717
|
+
static void ggml_cl_solve_tri(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
11718
|
+
GGML_ASSERT(src0);
|
|
11719
|
+
GGML_ASSERT(src0->extra);
|
|
11720
|
+
GGML_ASSERT(src1);
|
|
11721
|
+
GGML_ASSERT(src1->extra);
|
|
11722
|
+
GGML_ASSERT(dst);
|
|
11723
|
+
GGML_ASSERT(dst->extra);
|
|
11724
|
+
|
|
11725
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
11726
|
+
|
|
11727
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
11728
|
+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
11729
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
11730
|
+
|
|
11731
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
11732
|
+
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
|
11733
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
11734
|
+
|
|
11735
|
+
cl_kernel kernel = backend_ctx->kernel_solve_tri_f32;
|
|
11736
|
+
GGML_ASSERT(kernel != nullptr);
|
|
11737
|
+
|
|
11738
|
+
const int n = src0->ne[0];
|
|
11739
|
+
const int k = src1->ne[0];
|
|
11740
|
+
|
|
11741
|
+
const cl_ulong nb00 = src0->nb[0];
|
|
11742
|
+
const cl_ulong nb01 = src0->nb[1];
|
|
11743
|
+
const cl_ulong nb02 = src0->nb[2];
|
|
11744
|
+
const cl_ulong nb03 = src0->nb[3];
|
|
11745
|
+
|
|
11746
|
+
const cl_ulong nb10 = src1->nb[0];
|
|
11747
|
+
const cl_ulong nb11 = src1->nb[1];
|
|
11748
|
+
const cl_ulong nb12 = src1->nb[2];
|
|
11749
|
+
const cl_ulong nb13 = src1->nb[3];
|
|
11750
|
+
|
|
11751
|
+
const cl_ulong nb0 = dst->nb[0];
|
|
11752
|
+
const cl_ulong nb1 = dst->nb[1];
|
|
11753
|
+
const cl_ulong nb2 = dst->nb[2];
|
|
11754
|
+
const cl_ulong nb3 = dst->nb[3];
|
|
11755
|
+
|
|
11756
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
11757
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
11758
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
11759
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
11760
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
11761
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
11762
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n));
|
|
11763
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &k));
|
|
11764
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
|
|
11765
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
|
11766
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
|
|
11767
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
|
|
11768
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong),&nb10));
|
|
11769
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong),&nb11));
|
|
11770
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong),&nb12));
|
|
11771
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong),&nb13));
|
|
11772
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb0));
|
|
11773
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb1));
|
|
11774
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb2));
|
|
11775
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb3));
|
|
11776
|
+
|
|
11777
|
+
size_t global_work_size[3]= { (size_t)k, (size_t)dst->ne[2], (size_t)dst->ne[3]};
|
|
11778
|
+
size_t local_work_size[] = {16, 4, 1};
|
|
11779
|
+
|
|
11780
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
11781
|
+
}
|
|
11782
|
+
|
|
9477
11783
|
static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
9478
11784
|
GGML_ASSERT(src0);
|
|
9479
11785
|
GGML_ASSERT(src1);
|
|
@@ -9611,7 +11917,6 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
9611
11917
|
GGML_UNUSED(src1);
|
|
9612
11918
|
|
|
9613
11919
|
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
|
9614
|
-
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
9615
11920
|
|
|
9616
11921
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
9617
11922
|
|
|
@@ -9634,7 +11939,14 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
9634
11939
|
const cl_ulong nb2 = dst->nb[2];
|
|
9635
11940
|
const cl_ulong nb3 = dst->nb[3];
|
|
9636
11941
|
|
|
9637
|
-
cl_kernel kernel
|
|
11942
|
+
cl_kernel kernel;
|
|
11943
|
+
|
|
11944
|
+
const bool is_c4 = ne00 % 4 == 0;
|
|
11945
|
+
if (is_c4) {
|
|
11946
|
+
kernel = backend_ctx->kernel_sum_rows_f32_4;
|
|
11947
|
+
} else {
|
|
11948
|
+
kernel = backend_ctx->kernel_sum_rows_f32;
|
|
11949
|
+
}
|
|
9638
11950
|
|
|
9639
11951
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
9640
11952
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
@@ -9651,12 +11963,124 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
9651
11963
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
|
|
9652
11964
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
|
|
9653
11965
|
|
|
9654
|
-
size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
|
|
11966
|
+
size_t global_work_size[] = {64 * (size_t)ne01, (size_t)ne02, (size_t)ne03};
|
|
9655
11967
|
size_t local_work_size[] = {(size_t)64, 1, 1};
|
|
9656
11968
|
|
|
9657
11969
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
9658
11970
|
}
|
|
9659
11971
|
|
|
11972
|
+
static void ggml_cl_cumsum(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
11973
|
+
GGML_ASSERT(src0);
|
|
11974
|
+
GGML_ASSERT(src0->extra);
|
|
11975
|
+
GGML_ASSERT(dst);
|
|
11976
|
+
GGML_ASSERT(dst->extra);
|
|
11977
|
+
GGML_UNUSED(src1);
|
|
11978
|
+
|
|
11979
|
+
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
|
11980
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
11981
|
+
|
|
11982
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
11983
|
+
|
|
11984
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
11985
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
11986
|
+
|
|
11987
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
11988
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
11989
|
+
|
|
11990
|
+
GGML_TENSOR_LOCALS(int, ne0, src0, ne);
|
|
11991
|
+
GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
|
|
11992
|
+
|
|
11993
|
+
cl_kernel kernel = backend_ctx->kernel_cumsum_blk;
|
|
11994
|
+
|
|
11995
|
+
int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
|
|
11996
|
+
int nth = 1;
|
|
11997
|
+
while (nth < ne00 && 2*nth <= max_workgroup_size) {
|
|
11998
|
+
nth *= 2;
|
|
11999
|
+
}
|
|
12000
|
+
|
|
12001
|
+
GGML_ASSERT(ne00 <= nth*nth);
|
|
12002
|
+
|
|
12003
|
+
const int net0 = CEIL_DIV(ne00, nth);
|
|
12004
|
+
const int net1 = ne01;
|
|
12005
|
+
const int net2 = ne02;
|
|
12006
|
+
const int net3 = ne03;
|
|
12007
|
+
|
|
12008
|
+
const cl_ulong nbt0 = sizeof(float);
|
|
12009
|
+
const cl_ulong nbt1 = net0*nbt0;
|
|
12010
|
+
const cl_ulong nbt2 = net1*nbt1;
|
|
12011
|
+
const cl_ulong nbt3 = net2*nbt2;
|
|
12012
|
+
|
|
12013
|
+
static ggml_cl_buffer tmp_buffer;
|
|
12014
|
+
tmp_buffer.allocate(backend_ctx->context, net0*ne01*ne02*ne03*sizeof(float));
|
|
12015
|
+
|
|
12016
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
12017
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
12018
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &tmp_buffer.buffer));
|
|
12019
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extrad->data_device));
|
|
12020
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offsetd));
|
|
12021
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00));
|
|
12022
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01));
|
|
12023
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne02));
|
|
12024
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne03));
|
|
12025
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb00));
|
|
12026
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
|
|
12027
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
|
|
12028
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
|
|
12029
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &net0));
|
|
12030
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &net1));
|
|
12031
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &net2));
|
|
12032
|
+
|
|
12033
|
+
size_t global_work_size[] = { (size_t)(nth*net0*ne01), (size_t)ne02, (size_t)ne03};
|
|
12034
|
+
size_t local_work_size[] = { (size_t)nth, 1, 1};
|
|
12035
|
+
|
|
12036
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
12037
|
+
|
|
12038
|
+
if(ne00 > nth) {
|
|
12039
|
+
// if a single workgroup cannot handle an entire row, each workgroup
|
|
12040
|
+
// computes a partial sum and stores to dst, tmp_buffer contains the sum
|
|
12041
|
+
// of the each workgroup; cumsum this buffer and add to the partial sums in dst
|
|
12042
|
+
cl_ulong offsett = 0;
|
|
12043
|
+
kernel = backend_ctx->kernel_cumsum_blk;
|
|
12044
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &tmp_buffer.buffer));
|
|
12045
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offsett));
|
|
12046
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &tmp_buffer.buffer));
|
|
12047
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &tmp_buffer.buffer));
|
|
12048
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offsett));
|
|
12049
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &net0));
|
|
12050
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01));
|
|
12051
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne02));
|
|
12052
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne03));
|
|
12053
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nbt0));
|
|
12054
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nbt1));
|
|
12055
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nbt2));
|
|
12056
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nbt3));
|
|
12057
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &net0));
|
|
12058
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &net1));
|
|
12059
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &net2));
|
|
12060
|
+
|
|
12061
|
+
size_t global_work_size_1[] = { (size_t)net1*nth, (size_t)net2, (size_t)net3};
|
|
12062
|
+
size_t local_work_size_1[] = { (size_t)nth, 1, 1};
|
|
12063
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_1, local_work_size_1, dst);
|
|
12064
|
+
|
|
12065
|
+
kernel = backend_ctx->kernel_cumsum_add;
|
|
12066
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &tmp_buffer.buffer));
|
|
12067
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extrad->data_device));
|
|
12068
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &offsetd));
|
|
12069
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &ne00));
|
|
12070
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01));
|
|
12071
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne02));
|
|
12072
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne03));
|
|
12073
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &nbt0));
|
|
12074
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &nbt1));
|
|
12075
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &nbt2));
|
|
12076
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &nbt3));
|
|
12077
|
+
|
|
12078
|
+
size_t global_work_size_2[] = { (size_t)(nth*net0*ne01), (size_t)ne02, (size_t)ne03};
|
|
12079
|
+
size_t local_work_size_2[] = { (size_t)nth, 1, 1};
|
|
12080
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_2, local_work_size_2, dst);
|
|
12081
|
+
}
|
|
12082
|
+
}
|
|
12083
|
+
|
|
9660
12084
|
static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
9661
12085
|
GGML_ASSERT(src0);
|
|
9662
12086
|
GGML_ASSERT(src0->extra);
|
|
@@ -9802,6 +12226,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
9802
12226
|
}
|
|
9803
12227
|
func = ggml_cl_cpy;
|
|
9804
12228
|
break;
|
|
12229
|
+
case GGML_OP_SET:
|
|
12230
|
+
if (!any_on_device) {
|
|
12231
|
+
return false;
|
|
12232
|
+
}
|
|
12233
|
+
func = ggml_cl_set;
|
|
12234
|
+
break;
|
|
9805
12235
|
case GGML_OP_DUP:
|
|
9806
12236
|
case GGML_OP_CONT:
|
|
9807
12237
|
if (!any_on_device) {
|
|
@@ -9901,6 +12331,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
9901
12331
|
}
|
|
9902
12332
|
func = ggml_cl_tanh;
|
|
9903
12333
|
break;
|
|
12334
|
+
case GGML_UNARY_OP_NEG:
|
|
12335
|
+
if (!any_on_device) {
|
|
12336
|
+
return false;
|
|
12337
|
+
}
|
|
12338
|
+
func = ggml_cl_neg;
|
|
12339
|
+
break;
|
|
12340
|
+
case GGML_UNARY_OP_EXP:
|
|
12341
|
+
if (!any_on_device) {
|
|
12342
|
+
return false;
|
|
12343
|
+
}
|
|
12344
|
+
func = ggml_cl_exp;
|
|
12345
|
+
break;
|
|
9904
12346
|
case GGML_UNARY_OP_EXPM1:
|
|
9905
12347
|
if (!any_on_device) {
|
|
9906
12348
|
return false;
|
|
@@ -9922,6 +12364,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
9922
12364
|
}
|
|
9923
12365
|
func = ggml_cl_glu;
|
|
9924
12366
|
break;
|
|
12367
|
+
case GGML_OP_TRI:
|
|
12368
|
+
if (!any_on_device) {
|
|
12369
|
+
return false;
|
|
12370
|
+
}
|
|
12371
|
+
func = ggml_cl_tri;
|
|
12372
|
+
break;
|
|
9925
12373
|
case GGML_OP_FILL:
|
|
9926
12374
|
if (!any_on_device) {
|
|
9927
12375
|
return false;
|
|
@@ -9946,6 +12394,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
9946
12394
|
}
|
|
9947
12395
|
func = ggml_cl_rms_norm;
|
|
9948
12396
|
break;
|
|
12397
|
+
case GGML_OP_L2_NORM:
|
|
12398
|
+
if (!any_on_device) {
|
|
12399
|
+
return false;
|
|
12400
|
+
}
|
|
12401
|
+
func = ggml_cl_l2_norm;
|
|
12402
|
+
break;
|
|
9949
12403
|
case GGML_OP_GROUP_NORM:
|
|
9950
12404
|
if (!any_on_device) {
|
|
9951
12405
|
return false;
|
|
@@ -10021,6 +12475,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
10021
12475
|
}
|
|
10022
12476
|
func = ggml_cl_nop;
|
|
10023
12477
|
break;
|
|
12478
|
+
case GGML_OP_DIAG:
|
|
12479
|
+
if (!any_on_device) {
|
|
12480
|
+
return false;
|
|
12481
|
+
}
|
|
12482
|
+
func = ggml_cl_diag;
|
|
12483
|
+
break;
|
|
10024
12484
|
case GGML_OP_DIAG_MASK_INF:
|
|
10025
12485
|
if (!any_on_device) {
|
|
10026
12486
|
return false;
|
|
@@ -10039,6 +12499,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
10039
12499
|
}
|
|
10040
12500
|
func = ggml_cl_rope;
|
|
10041
12501
|
break;
|
|
12502
|
+
case GGML_OP_SOLVE_TRI:
|
|
12503
|
+
if (!any_on_device) {
|
|
12504
|
+
return false;
|
|
12505
|
+
}
|
|
12506
|
+
func = ggml_cl_solve_tri;
|
|
12507
|
+
break;
|
|
10042
12508
|
case GGML_OP_IM2COL:
|
|
10043
12509
|
if (!any_on_device) {
|
|
10044
12510
|
return false;
|
|
@@ -10057,6 +12523,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
10057
12523
|
}
|
|
10058
12524
|
func = ggml_cl_sum_rows;
|
|
10059
12525
|
break;
|
|
12526
|
+
case GGML_OP_CUMSUM:
|
|
12527
|
+
if (!any_on_device) {
|
|
12528
|
+
return false;
|
|
12529
|
+
}
|
|
12530
|
+
func = ggml_cl_cumsum;
|
|
12531
|
+
break;
|
|
10060
12532
|
case GGML_OP_FLASH_ATTN_EXT:
|
|
10061
12533
|
if (!any_on_device) {
|
|
10062
12534
|
return false;
|