whispercpp 1.3.5 → 1.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.document +3 -0
- data/.rdoc_options +2 -0
- data/LICENSE +1 -1
- data/README.md +133 -3
- data/Rakefile +18 -3
- data/ext/dependencies.rb +10 -4
- data/ext/dependencies_for_windows.rb +17 -0
- data/ext/extconf.rb +20 -7
- data/ext/options.rb +54 -14
- data/ext/options_for_windows.rb +51 -0
- data/ext/ruby_whisper.c +56 -46
- data/ext/ruby_whisper.h +165 -2
- data/ext/ruby_whisper_context.c +297 -126
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_log_queue.c +180 -0
- data/ext/ruby_whisper_log_settable.h +47 -0
- data/ext/ruby_whisper_model.c +0 -1
- data/ext/ruby_whisper_parakeet.c +49 -0
- data/ext/ruby_whisper_parakeet_context.c +304 -0
- data/ext/ruby_whisper_parakeet_context_params.c +117 -0
- data/ext/ruby_whisper_parakeet_model.c +84 -0
- data/ext/ruby_whisper_parakeet_params.c +548 -0
- data/ext/ruby_whisper_parakeet_segment.c +157 -0
- data/ext/ruby_whisper_parakeet_token.c +188 -0
- data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
- data/ext/ruby_whisper_params.c +256 -66
- data/ext/ruby_whisper_segment.c +6 -7
- data/ext/ruby_whisper_token.c +29 -9
- data/ext/ruby_whisper_transcribe.cpp +46 -16
- data/ext/ruby_whisper_vad_context.c +48 -1
- data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +0 -1
- data/ext/ruby_whisper_vad_segments.c +0 -1
- data/ext/sources/CMakeLists.txt +41 -3
- data/ext/sources/CMakePresets.json +95 -0
- data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
- data/ext/sources/cmake/parakeet.pc.in +10 -0
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/cmake/whisper.pc.in +1 -1
- data/ext/sources/examples/CMakeLists.txt +4 -2
- data/ext/sources/examples/bench/bench.cpp +24 -19
- data/ext/sources/examples/cli/cli.cpp +51 -9
- data/ext/sources/examples/common-ggml.cpp +4 -0
- data/ext/sources/examples/common-whisper.cpp +139 -67
- data/ext/sources/examples/common-whisper.h +11 -0
- data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
- data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
- data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
- data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
- data/ext/sources/examples/server/server.cpp +213 -163
- data/ext/sources/ggml/CMakeLists.txt +29 -15
- data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
- data/ext/sources/ggml/include/ggml-alloc.h +1 -0
- data/ext/sources/ggml/include/ggml-backend.h +73 -11
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +5 -0
- data/ext/sources/ggml/include/ggml-cuda.h +3 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +8 -3
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml.h +155 -16
- data/ext/sources/ggml/include/gguf.h +10 -2
- data/ext/sources/ggml/src/CMakeLists.txt +25 -5
- data/ext/sources/ggml/src/ggml-alloc.c +9 -10
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
- data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +40 -86
- data/ext/sources/ggml/src/ggml-backend.cpp +114 -10
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +10 -2
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +1016 -442
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +111 -85
- data/ext/sources/ggml/src/ggml-cann/common.h +23 -14
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +255 -92
- data/ext/sources/ggml/src/ggml-common.h +22 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +68 -34
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +44 -19
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +101 -101
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +194 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2874 -613
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +5480 -840
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1361 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -11
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +186 -36
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +119 -19
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +112 -26
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +153 -16
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +17 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +976 -251
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +671 -266
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1277 -263
- data/ext/sources/ggml/src/ggml-cpu/ops.h +4 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +95 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +6 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2893 -679
- data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +226 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +114 -19
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
- data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +54 -53
- data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +18 -8
- data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
- data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +73 -28
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +69 -41
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +359 -29
- data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +94 -27
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +20 -9
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +333 -85
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +632 -190
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +162 -49
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +43 -18
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +44 -14
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +241 -23
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
- data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +312 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1454 -599
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +397 -183
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +161 -88
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +522 -431
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +139 -72
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +608 -88
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +47 -79
- data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +134 -27
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +7 -17
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +244 -137
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
- data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
- data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +96 -40
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +6 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +6 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -5
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +202 -135
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +86 -2
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +111 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +30 -2
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +84 -46
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1612 -753
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +51 -11
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +361 -261
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +294 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +753 -241
- data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +295 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +471 -296
- data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +159 -53
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +3 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +372 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +86 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +137 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +97 -14
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +163 -67
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +308 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +262 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +291 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +216 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +142 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -1348
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +547 -635
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +3556 -1101
- data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +475 -269
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +94 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +222 -217
- data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +432 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +886 -117
- data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +40 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +28 -9
- data/ext/sources/ggml/src/ggml-impl.h +68 -1
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +409 -83
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +54 -5
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +254 -52
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +254 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +756 -285
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +7 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +359 -133
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1867 -1123
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +71 -4
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +14127 -5314
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +97 -88
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +104 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1978 -67
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
- data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
- data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +178 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +985 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +380 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1132 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +956 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +149 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +47 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +40 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +317 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +257 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +86 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +880 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +143 -0
- data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
- data/ext/sources/ggml/src/ggml-quants.c +385 -119
- data/ext/sources/ggml/src/ggml-quants.h +6 -0
- data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
- data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
- data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +64 -91
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +4 -1
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
- data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +356 -11
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +184 -14
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +31 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
- data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
- data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
- data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +77 -156
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1181 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +59 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1246 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +674 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +227 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +347 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +1134 -236
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
- data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
- data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +72 -1
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
- data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
- data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
- data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
- data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
- data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +228 -53
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +123 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +160 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +99 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +545 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +115 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3250 -940
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +533 -180
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +113 -68
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +412 -222
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +222 -83
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +189 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +22 -20
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +51 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +39 -63
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +13 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +27 -11
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +193 -149
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +3221 -97
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3493 -1997
- data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +142 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +115 -141
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +93 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -44
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +198 -230
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +234 -335
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +871 -42
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +149 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +36 -138
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +151 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +15 -40
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +39 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +213 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +24 -15
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +253 -16
- data/ext/sources/ggml/src/ggml.c +268 -52
- data/ext/sources/ggml/src/gguf.cpp +377 -47
- data/ext/sources/include/parakeet.h +342 -0
- data/ext/sources/include/whisper.h +10 -0
- data/ext/sources/media/matmul.png +0 -0
- data/ext/sources/src/CMakeLists.txt +23 -0
- data/ext/sources/src/parakeet-arch.h +188 -0
- data/ext/sources/src/parakeet.cpp +3838 -0
- data/ext/sources/src/whisper.cpp +62 -40
- data/extsources.rb +26 -10
- data/lib/whisper/log_settable.rb +36 -0
- data/lib/whisper/model/uri.rb +13 -1
- data/lib/whisper/output.rb +74 -0
- data/sig/whisper.rbs +445 -55
- data/test/helper.rb +2 -0
- data/test/jfk_reader/jfk_reader.c +50 -7
- data/test/test_callback.rb +1 -0
- data/test/test_context_params.rb +82 -0
- data/test/test_package.rb +6 -5
- data/test/test_parakeet.rb +28 -0
- data/test/test_parakeet_callback.rb +107 -0
- data/test/test_parakeet_context.rb +116 -0
- data/test/test_parakeet_context_params.rb +24 -0
- data/test/test_parakeet_model.rb +21 -0
- data/test/test_parakeet_params.rb +78 -0
- data/test/test_parakeet_segment.rb +42 -0
- data/test/test_parakeet_token.rb +73 -0
- data/test/test_params.rb +2 -0
- data/test/test_token.rb +11 -0
- data/test/test_vad_context.rb +58 -8
- data/test/test_vad_segment.rb +1 -1
- data/test/test_whisper.rb +44 -6
- data/whispercpp.gemspec +2 -2
- metadata +426 -280
- data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
- data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
- data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
- data/ext/sources/bindings/javascript/package.json +0 -26
- data/ext/sources/bindings/javascript/whisper.js +0 -19
- data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
- data/ext/sources/examples/addon.node/addon.cpp +0 -557
- data/ext/sources/examples/addon.node/index.js +0 -59
- data/ext/sources/examples/addon.node/package.json +0 -16
- data/ext/sources/examples/addon.node/vad-example.js +0 -132
- data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
- data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
- data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
- data/ext/sources/examples/coi-serviceworker.js +0 -146
- data/ext/sources/examples/command/CMakeLists.txt +0 -10
- data/ext/sources/examples/command/command.cpp +0 -802
- data/ext/sources/examples/command/commands.txt +0 -9
- data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
- data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
- data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
- data/ext/sources/examples/generate-karaoke.sh +0 -57
- data/ext/sources/examples/helpers.js +0 -191
- data/ext/sources/examples/livestream.sh +0 -112
- data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
- data/ext/sources/examples/lsp/lsp.cpp +0 -471
- data/ext/sources/examples/lsp/whisper.vim +0 -362
- data/ext/sources/examples/python/test_whisper_processor.py +0 -7
- data/ext/sources/examples/python/whisper_processor.py +0 -54
- data/ext/sources/examples/server/bench.js +0 -29
- data/ext/sources/examples/server.py +0 -120
- data/ext/sources/examples/stream/CMakeLists.txt +0 -10
- data/ext/sources/examples/stream/stream.cpp +0 -437
- data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
- data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
- data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
- data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
- data/ext/sources/examples/sycl/build.sh +0 -22
- data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
- data/ext/sources/examples/sycl/run-whisper.sh +0 -17
- data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -47
- data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -494
- data/ext/sources/examples/talk-llama/llama-adapter.h +0 -88
- data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2559
- data/ext/sources/examples/talk-llama/llama-arch.h +0 -586
- data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -917
- data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
- data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -876
- data/ext/sources/examples/talk-llama/llama-chat.h +0 -70
- data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3645
- data/ext/sources/examples/talk-llama/llama-context.h +0 -360
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
- data/ext/sources/examples/talk-llama/llama-cparams.h +0 -42
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
- data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
- data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2282
- data/ext/sources/examples/talk-llama/llama-graph.h +0 -910
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -241
- data/ext/sources/examples/talk-llama/llama-hparams.h +0 -284
- data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
- data/ext/sources/examples/talk-llama/llama-impl.h +0 -63
- data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
- data/ext/sources/examples/talk-llama/llama-io.h +0 -35
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -328
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2100
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -390
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1167
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
- data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
- data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -735
- data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1247
- data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -176
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -285
- data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -37
- data/ext/sources/examples/talk-llama/llama-model.cpp +0 -8338
- data/ext/sources/examples/talk-llama/llama-model.h +0 -544
- data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1072
- data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +0 -3771
- data/ext/sources/examples/talk-llama/llama-sampling.h +0 -44
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3900
- data/ext/sources/examples/talk-llama/llama-vocab.h +0 -182
- data/ext/sources/examples/talk-llama/llama.cpp +0 -1140
- data/ext/sources/examples/talk-llama/llama.h +0 -1540
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -191
- data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
- data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -138
- data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -144
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/bert.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -160
- data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
- data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -144
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -259
- data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -134
- data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
- data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -113
- data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
- data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
- data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -196
- data/ext/sources/examples/talk-llama/models/granite.cpp +0 -211
- data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +0 -283
- data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -141
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -154
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -175
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/llama.cpp +0 -168
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -55
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -199
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
- data/ext/sources/examples/talk-llama/models/models.h +0 -569
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -116
- data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
- data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
- data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
- data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -316
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/plm.cpp +0 -168
- data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -873
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -149
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -141
- data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -162
- data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
- data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
- data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
- data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
- data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
- data/ext/sources/examples/talk-llama/speak +0 -40
- data/ext/sources/examples/talk-llama/speak.bat +0 -1
- data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
- data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
- data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
- data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
- data/ext/sources/examples/talk-llama/unicode.cpp +0 -1147
- data/ext/sources/examples/talk-llama/unicode.h +0 -111
- data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
- data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
- data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
- data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
- data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
- data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
- data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
- data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
- data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
- data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +0 -157
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -165
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
- data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
- data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -147
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +0 -907
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +0 -247
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
- data/ext/sources/tests/CMakeLists.txt +0 -112
- data/ext/sources/tests/earnings21/eval.mk +0 -58
- data/ext/sources/tests/earnings21/eval.py +0 -68
- data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
- data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
- data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
- data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
- data/ext/sources/tests/earnings21/requirements.txt +0 -6
- data/ext/sources/tests/en-0-ref.txt +0 -1
- data/ext/sources/tests/en-1-ref.txt +0 -1
- data/ext/sources/tests/en-2-ref.txt +0 -1
- data/ext/sources/tests/es-0-ref.txt +0 -1
- data/ext/sources/tests/librispeech/eval.mk +0 -39
- data/ext/sources/tests/librispeech/eval.py +0 -47
- data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
- data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
- data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
- data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
- data/ext/sources/tests/librispeech/requirements.txt +0 -6
- data/ext/sources/tests/run-tests.sh +0 -130
- data/ext/sources/tests/test-c.c +0 -3
- data/ext/sources/tests/test-vad-full.cpp +0 -56
- data/ext/sources/tests/test-vad.cpp +0 -83
- data/ext/sources/tests/test-whisper.js +0 -58
- data/lib/whisper/context.rb +0 -15
- data/lib/whisper/segment.rb +0 -58
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
#include <cstdlib>
|
|
20
20
|
#include <float.h>
|
|
21
21
|
#include <limits>
|
|
22
|
+
#include <optional>
|
|
22
23
|
#include <stdint.h>
|
|
23
24
|
#include <stdio.h>
|
|
24
25
|
#include <vector>
|
|
@@ -30,11 +31,21 @@
|
|
|
30
31
|
#include <regex>
|
|
31
32
|
|
|
32
33
|
#include <sycl/sycl.hpp>
|
|
34
|
+
#include <sycl/backend.hpp>
|
|
35
|
+
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
|
36
|
+
#include <level_zero/ze_api.h>
|
|
37
|
+
#endif
|
|
33
38
|
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
|
|
34
39
|
# include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
|
|
35
40
|
#endif
|
|
41
|
+
#if SYCL_EXT_ONEAPI_VIRTUAL_MEM
|
|
42
|
+
# include <sycl/ext/oneapi/virtual_mem/physical_mem.hpp>
|
|
43
|
+
# include <sycl/ext/oneapi/virtual_mem/virtual_mem.hpp>
|
|
44
|
+
# define GGML_SYCL_USE_VMM
|
|
45
|
+
#endif
|
|
36
46
|
#include <sycl/half_type.hpp>
|
|
37
47
|
|
|
48
|
+
#include "ggml.h"
|
|
38
49
|
#include "ggml-sycl.h"
|
|
39
50
|
#include "ggml-impl.h"
|
|
40
51
|
#include "ggml-backend-impl.h"
|
|
@@ -43,25 +54,35 @@
|
|
|
43
54
|
#include "ggml-sycl/backend.hpp"
|
|
44
55
|
#include "ggml-sycl/common.hpp"
|
|
45
56
|
#include "ggml-sycl/element_wise.hpp"
|
|
57
|
+
#include "ggml-sycl/gemm.hpp"
|
|
58
|
+
#include "ggml-sycl/getrows.hpp"
|
|
46
59
|
#include "ggml-sycl/norm.hpp"
|
|
47
60
|
#include "ggml-sycl/presets.hpp"
|
|
48
|
-
#include "ggml-sycl/
|
|
61
|
+
#include "ggml-sycl/quantize.hpp"
|
|
62
|
+
#include "ggml-sycl/repeat_back.hpp"
|
|
49
63
|
#include "ggml-sycl/set_rows.hpp"
|
|
50
64
|
#include "ggml-sycl/set.hpp"
|
|
51
|
-
#include "ggml-sycl/sycl_hw.hpp"
|
|
52
|
-
#include "ggml-sycl/getrows.hpp"
|
|
53
|
-
#include "ggml-sycl/repeat_back.hpp"
|
|
54
|
-
#include "ggml-sycl/quantize.hpp"
|
|
55
65
|
#include "ggml-sycl/ssm_conv.hpp"
|
|
56
|
-
#include "ggml.
|
|
66
|
+
#include "ggml-sycl/sycl_hw.hpp"
|
|
67
|
+
#include "ggml-sycl/ssm_scan.hpp"
|
|
68
|
+
#include "ggml-sycl/fill.hpp"
|
|
69
|
+
#include "ggml-sycl/cumsum.hpp"
|
|
70
|
+
#include "ggml-sycl/diag.hpp"
|
|
71
|
+
#include "ggml-sycl/solve_tri.hpp"
|
|
72
|
+
#include "ggml-sycl/gated_delta_net.hpp"
|
|
57
73
|
|
|
58
74
|
static bool g_sycl_loaded = false;
|
|
59
75
|
int g_ggml_sycl_debug = 0;
|
|
60
76
|
int g_ggml_sycl_disable_optimize = 0;
|
|
61
77
|
int g_ggml_sycl_disable_graph = 0;
|
|
62
78
|
int g_ggml_sycl_disable_dnn = 0;
|
|
79
|
+
int g_ggml_sycl_enable_vmm = 1;
|
|
63
80
|
int g_ggml_sycl_prioritize_dmmv = 0;
|
|
64
81
|
int g_ggml_sycl_use_async_mem_op = 0;
|
|
82
|
+
int g_ggml_sycl_use_async_mem_op_requested = 1;
|
|
83
|
+
int g_ggml_sycl_enable_level_zero = 0;
|
|
84
|
+
int g_ggml_sycl_enable_flash_attention = 1;
|
|
85
|
+
|
|
65
86
|
|
|
66
87
|
static ggml_sycl_device_info ggml_sycl_init() {
|
|
67
88
|
ggml_sycl_device_info info = {};
|
|
@@ -82,23 +103,50 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
|
|
82
103
|
// GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
|
|
83
104
|
// #endif
|
|
84
105
|
for (int i = 0; i < info.device_count; ++i) {
|
|
85
|
-
info.devices[i].vmm = 0;
|
|
86
106
|
dpct::device_info prop;
|
|
87
|
-
|
|
107
|
+
auto & device = dpct::dev_mgr::instance().get_device(i);
|
|
88
108
|
|
|
89
109
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
|
90
110
|
prop, device)));
|
|
91
111
|
|
|
112
|
+
#if !defined(GGML_SYCL_USE_VMM)
|
|
113
|
+
info.devices[i].vmm = 0;
|
|
114
|
+
#else
|
|
115
|
+
info.devices[i].vmm = device.has(sycl::aspect::ext_oneapi_virtual_mem);
|
|
116
|
+
if (info.devices[i].vmm) {
|
|
117
|
+
// NB: SYCL's get_mem_granularity always returns the _minimum_ granularity,
|
|
118
|
+
// but the L0 API requires a larger page size for allocs above 2 MiB and
|
|
119
|
+
// rejects non-multiples with UR_RESULT_ERROR_INVALID_VALUE [sic].
|
|
120
|
+
// Here we clamp it to 2 MiB for simplicity, but other devices may require
|
|
121
|
+
// calling zeVirtualMemQueryPageSize or yet unexposed public API.
|
|
122
|
+
const size_t physical_page = 2ull << 20; // 2 MiB
|
|
123
|
+
info.devices[i].vmm_granularity = std::max<size_t>(
|
|
124
|
+
sycl::ext::oneapi::experimental::get_mem_granularity(
|
|
125
|
+
device, sycl::context(device)),
|
|
126
|
+
physical_page);
|
|
127
|
+
}
|
|
128
|
+
#endif
|
|
129
|
+
|
|
92
130
|
info.default_tensor_split[i] = total_vram;
|
|
93
131
|
total_vram += prop.get_global_mem_size();
|
|
94
132
|
|
|
95
133
|
info.devices[i].cc =
|
|
96
134
|
100 * prop.get_major_version() + 10 * prop.get_minor_version();
|
|
97
|
-
info.devices[i].nsm = prop.get_max_compute_units();
|
|
135
|
+
info.devices[i].nsm = prop.get_max_compute_units() / 16; //16: Number of Xe Cores
|
|
98
136
|
info.devices[i].opt_feature.reorder = device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu);
|
|
99
137
|
info.devices[i].smpbo = prop.get_local_mem_size();
|
|
138
|
+
info.devices[i].warp_size = WARP_SIZE;
|
|
100
139
|
|
|
101
140
|
info.max_work_group_sizes[i] = prop.get_max_work_group_size();
|
|
141
|
+
info.devices[i].max_wg_per_cu = info.max_work_group_sizes[i] / prop.get_max_compute_units();
|
|
142
|
+
info.devices[i].hw_info = get_device_hw_info(&device);
|
|
143
|
+
|
|
144
|
+
// Only check GPU devices; CPU devices use OpenCL and would otherwise
|
|
145
|
+
// disable Level Zero for the GPUs on systems without ONEAPI_DEVICE_SELECTOR set.
|
|
146
|
+
if (device.is_gpu() && device.default_queue().get_backend() != sycl::backend::ext_oneapi_level_zero) {
|
|
147
|
+
GGML_LOG_WARN("SYCL GPU device %d does not use Level Zero backend, disabling Level Zero memory API\n", i);
|
|
148
|
+
info.ext_oneapi_level_zero = false;
|
|
149
|
+
}
|
|
102
150
|
}
|
|
103
151
|
|
|
104
152
|
for (int id = 0; id < info.device_count; ++id) {
|
|
@@ -210,8 +258,54 @@ static void ggml_check_sycl() try {
|
|
|
210
258
|
g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
|
|
211
259
|
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
|
212
260
|
g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
|
|
261
|
+
g_ggml_sycl_enable_vmm = get_sycl_env("GGML_SYCL_ENABLE_VMM", 1);
|
|
213
262
|
g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
|
|
263
|
+
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
|
264
|
+
g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", ggml_sycl_info().ext_oneapi_level_zero);
|
|
265
|
+
#else
|
|
266
|
+
g_ggml_sycl_enable_level_zero = 0;
|
|
267
|
+
#endif
|
|
268
|
+
|
|
269
|
+
#ifdef SYCL_FLASH_ATTN
|
|
270
|
+
g_ggml_sycl_enable_flash_attention = get_sycl_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1);
|
|
271
|
+
#else
|
|
272
|
+
g_ggml_sycl_enable_flash_attention = 0;
|
|
273
|
+
#endif
|
|
274
|
+
|
|
214
275
|
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
|
|
276
|
+
|
|
277
|
+
GGML_LOG_INFO("Build with Macros:\n");
|
|
278
|
+
#if defined(GGML_SYCL_FORCE_MMQ)
|
|
279
|
+
GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n");
|
|
280
|
+
#else
|
|
281
|
+
GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: no\n");
|
|
282
|
+
#endif
|
|
283
|
+
#if defined(GGML_SYCL_F16)
|
|
284
|
+
GGML_LOG_INFO(" GGML_SYCL_F16: yes\n");
|
|
285
|
+
#else
|
|
286
|
+
GGML_LOG_INFO(" GGML_SYCL_F16: no\n");
|
|
287
|
+
#endif
|
|
288
|
+
#if defined(GGML_SYCL_GRAPH)
|
|
289
|
+
GGML_LOG_INFO(" GGML_SYCL_GRAPH: yes\n");
|
|
290
|
+
#else
|
|
291
|
+
GGML_LOG_INFO(" GGML_SYCL_GRAPH: no\n");
|
|
292
|
+
#endif
|
|
293
|
+
#if defined(GGML_SYCL_DNNL)
|
|
294
|
+
GGML_LOG_INFO(" GGML_SYCL_DNNL: yes\n");
|
|
295
|
+
#else
|
|
296
|
+
GGML_LOG_INFO(" GGML_SYCL_DNNL: no\n");
|
|
297
|
+
#endif
|
|
298
|
+
#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO)
|
|
299
|
+
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: yes\n");
|
|
300
|
+
#else
|
|
301
|
+
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: no\n");
|
|
302
|
+
#endif
|
|
303
|
+
#if defined(GGML_SYCL_USE_VMM)
|
|
304
|
+
GGML_LOG_INFO(" GGML_SYCL_USE_VMM: yes\n");
|
|
305
|
+
#else
|
|
306
|
+
GGML_LOG_INFO(" GGML_SYCL_USE_VMM: no\n");
|
|
307
|
+
#endif
|
|
308
|
+
|
|
215
309
|
GGML_LOG_INFO("Running with Environment Variables:\n");
|
|
216
310
|
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
|
|
217
311
|
GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
|
|
@@ -220,22 +314,30 @@ static void ggml_check_sycl() try {
|
|
|
220
314
|
#else
|
|
221
315
|
GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: graph disabled by compile flag\n");
|
|
222
316
|
#endif
|
|
317
|
+
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
|
318
|
+
GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: %d\n", g_ggml_sycl_enable_level_zero);
|
|
319
|
+
#else
|
|
320
|
+
GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: Level Zero disabled by compile flag\n");
|
|
321
|
+
#endif
|
|
223
322
|
#if GGML_SYCL_DNNL
|
|
224
323
|
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn);
|
|
225
324
|
#else
|
|
226
325
|
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n");
|
|
227
326
|
#endif
|
|
228
|
-
|
|
229
|
-
GGML_LOG_INFO("
|
|
230
|
-
#if defined(GGML_SYCL_FORCE_MMQ)
|
|
231
|
-
GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n");
|
|
327
|
+
#if defined(GGML_SYCL_USE_VMM)
|
|
328
|
+
GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: %d\n", g_ggml_sycl_enable_vmm);
|
|
232
329
|
#else
|
|
233
|
-
GGML_LOG_INFO("
|
|
330
|
+
GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: virtual memory extension is not available\n");
|
|
234
331
|
#endif
|
|
235
|
-
|
|
236
|
-
|
|
332
|
+
GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
|
|
333
|
+
g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1);
|
|
334
|
+
GGML_LOG_INFO(" GGML_SYCL_USE_ASYNC_MEM_OP: %d\n", g_ggml_sycl_use_async_mem_op_requested);
|
|
335
|
+
|
|
336
|
+
#ifdef SYCL_FLASH_ATTN
|
|
337
|
+
GGML_LOG_INFO(" GGML_SYCL_ENABLE_FLASH_ATTN: %d\n", g_ggml_sycl_enable_flash_attention);
|
|
237
338
|
#else
|
|
238
|
-
GGML_LOG_INFO("
|
|
339
|
+
GGML_LOG_INFO(" GGML_SYCL_ENABLE_FLASH_ATTN: %d disabled by compile flag\n",
|
|
340
|
+
g_ggml_sycl_enable_flash_attention);
|
|
239
341
|
#endif
|
|
240
342
|
|
|
241
343
|
/* NOT REMOVE, keep it for next optimize for XMX.
|
|
@@ -245,11 +347,11 @@ static void ggml_check_sycl() try {
|
|
|
245
347
|
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
|
|
246
348
|
#endif
|
|
247
349
|
*/
|
|
248
|
-
//
|
|
249
|
-
//
|
|
250
|
-
//
|
|
350
|
+
// Async USM allocation/free is also useful outside the graph path: it avoids the host waits in the reorder
|
|
351
|
+
// staging path while preserving queue ordering semantics. Graph support still depends on the extension being
|
|
352
|
+
// available, but it no longer needs to control the non-graph fast path.
|
|
251
353
|
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
|
|
252
|
-
g_ggml_sycl_use_async_mem_op = !g_ggml_sycl_disable_graph;
|
|
354
|
+
g_ggml_sycl_use_async_mem_op = g_ggml_sycl_use_async_mem_op_requested || !g_ggml_sycl_disable_graph;
|
|
253
355
|
if (g_ggml_sycl_use_async_mem_op) {
|
|
254
356
|
for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); ++i) {
|
|
255
357
|
if (!dpct::dev_mgr::instance().get_device(i).has(sycl::aspect::ext_oneapi_async_memory_alloc)) {
|
|
@@ -333,7 +435,7 @@ struct ggml_backend_sycl_buffer_context {
|
|
|
333
435
|
~ggml_backend_sycl_buffer_context() {
|
|
334
436
|
if (dev_ptr != nullptr) {
|
|
335
437
|
ggml_sycl_set_device(device);
|
|
336
|
-
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
438
|
+
SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(dev_ptr, *stream)));
|
|
337
439
|
}
|
|
338
440
|
|
|
339
441
|
//release extra used by tensors
|
|
@@ -379,11 +481,22 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
|
379
481
|
assert(tensor->view_src->buffer->buft == buffer->buft);
|
|
380
482
|
return GGML_STATUS_SUCCESS;
|
|
381
483
|
}
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
tensor->
|
|
386
|
-
|
|
484
|
+
|
|
485
|
+
if (!g_ggml_sycl_disable_optimize) {
|
|
486
|
+
// set reorder extra buffer based on supported type
|
|
487
|
+
switch (tensor->type) {
|
|
488
|
+
case GGML_TYPE_Q4_0:
|
|
489
|
+
case GGML_TYPE_Q8_0:
|
|
490
|
+
case GGML_TYPE_Q4_K:
|
|
491
|
+
case GGML_TYPE_Q6_K:{
|
|
492
|
+
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
|
493
|
+
tensor->extra = extra;
|
|
494
|
+
ctx->tensor_extras.push_back(extra);
|
|
495
|
+
break;
|
|
496
|
+
}
|
|
497
|
+
default:
|
|
498
|
+
break;
|
|
499
|
+
}
|
|
387
500
|
}
|
|
388
501
|
|
|
389
502
|
if (ggml_is_quantized(tensor->type)) {
|
|
@@ -455,8 +568,43 @@ catch (sycl::exception const &exc) {
|
|
|
455
568
|
std::exit(1);
|
|
456
569
|
}
|
|
457
570
|
|
|
571
|
+
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
|
572
|
+
static bool ggml_sycl_is_l0_discrete_gpu(sycl::queue &q) {
|
|
573
|
+
if (!q.get_device().is_gpu() || q.get_backend() != sycl::backend::ext_oneapi_level_zero) {
|
|
574
|
+
return false;
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
ze_device_handle_t ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_device());
|
|
578
|
+
ze_device_properties_t props = {};
|
|
579
|
+
props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
|
|
580
|
+
ze_result_t r = zeDeviceGetProperties(ze_dev, &props);
|
|
581
|
+
return r == ZE_RESULT_SUCCESS && !(props.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED);
|
|
582
|
+
}
|
|
583
|
+
#endif
|
|
584
|
+
|
|
458
585
|
static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
|
|
459
586
|
const void *ptr_src, size_t size) {
|
|
587
|
+
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
|
588
|
+
// Use Level Zero direct copy for dGPU-to-dGPU transfers.
|
|
589
|
+
const bool l0_copy_supported =
|
|
590
|
+
ggml_sycl_is_l0_discrete_gpu(q_dst) && ggml_sycl_is_l0_discrete_gpu(q_src);
|
|
591
|
+
if (g_ggml_sycl_enable_level_zero && l0_copy_supported) {
|
|
592
|
+
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_context());
|
|
593
|
+
auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_device());
|
|
594
|
+
ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0,
|
|
595
|
+
0, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
|
|
596
|
+
ze_command_list_handle_t cl;
|
|
597
|
+
ze_result_t r = zeCommandListCreateImmediate(ze_ctx, ze_dev, &cq_desc, &cl);
|
|
598
|
+
if (r == ZE_RESULT_SUCCESS) {
|
|
599
|
+
r = zeCommandListAppendMemoryCopy(cl, ptr_dst, ptr_src, size, nullptr, 0, nullptr);
|
|
600
|
+
zeCommandListDestroy(cl);
|
|
601
|
+
if (r == ZE_RESULT_SUCCESS) {
|
|
602
|
+
return;
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
#endif
|
|
607
|
+
// Host-staged copy
|
|
460
608
|
char *host_buf = (char *)malloc(size);
|
|
461
609
|
q_src.memcpy(host_buf, (const char *)ptr_src, size).wait();
|
|
462
610
|
q_dst.memcpy((char *)ptr_dst, host_buf, size).wait();
|
|
@@ -537,9 +685,15 @@ static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
|
|
|
537
685
|
SYCL_CHECK(
|
|
538
686
|
CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw()));
|
|
539
687
|
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
688
|
+
constexpr size_t MAX_CHUNK = 2ULL << 30; // 2 GiB
|
|
689
|
+
for (size_t off = 0; off < buffer->size; off += MAX_CHUNK) {
|
|
690
|
+
size_t chunk = std::min(buffer->size - off, MAX_CHUNK);
|
|
691
|
+
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
692
|
+
(*stream)
|
|
693
|
+
.memset(static_cast<char*>(ctx->dev_ptr) + off, value, chunk)
|
|
694
|
+
.wait()
|
|
695
|
+
));
|
|
696
|
+
}
|
|
543
697
|
}
|
|
544
698
|
catch (sycl::exception const &exc) {
|
|
545
699
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
|
@@ -589,6 +743,8 @@ static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
|
|
|
589
743
|
/* .memset_tensor = */ ggml_backend_sycl_buffer_memset_tensor,
|
|
590
744
|
/* .set_tensor = */ ggml_backend_sycl_buffer_set_tensor,
|
|
591
745
|
/* .get_tensor = */ ggml_backend_sycl_buffer_get_tensor,
|
|
746
|
+
/* .set_tensor_2d = */ NULL,
|
|
747
|
+
/* .get_tensor_2d = */ NULL,
|
|
592
748
|
/* .cpy_tensor = */ ggml_backend_sycl_buffer_cpy_tensor,
|
|
593
749
|
/* .clear = */ ggml_backend_sycl_buffer_clear,
|
|
594
750
|
/* .reset = */ ggml_backend_sycl_buffer_reset,
|
|
@@ -618,8 +774,7 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
|
|
618
774
|
size = std::max(size, (size_t)1); // syclMalloc returns null for size 0
|
|
619
775
|
|
|
620
776
|
void * dev_ptr;
|
|
621
|
-
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)
|
|
622
|
-
size, *stream)));
|
|
777
|
+
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)ggml_sycl_malloc_device(size, *stream)));
|
|
623
778
|
if (!dev_ptr) {
|
|
624
779
|
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
|
|
625
780
|
return nullptr;
|
|
@@ -634,7 +789,7 @@ catch (sycl::exception const &exc) {
|
|
|
634
789
|
}
|
|
635
790
|
|
|
636
791
|
static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
637
|
-
return
|
|
792
|
+
return SYCL_BUFFER_ALIGNMENT;
|
|
638
793
|
GGML_UNUSED(buft);
|
|
639
794
|
}
|
|
640
795
|
|
|
@@ -860,18 +1015,10 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
|
860
1015
|
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
861
1016
|
}
|
|
862
1017
|
|
|
863
|
-
// FIXME: do not crash if SYCL Buffer alloc fails
|
|
864
|
-
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
|
865
1018
|
ggml_sycl_set_device(i);
|
|
866
1019
|
const queue_ptr stream = ctx->streams[i];
|
|
867
1020
|
char * buf;
|
|
868
|
-
|
|
869
|
-
DPCT1009:208: SYCL uses exceptions to report errors and does not use the
|
|
870
|
-
error codes. The original code was commented out and a warning string
|
|
871
|
-
was inserted. You need to rewrite this code.
|
|
872
|
-
*/
|
|
873
|
-
SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
|
|
874
|
-
size, *stream)));
|
|
1021
|
+
SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)ggml_sycl_malloc_device(size, *stream)));
|
|
875
1022
|
if (!buf) {
|
|
876
1023
|
char err_buf[1024];
|
|
877
1024
|
snprintf(err_buf, 1023, "%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
|
|
@@ -1035,6 +1182,8 @@ static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
|
|
|
1035
1182
|
/* .memset_tensor = */ NULL,
|
|
1036
1183
|
/* .set_tensor = */ ggml_backend_sycl_split_buffer_set_tensor,
|
|
1037
1184
|
/* .get_tensor = */ ggml_backend_sycl_split_buffer_get_tensor,
|
|
1185
|
+
/* .set_tensor_2d = */ NULL,
|
|
1186
|
+
/* .get_tensor_2d = */ NULL,
|
|
1038
1187
|
/* .cpy_tensor = */ NULL,
|
|
1039
1188
|
/* .clear = */ ggml_backend_sycl_split_buffer_clear,
|
|
1040
1189
|
/* .reset = */ NULL,
|
|
@@ -1063,7 +1212,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(gg
|
|
|
1063
1212
|
}
|
|
1064
1213
|
|
|
1065
1214
|
static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
1066
|
-
return
|
|
1215
|
+
return SYCL_BUFFER_ALIGNMENT;
|
|
1067
1216
|
GGML_UNUSED(buft);
|
|
1068
1217
|
}
|
|
1069
1218
|
|
|
@@ -1157,13 +1306,28 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
|
|
|
1157
1306
|
GGML_UNUSED(buft);
|
|
1158
1307
|
}
|
|
1159
1308
|
|
|
1309
|
+
inline void * aligned_malloc_host(size_t alignment, size_t size) {
|
|
1310
|
+
#ifdef _WIN32
|
|
1311
|
+
return _aligned_malloc(size, alignment);
|
|
1312
|
+
#else
|
|
1313
|
+
return aligned_alloc(alignment, size);
|
|
1314
|
+
#endif
|
|
1315
|
+
}
|
|
1316
|
+
|
|
1317
|
+
inline void free_aligned_mem_host(void * memblock) {
|
|
1318
|
+
#ifdef _WIN32
|
|
1319
|
+
_aligned_free(memblock);
|
|
1320
|
+
#else
|
|
1321
|
+
free(memblock);
|
|
1322
|
+
#endif
|
|
1323
|
+
}
|
|
1324
|
+
|
|
1160
1325
|
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
1161
|
-
|
|
1326
|
+
free_aligned_mem_host((void *)buffer->context);
|
|
1162
1327
|
}
|
|
1163
1328
|
|
|
1164
1329
|
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
1165
|
-
void * ptr =
|
|
1166
|
-
|
|
1330
|
+
void * ptr = aligned_malloc_host(TENSOR_ALIGNMENT, size);
|
|
1167
1331
|
if (ptr == nullptr) {
|
|
1168
1332
|
// fallback to cpu buffer
|
|
1169
1333
|
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
|
@@ -1212,16 +1376,53 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
|
|
1212
1376
|
explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : device(device_), qptr(qptr_) {}
|
|
1213
1377
|
|
|
1214
1378
|
~ggml_sycl_pool_leg() {
|
|
1379
|
+
#ifdef DEBUG_SYCL_POOL
|
|
1380
|
+
int n_cached = 0;
|
|
1381
|
+
size_t bytes_cached = 0;
|
|
1382
|
+
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
|
|
1383
|
+
if (buffer_pool[i].ptr != nullptr) {
|
|
1384
|
+
++n_cached;
|
|
1385
|
+
bytes_cached += buffer_pool[i].size;
|
|
1386
|
+
}
|
|
1387
|
+
}
|
|
1388
|
+
GGML_LOG_INFO("%s: %d buffers, cached = %.2f MiB\n", __func__,
|
|
1389
|
+
n_cached, bytes_cached / 1024.0 / 1024.0);
|
|
1390
|
+
const auto slots = format_slots_in_alloc_order();
|
|
1391
|
+
if (!slots.empty()) {
|
|
1392
|
+
GGML_LOG_INFO("%s: slots MiB: %s\n", __func__, slots.c_str());
|
|
1393
|
+
}
|
|
1394
|
+
#endif
|
|
1395
|
+
|
|
1215
1396
|
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
|
|
1216
1397
|
ggml_sycl_buffer & b = buffer_pool[i];
|
|
1217
1398
|
if (b.ptr != nullptr) {
|
|
1218
|
-
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
1399
|
+
SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(b.ptr, *qptr)));
|
|
1219
1400
|
pool_size -= b.size;
|
|
1220
1401
|
}
|
|
1221
1402
|
}
|
|
1222
1403
|
GGML_ASSERT(pool_size == 0);
|
|
1223
1404
|
}
|
|
1224
1405
|
|
|
1406
|
+
#ifdef DEBUG_SYCL_POOL
|
|
1407
|
+
std::string format_slots_in_alloc_order() const {
|
|
1408
|
+
std::string line;
|
|
1409
|
+
char buf[32];
|
|
1410
|
+
bool first = true;
|
|
1411
|
+
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
|
|
1412
|
+
if (buffer_pool[i].ptr == nullptr) {
|
|
1413
|
+
continue;
|
|
1414
|
+
}
|
|
1415
|
+
if (!first) {
|
|
1416
|
+
line += '/';
|
|
1417
|
+
}
|
|
1418
|
+
first = false;
|
|
1419
|
+
snprintf(buf, sizeof(buf), "%.2f", buffer_pool[i].size / 1024.0 / 1024.0);
|
|
1420
|
+
line += buf;
|
|
1421
|
+
}
|
|
1422
|
+
return line;
|
|
1423
|
+
}
|
|
1424
|
+
#endif
|
|
1425
|
+
|
|
1225
1426
|
void * alloc(size_t size, size_t * actual_size) override {
|
|
1226
1427
|
#ifdef DEBUG_sycl_MALLOC
|
|
1227
1428
|
int nnz = 0;
|
|
@@ -1263,9 +1464,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
|
|
1263
1464
|
void * ptr;
|
|
1264
1465
|
size_t look_ahead_size = (size_t) (1.05 * size);
|
|
1265
1466
|
|
|
1266
|
-
SYCL_CHECK(
|
|
1267
|
-
CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
|
|
1268
|
-
look_ahead_size, *qptr)));
|
|
1467
|
+
SYCL_CHECK(CHECK_TRY_ERROR(ptr = (void *)ggml_sycl_malloc_device(look_ahead_size, *qptr)));
|
|
1269
1468
|
if (!ptr) {
|
|
1270
1469
|
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device/GPU\n", __func__, look_ahead_size);
|
|
1271
1470
|
return nullptr;
|
|
@@ -1293,11 +1492,126 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
|
|
1293
1492
|
}
|
|
1294
1493
|
}
|
|
1295
1494
|
GGML_LOG_WARN("WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n");
|
|
1296
|
-
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
1495
|
+
SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(ptr, *qptr)));
|
|
1297
1496
|
pool_size -= size;
|
|
1298
1497
|
}
|
|
1299
1498
|
};
|
|
1300
1499
|
|
|
1500
|
+
// pool with virtual memory management
|
|
1501
|
+
#if defined(GGML_SYCL_USE_VMM)
|
|
1502
|
+
struct ggml_sycl_pool_vmm : public ggml_sycl_pool {
|
|
1503
|
+
static const size_t SYCL_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
|
1504
|
+
|
|
1505
|
+
int device;
|
|
1506
|
+
sycl::context ctx;
|
|
1507
|
+
sycl::device dev;
|
|
1508
|
+
|
|
1509
|
+
uintptr_t pool_addr = 0;
|
|
1510
|
+
size_t pool_used = 0;
|
|
1511
|
+
size_t pool_size = 0;
|
|
1512
|
+
size_t granularity;
|
|
1513
|
+
|
|
1514
|
+
// physical_mem owns the commits (unlike cuMemMap)
|
|
1515
|
+
struct mapping {
|
|
1516
|
+
sycl::ext::oneapi::experimental::physical_mem phys;
|
|
1517
|
+
void * map_ptr;
|
|
1518
|
+
};
|
|
1519
|
+
std::vector<mapping> mappings;
|
|
1520
|
+
|
|
1521
|
+
explicit ggml_sycl_pool_vmm(queue_ptr qptr_, int device_) :
|
|
1522
|
+
device(device_),
|
|
1523
|
+
ctx(qptr_->get_context()),
|
|
1524
|
+
dev(qptr_->get_device()),
|
|
1525
|
+
granularity(ggml_sycl_info().devices[device_].vmm_granularity) {
|
|
1526
|
+
}
|
|
1527
|
+
|
|
1528
|
+
~ggml_sycl_pool_vmm() {
|
|
1529
|
+
if (pool_addr == 0) {
|
|
1530
|
+
return;
|
|
1531
|
+
}
|
|
1532
|
+
|
|
1533
|
+
// Per spec, unmap must (a) match the exact (ptr, size) of an earlier
|
|
1534
|
+
// physical_mem::map() call and (b) precede destruction of the
|
|
1535
|
+
// physical_mem objects (their dtors won't unmap).
|
|
1536
|
+
for (auto & m : mappings) {
|
|
1537
|
+
SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::unmap(
|
|
1538
|
+
m.map_ptr, m.phys.size(), ctx)));
|
|
1539
|
+
}
|
|
1540
|
+
SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::free_virtual_mem(
|
|
1541
|
+
pool_addr, SYCL_POOL_VMM_MAX_SIZE, ctx)));
|
|
1542
|
+
}
|
|
1543
|
+
|
|
1544
|
+
void * alloc(size_t size, size_t * actual_size) override {
|
|
1545
|
+
// round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
|
|
1546
|
+
size = GGML_PAD(size, SYCL_BUFFER_ALIGNMENT);
|
|
1547
|
+
|
|
1548
|
+
size_t avail = pool_size - pool_used;
|
|
1549
|
+
|
|
1550
|
+
if (size > avail) {
|
|
1551
|
+
// round up to the next multiple of the granularity
|
|
1552
|
+
size_t reserve_size = GGML_PAD(size - avail, granularity);
|
|
1553
|
+
|
|
1554
|
+
GGML_ASSERT(pool_size + reserve_size <= SYCL_POOL_VMM_MAX_SIZE);
|
|
1555
|
+
|
|
1556
|
+
// allocate more physical memory
|
|
1557
|
+
std::optional<sycl::ext::oneapi::experimental::physical_mem> phys;
|
|
1558
|
+
SYCL_CHECK(CHECK_TRY_ERROR(phys.emplace(dev, ctx, reserve_size)));
|
|
1559
|
+
|
|
1560
|
+
// reserve virtual address space (if not already reserved)
|
|
1561
|
+
if (pool_addr == 0) {
|
|
1562
|
+
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
1563
|
+
pool_addr = sycl::ext::oneapi::experimental::reserve_virtual_mem(
|
|
1564
|
+
SYCL_POOL_VMM_MAX_SIZE, ctx)));
|
|
1565
|
+
}
|
|
1566
|
+
|
|
1567
|
+
// map at the end of the pool
|
|
1568
|
+
void * map_ptr = nullptr;
|
|
1569
|
+
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
1570
|
+
map_ptr = phys->map(pool_addr + pool_size, reserve_size,
|
|
1571
|
+
sycl::ext::oneapi::experimental::address_access_mode::read_write)));
|
|
1572
|
+
|
|
1573
|
+
// stash these so we could unmap this exact range in dtor
|
|
1574
|
+
mappings.push_back({
|
|
1575
|
+
std::move(*phys),
|
|
1576
|
+
map_ptr,
|
|
1577
|
+
});
|
|
1578
|
+
|
|
1579
|
+
// add to the pool
|
|
1580
|
+
pool_size += reserve_size;
|
|
1581
|
+
|
|
1582
|
+
#ifdef DEBUG_SYCL_MALLOC
|
|
1583
|
+
GGML_LOG_INFO("sycl pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
|
|
1584
|
+
device, (unsigned long long) (pool_size/1024/1024),
|
|
1585
|
+
(unsigned long long) (reserve_size/1024/1024));
|
|
1586
|
+
#endif
|
|
1587
|
+
}
|
|
1588
|
+
|
|
1589
|
+
GGML_ASSERT(pool_addr != 0);
|
|
1590
|
+
|
|
1591
|
+
void * ptr = reinterpret_cast<void *>(pool_addr + pool_used);
|
|
1592
|
+
*actual_size = size;
|
|
1593
|
+
pool_used += size;
|
|
1594
|
+
|
|
1595
|
+
#ifdef DEBUG_SYCL_MALLOC
|
|
1596
|
+
GGML_LOG_INFO("sycl pool[%d]: allocated %llu bytes at %p\n", device, (unsigned long long) size, ptr);
|
|
1597
|
+
#endif
|
|
1598
|
+
|
|
1599
|
+
return ptr;
|
|
1600
|
+
}
|
|
1601
|
+
|
|
1602
|
+
void free(void * ptr, size_t size) override {
|
|
1603
|
+
#ifdef DEBUG_SYCL_MALLOC
|
|
1604
|
+
GGML_LOG_INFO("sycl pool[%d]: freed %llu bytes at %p\n", device, (unsigned long long) size, ptr);
|
|
1605
|
+
#endif
|
|
1606
|
+
|
|
1607
|
+
pool_used -= size;
|
|
1608
|
+
|
|
1609
|
+
// all deallocations must be in reverse order of the allocations
|
|
1610
|
+
GGML_ASSERT(ptr == reinterpret_cast<void *>(pool_addr + pool_used));
|
|
1611
|
+
}
|
|
1612
|
+
};
|
|
1613
|
+
#endif // defined(GGML_SYCL_USE_VMM)
|
|
1614
|
+
|
|
1301
1615
|
struct ggml_sycl_pool_host : public ggml_sycl_pool {
|
|
1302
1616
|
queue_ptr qptr;
|
|
1303
1617
|
int device;
|
|
@@ -1378,15 +1692,18 @@ std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_host(que
|
|
|
1378
1692
|
}
|
|
1379
1693
|
|
|
1380
1694
|
std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) {
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1695
|
+
#if defined(GGML_SYCL_USE_VMM)
|
|
1696
|
+
if (g_ggml_sycl_enable_vmm && ggml_sycl_info().devices[device].vmm) {
|
|
1697
|
+
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(qptr, device));
|
|
1698
|
+
}
|
|
1699
|
+
#endif // defined(GGML_SYCL_USE_VMM)
|
|
1700
|
+
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
|
|
1386
1701
|
}
|
|
1387
1702
|
|
|
1388
|
-
|
|
1389
|
-
|
|
1703
|
+
|
|
1704
|
+
std::unique_ptr<ggml_sycl_fattn_kv_buffers> ggml_backend_sycl_context::new_fattn_kv_buffers(queue_ptr qptr, int device) {
|
|
1705
|
+
return std::unique_ptr<ggml_sycl_fattn_kv_buffers>(new ggml_sycl_fattn_kv_buffers(qptr, device));
|
|
1706
|
+
}
|
|
1390
1707
|
|
|
1391
1708
|
/// kernels
|
|
1392
1709
|
typedef void (*ggml_sycl_op_mul_mat_t)(
|
|
@@ -1825,6 +2142,110 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
|
|
1825
2142
|
}
|
|
1826
2143
|
}
|
|
1827
2144
|
|
|
2145
|
+
static void top_k_f32_sycl(
|
|
2146
|
+
const float * src,
|
|
2147
|
+
int32_t * dst_indices,
|
|
2148
|
+
const int64_t ncols,
|
|
2149
|
+
const int64_t nrows,
|
|
2150
|
+
const int k,
|
|
2151
|
+
dpct::queue_ptr main_stream
|
|
2152
|
+
) {
|
|
2153
|
+
const int block_size = 128;
|
|
2154
|
+
|
|
2155
|
+
const sycl::range<1> block_dims(block_size);
|
|
2156
|
+
const sycl::range<1> grid_dims(nrows);
|
|
2157
|
+
|
|
2158
|
+
main_stream->submit([&](sycl::handler &cgh) {
|
|
2159
|
+
sycl::local_accessor<float, 1> shared_vals(sycl::range<1>(block_size * k), cgh);
|
|
2160
|
+
sycl::local_accessor<int, 1> shared_idx(sycl::range<1>(block_size * k), cgh);
|
|
2161
|
+
|
|
2162
|
+
cgh.parallel_for(
|
|
2163
|
+
sycl::nd_range<1>(grid_dims * block_dims, block_dims),
|
|
2164
|
+
[=](sycl::nd_item<1> item_ct1) {
|
|
2165
|
+
const int row = item_ct1.get_group(0);
|
|
2166
|
+
const int tid = item_ct1.get_local_id(0);
|
|
2167
|
+
|
|
2168
|
+
if (row >= nrows) return;
|
|
2169
|
+
|
|
2170
|
+
const float * src_row = src + row * ncols;
|
|
2171
|
+
int32_t * dst_idx_row = dst_indices + row * k;
|
|
2172
|
+
|
|
2173
|
+
float local_vals[32];
|
|
2174
|
+
int local_idx[32];
|
|
2175
|
+
|
|
2176
|
+
for (int i = 0; i < k; i++) {
|
|
2177
|
+
local_vals[i] = -FLT_MAX;
|
|
2178
|
+
local_idx[i] = -1;
|
|
2179
|
+
}
|
|
2180
|
+
|
|
2181
|
+
for (int col = tid; col < ncols; col += block_size) {
|
|
2182
|
+
float val = src_row[col];
|
|
2183
|
+
|
|
2184
|
+
if (val > local_vals[k-1]) {
|
|
2185
|
+
int pos = k - 1;
|
|
2186
|
+
while (pos > 0 && val > local_vals[pos - 1]) {
|
|
2187
|
+
pos--;
|
|
2188
|
+
}
|
|
2189
|
+
|
|
2190
|
+
for (int i = k - 1; i > pos; i--) {
|
|
2191
|
+
local_vals[i] = local_vals[i - 1];
|
|
2192
|
+
local_idx[i] = local_idx[i - 1];
|
|
2193
|
+
}
|
|
2194
|
+
local_vals[pos] = val;
|
|
2195
|
+
local_idx[pos] = col;
|
|
2196
|
+
}
|
|
2197
|
+
}
|
|
2198
|
+
|
|
2199
|
+
for (int i = 0; i < k; i++) {
|
|
2200
|
+
shared_vals[tid * k + i] = local_vals[i];
|
|
2201
|
+
shared_idx[tid * k + i] = local_idx[i];
|
|
2202
|
+
}
|
|
2203
|
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
2204
|
+
|
|
2205
|
+
if (tid == 0) {
|
|
2206
|
+
float final_vals[32];
|
|
2207
|
+
int final_idx[32];
|
|
2208
|
+
|
|
2209
|
+
for (int i = 0; i < k; i++) {
|
|
2210
|
+
final_vals[i] = -FLT_MAX;
|
|
2211
|
+
final_idx[i] = -1;
|
|
2212
|
+
}
|
|
2213
|
+
|
|
2214
|
+
for (int t = 0; t < block_size; t++) {
|
|
2215
|
+
for (int i = 0; i < k; i++) {
|
|
2216
|
+
float val = shared_vals[t * k + i];
|
|
2217
|
+
int idx = shared_idx[t * k + i];
|
|
2218
|
+
|
|
2219
|
+
if (val > final_vals[k-1]) {
|
|
2220
|
+
int pos = k - 1;
|
|
2221
|
+
while (pos > 0 && val > final_vals[pos - 1]) {
|
|
2222
|
+
pos--;
|
|
2223
|
+
}
|
|
2224
|
+
|
|
2225
|
+
for (int j = k - 1; j > pos; j--) {
|
|
2226
|
+
final_vals[j] = final_vals[j - 1];
|
|
2227
|
+
final_idx[j] = final_idx[j - 1];
|
|
2228
|
+
}
|
|
2229
|
+
final_vals[pos] = val;
|
|
2230
|
+
final_idx[pos] = idx;
|
|
2231
|
+
}
|
|
2232
|
+
}
|
|
2233
|
+
}
|
|
2234
|
+
|
|
2235
|
+
for (int i = 0; i < k; i++) {
|
|
2236
|
+
dst_idx_row[i] = final_idx[i];
|
|
2237
|
+
}
|
|
2238
|
+
|
|
2239
|
+
if (k > 1) {
|
|
2240
|
+
int32_t temp = dst_idx_row[0];
|
|
2241
|
+
dst_idx_row[0] = dst_idx_row[1];
|
|
2242
|
+
dst_idx_row[1] = temp;
|
|
2243
|
+
}
|
|
2244
|
+
}
|
|
2245
|
+
});
|
|
2246
|
+
});
|
|
2247
|
+
}
|
|
2248
|
+
|
|
1828
2249
|
static void argmax_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
|
1829
2250
|
const int nrows, queue_ptr stream) {
|
|
1830
2251
|
const sycl::range<3> block_dims(1, 1, SYCL_ARGMAX_BLOCK_SIZE);
|
|
@@ -2004,6 +2425,31 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
|
2004
2425
|
#else
|
|
2005
2426
|
bool use_fp16 = false;
|
|
2006
2427
|
#endif
|
|
2428
|
+
|
|
2429
|
+
#if GGML_SYCL_DNNL && defined(GGML_SYCL_HAS_BF16)
|
|
2430
|
+
// Fast path for bf16 src0
|
|
2431
|
+
if (src0->type == GGML_TYPE_BF16 && !g_ggml_sycl_disable_dnn && ggml_is_contiguous(src0) &&
|
|
2432
|
+
row_diff == src0->ne[1]) {
|
|
2433
|
+
using bf16_t = sycl::ext::oneapi::bfloat16;
|
|
2434
|
+
ggml_sycl_pool_alloc<bf16_t> src1_as_bf16(ctx.pool(), src1_ncols*ne10);
|
|
2435
|
+
if (src1->type != GGML_TYPE_BF16) {
|
|
2436
|
+
const to_bf16_sycl_t to_bf16_sycl = ggml_get_to_bf16_sycl(src1->type, dst);
|
|
2437
|
+
GGML_ASSERT(to_bf16_sycl != nullptr);
|
|
2438
|
+
to_bf16_sycl(src1_ddf_i, src1_as_bf16.get(), src1_ncols*ne10, stream);
|
|
2439
|
+
} else {
|
|
2440
|
+
stream->memcpy(src1_as_bf16.get(), src1_ddf_i, src1_ncols*ne10*sizeof(bf16_t));
|
|
2441
|
+
}
|
|
2442
|
+
DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10,
|
|
2443
|
+
src0_dd_i, DnnlGemmWrapper::to_dt<bf16_t>(),
|
|
2444
|
+
src1_as_bf16.get(), DnnlGemmWrapper::to_dt<bf16_t>(),
|
|
2445
|
+
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
|
|
2446
|
+
GGML_UNUSED(dst);
|
|
2447
|
+
GGML_UNUSED(src1_ddq_i);
|
|
2448
|
+
GGML_UNUSED(src1_padded_row_size);
|
|
2449
|
+
return;
|
|
2450
|
+
}
|
|
2451
|
+
#endif
|
|
2452
|
+
|
|
2007
2453
|
if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && use_fp16 && ggml_is_contiguous(src0) &&
|
|
2008
2454
|
row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
|
2009
2455
|
ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
|
|
@@ -2048,8 +2494,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
|
2048
2494
|
const sycl::half alpha_f16 = 1.0f;
|
|
2049
2495
|
const sycl::half beta_f16 = 0.0f;
|
|
2050
2496
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
|
|
2051
|
-
*stream, oneapi::
|
|
2052
|
-
oneapi::
|
|
2497
|
+
*stream, oneapi::mkl::transpose::trans,
|
|
2498
|
+
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
|
|
2053
2499
|
&alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
|
|
2054
2500
|
src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
|
|
2055
2501
|
dst_f16.get(), dpct::library_data_t::real_half, ldc,
|
|
@@ -2081,21 +2527,25 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
|
2081
2527
|
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
|
|
2082
2528
|
const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
|
|
2083
2529
|
|
|
2530
|
+
{
|
|
2531
|
+
const int64_t gemm_flops = (int64_t)row_diff * src1_ncols * ne10;
|
|
2532
|
+
const bool use_mkl_direct = gemm_flops < 256 * 256 * 256;
|
|
2084
2533
|
#if GGML_SYCL_DNNL
|
|
2085
|
-
|
|
2086
|
-
|
|
2087
|
-
|
|
2088
|
-
|
|
2089
|
-
|
|
2090
|
-
|
|
2534
|
+
if (!g_ggml_sycl_disable_dnn && !use_mkl_direct) {
|
|
2535
|
+
DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10, src0_ddf_i,
|
|
2536
|
+
DnnlGemmWrapper::to_dt<float>(), src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
|
|
2537
|
+
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
|
|
2538
|
+
}
|
|
2539
|
+
else
|
|
2091
2540
|
#endif
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2098
|
-
|
|
2541
|
+
{
|
|
2542
|
+
const float alpha = 1.0f;
|
|
2543
|
+
const float beta = 0.0f;
|
|
2544
|
+
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
|
|
2545
|
+
*stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff,
|
|
2546
|
+
src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
|
|
2547
|
+
dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
|
|
2548
|
+
}
|
|
2099
2549
|
}
|
|
2100
2550
|
}
|
|
2101
2551
|
GGML_UNUSED(dst);
|
|
@@ -2216,6 +2666,30 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor *
|
|
|
2216
2666
|
main_stream, ctx.device);
|
|
2217
2667
|
}
|
|
2218
2668
|
|
|
2669
|
+
static void ggml_sycl_op_top_k(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
2670
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
2671
|
+
|
|
2672
|
+
GGML_ASSERT(src0);
|
|
2673
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
2674
|
+
GGML_ASSERT(dst->type == GGML_TYPE_I32);
|
|
2675
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
2676
|
+
|
|
2677
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
2678
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
2679
|
+
|
|
2680
|
+
const float * src0_dd = static_cast<const float *>(src0->data);
|
|
2681
|
+
int32_t * dst_dd = static_cast<int32_t *>(dst->data);
|
|
2682
|
+
|
|
2683
|
+
const int k = dst->ne[0];
|
|
2684
|
+
const int64_t ncols = src0->ne[0];
|
|
2685
|
+
const int64_t nrows = ggml_nrows(src0);
|
|
2686
|
+
|
|
2687
|
+
GGML_ASSERT(k > 0 && k <= 32);
|
|
2688
|
+
GGML_ASSERT(k <= ncols);
|
|
2689
|
+
|
|
2690
|
+
top_k_f32_sycl(src0_dd, dst_dd, ncols, nrows, k, main_stream);
|
|
2691
|
+
}
|
|
2692
|
+
|
|
2219
2693
|
inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
2220
2694
|
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
2221
2695
|
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
|
@@ -2248,6 +2722,65 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_ten
|
|
|
2248
2722
|
diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
|
|
2249
2723
|
}
|
|
2250
2724
|
|
|
2725
|
+
static void tri_f32_sycl(
|
|
2726
|
+
const float * src,
|
|
2727
|
+
float * dst,
|
|
2728
|
+
const int64_t ne0,
|
|
2729
|
+
const int64_t ne1,
|
|
2730
|
+
const int64_t ne2,
|
|
2731
|
+
const int64_t ne3,
|
|
2732
|
+
const ggml_tri_type ttype,
|
|
2733
|
+
dpct::queue_ptr main_stream
|
|
2734
|
+
) {
|
|
2735
|
+
const size_t total = (size_t) ne0 * (size_t) ne1 * (size_t) ne2 * (size_t) ne3;
|
|
2736
|
+
|
|
2737
|
+
main_stream->parallel_for(sycl::range<1>(total), [=](sycl::id<1> tid) {
|
|
2738
|
+
const int64_t idx = (int64_t) tid[0];
|
|
2739
|
+
|
|
2740
|
+
const int64_t i0 = idx % ne0;
|
|
2741
|
+
const int64_t t1 = idx / ne0;
|
|
2742
|
+
const int64_t i1 = t1 % ne1;
|
|
2743
|
+
|
|
2744
|
+
bool keep = false;
|
|
2745
|
+
switch (ttype) {
|
|
2746
|
+
case GGML_TRI_TYPE_LOWER: keep = (i0 < i1); break;
|
|
2747
|
+
case GGML_TRI_TYPE_LOWER_DIAG: keep = (i0 <= i1); break;
|
|
2748
|
+
case GGML_TRI_TYPE_UPPER: keep = (i0 > i1); break;
|
|
2749
|
+
case GGML_TRI_TYPE_UPPER_DIAG: keep = (i0 >= i1); break;
|
|
2750
|
+
default: keep = false; break;
|
|
2751
|
+
}
|
|
2752
|
+
|
|
2753
|
+
dst[idx] = keep ? src[idx] : 0.0f;
|
|
2754
|
+
});
|
|
2755
|
+
}
|
|
2756
|
+
|
|
2757
|
+
static void ggml_sycl_op_tri(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
2758
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
2759
|
+
GGML_ASSERT(src0);
|
|
2760
|
+
|
|
2761
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
2762
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
2763
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
2764
|
+
GGML_ASSERT(ggml_is_contiguous(dst));
|
|
2765
|
+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
|
2766
|
+
|
|
2767
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
2768
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
2769
|
+
|
|
2770
|
+
const float * src0_dd = static_cast<const float *>(src0->data);
|
|
2771
|
+
float * dst_dd = static_cast<float *>(dst->data);
|
|
2772
|
+
|
|
2773
|
+
const ggml_tri_type ttype = (ggml_tri_type) ggml_get_op_params_i32(dst, 0);
|
|
2774
|
+
|
|
2775
|
+
const int64_t ne0 = src0->ne[0];
|
|
2776
|
+
const int64_t ne1 = src0->ne[1];
|
|
2777
|
+
const int64_t ne2 = src0->ne[2];
|
|
2778
|
+
const int64_t ne3 = src0->ne[3];
|
|
2779
|
+
|
|
2780
|
+
tri_f32_sycl(src0_dd, dst_dd, ne0, ne1, ne2, ne3, ttype, main_stream);
|
|
2781
|
+
}
|
|
2782
|
+
|
|
2783
|
+
|
|
2251
2784
|
inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
2252
2785
|
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
2253
2786
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
@@ -2810,7 +3343,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
|
|
2810
3343
|
|
|
2811
3344
|
}
|
|
2812
3345
|
#if GGML_SYCL_DNNL
|
|
2813
|
-
// oneDNN handles strided data and does not need overhead of
|
|
3346
|
+
// oneDNN handles strided data and does not need overhead of ggml_get_to_fp16_nc_sycl
|
|
2814
3347
|
const int64_t ne_src1 = src1->nb[last_str] * src1->ne[last_dim] / type_size_src1;
|
|
2815
3348
|
src1_f16_alloc.alloc(ne_src1);
|
|
2816
3349
|
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
|
|
@@ -2819,7 +3352,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
|
|
2819
3352
|
# else
|
|
2820
3353
|
const int64_t ne_src1 = ggml_nelements(src1);
|
|
2821
3354
|
src1_f16_alloc.alloc(ne_src1);
|
|
2822
|
-
const to_fp16_nc_sycl_t to_fp16_nc_sycl =
|
|
3355
|
+
const to_fp16_nc_sycl_t to_fp16_nc_sycl = ggml_get_to_fp16_nc_sycl(src1->type);
|
|
2823
3356
|
GGML_ASSERT(to_fp16_nc_sycl != nullptr);
|
|
2824
3357
|
to_fp16_nc_sycl(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, queue);
|
|
2825
3358
|
#endif
|
|
@@ -2963,8 +3496,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
|
|
2963
3496
|
const int64_t smb = ne12 == 1 ? s13 : s12;
|
|
2964
3497
|
|
|
2965
3498
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
|
2966
|
-
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::
|
|
2967
|
-
oneapi::
|
|
3499
|
+
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::mkl::transpose::trans,
|
|
3500
|
+
oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
|
2968
3501
|
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, sma,
|
|
2969
3502
|
src1_f16, dpct::library_data_t::real_half, s11, smb, beta, dst_ddf,
|
|
2970
3503
|
mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
|
|
@@ -2988,7 +3521,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
|
|
2988
3521
|
});
|
|
2989
3522
|
|
|
2990
3523
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
|
2991
|
-
*queue, oneapi::
|
|
3524
|
+
*queue, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
|
2992
3525
|
(const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
|
|
2993
3526
|
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, s11, beta,
|
|
2994
3527
|
(void **) (ptrs_dst.get() + 0 * ne23), mkl_data_type, ne0, ne23, mkl_compute_type, matrix_info.get())));
|
|
@@ -3014,8 +3547,11 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
|
|
|
3014
3547
|
inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
|
|
3015
3548
|
switch (type) {
|
|
3016
3549
|
case GGML_TYPE_Q4_0:
|
|
3550
|
+
case GGML_TYPE_Q8_0:
|
|
3017
3551
|
return true;
|
|
3552
|
+
case GGML_TYPE_Q3_K:
|
|
3018
3553
|
case GGML_TYPE_Q4_K:
|
|
3554
|
+
case GGML_TYPE_Q5_K:
|
|
3019
3555
|
case GGML_TYPE_Q6_K:
|
|
3020
3556
|
return !g_ggml_sycl_prioritize_dmmv;
|
|
3021
3557
|
default:
|
|
@@ -3026,6 +3562,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
|
|
|
3026
3562
|
inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
|
|
3027
3563
|
switch (type) {
|
|
3028
3564
|
case GGML_TYPE_Q4_0:
|
|
3565
|
+
case GGML_TYPE_Q8_0:
|
|
3029
3566
|
return true;
|
|
3030
3567
|
default:
|
|
3031
3568
|
return false;
|
|
@@ -3035,7 +3572,10 @@ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
|
|
|
3035
3572
|
inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
|
|
3036
3573
|
switch (type) {
|
|
3037
3574
|
case GGML_TYPE_Q4_0:
|
|
3575
|
+
case GGML_TYPE_Q8_0:
|
|
3576
|
+
case GGML_TYPE_Q3_K:
|
|
3038
3577
|
case GGML_TYPE_Q4_K:
|
|
3578
|
+
case GGML_TYPE_Q5_K:
|
|
3039
3579
|
case GGML_TYPE_Q6_K:
|
|
3040
3580
|
return true;
|
|
3041
3581
|
default:
|
|
@@ -3056,6 +3596,7 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
|
|
3056
3596
|
case GGML_TYPE_Q5_K:
|
|
3057
3597
|
case GGML_TYPE_Q6_K:
|
|
3058
3598
|
case GGML_TYPE_F16:
|
|
3599
|
+
case GGML_TYPE_BF16:
|
|
3059
3600
|
return true;
|
|
3060
3601
|
default:
|
|
3061
3602
|
return false;
|
|
@@ -3073,7 +3614,7 @@ static inline void * sycl_ext_malloc_device(dpct::queue_ptr stream, size_t size)
|
|
|
3073
3614
|
// If async allocation extension is not available, use_async should always be false.
|
|
3074
3615
|
GGML_ASSERT(!use_async);
|
|
3075
3616
|
#endif
|
|
3076
|
-
return
|
|
3617
|
+
return ggml_sycl_malloc_device(size, *stream);
|
|
3077
3618
|
}
|
|
3078
3619
|
|
|
3079
3620
|
static inline void sycl_ext_free(dpct::queue_ptr stream, void * ptr) {
|
|
@@ -3087,12 +3628,58 @@ static inline void sycl_ext_free(dpct::queue_ptr stream, void * ptr) {
|
|
|
3087
3628
|
// If async allocation extension is not available, use_async should always be false.
|
|
3088
3629
|
GGML_ASSERT(!use_async);
|
|
3089
3630
|
#endif
|
|
3090
|
-
|
|
3631
|
+
ggml_sycl_free_device(ptr, *stream);
|
|
3091
3632
|
}
|
|
3092
3633
|
|
|
3093
|
-
|
|
3634
|
+
// RAII wrapper for temporary reorder buffers with optional host memory fallback.
|
|
3635
|
+
// When device allocation fails and GGML_SYCL_HOST_MEM_FALLBACK is enabled,
|
|
3636
|
+
// falls back to host memory so the reorder kernel can still run (over PCIe).
|
|
3637
|
+
// Device access to host memory requires Linux kernel 6.8+ (Ubuntu 26.04+).
|
|
3638
|
+
struct sycl_reorder_temp_buffer {
|
|
3639
|
+
void * ptr = nullptr;
|
|
3640
|
+
dpct::queue_ptr stream;
|
|
3641
|
+
|
|
3642
|
+
sycl_reorder_temp_buffer(dpct::queue_ptr stream, size_t size) : stream(stream) {
|
|
3643
|
+
ptr = sycl_ext_malloc_device(stream, size);
|
|
3644
|
+
#ifdef GGML_SYCL_HOST_MEM_FALLBACK
|
|
3645
|
+
if (!ptr) {
|
|
3646
|
+
ptr = sycl::malloc_host(size, *stream);
|
|
3647
|
+
if (ptr) {
|
|
3648
|
+
host_fallback = true;
|
|
3649
|
+
GGML_LOG_WARN("%s: device alloc of %zu bytes failed, using host memory fallback\n", __func__, size);
|
|
3650
|
+
}
|
|
3651
|
+
}
|
|
3652
|
+
#endif
|
|
3653
|
+
}
|
|
3654
|
+
|
|
3655
|
+
~sycl_reorder_temp_buffer() {
|
|
3656
|
+
if (!ptr) {
|
|
3657
|
+
return;
|
|
3658
|
+
}
|
|
3659
|
+
if (host_fallback) {
|
|
3660
|
+
sycl::free(ptr, *stream);
|
|
3661
|
+
} else {
|
|
3662
|
+
sycl_ext_free(stream, ptr);
|
|
3663
|
+
}
|
|
3664
|
+
}
|
|
3665
|
+
|
|
3666
|
+
explicit operator bool() const { return ptr != nullptr; }
|
|
3667
|
+
|
|
3668
|
+
sycl_reorder_temp_buffer(const sycl_reorder_temp_buffer &) = delete;
|
|
3669
|
+
sycl_reorder_temp_buffer & operator=(const sycl_reorder_temp_buffer &) = delete;
|
|
3670
|
+
|
|
3671
|
+
private:
|
|
3672
|
+
bool host_fallback = false;
|
|
3673
|
+
};
|
|
3674
|
+
|
|
3675
|
+
static bool reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
|
|
3094
3676
|
dpct::queue_ptr stream) {
|
|
3095
|
-
|
|
3677
|
+
sycl_reorder_temp_buffer tmp(stream, size);
|
|
3678
|
+
if (!tmp) {
|
|
3679
|
+
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
|
|
3680
|
+
return false;
|
|
3681
|
+
}
|
|
3682
|
+
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
|
|
3096
3683
|
|
|
3097
3684
|
sycl::event copy_event;
|
|
3098
3685
|
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
|
|
@@ -3121,16 +3708,60 @@ static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nr
|
|
|
3121
3708
|
if (!g_ggml_sycl_use_async_mem_op) {
|
|
3122
3709
|
reorder_event.wait_and_throw();
|
|
3123
3710
|
}
|
|
3124
|
-
|
|
3711
|
+
return true;
|
|
3125
3712
|
}
|
|
3126
3713
|
|
|
3127
|
-
static
|
|
3714
|
+
static bool reorder_qw_q8_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
|
|
3715
|
+
dpct::queue_ptr stream) {
|
|
3716
|
+
sycl_reorder_temp_buffer tmp(stream, size);
|
|
3717
|
+
if (!tmp) {
|
|
3718
|
+
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
|
|
3719
|
+
return false;
|
|
3720
|
+
}
|
|
3721
|
+
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
|
|
3722
|
+
|
|
3723
|
+
sycl::event copy_event;
|
|
3724
|
+
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
|
|
3725
|
+
if (!g_ggml_sycl_use_async_mem_op) {
|
|
3726
|
+
copy_event.wait();
|
|
3727
|
+
}
|
|
3728
|
+
|
|
3729
|
+
GGML_ASSERT((size % sizeof(block_q8_0) == 0));
|
|
3730
|
+
GGML_ASSERT((offset % sizeof(block_q8_0) == 0));
|
|
3731
|
+
int offset_blks = offset / sizeof(block_q8_0);
|
|
3732
|
+
auto qs_ptr = data_device + offset_blks * QK8_0;
|
|
3733
|
+
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows) + offset_blks;
|
|
3734
|
+
|
|
3735
|
+
auto reorder_event = stream->parallel_for(
|
|
3736
|
+
size / sizeof(block_q8_0),
|
|
3737
|
+
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
3738
|
+
const block_q8_0* x = (const block_q8_0*)tmp_buf;
|
|
3739
|
+
const int ib = i;
|
|
3740
|
+
|
|
3741
|
+
for (int j = 0; j < QK8_0; j++)
|
|
3742
|
+
{
|
|
3743
|
+
*((int8_t*)qs_ptr + ib * QK8_0 + j) = x[ib].qs[j];
|
|
3744
|
+
}
|
|
3745
|
+
*(d_ptr + ib) = x[ib].d;
|
|
3746
|
+
});
|
|
3747
|
+
if (!g_ggml_sycl_use_async_mem_op) {
|
|
3748
|
+
reorder_event.wait_and_throw();
|
|
3749
|
+
}
|
|
3750
|
+
return true;
|
|
3751
|
+
}
|
|
3752
|
+
|
|
3753
|
+
static bool reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
|
3128
3754
|
GGML_ASSERT(size % sizeof(block_q4_K) == 0);
|
|
3129
3755
|
GGML_ASSERT(offset % sizeof(block_q4_K) == 0);
|
|
3130
3756
|
|
|
3131
3757
|
const int nblocks = size / sizeof(block_q4_K);
|
|
3132
3758
|
|
|
3133
|
-
|
|
3759
|
+
sycl_reorder_temp_buffer tmp(stream, size);
|
|
3760
|
+
if (!tmp) {
|
|
3761
|
+
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
|
|
3762
|
+
return false;
|
|
3763
|
+
}
|
|
3764
|
+
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
|
|
3134
3765
|
|
|
3135
3766
|
sycl::event copy_event;
|
|
3136
3767
|
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
|
|
@@ -3159,16 +3790,117 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
|
|
|
3159
3790
|
if (!g_ggml_sycl_use_async_mem_op) {
|
|
3160
3791
|
reorder_event.wait_and_throw();
|
|
3161
3792
|
}
|
|
3162
|
-
|
|
3793
|
+
return true;
|
|
3794
|
+
}
|
|
3795
|
+
|
|
3796
|
+
static bool reorder_qw_q3_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
|
3797
|
+
GGML_ASSERT(size % sizeof(block_q3_K) == 0);
|
|
3798
|
+
GGML_ASSERT(offset % sizeof(block_q3_K) == 0);
|
|
3799
|
+
|
|
3800
|
+
const int nblocks = size / sizeof(block_q3_K);
|
|
3801
|
+
|
|
3802
|
+
sycl_reorder_temp_buffer tmp(stream, size);
|
|
3803
|
+
if (!tmp) {
|
|
3804
|
+
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
|
|
3805
|
+
return false;
|
|
3806
|
+
}
|
|
3807
|
+
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
|
|
3808
|
+
|
|
3809
|
+
sycl::event copy_event;
|
|
3810
|
+
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
|
|
3811
|
+
if (!g_ggml_sycl_use_async_mem_op) {
|
|
3812
|
+
copy_event.wait();
|
|
3813
|
+
}
|
|
3814
|
+
|
|
3815
|
+
auto * qs_ptr = data_device;
|
|
3816
|
+
auto * hmask_ptr = qs_ptr + (QK_K / 4) * nblocks;
|
|
3817
|
+
auto * scales_ptr = hmask_ptr + (QK_K / 8) * nblocks;
|
|
3818
|
+
sycl::half * d_ptr = (sycl::half *) (scales_ptr + 12 * nblocks);
|
|
3819
|
+
|
|
3820
|
+
auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
|
|
3821
|
+
const block_q3_K * x = (const block_q3_K *) tmp_buf;
|
|
3822
|
+
const int ib = i;
|
|
3823
|
+
|
|
3824
|
+
for (int j = 0; j < QK_K / 4; ++j) {
|
|
3825
|
+
qs_ptr[ib * (QK_K / 4) + j] = x[ib].qs[j];
|
|
3826
|
+
}
|
|
3827
|
+
|
|
3828
|
+
for (int j = 0; j < QK_K / 8; ++j) {
|
|
3829
|
+
hmask_ptr[ib * (QK_K / 8) + j] = x[ib].hmask[j];
|
|
3830
|
+
}
|
|
3831
|
+
|
|
3832
|
+
for (int j = 0; j < 12; ++j) {
|
|
3833
|
+
scales_ptr[ib * 12 + j] = x[ib].scales[j];
|
|
3834
|
+
}
|
|
3835
|
+
|
|
3836
|
+
d_ptr[ib] = x[ib].d;
|
|
3837
|
+
});
|
|
3838
|
+
if (!g_ggml_sycl_use_async_mem_op) {
|
|
3839
|
+
reorder_event.wait_and_throw();
|
|
3840
|
+
}
|
|
3841
|
+
return true;
|
|
3842
|
+
}
|
|
3843
|
+
|
|
3844
|
+
static bool reorder_qw_q5_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
|
3845
|
+
GGML_ASSERT(size % sizeof(block_q5_K) == 0);
|
|
3846
|
+
GGML_ASSERT(offset % sizeof(block_q5_K) == 0);
|
|
3847
|
+
|
|
3848
|
+
const int nblocks = size / sizeof(block_q5_K);
|
|
3849
|
+
|
|
3850
|
+
sycl_reorder_temp_buffer tmp(stream, size);
|
|
3851
|
+
if (!tmp) {
|
|
3852
|
+
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
|
|
3853
|
+
return false;
|
|
3854
|
+
}
|
|
3855
|
+
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
|
|
3856
|
+
|
|
3857
|
+
sycl::event copy_event;
|
|
3858
|
+
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
|
|
3859
|
+
if (!g_ggml_sycl_use_async_mem_op) {
|
|
3860
|
+
copy_event.wait();
|
|
3861
|
+
}
|
|
3862
|
+
|
|
3863
|
+
auto * qs_ptr = data_device;
|
|
3864
|
+
auto * qh_ptr = qs_ptr + (QK_K / 2) * nblocks;
|
|
3865
|
+
auto * scales_ptr = qh_ptr + (QK_K / 8) * nblocks;
|
|
3866
|
+
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
|
|
3867
|
+
|
|
3868
|
+
auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
|
|
3869
|
+
const block_q5_K * x = (const block_q5_K *) tmp_buf;
|
|
3870
|
+
const int ib = i;
|
|
3871
|
+
|
|
3872
|
+
for (int j = 0; j < QK_K / 2; ++j) {
|
|
3873
|
+
qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
|
|
3874
|
+
}
|
|
3875
|
+
|
|
3876
|
+
for (int j = 0; j < QK_K / 8; ++j) {
|
|
3877
|
+
qh_ptr[ib * (QK_K / 8) + j] = x[ib].qh[j];
|
|
3878
|
+
}
|
|
3879
|
+
|
|
3880
|
+
for (int j = 0; j < K_SCALE_SIZE; ++j) {
|
|
3881
|
+
scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
|
|
3882
|
+
}
|
|
3883
|
+
|
|
3884
|
+
dm_ptr[ib] = x[ib].dm;
|
|
3885
|
+
});
|
|
3886
|
+
if (!g_ggml_sycl_use_async_mem_op) {
|
|
3887
|
+
reorder_event.wait_and_throw();
|
|
3888
|
+
}
|
|
3889
|
+
return true;
|
|
3163
3890
|
}
|
|
3164
3891
|
|
|
3165
|
-
static
|
|
3892
|
+
static bool reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
|
3166
3893
|
GGML_ASSERT(size % sizeof(block_q6_K) == 0);
|
|
3167
3894
|
GGML_ASSERT(offset % sizeof(block_q6_K) == 0);
|
|
3168
3895
|
|
|
3169
3896
|
const int nblocks = size / sizeof(block_q6_K);
|
|
3170
3897
|
|
|
3171
|
-
|
|
3898
|
+
sycl_reorder_temp_buffer tmp(stream, size);
|
|
3899
|
+
if (!tmp) {
|
|
3900
|
+
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
|
|
3901
|
+
return false;
|
|
3902
|
+
}
|
|
3903
|
+
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
|
|
3172
3904
|
|
|
3173
3905
|
sycl::event copy_event;
|
|
3174
3906
|
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
|
|
@@ -3207,10 +3939,10 @@ static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, d
|
|
|
3207
3939
|
if (!g_ggml_sycl_use_async_mem_op) {
|
|
3208
3940
|
reorder_event.wait_and_throw();
|
|
3209
3941
|
}
|
|
3210
|
-
|
|
3942
|
+
return true;
|
|
3211
3943
|
}
|
|
3212
3944
|
|
|
3213
|
-
static
|
|
3945
|
+
static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
|
3214
3946
|
uint8_t * data_device = (uint8_t *) src0->data;
|
|
3215
3947
|
size_t ncols = src0->ne[0];
|
|
3216
3948
|
size_t nrows = src0->ne[1];
|
|
@@ -3218,17 +3950,20 @@ static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
|
|
3218
3950
|
|
|
3219
3951
|
switch (src0->type) {
|
|
3220
3952
|
case GGML_TYPE_Q4_0:
|
|
3221
|
-
reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
|
|
3222
|
-
|
|
3953
|
+
return reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
|
|
3954
|
+
case GGML_TYPE_Q8_0:
|
|
3955
|
+
return reorder_qw_q8_0(data_device, ncols, nrows, size, 0, stream);
|
|
3956
|
+
case GGML_TYPE_Q3_K:
|
|
3957
|
+
return reorder_qw_q3_k(data_device, size, 0, stream);
|
|
3223
3958
|
case GGML_TYPE_Q4_K:
|
|
3224
|
-
reorder_qw_q4_k(data_device, size, 0, stream);
|
|
3225
|
-
|
|
3959
|
+
return reorder_qw_q4_k(data_device, size, 0, stream);
|
|
3960
|
+
case GGML_TYPE_Q5_K:
|
|
3961
|
+
return reorder_qw_q5_k(data_device, size, 0, stream);
|
|
3226
3962
|
case GGML_TYPE_Q6_K:
|
|
3227
|
-
reorder_qw_q6_k(data_device, size, 0, stream);
|
|
3228
|
-
break;
|
|
3963
|
+
return reorder_qw_q6_k(data_device, size, 0, stream);
|
|
3229
3964
|
default:
|
|
3230
3965
|
GGML_ABORT("reorder_qw() called with unsupported type");
|
|
3231
|
-
|
|
3966
|
+
return false;
|
|
3232
3967
|
}
|
|
3233
3968
|
}
|
|
3234
3969
|
|
|
@@ -3236,7 +3971,9 @@ static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_ten
|
|
|
3236
3971
|
return !g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
|
|
3237
3972
|
ctx.opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf.
|
|
3238
3973
|
dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases.
|
|
3239
|
-
|
|
3974
|
+
// ne[1] <= 8 so multi-column decode (spec / MTP verify) also bootstraps the reorder;
|
|
3975
|
+
// all reorderable types have a _switch_ncols kernel.
|
|
3976
|
+
dst->src[1]->ne[1] <= 8 && dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
|
|
3240
3977
|
}
|
|
3241
3978
|
|
|
3242
3979
|
static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * /* src1 */,
|
|
@@ -3268,14 +4005,20 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor *
|
|
|
3268
4005
|
break;
|
|
3269
4006
|
}
|
|
3270
4007
|
|
|
3271
|
-
reorder_qw(src0, ctx->stream())
|
|
3272
|
-
|
|
4008
|
+
if (reorder_qw(src0, ctx->stream())) {
|
|
4009
|
+
extra->optimized_feature.reorder = true; // Used to decode/dequan in next steps and avoid re-reordering
|
|
4010
|
+
}
|
|
3273
4011
|
}
|
|
3274
4012
|
|
|
3275
4013
|
|
|
3276
4014
|
static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
4015
|
+
// The F16/BF16 qk=1 kernel iterates with stride 2*DMMV_X, requiring ne[0] to be
|
|
4016
|
+
// a multiple of 2*DMMV_X. Quantized types use block-structured kernels that only
|
|
4017
|
+
// need ne[0] % DMMV_X == 0.
|
|
4018
|
+
const int64_t dmmv_x_required = (src0->type == GGML_TYPE_BF16 || src0->type == GGML_TYPE_F16) ?
|
|
4019
|
+
2*GGML_SYCL_DMMV_X : GGML_SYCL_DMMV_X;
|
|
3277
4020
|
return ggml_sycl_supports_dmmv(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
|
|
3278
|
-
src0->ne[0] %
|
|
4021
|
+
src0->ne[0] % dmmv_x_required == 0 && src1->ne[1] == 1;
|
|
3279
4022
|
}
|
|
3280
4023
|
|
|
3281
4024
|
static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3316,19 +4059,25 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
|
3316
4059
|
|
|
3317
4060
|
|
|
3318
4061
|
// mmvq and mmq need the __dp4a instruction which is available for gen12+
|
|
3319
|
-
// Workaround in https://github.com/
|
|
4062
|
+
// Workaround in https://github.com/ggml-org/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
|
|
3320
4063
|
use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
|
|
3321
4064
|
#ifdef SYCL_USE_XMX
|
|
3322
4065
|
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
|
3323
4066
|
#endif // SYCL_USE_XMX
|
|
3324
4067
|
|
|
3325
|
-
//
|
|
3326
|
-
|
|
3327
|
-
|
|
3328
|
-
|
|
3329
|
-
|
|
3330
|
-
|
|
3331
|
-
|
|
4068
|
+
// Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
|
|
4069
|
+
// is enabled takes precedence over DMMV, the current if-else implementation
|
|
4070
|
+
// requires disabling DMMV if both conditions are met
|
|
4071
|
+
|
|
4072
|
+
if (!g_ggml_sycl_prioritize_dmmv && ((should_reorder_tensor(ctx, dst) &&
|
|
4073
|
+
ggml_sycl_supports_reorder_mmvq(src0->type)))) {
|
|
4074
|
+
// Arc770 get benefit with Q4_0 by skipping it.
|
|
4075
|
+
if (!(ggml_sycl_info().devices[ctx.device].hw_info.arch ==
|
|
4076
|
+
gpu_arch::intel_gpu_acm_g10 &&
|
|
4077
|
+
src0->type == GGML_TYPE_Q4_0)) {
|
|
4078
|
+
use_dequantize_mul_mat_vec =
|
|
4079
|
+
use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
|
4080
|
+
}
|
|
3332
4081
|
}
|
|
3333
4082
|
|
|
3334
4083
|
if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
|
@@ -3373,35 +4122,17 @@ struct mmid_row_mapping {
|
|
|
3373
4122
|
|
|
3374
4123
|
__dpct_inline__ static void k_copy_src1_to_contiguous(
|
|
3375
4124
|
const char *__restrict__ src1_original, char *__restrict__ src1_contiguous,
|
|
3376
|
-
|
|
3377
|
-
const char *__restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
|
|
4125
|
+
const mmid_row_mapping *__restrict__ row_mapping,
|
|
3378
4126
|
int64_t ne11, int64_t ne10, size_t nb11, size_t nb12,
|
|
3379
|
-
const sycl::nd_item<3> &item_ct1
|
|
3380
|
-
int32_t
|
|
3381
|
-
int32_t id = item_ct1.get_group(1);
|
|
3382
|
-
|
|
3383
|
-
const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
|
|
4127
|
+
const sycl::nd_item<3> &item_ct1) {
|
|
4128
|
+
const int32_t src1_row = item_ct1.get_group(2);
|
|
3384
4129
|
|
|
3385
|
-
|
|
3386
|
-
|
|
3387
|
-
}
|
|
4130
|
+
const int32_t iid1 = row_mapping[src1_row].i2;
|
|
4131
|
+
const int32_t id = row_mapping[src1_row].i1;
|
|
3388
4132
|
|
|
3389
4133
|
const int64_t i11 = id % ne11;
|
|
3390
4134
|
const int64_t i12 = iid1;
|
|
3391
4135
|
|
|
3392
|
-
if (item_ct1.get_local_id(2) == 0) {
|
|
3393
|
-
src1_row =
|
|
3394
|
-
dpct::atomic_fetch_add<sycl::access::address_space::generic_space>(
|
|
3395
|
-
cur_src1_row, 1);
|
|
3396
|
-
row_mapping[src1_row] = {id, iid1};
|
|
3397
|
-
}
|
|
3398
|
-
/*
|
|
3399
|
-
DPCT1065:194: Consider replacing sycl::nd_item::barrier() with
|
|
3400
|
-
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
|
|
3401
|
-
performance if there is no access to global memory.
|
|
3402
|
-
*/
|
|
3403
|
-
item_ct1.barrier();
|
|
3404
|
-
|
|
3405
4136
|
const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
|
|
3406
4137
|
float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
|
|
3407
4138
|
|
|
@@ -3431,6 +4162,92 @@ __dpct_inline__ static void k_copy_dst_from_contiguous(
|
|
|
3431
4162
|
}
|
|
3432
4163
|
}
|
|
3433
4164
|
|
|
4165
|
+
// Fused MoE TG fast path. Returns false to fall back to the per-expert loop below.
|
|
4166
|
+
static bool ggml_sycl_mul_mat_id_mmvq_fused(
|
|
4167
|
+
ggml_backend_sycl_context & ctx, const ggml_tensor * src0,
|
|
4168
|
+
const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst)
|
|
4169
|
+
{
|
|
4170
|
+
const int64_t ne10 = src1->ne[0];
|
|
4171
|
+
const int64_t ne11 = src1->ne[1];
|
|
4172
|
+
const int64_t ne12 = src1->ne[2];
|
|
4173
|
+
if (ne12 != 1) return false;
|
|
4174
|
+
if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) return false;
|
|
4175
|
+
if (ne10 != src0->ne[0] || ne10 % QK8_1 != 0) return false;
|
|
4176
|
+
if (!ggml_is_contiguous(src1)) return false;
|
|
4177
|
+
|
|
4178
|
+
// Reorder layout not supported; fall back.
|
|
4179
|
+
const ggml_tensor_extra_gpu * src0_extra =
|
|
4180
|
+
static_cast<const ggml_tensor_extra_gpu *>(src0->extra);
|
|
4181
|
+
if (src0_extra && src0_extra->optimized_feature.reorder) return false;
|
|
4182
|
+
|
|
4183
|
+
const int64_t n_ids_per_group = ids->ne[0];
|
|
4184
|
+
if (ids->ne[1] != 1) return false;
|
|
4185
|
+
if (ne11 != 1 && ne11 != n_ids_per_group) return false;
|
|
4186
|
+
|
|
4187
|
+
const queue_ptr stream = ctx.stream();
|
|
4188
|
+
const int src1_padded_cols = GGML_PAD((int) ne10, MATRIX_ROW_PADDING);
|
|
4189
|
+
const int n_experts_used = (int) n_ids_per_group;
|
|
4190
|
+
const int nrows = (int) src0->ne[1];
|
|
4191
|
+
|
|
4192
|
+
ggml_sycl_pool_alloc<char> src1_q8_alloc(ctx.pool(),
|
|
4193
|
+
(size_t) ne11 * src1_padded_cols * sizeof(block_q8_1) / QK8_1);
|
|
4194
|
+
char * src1_ddq = src1_q8_alloc.get();
|
|
4195
|
+
quantize_row_q8_1_sycl<quantize_q8_1>(
|
|
4196
|
+
(const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
|
|
4197
|
+
src1_padded_cols, stream);
|
|
4198
|
+
|
|
4199
|
+
const size_t bytes_per_qrow = (size_t) src1_padded_cols * sizeof(block_q8_1) / QK8_1;
|
|
4200
|
+
const size_t src1_row_stride = (ne11 == 1) ? 0 : bytes_per_qrow;
|
|
4201
|
+
|
|
4202
|
+
return ggml_sycl_mul_mat_vec_q_id(
|
|
4203
|
+
src0->type, src0->data, src1_ddq, (const int32_t *) ids->data,
|
|
4204
|
+
(float *) dst->data, (int) ne10, nrows, n_experts_used,
|
|
4205
|
+
/*expert_weight_stride=*/ src0->nb[2],
|
|
4206
|
+
/*dst_row_stride=*/ dst->nb[1],
|
|
4207
|
+
src1_row_stride, stream);
|
|
4208
|
+
}
|
|
4209
|
+
|
|
4210
|
+
// counting sort of the routed rows by expert id (row_id_i, as chosen by the router):
|
|
4211
|
+
// builds a projection of a memory layout where each expert's slice is contiguous
|
|
4212
|
+
static void mmid_counting_sort_rows(
|
|
4213
|
+
const ggml_tensor * ids, const char * ids_host,
|
|
4214
|
+
int64_t n_ids, int64_t n_as, int64_t n_routed_rows,
|
|
4215
|
+
std::vector<int64_t> & expert_counts,
|
|
4216
|
+
std::vector<int64_t> & expert_row_offsets,
|
|
4217
|
+
std::vector<mmid_row_mapping> & routed_row_src) {
|
|
4218
|
+
|
|
4219
|
+
// frequencies: how many routed rows each expert "owns"
|
|
4220
|
+
expert_counts.assign(n_as, 0);
|
|
4221
|
+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
|
4222
|
+
for (int64_t id = 0; id < n_ids; id++) {
|
|
4223
|
+
const int32_t row_id_i = *(const int32_t *) (ids_host + iid1*ids->nb[1] + id*ids->nb[0]);
|
|
4224
|
+
GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
|
|
4225
|
+
expert_counts[row_id_i]++;
|
|
4226
|
+
}
|
|
4227
|
+
}
|
|
4228
|
+
|
|
4229
|
+
// where each expert's slice starts (row indices) and the previous ends
|
|
4230
|
+
expert_row_offsets.assign(n_as + 1, 0);
|
|
4231
|
+
for (int64_t i02 = 0; i02 < n_as; i02++) {
|
|
4232
|
+
expert_row_offsets[i02 + 1] = expert_row_offsets[i02] + expert_counts[i02];
|
|
4233
|
+
}
|
|
4234
|
+
|
|
4235
|
+
std::vector<int64_t> expert_row_next = expert_row_offsets;
|
|
4236
|
+
routed_row_src.resize(n_routed_rows);
|
|
4237
|
+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
|
4238
|
+
for (int64_t id = 0; id < n_ids; id++) {
|
|
4239
|
+
const int32_t row_id_i = *(const int32_t *) (ids_host + iid1*ids->nb[1] + id*ids->nb[0]);
|
|
4240
|
+
GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
|
|
4241
|
+
|
|
4242
|
+
// find and validate the next free row for a given expert (row_id_i)
|
|
4243
|
+
const int64_t routed_row = expert_row_next[row_id_i]++;
|
|
4244
|
+
GGML_ASSERT(routed_row >= expert_row_offsets[row_id_i]);
|
|
4245
|
+
GGML_ASSERT(routed_row < expert_row_offsets[row_id_i + 1]);
|
|
4246
|
+
routed_row_src[routed_row] = {(int32_t) id, (int32_t) iid1};
|
|
4247
|
+
}
|
|
4248
|
+
}
|
|
4249
|
+
}
|
|
4250
|
+
|
|
3434
4251
|
static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
|
3435
4252
|
ggml_tensor *dst) try {
|
|
3436
4253
|
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
|
|
@@ -3446,6 +4263,12 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
|
|
3446
4263
|
const int64_t n_as = ne02;
|
|
3447
4264
|
const int64_t n_ids = ids->ne[0];
|
|
3448
4265
|
|
|
4266
|
+
if (ne12 == 1) {
|
|
4267
|
+
if (ggml_sycl_mul_mat_id_mmvq_fused(ctx, src0, src1, ids, dst)) {
|
|
4268
|
+
return;
|
|
4269
|
+
}
|
|
4270
|
+
}
|
|
4271
|
+
|
|
3449
4272
|
std::vector<char> ids_host(ggml_nbytes(ids));
|
|
3450
4273
|
const char * ids_dev = (const char *) ids->data;
|
|
3451
4274
|
|
|
@@ -3496,105 +4319,98 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
|
|
3496
4319
|
}
|
|
3497
4320
|
}
|
|
3498
4321
|
} else {
|
|
3499
|
-
|
|
3500
|
-
ggml_sycl_pool_alloc<char>
|
|
4322
|
+
const int64_t n_routed_rows = ids->ne[1] * n_ids;
|
|
4323
|
+
ggml_sycl_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*n_routed_rows*ne10);
|
|
4324
|
+
ggml_sycl_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*n_routed_rows*ne0);
|
|
3501
4325
|
|
|
3502
4326
|
src1_row.data = src1_contiguous.get();
|
|
3503
4327
|
dst_row.data = dst_contiguous.get();
|
|
3504
4328
|
|
|
3505
|
-
|
|
3506
|
-
|
|
3507
|
-
|
|
3508
|
-
|
|
3509
|
-
|
|
4329
|
+
// how many "owned" routed rows to pass to each expert
|
|
4330
|
+
std::vector<int64_t> expert_row_counts;
|
|
4331
|
+
// where each expert's slice starts and the previous ends (row indices, right-exclusive)
|
|
4332
|
+
std::vector<int64_t> expert_row_offsets;
|
|
4333
|
+
// the sources (slot/token pairs) of contiguous rows to guide k_copy_src1_to_contiguous
|
|
4334
|
+
std::vector<mmid_row_mapping> routed_row_src;
|
|
3510
4335
|
|
|
3511
|
-
|
|
4336
|
+
mmid_counting_sort_rows(ids, ids_host.data(), n_ids, n_as, n_routed_rows,
|
|
4337
|
+
expert_row_counts, expert_row_offsets, routed_row_src);
|
|
3512
4338
|
|
|
3513
|
-
|
|
3514
|
-
|
|
3515
|
-
|
|
4339
|
+
ggml_sycl_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), n_routed_rows);
|
|
4340
|
+
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
4341
|
+
stream->memcpy(dev_row_mapping.get(), routed_row_src.data(), n_routed_rows*sizeof(mmid_row_mapping))));
|
|
3516
4342
|
|
|
3517
|
-
|
|
3518
|
-
|
|
3519
|
-
|
|
4343
|
+
const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
|
|
4344
|
+
assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
|
|
4345
|
+
|
|
4346
|
+
{
|
|
4347
|
+
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
|
|
4348
|
+
sycl::range<3> grid_dims(1, 1, n_routed_rows);
|
|
4349
|
+
stream->submit([&](sycl::handler &cgh) {
|
|
4350
|
+
char *__restrict src1_contiguous_get =
|
|
4351
|
+
src1_contiguous.get();
|
|
4352
|
+
mmid_row_mapping *__restrict dev_row_mapping_get =
|
|
4353
|
+
dev_row_mapping.get();
|
|
4354
|
+
|
|
4355
|
+
cgh.parallel_for(
|
|
4356
|
+
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
|
4357
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
4358
|
+
k_copy_src1_to_contiguous(
|
|
4359
|
+
src1_original, src1_contiguous_get,
|
|
4360
|
+
dev_row_mapping_get,
|
|
4361
|
+
ne11, ne10, nb11, nb12,
|
|
4362
|
+
item_ct1);
|
|
4363
|
+
});
|
|
4364
|
+
});
|
|
4365
|
+
}
|
|
4366
|
+
|
|
4367
|
+
for (int64_t i02 = 0; i02 < n_as; i02++) {
|
|
4368
|
+
const int64_t num_src1_rows = expert_row_counts[i02];
|
|
3520
4369
|
|
|
3521
4370
|
if (num_src1_rows == 0) {
|
|
3522
4371
|
continue;
|
|
3523
4372
|
}
|
|
3524
4373
|
|
|
3525
|
-
|
|
3526
|
-
ggml_sycl_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
|
|
3527
|
-
ggml_sycl_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
|
|
3528
|
-
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
3529
|
-
stream->memset(dev_cur_src1_row.get(), 0, sizeof(int))));
|
|
3530
|
-
|
|
3531
|
-
const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
|
|
3532
|
-
assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
|
|
3533
|
-
|
|
3534
|
-
{
|
|
3535
|
-
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
|
|
3536
|
-
sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
|
|
3537
|
-
stream->submit([&](sycl::handler &cgh) {
|
|
3538
|
-
sycl::local_accessor<int, 0> src1_row_acc(cgh);
|
|
3539
|
-
|
|
3540
|
-
char *__restrict src1_contiguous_get =
|
|
3541
|
-
src1_contiguous.get();
|
|
3542
|
-
int *__restrict dev_cur_src1_row_get =
|
|
3543
|
-
dev_cur_src1_row.get();
|
|
3544
|
-
mmid_row_mapping *__restrict dev_row_mapping_get =
|
|
3545
|
-
dev_row_mapping.get();
|
|
3546
|
-
size_t ids_nb_ct6 = ids->nb[1];
|
|
3547
|
-
size_t ids_nb_ct7 = ids->nb[0];
|
|
3548
|
-
|
|
3549
|
-
cgh.parallel_for(
|
|
3550
|
-
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
|
3551
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
3552
|
-
k_copy_src1_to_contiguous(
|
|
3553
|
-
src1_original, src1_contiguous_get,
|
|
3554
|
-
dev_cur_src1_row_get,
|
|
3555
|
-
dev_row_mapping_get, ids_dev, i02,
|
|
3556
|
-
ids_nb_ct6, ids_nb_ct7, ne11, ne10, nb11, nb12,
|
|
3557
|
-
item_ct1, src1_row_acc);
|
|
3558
|
-
});
|
|
3559
|
-
});
|
|
3560
|
-
}
|
|
4374
|
+
const int64_t expert_row_offset = expert_row_offsets[i02];
|
|
3561
4375
|
|
|
3562
4376
|
src0_row.data = src0_original + i02*nb02;
|
|
3563
4377
|
|
|
3564
4378
|
GGML_ASSERT(nb11 == sizeof(float)*ne10);
|
|
3565
4379
|
GGML_ASSERT(nb1 == sizeof(float)*ne0);
|
|
4380
|
+
src1_row.data = src1_contiguous.get() + expert_row_offset*nb11;
|
|
3566
4381
|
src1_row.ne[1] = num_src1_rows;
|
|
3567
4382
|
|
|
3568
4383
|
src1_row.nb[1] = nb11;
|
|
3569
4384
|
src1_row.nb[2] = num_src1_rows*nb11;
|
|
3570
4385
|
src1_row.nb[3] = num_src1_rows*nb11;
|
|
3571
4386
|
|
|
4387
|
+
dst_row.data = dst_contiguous.get() + expert_row_offset*nb1;
|
|
3572
4388
|
dst_row.ne[1] = num_src1_rows;
|
|
3573
4389
|
dst_row.nb[1] = nb1;
|
|
3574
4390
|
dst_row.nb[2] = num_src1_rows*nb1;
|
|
3575
4391
|
dst_row.nb[3] = num_src1_rows*nb1;
|
|
3576
4392
|
|
|
3577
4393
|
ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
|
|
4394
|
+
}
|
|
3578
4395
|
|
|
3579
|
-
|
|
3580
|
-
|
|
3581
|
-
|
|
3582
|
-
|
|
3583
|
-
|
|
3584
|
-
|
|
3585
|
-
|
|
3586
|
-
|
|
3587
|
-
|
|
3588
|
-
|
|
3589
|
-
|
|
3590
|
-
|
|
3591
|
-
|
|
3592
|
-
|
|
3593
|
-
|
|
3594
|
-
|
|
3595
|
-
|
|
3596
|
-
|
|
3597
|
-
}
|
|
4396
|
+
{
|
|
4397
|
+
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, max_work_group_size));
|
|
4398
|
+
sycl::range<3> grid_dims(1, 1, n_routed_rows);
|
|
4399
|
+
stream->submit([&](sycl::handler &cgh) {
|
|
4400
|
+
const char *__restrict dst_contiguous_get =
|
|
4401
|
+
dst_contiguous.get();
|
|
4402
|
+
const mmid_row_mapping *__restrict dev_row_mapping_get =
|
|
4403
|
+
dev_row_mapping.get();
|
|
4404
|
+
|
|
4405
|
+
cgh.parallel_for(
|
|
4406
|
+
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
|
4407
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
4408
|
+
k_copy_dst_from_contiguous(dst_original,
|
|
4409
|
+
dst_contiguous_get,
|
|
4410
|
+
dev_row_mapping_get,
|
|
4411
|
+
ne0, nb1, nb2, item_ct1);
|
|
4412
|
+
});
|
|
4413
|
+
});
|
|
3598
4414
|
}
|
|
3599
4415
|
}
|
|
3600
4416
|
}
|
|
@@ -3624,6 +4440,11 @@ static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
|
|
|
3624
4440
|
ggml_sycl_op_im2col(ctx, dst);
|
|
3625
4441
|
}
|
|
3626
4442
|
|
|
4443
|
+
static void ggml_sycl_im2col_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
4444
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
|
4445
|
+
ggml_sycl_op_im2col_3d(ctx, dst);
|
|
4446
|
+
}
|
|
4447
|
+
|
|
3627
4448
|
static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
3628
4449
|
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
|
3629
4450
|
GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
|
|
@@ -3771,6 +4592,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
|
|
3771
4592
|
case GGML_UNARY_OP_EXP:
|
|
3772
4593
|
ggml_sycl_exp(ctx, dst);
|
|
3773
4594
|
break;
|
|
4595
|
+
case GGML_UNARY_OP_SOFTPLUS:
|
|
4596
|
+
ggml_sycl_softplus(ctx, dst);
|
|
4597
|
+
break;
|
|
3774
4598
|
case GGML_UNARY_OP_SGN:
|
|
3775
4599
|
ggml_sycl_sgn(ctx, dst);
|
|
3776
4600
|
break;
|
|
@@ -3897,6 +4721,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
|
|
3897
4721
|
case GGML_OP_TRANSPOSE:
|
|
3898
4722
|
GGML_SYCL_DEBUG("%s: Tensor NO-OP\n", __func__);
|
|
3899
4723
|
break;
|
|
4724
|
+
case GGML_OP_TRI:
|
|
4725
|
+
ggml_sycl_op_tri(ctx, dst);
|
|
4726
|
+
break;
|
|
3900
4727
|
case GGML_OP_DIAG_MASK_INF:
|
|
3901
4728
|
ggml_sycl_diag_mask_inf(ctx, dst);
|
|
3902
4729
|
break;
|
|
@@ -3909,9 +4736,15 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
|
|
3909
4736
|
case GGML_OP_ROPE:
|
|
3910
4737
|
ggml_sycl_rope(ctx, dst);
|
|
3911
4738
|
break;
|
|
4739
|
+
case GGML_OP_ROPE_BACK:
|
|
4740
|
+
ggml_sycl_rope_back(ctx, dst);
|
|
4741
|
+
break;
|
|
3912
4742
|
case GGML_OP_IM2COL:
|
|
3913
4743
|
ggml_sycl_im2col(ctx, dst);
|
|
3914
4744
|
break;
|
|
4745
|
+
case GGML_OP_IM2COL_3D:
|
|
4746
|
+
ggml_sycl_im2col_3d(ctx, dst);
|
|
4747
|
+
break;
|
|
3915
4748
|
case GGML_OP_POOL_2D:
|
|
3916
4749
|
ggml_sycl_pool2d(ctx, dst);
|
|
3917
4750
|
break;
|
|
@@ -3927,6 +4760,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
|
|
3927
4760
|
case GGML_OP_ARGSORT:
|
|
3928
4761
|
ggml_sycl_argsort(ctx, dst);
|
|
3929
4762
|
break;
|
|
4763
|
+
case GGML_OP_TOP_K:
|
|
4764
|
+
ggml_sycl_op_top_k(ctx, dst);
|
|
4765
|
+
break;
|
|
3930
4766
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
3931
4767
|
ggml_sycl_op_timestep_embedding(ctx, dst);
|
|
3932
4768
|
break;
|
|
@@ -3939,15 +4775,36 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
|
|
3939
4775
|
case GGML_OP_GATED_LINEAR_ATTN:
|
|
3940
4776
|
ggml_sycl_op_gated_linear_attn(ctx, dst);
|
|
3941
4777
|
break;
|
|
4778
|
+
case GGML_OP_GATED_DELTA_NET:
|
|
4779
|
+
ggml_sycl_gated_delta_net(ctx, dst);
|
|
4780
|
+
break;
|
|
3942
4781
|
case GGML_OP_SSM_CONV:
|
|
3943
4782
|
ggml_sycl_ssm_conv(ctx, dst);
|
|
3944
4783
|
break;
|
|
4784
|
+
case GGML_OP_SSM_SCAN:
|
|
4785
|
+
ggml_sycl_ssm_scan(ctx, dst);
|
|
4786
|
+
break;
|
|
4787
|
+
case GGML_OP_FILL:
|
|
4788
|
+
ggml_sycl_fill(ctx, dst);
|
|
4789
|
+
break;
|
|
4790
|
+
case GGML_OP_CUMSUM:
|
|
4791
|
+
ggml_sycl_cumsum(ctx, dst);
|
|
4792
|
+
break;
|
|
4793
|
+
case GGML_OP_DIAG:
|
|
4794
|
+
ggml_sycl_diag(ctx, dst);
|
|
4795
|
+
break;
|
|
4796
|
+
case GGML_OP_SOLVE_TRI:
|
|
4797
|
+
ggml_sycl_solve_tri(ctx, dst);
|
|
4798
|
+
break;
|
|
3945
4799
|
case GGML_OP_ROLL:
|
|
3946
4800
|
ggml_sycl_roll(ctx, dst);
|
|
3947
4801
|
break;
|
|
3948
4802
|
case GGML_OP_ARANGE:
|
|
3949
4803
|
ggml_sycl_arange(ctx, dst);
|
|
3950
4804
|
break;
|
|
4805
|
+
case GGML_OP_FLASH_ATTN_EXT:
|
|
4806
|
+
ggml_sycl_flash_attn_ext(ctx, dst);
|
|
4807
|
+
break;
|
|
3951
4808
|
default:
|
|
3952
4809
|
return false;
|
|
3953
4810
|
}
|
|
@@ -3978,16 +4835,6 @@ void ggml_backend_sycl_get_device_memory(int device, size_t *free,
|
|
|
3978
4835
|
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
|
|
3979
4836
|
ggml_sycl_set_device(device);
|
|
3980
4837
|
|
|
3981
|
-
/*
|
|
3982
|
-
DPCT1009:218: SYCL uses exceptions to report errors and does not use the
|
|
3983
|
-
error codes. The original code was commented out and a warning string was
|
|
3984
|
-
inserted. You need to rewrite this code.
|
|
3985
|
-
*/
|
|
3986
|
-
/*
|
|
3987
|
-
DPCT1106:217: 'cudaMemGetInfo' was migrated with the Intel extensions for
|
|
3988
|
-
device information which may not be supported by all compilers or runtimes.
|
|
3989
|
-
You may need to adjust the code.
|
|
3990
|
-
*/
|
|
3991
4838
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
3992
4839
|
dpct::dev_mgr::instance().get_device(device).get_memory_info(*free, *total)));
|
|
3993
4840
|
}
|
|
@@ -4109,6 +4956,9 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * syc
|
|
|
4109
4956
|
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
|
4110
4957
|
continue;
|
|
4111
4958
|
}
|
|
4959
|
+
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
|
4960
|
+
continue;
|
|
4961
|
+
}
|
|
4112
4962
|
#ifndef NDEBUG
|
|
4113
4963
|
assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
|
|
4114
4964
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
@@ -4252,6 +5102,8 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|
|
4252
5102
|
/* .free = */ ggml_backend_sycl_free,
|
|
4253
5103
|
/* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
|
|
4254
5104
|
/* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
|
|
5105
|
+
/* .set_tensor_2d_async = */ NULL,
|
|
5106
|
+
/* .get_tensor_2d_async = */ NULL,
|
|
4255
5107
|
/* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
|
|
4256
5108
|
// // TODO: update for the new
|
|
4257
5109
|
// interface
|
|
@@ -4386,10 +5238,11 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
4386
5238
|
case GGML_UNARY_OP_GELU_QUICK:
|
|
4387
5239
|
case GGML_UNARY_OP_GELU_ERF:
|
|
4388
5240
|
case GGML_UNARY_OP_EXP:
|
|
5241
|
+
case GGML_UNARY_OP_SOFTPLUS:
|
|
4389
5242
|
case GGML_UNARY_OP_ELU:
|
|
5243
|
+
case GGML_UNARY_OP_CEIL:
|
|
4390
5244
|
return true;
|
|
4391
5245
|
case GGML_UNARY_OP_FLOOR:
|
|
4392
|
-
case GGML_UNARY_OP_CEIL:
|
|
4393
5246
|
case GGML_UNARY_OP_ROUND:
|
|
4394
5247
|
case GGML_UNARY_OP_TRUNC:
|
|
4395
5248
|
#if defined (GGML_SYCL_F16)
|
|
@@ -4419,26 +5272,19 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
4419
5272
|
struct ggml_tensor * a = op->src[0];
|
|
4420
5273
|
struct ggml_tensor * b = op->src[1];
|
|
4421
5274
|
|
|
4422
|
-
|
|
5275
|
+
// disable Q1_0 until implementation
|
|
5276
|
+
if (a->type == GGML_TYPE_Q1_0 || b->type == GGML_TYPE_Q1_0) {
|
|
4423
5277
|
return false;
|
|
4424
5278
|
}
|
|
4425
|
-
|
|
4426
|
-
if (
|
|
4427
|
-
a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ3_S ||
|
|
4428
|
-
a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ2_S ||
|
|
4429
|
-
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ1_M
|
|
4430
|
-
) {
|
|
4431
|
-
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
|
4432
|
-
return false;
|
|
4433
|
-
}
|
|
4434
|
-
}
|
|
4435
|
-
ggml_type src0_type = op->src[0]->type;
|
|
4436
|
-
if (src0_type == GGML_TYPE_BF16 ) {
|
|
4437
|
-
// TODO: support GGML_TYPE_BF16
|
|
4438
|
-
// FIXME: keep a list of supported types to avoid breaking the backend when a new type is added
|
|
5279
|
+
|
|
5280
|
+
if (a->ne[3] != b->ne[3]) {
|
|
4439
5281
|
return false;
|
|
4440
5282
|
}
|
|
4441
5283
|
|
|
5284
|
+
ggml_type src0_type = op->src[0]->type;
|
|
5285
|
+
|
|
5286
|
+
|
|
5287
|
+
|
|
4442
5288
|
// TODO: The configuration below needs more work to be supported with oneDNN
|
|
4443
5289
|
if (ggml_is_permuted(a) && !ggml_is_contiguous(a) &&
|
|
4444
5290
|
a->ne[2] > 1 && a->ne[3] > 1 && src0_type == GGML_TYPE_F16) {
|
|
@@ -4457,12 +5303,31 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
4457
5303
|
case GGML_OP_GET_ROWS:
|
|
4458
5304
|
{
|
|
4459
5305
|
switch (op->src[0]->type) {
|
|
5306
|
+
case GGML_TYPE_I32:
|
|
4460
5307
|
case GGML_TYPE_F16:
|
|
5308
|
+
case GGML_TYPE_BF16:
|
|
4461
5309
|
case GGML_TYPE_F32:
|
|
5310
|
+
case GGML_TYPE_Q1_0:
|
|
5311
|
+
case GGML_TYPE_MXFP4:
|
|
5312
|
+
case GGML_TYPE_NVFP4:
|
|
5313
|
+
case GGML_TYPE_IQ2_XXS:
|
|
5314
|
+
case GGML_TYPE_IQ2_XS:
|
|
5315
|
+
case GGML_TYPE_IQ2_S:
|
|
5316
|
+
case GGML_TYPE_IQ3_XXS:
|
|
5317
|
+
case GGML_TYPE_IQ1_S:
|
|
5318
|
+
case GGML_TYPE_IQ1_M:
|
|
5319
|
+
case GGML_TYPE_IQ3_S:
|
|
5320
|
+
case GGML_TYPE_IQ4_NL:
|
|
5321
|
+
case GGML_TYPE_IQ4_XS:
|
|
5322
|
+
case GGML_TYPE_Q2_K:
|
|
5323
|
+
case GGML_TYPE_Q3_K:
|
|
4462
5324
|
case GGML_TYPE_Q4_0:
|
|
4463
5325
|
case GGML_TYPE_Q4_1:
|
|
5326
|
+
case GGML_TYPE_Q4_K:
|
|
4464
5327
|
case GGML_TYPE_Q5_0:
|
|
4465
5328
|
case GGML_TYPE_Q5_1:
|
|
5329
|
+
case GGML_TYPE_Q5_K:
|
|
5330
|
+
case GGML_TYPE_Q6_K:
|
|
4466
5331
|
case GGML_TYPE_Q8_0:
|
|
4467
5332
|
return true;
|
|
4468
5333
|
default:
|
|
@@ -4588,18 +5453,23 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
4588
5453
|
return (op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32) && (op->type == op->src[0]->type);
|
|
4589
5454
|
#endif
|
|
4590
5455
|
case GGML_OP_NORM:
|
|
4591
|
-
return true;
|
|
4592
5456
|
case GGML_OP_L2_NORM:
|
|
4593
5457
|
case GGML_OP_GROUP_NORM:
|
|
4594
|
-
return ggml_is_contiguous(op->src[0]);
|
|
4595
5458
|
case GGML_OP_RMS_NORM:
|
|
4596
|
-
return
|
|
5459
|
+
return true;
|
|
4597
5460
|
case GGML_OP_RMS_NORM_BACK:
|
|
4598
|
-
return (
|
|
5461
|
+
return ggml_is_contiguous(op->src[0]);
|
|
4599
5462
|
case GGML_OP_SCALE:
|
|
4600
5463
|
return true;
|
|
4601
5464
|
case GGML_OP_CONT:
|
|
4602
5465
|
return op->src[0]->type != GGML_TYPE_BF16;
|
|
5466
|
+
case GGML_OP_TRI:
|
|
5467
|
+
{
|
|
5468
|
+
const ggml_tensor * src0 = op->src[0];
|
|
5469
|
+
return src0 &&
|
|
5470
|
+
op->type == GGML_TYPE_F32 &&
|
|
5471
|
+
ggml_is_contiguous(src0);
|
|
5472
|
+
}
|
|
4603
5473
|
case GGML_OP_DIAG_MASK_INF:
|
|
4604
5474
|
return true;
|
|
4605
5475
|
case GGML_OP_SOFT_MAX:
|
|
@@ -4610,10 +5480,11 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
4610
5480
|
return max_bias == 0.0f;
|
|
4611
5481
|
}
|
|
4612
5482
|
case GGML_OP_ROPE:
|
|
5483
|
+
case GGML_OP_ROPE_BACK:
|
|
4613
5484
|
case GGML_OP_IM2COL:
|
|
4614
|
-
|
|
5485
|
+
case GGML_OP_IM2COL_3D:
|
|
4615
5486
|
case GGML_OP_UPSCALE:
|
|
4616
|
-
return
|
|
5487
|
+
return true;
|
|
4617
5488
|
case GGML_OP_SUM:
|
|
4618
5489
|
case GGML_OP_SUM_ROWS:
|
|
4619
5490
|
case GGML_OP_MEAN:
|
|
@@ -4621,20 +5492,30 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
4621
5492
|
case GGML_OP_ARGSORT:
|
|
4622
5493
|
return op->src[0]->ne[0] * sizeof(int) <=
|
|
4623
5494
|
ggml_sycl_info().devices[device].smpbo;
|
|
5495
|
+
case GGML_OP_TOP_K: {
|
|
5496
|
+
const ggml_tensor * src0 = op->src[0];
|
|
5497
|
+
const int k = op->ne[0];
|
|
5498
|
+
return src0 &&
|
|
5499
|
+
op->type == GGML_TYPE_I32 &&
|
|
5500
|
+
src0->type == GGML_TYPE_F32 &&
|
|
5501
|
+
ggml_is_contiguous(src0) &&
|
|
5502
|
+
k > 0 && k <= 32;
|
|
5503
|
+
}
|
|
4624
5504
|
case GGML_OP_POOL_2D:
|
|
4625
|
-
case GGML_OP_ACC:
|
|
4626
5505
|
return true;
|
|
5506
|
+
case GGML_OP_ACC:
|
|
5507
|
+
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
|
4627
5508
|
case GGML_OP_PAD:
|
|
4628
|
-
// TODO: add circular padding support for syscl, see https://github.com/ggml-org/llama.cpp/pull/16985
|
|
4629
5509
|
if (ggml_get_op_params_i32(op, 8) != 0) {
|
|
4630
5510
|
return false;
|
|
4631
5511
|
}
|
|
4632
|
-
return
|
|
5512
|
+
return true;
|
|
4633
5513
|
case GGML_OP_LEAKY_RELU:
|
|
4634
5514
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
4635
5515
|
case GGML_OP_RWKV_WKV6:
|
|
4636
5516
|
case GGML_OP_RWKV_WKV7:
|
|
4637
5517
|
case GGML_OP_GATED_LINEAR_ATTN:
|
|
5518
|
+
case GGML_OP_GATED_DELTA_NET:
|
|
4638
5519
|
return true;
|
|
4639
5520
|
case GGML_OP_SSM_CONV:
|
|
4640
5521
|
return op->type == GGML_TYPE_F32 &&
|
|
@@ -4644,6 +5525,23 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
4644
5525
|
return op->type == GGML_TYPE_F32;
|
|
4645
5526
|
case GGML_OP_ARANGE:
|
|
4646
5527
|
return op->type == GGML_TYPE_F32;
|
|
5528
|
+
case GGML_OP_SSM_SCAN:
|
|
5529
|
+
if (op->src[3]->ne[0] == 1) {
|
|
5530
|
+
// Mamba2
|
|
5531
|
+
// (kernel only supports (d_state == 128 || d_state == 256) && d_head % WARP_SIZE == 0)
|
|
5532
|
+
return (op->src[0]->ne[0] == 128 || op->src[0]->ne[0] == 256) && op->src[0]->ne[1] % WARP_SIZE == 0;
|
|
5533
|
+
} else {
|
|
5534
|
+
// TODO Mamba-1 not yet ported to SYCL
|
|
5535
|
+
return false;
|
|
5536
|
+
}
|
|
5537
|
+
case GGML_OP_FILL:
|
|
5538
|
+
case GGML_OP_CUMSUM:
|
|
5539
|
+
case GGML_OP_DIAG:
|
|
5540
|
+
return true;
|
|
5541
|
+
case GGML_OP_SOLVE_TRI:
|
|
5542
|
+
return op->src[0]->ne[0] <= SYCL_SOLVE_TRI_MAX_N && op->src[1]->ne[0] <= SYCL_SOLVE_TRI_MAX_K;
|
|
5543
|
+
case GGML_OP_FLASH_ATTN_EXT:
|
|
5544
|
+
return ggml_sycl_flash_attn_ext_supported(device, op);
|
|
4647
5545
|
default:
|
|
4648
5546
|
return false;
|
|
4649
5547
|
}
|