whispercpp 1.3.6 → 1.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.document +3 -0
- data/.rdoc_options +2 -0
- data/README.md +38 -5
- data/Rakefile +18 -3
- data/ext/dependencies.rb +10 -4
- data/ext/dependencies_for_windows.rb +17 -0
- data/ext/extconf.rb +20 -8
- data/ext/options.rb +54 -14
- data/ext/options_for_windows.rb +51 -0
- data/ext/ruby_whisper.c +36 -42
- data/ext/ruby_whisper.h +135 -0
- data/ext/ruby_whisper_context.c +107 -28
- data/ext/ruby_whisper_log_queue.c +180 -0
- data/ext/ruby_whisper_log_settable.h +47 -0
- data/ext/ruby_whisper_parakeet.c +49 -0
- data/ext/ruby_whisper_parakeet_context.c +304 -0
- data/ext/ruby_whisper_parakeet_context_params.c +117 -0
- data/ext/ruby_whisper_parakeet_model.c +84 -0
- data/ext/ruby_whisper_parakeet_params.c +548 -0
- data/ext/ruby_whisper_parakeet_segment.c +157 -0
- data/ext/ruby_whisper_parakeet_token.c +188 -0
- data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
- data/ext/ruby_whisper_params.c +256 -65
- data/ext/ruby_whisper_segment.c +6 -6
- data/ext/ruby_whisper_transcribe.cpp +42 -15
- data/ext/sources/CMakeLists.txt +41 -3
- data/ext/sources/CMakePresets.json +95 -0
- data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
- data/ext/sources/cmake/parakeet.pc.in +10 -0
- data/ext/sources/cmake/whisper.pc.in +1 -1
- data/ext/sources/examples/CMakeLists.txt +4 -2
- data/ext/sources/examples/bench/bench.cpp +1 -1
- data/ext/sources/examples/cli/cli.cpp +43 -9
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/common-whisper.cpp +139 -67
- data/ext/sources/examples/common-whisper.h +11 -0
- data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
- data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
- data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
- data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
- data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
- data/ext/sources/examples/server/server.cpp +199 -163
- data/ext/sources/ggml/CMakeLists.txt +21 -13
- data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
- data/ext/sources/ggml/include/ggml-alloc.h +1 -0
- data/ext/sources/ggml/include/ggml-backend.h +72 -10
- data/ext/sources/ggml/include/ggml-cuda.h +3 -0
- data/ext/sources/ggml/include/ggml-rpc.h +3 -3
- data/ext/sources/ggml/include/ggml.h +101 -9
- data/ext/sources/ggml/include/gguf.h +10 -2
- data/ext/sources/ggml/src/CMakeLists.txt +22 -5
- data/ext/sources/ggml/src/ggml-alloc.c +5 -1
- data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
- data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +12 -0
- data/ext/sources/ggml/src/ggml-backend.cpp +110 -9
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +4 -0
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +672 -257
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +71 -0
- data/ext/sources/ggml/src/ggml-cann/common.h +20 -10
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +211 -30
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +58 -29
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +2 -0
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +16 -16
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +116 -7
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +65 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +4279 -1292
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +5 -35
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +0 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +177 -27
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +5 -0
- data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +95 -5
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +146 -134
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +88 -70
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +372 -73
- data/ext/sources/ggml/src/ggml-cpu/ops.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +55 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +3 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +90 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +3 -16
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
- data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +37 -53
- data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +17 -7
- data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
- data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +62 -26
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +44 -18
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +242 -28
- data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +53 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +14 -6
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +278 -44
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +331 -130
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +126 -27
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +40 -15
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +18 -9
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +152 -49
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
- data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +84 -35
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1069 -609
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +4 -2
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +242 -195
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +3 -3
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +502 -423
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +19 -12
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +485 -57
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -1
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +36 -10
- data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +133 -26
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +5 -1
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +11 -4
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
- data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
- data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +45 -13
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -4
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +26 -23
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +31 -2
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +80 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +22 -4
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +2 -1
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1428 -743
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -7
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +53 -84
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +25 -12
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +165 -184
- data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +170 -127
- data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +125 -97
- data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +148 -42
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.c +2 -2
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +252 -62
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +9 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +87 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +96 -13
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +182 -57
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +71 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +27 -10
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +63 -23
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +9 -8
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +1 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +5 -8
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +529 -815
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2522 -234
- data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +291 -95
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +59 -37
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +121 -133
- data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +244 -151
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +6 -6
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +719 -45
- data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +3 -1
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +22 -9
- data/ext/sources/ggml/src/ggml-impl.h +6 -1
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +138 -13
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +32 -1
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +164 -28
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +80 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +190 -19
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +2 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +39 -26
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +823 -322
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +54 -5
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +12248 -5907
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +67 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +59 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1819 -112
- data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mm_q8_0_f32_8x4.cl → gemm_noshuffle_q8_0_f32.cl} +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
- data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
- data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +48 -64
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +15 -5
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +18 -11
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +35 -13
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +264 -192
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +33 -7
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +1 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +1 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +27 -3
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +67 -36
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +1 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +101 -44
- data/ext/sources/ggml/src/ggml-openvino/utils.h +23 -3
- data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
- data/ext/sources/ggml/src/ggml-quants.c +289 -114
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
- data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
- data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +50 -4
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +3 -1
- data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +41 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +115 -13
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
- data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
- data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
- data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1 -90
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +0 -2
- data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +7 -5
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +4 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +76 -168
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +3 -1
- data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +69 -31
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +823 -190
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
- data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
- data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +71 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
- data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
- data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
- data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
- data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
- data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +215 -53
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +4 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +2 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +2 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +1 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +1 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +0 -2
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +11 -0
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +2060 -535
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +197 -48
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +60 -59
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +115 -113
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +122 -31
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +125 -64
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +43 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +5 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +3 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +11 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +171 -147
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +2202 -283
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +2610 -1403
- data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +8 -7
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +76 -95
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +19 -1
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -50
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +107 -184
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +183 -78
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +655 -495
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +8 -6
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +5 -1
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +80 -409
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +6 -4
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +2 -3
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +68 -48
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +18 -14
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +244 -10
- data/ext/sources/ggml/src/ggml.c +110 -28
- data/ext/sources/ggml/src/gguf.cpp +173 -28
- data/ext/sources/include/parakeet.h +342 -0
- data/ext/sources/include/whisper.h +10 -0
- data/ext/sources/media/matmul.png +0 -0
- data/ext/sources/src/CMakeLists.txt +23 -0
- data/ext/sources/src/parakeet-arch.h +188 -0
- data/ext/sources/src/parakeet.cpp +3838 -0
- data/ext/sources/src/whisper.cpp +56 -12
- data/extsources.rb +26 -10
- data/lib/whisper/log_settable.rb +36 -0
- data/lib/whisper/model/uri.rb +13 -1
- data/lib/whisper/output.rb +74 -0
- data/sig/whisper.rbs +411 -62
- data/test/helper.rb +2 -0
- data/test/jfk_reader/jfk_reader.c +50 -7
- data/test/test_callback.rb +1 -0
- data/test/test_package.rb +6 -5
- data/test/test_parakeet.rb +28 -0
- data/test/test_parakeet_callback.rb +107 -0
- data/test/test_parakeet_context.rb +116 -0
- data/test/test_parakeet_context_params.rb +24 -0
- data/test/test_parakeet_model.rb +21 -0
- data/test/test_parakeet_params.rb +78 -0
- data/test/test_parakeet_segment.rb +42 -0
- data/test/test_parakeet_token.rb +73 -0
- data/test/test_params.rb +2 -0
- data/test/test_vad_segment.rb +1 -1
- data/test/test_whisper.rb +24 -6
- data/whispercpp.gemspec +2 -2
- metadata +215 -281
- data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
- data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
- data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
- data/ext/sources/bindings/javascript/package.json +0 -26
- data/ext/sources/bindings/javascript/whisper.js +0 -19
- data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
- data/ext/sources/examples/addon.node/addon.cpp +0 -557
- data/ext/sources/examples/addon.node/index.js +0 -59
- data/ext/sources/examples/addon.node/package.json +0 -16
- data/ext/sources/examples/addon.node/vad-example.js +0 -132
- data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
- data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
- data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
- data/ext/sources/examples/coi-serviceworker.js +0 -146
- data/ext/sources/examples/command/CMakeLists.txt +0 -10
- data/ext/sources/examples/command/command.cpp +0 -802
- data/ext/sources/examples/command/commands.txt +0 -9
- data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
- data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
- data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
- data/ext/sources/examples/generate-karaoke.sh +0 -57
- data/ext/sources/examples/helpers.js +0 -191
- data/ext/sources/examples/livestream.sh +0 -112
- data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
- data/ext/sources/examples/lsp/lsp.cpp +0 -471
- data/ext/sources/examples/lsp/whisper.vim +0 -362
- data/ext/sources/examples/python/test_whisper_processor.py +0 -7
- data/ext/sources/examples/python/whisper_processor.py +0 -54
- data/ext/sources/examples/server/bench.js +0 -29
- data/ext/sources/examples/server.py +0 -120
- data/ext/sources/examples/stream/CMakeLists.txt +0 -10
- data/ext/sources/examples/stream/stream.cpp +0 -437
- data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
- data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
- data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
- data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
- data/ext/sources/examples/sycl/build.sh +0 -22
- data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
- data/ext/sources/examples/sycl/run-whisper.sh +0 -17
- data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -48
- data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -488
- data/ext/sources/examples/talk-llama/llama-adapter.h +0 -89
- data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2877
- data/ext/sources/examples/talk-llama/llama-arch.h +0 -628
- data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -919
- data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
- data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -896
- data/ext/sources/examples/talk-llama/llama-chat.h +0 -71
- data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3633
- data/ext/sources/examples/talk-llama/llama-context.h +0 -359
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
- data/ext/sources/examples/talk-llama/llama-cparams.h +0 -47
- data/ext/sources/examples/talk-llama/llama-ext.h +0 -12
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
- data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
- data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2735
- data/ext/sources/examples/talk-llama/llama-graph.h +0 -1031
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -258
- data/ext/sources/examples/talk-llama/llama-hparams.h +0 -353
- data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
- data/ext/sources/examples/talk-llama/llama-impl.h +0 -75
- data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
- data/ext/sources/examples/talk-llama/llama-io.h +0 -35
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -330
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2285
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -389
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +0 -275
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +0 -140
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1165
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
- data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
- data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -752
- data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1655
- data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -206
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -299
- data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -40
- data/ext/sources/examples/talk-llama/llama-model.cpp +0 -9056
- data/ext/sources/examples/talk-llama/llama-model.h +0 -597
- data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1304
- data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
- data/ext/sources/examples/talk-llama/llama-sampler.cpp +0 -3885
- data/ext/sources/examples/talk-llama/llama-sampler.h +0 -42
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3970
- data/ext/sources/examples/talk-llama/llama-vocab.h +0 -187
- data/ext/sources/examples/talk-llama/llama.cpp +0 -1194
- data/ext/sources/examples/talk-llama/llama.h +0 -1573
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -190
- data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
- data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -137
- data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -143
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -133
- data/ext/sources/examples/talk-llama/models/bert.cpp +0 -184
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -145
- data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
- data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -142
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -262
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +0 -445
- data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -148
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +0 -97
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +0 -145
- data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -111
- data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
- data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
- data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -157
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -195
- data/ext/sources/examples/talk-llama/models/granite.cpp +0 -210
- data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -139
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -153
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/jais2.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +0 -381
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -196
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/llama.cpp +0 -175
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/mamba-base.cpp +0 -289
- data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -54
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -129
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -200
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
- data/ext/sources/examples/talk-llama/models/models.h +0 -704
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -109
- data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -162
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
- data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
- data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
- data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -320
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/plm.cpp +0 -169
- data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +0 -381
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +0 -422
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -131
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -525
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -140
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -164
- data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
- data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -137
- data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +0 -165
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
- data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
- data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
- data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
- data/ext/sources/examples/talk-llama/speak +0 -40
- data/ext/sources/examples/talk-llama/speak.bat +0 -1
- data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
- data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
- data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
- data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
- data/ext/sources/examples/talk-llama/unicode.cpp +0 -1103
- data/ext/sources/examples/talk-llama/unicode.h +0 -111
- data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
- data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
- data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
- data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
- data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
- data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
- data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
- data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
- data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
- data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -155
- data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
- data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +0 -123
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +0 -17
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +0 -333
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -182
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -718
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
- data/ext/sources/tests/CMakeLists.txt +0 -112
- data/ext/sources/tests/earnings21/eval.mk +0 -58
- data/ext/sources/tests/earnings21/eval.py +0 -68
- data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
- data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
- data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
- data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
- data/ext/sources/tests/earnings21/requirements.txt +0 -6
- data/ext/sources/tests/en-0-ref.txt +0 -1
- data/ext/sources/tests/en-1-ref.txt +0 -1
- data/ext/sources/tests/en-2-ref.txt +0 -1
- data/ext/sources/tests/es-0-ref.txt +0 -1
- data/ext/sources/tests/librispeech/eval.mk +0 -39
- data/ext/sources/tests/librispeech/eval.py +0 -47
- data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
- data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
- data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
- data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
- data/ext/sources/tests/librispeech/requirements.txt +0 -6
- data/ext/sources/tests/run-tests.sh +0 -130
- data/ext/sources/tests/test-c.c +0 -3
- data/ext/sources/tests/test-vad-full.cpp +0 -56
- data/ext/sources/tests/test-vad.cpp +0 -83
- data/ext/sources/tests/test-whisper.js +0 -58
- data/lib/whisper/context.rb +0 -15
- data/lib/whisper/segment.rb +0 -58
- /data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general_q8_0_f32.cl → gemv_noshuffle_q8_0_f32.cl} +0 -0
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
#include <cstdlib>
|
|
20
20
|
#include <float.h>
|
|
21
21
|
#include <limits>
|
|
22
|
+
#include <optional>
|
|
22
23
|
#include <stdint.h>
|
|
23
24
|
#include <stdio.h>
|
|
24
25
|
#include <vector>
|
|
@@ -30,9 +31,18 @@
|
|
|
30
31
|
#include <regex>
|
|
31
32
|
|
|
32
33
|
#include <sycl/sycl.hpp>
|
|
34
|
+
#include <sycl/backend.hpp>
|
|
35
|
+
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
|
36
|
+
#include <level_zero/ze_api.h>
|
|
37
|
+
#endif
|
|
33
38
|
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
|
|
34
39
|
# include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
|
|
35
40
|
#endif
|
|
41
|
+
#if SYCL_EXT_ONEAPI_VIRTUAL_MEM
|
|
42
|
+
# include <sycl/ext/oneapi/virtual_mem/physical_mem.hpp>
|
|
43
|
+
# include <sycl/ext/oneapi/virtual_mem/virtual_mem.hpp>
|
|
44
|
+
# define GGML_SYCL_USE_VMM
|
|
45
|
+
#endif
|
|
36
46
|
#include <sycl/half_type.hpp>
|
|
37
47
|
|
|
38
48
|
#include "ggml.h"
|
|
@@ -44,7 +54,6 @@
|
|
|
44
54
|
#include "ggml-sycl/backend.hpp"
|
|
45
55
|
#include "ggml-sycl/common.hpp"
|
|
46
56
|
#include "ggml-sycl/element_wise.hpp"
|
|
47
|
-
#include "ggml-sycl/gated_delta_net.hpp"
|
|
48
57
|
#include "ggml-sycl/gemm.hpp"
|
|
49
58
|
#include "ggml-sycl/getrows.hpp"
|
|
50
59
|
#include "ggml-sycl/norm.hpp"
|
|
@@ -55,15 +64,23 @@
|
|
|
55
64
|
#include "ggml-sycl/set.hpp"
|
|
56
65
|
#include "ggml-sycl/ssm_conv.hpp"
|
|
57
66
|
#include "ggml-sycl/sycl_hw.hpp"
|
|
58
|
-
|
|
67
|
+
#include "ggml-sycl/ssm_scan.hpp"
|
|
68
|
+
#include "ggml-sycl/fill.hpp"
|
|
69
|
+
#include "ggml-sycl/cumsum.hpp"
|
|
70
|
+
#include "ggml-sycl/diag.hpp"
|
|
71
|
+
#include "ggml-sycl/solve_tri.hpp"
|
|
72
|
+
#include "ggml-sycl/gated_delta_net.hpp"
|
|
59
73
|
|
|
60
74
|
static bool g_sycl_loaded = false;
|
|
61
75
|
int g_ggml_sycl_debug = 0;
|
|
62
76
|
int g_ggml_sycl_disable_optimize = 0;
|
|
63
77
|
int g_ggml_sycl_disable_graph = 0;
|
|
64
78
|
int g_ggml_sycl_disable_dnn = 0;
|
|
79
|
+
int g_ggml_sycl_enable_vmm = 1;
|
|
65
80
|
int g_ggml_sycl_prioritize_dmmv = 0;
|
|
66
81
|
int g_ggml_sycl_use_async_mem_op = 0;
|
|
82
|
+
int g_ggml_sycl_use_async_mem_op_requested = 1;
|
|
83
|
+
int g_ggml_sycl_enable_level_zero = 0;
|
|
67
84
|
int g_ggml_sycl_enable_flash_attention = 1;
|
|
68
85
|
|
|
69
86
|
|
|
@@ -86,13 +103,30 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
|
|
86
103
|
// GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
|
|
87
104
|
// #endif
|
|
88
105
|
for (int i = 0; i < info.device_count; ++i) {
|
|
89
|
-
info.devices[i].vmm = 0;
|
|
90
106
|
dpct::device_info prop;
|
|
91
|
-
|
|
107
|
+
auto & device = dpct::dev_mgr::instance().get_device(i);
|
|
92
108
|
|
|
93
109
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
|
94
110
|
prop, device)));
|
|
95
111
|
|
|
112
|
+
#if !defined(GGML_SYCL_USE_VMM)
|
|
113
|
+
info.devices[i].vmm = 0;
|
|
114
|
+
#else
|
|
115
|
+
info.devices[i].vmm = device.has(sycl::aspect::ext_oneapi_virtual_mem);
|
|
116
|
+
if (info.devices[i].vmm) {
|
|
117
|
+
// NB: SYCL's get_mem_granularity always returns the _minimum_ granularity,
|
|
118
|
+
// but the L0 API requires a larger page size for allocs above 2 MiB and
|
|
119
|
+
// rejects non-multiples with UR_RESULT_ERROR_INVALID_VALUE [sic].
|
|
120
|
+
// Here we clamp it to 2 MiB for simplicity, but other devices may require
|
|
121
|
+
// calling zeVirtualMemQueryPageSize or yet unexposed public API.
|
|
122
|
+
const size_t physical_page = 2ull << 20; // 2 MiB
|
|
123
|
+
info.devices[i].vmm_granularity = std::max<size_t>(
|
|
124
|
+
sycl::ext::oneapi::experimental::get_mem_granularity(
|
|
125
|
+
device, sycl::context(device)),
|
|
126
|
+
physical_page);
|
|
127
|
+
}
|
|
128
|
+
#endif
|
|
129
|
+
|
|
96
130
|
info.default_tensor_split[i] = total_vram;
|
|
97
131
|
total_vram += prop.get_global_mem_size();
|
|
98
132
|
|
|
@@ -105,7 +139,14 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
|
|
105
139
|
|
|
106
140
|
info.max_work_group_sizes[i] = prop.get_max_work_group_size();
|
|
107
141
|
info.devices[i].max_wg_per_cu = info.max_work_group_sizes[i] / prop.get_max_compute_units();
|
|
142
|
+
info.devices[i].hw_info = get_device_hw_info(&device);
|
|
108
143
|
|
|
144
|
+
// Only check GPU devices; CPU devices use OpenCL and would otherwise
|
|
145
|
+
// disable Level Zero for the GPUs on systems without ONEAPI_DEVICE_SELECTOR set.
|
|
146
|
+
if (device.is_gpu() && device.default_queue().get_backend() != sycl::backend::ext_oneapi_level_zero) {
|
|
147
|
+
GGML_LOG_WARN("SYCL GPU device %d does not use Level Zero backend, disabling Level Zero memory API\n", i);
|
|
148
|
+
info.ext_oneapi_level_zero = false;
|
|
149
|
+
}
|
|
109
150
|
}
|
|
110
151
|
|
|
111
152
|
for (int id = 0; id < info.device_count; ++id) {
|
|
@@ -217,7 +258,13 @@ static void ggml_check_sycl() try {
|
|
|
217
258
|
g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
|
|
218
259
|
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
|
219
260
|
g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
|
|
261
|
+
g_ggml_sycl_enable_vmm = get_sycl_env("GGML_SYCL_ENABLE_VMM", 1);
|
|
220
262
|
g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
|
|
263
|
+
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
|
264
|
+
g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", ggml_sycl_info().ext_oneapi_level_zero);
|
|
265
|
+
#else
|
|
266
|
+
g_ggml_sycl_enable_level_zero = 0;
|
|
267
|
+
#endif
|
|
221
268
|
|
|
222
269
|
#ifdef SYCL_FLASH_ATTN
|
|
223
270
|
g_ggml_sycl_enable_flash_attention = get_sycl_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1);
|
|
@@ -248,6 +295,16 @@ static void ggml_check_sycl() try {
|
|
|
248
295
|
#else
|
|
249
296
|
GGML_LOG_INFO(" GGML_SYCL_DNNL: no\n");
|
|
250
297
|
#endif
|
|
298
|
+
#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO)
|
|
299
|
+
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: yes\n");
|
|
300
|
+
#else
|
|
301
|
+
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: no\n");
|
|
302
|
+
#endif
|
|
303
|
+
#if defined(GGML_SYCL_USE_VMM)
|
|
304
|
+
GGML_LOG_INFO(" GGML_SYCL_USE_VMM: yes\n");
|
|
305
|
+
#else
|
|
306
|
+
GGML_LOG_INFO(" GGML_SYCL_USE_VMM: no\n");
|
|
307
|
+
#endif
|
|
251
308
|
|
|
252
309
|
GGML_LOG_INFO("Running with Environment Variables:\n");
|
|
253
310
|
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
|
|
@@ -257,12 +314,24 @@ static void ggml_check_sycl() try {
|
|
|
257
314
|
#else
|
|
258
315
|
GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: graph disabled by compile flag\n");
|
|
259
316
|
#endif
|
|
317
|
+
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
|
318
|
+
GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: %d\n", g_ggml_sycl_enable_level_zero);
|
|
319
|
+
#else
|
|
320
|
+
GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: Level Zero disabled by compile flag\n");
|
|
321
|
+
#endif
|
|
260
322
|
#if GGML_SYCL_DNNL
|
|
261
323
|
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn);
|
|
262
324
|
#else
|
|
263
325
|
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n");
|
|
326
|
+
#endif
|
|
327
|
+
#if defined(GGML_SYCL_USE_VMM)
|
|
328
|
+
GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: %d\n", g_ggml_sycl_enable_vmm);
|
|
329
|
+
#else
|
|
330
|
+
GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: virtual memory extension is not available\n");
|
|
264
331
|
#endif
|
|
265
332
|
GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
|
|
333
|
+
g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1);
|
|
334
|
+
GGML_LOG_INFO(" GGML_SYCL_USE_ASYNC_MEM_OP: %d\n", g_ggml_sycl_use_async_mem_op_requested);
|
|
266
335
|
|
|
267
336
|
#ifdef SYCL_FLASH_ATTN
|
|
268
337
|
GGML_LOG_INFO(" GGML_SYCL_ENABLE_FLASH_ATTN: %d\n", g_ggml_sycl_enable_flash_attention);
|
|
@@ -278,11 +347,11 @@ static void ggml_check_sycl() try {
|
|
|
278
347
|
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
|
|
279
348
|
#endif
|
|
280
349
|
*/
|
|
281
|
-
//
|
|
282
|
-
//
|
|
283
|
-
//
|
|
350
|
+
// Async USM allocation/free is also useful outside the graph path: it avoids the host waits in the reorder
|
|
351
|
+
// staging path while preserving queue ordering semantics. Graph support still depends on the extension being
|
|
352
|
+
// available, but it no longer needs to control the non-graph fast path.
|
|
284
353
|
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
|
|
285
|
-
g_ggml_sycl_use_async_mem_op = !g_ggml_sycl_disable_graph;
|
|
354
|
+
g_ggml_sycl_use_async_mem_op = g_ggml_sycl_use_async_mem_op_requested || !g_ggml_sycl_disable_graph;
|
|
286
355
|
if (g_ggml_sycl_use_async_mem_op) {
|
|
287
356
|
for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); ++i) {
|
|
288
357
|
if (!dpct::dev_mgr::instance().get_device(i).has(sycl::aspect::ext_oneapi_async_memory_alloc)) {
|
|
@@ -366,7 +435,7 @@ struct ggml_backend_sycl_buffer_context {
|
|
|
366
435
|
~ggml_backend_sycl_buffer_context() {
|
|
367
436
|
if (dev_ptr != nullptr) {
|
|
368
437
|
ggml_sycl_set_device(device);
|
|
369
|
-
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
438
|
+
SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(dev_ptr, *stream)));
|
|
370
439
|
}
|
|
371
440
|
|
|
372
441
|
//release extra used by tensors
|
|
@@ -412,11 +481,22 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
|
412
481
|
assert(tensor->view_src->buffer->buft == buffer->buft);
|
|
413
482
|
return GGML_STATUS_SUCCESS;
|
|
414
483
|
}
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
tensor->
|
|
419
|
-
|
|
484
|
+
|
|
485
|
+
if (!g_ggml_sycl_disable_optimize) {
|
|
486
|
+
// set reorder extra buffer based on supported type
|
|
487
|
+
switch (tensor->type) {
|
|
488
|
+
case GGML_TYPE_Q4_0:
|
|
489
|
+
case GGML_TYPE_Q8_0:
|
|
490
|
+
case GGML_TYPE_Q4_K:
|
|
491
|
+
case GGML_TYPE_Q6_K:{
|
|
492
|
+
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
|
493
|
+
tensor->extra = extra;
|
|
494
|
+
ctx->tensor_extras.push_back(extra);
|
|
495
|
+
break;
|
|
496
|
+
}
|
|
497
|
+
default:
|
|
498
|
+
break;
|
|
499
|
+
}
|
|
420
500
|
}
|
|
421
501
|
|
|
422
502
|
if (ggml_is_quantized(tensor->type)) {
|
|
@@ -488,8 +568,43 @@ catch (sycl::exception const &exc) {
|
|
|
488
568
|
std::exit(1);
|
|
489
569
|
}
|
|
490
570
|
|
|
571
|
+
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
|
572
|
+
static bool ggml_sycl_is_l0_discrete_gpu(sycl::queue &q) {
|
|
573
|
+
if (!q.get_device().is_gpu() || q.get_backend() != sycl::backend::ext_oneapi_level_zero) {
|
|
574
|
+
return false;
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
ze_device_handle_t ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_device());
|
|
578
|
+
ze_device_properties_t props = {};
|
|
579
|
+
props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
|
|
580
|
+
ze_result_t r = zeDeviceGetProperties(ze_dev, &props);
|
|
581
|
+
return r == ZE_RESULT_SUCCESS && !(props.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED);
|
|
582
|
+
}
|
|
583
|
+
#endif
|
|
584
|
+
|
|
491
585
|
static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
|
|
492
586
|
const void *ptr_src, size_t size) {
|
|
587
|
+
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
|
588
|
+
// Use Level Zero direct copy for dGPU-to-dGPU transfers.
|
|
589
|
+
const bool l0_copy_supported =
|
|
590
|
+
ggml_sycl_is_l0_discrete_gpu(q_dst) && ggml_sycl_is_l0_discrete_gpu(q_src);
|
|
591
|
+
if (g_ggml_sycl_enable_level_zero && l0_copy_supported) {
|
|
592
|
+
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_context());
|
|
593
|
+
auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_device());
|
|
594
|
+
ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0,
|
|
595
|
+
0, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
|
|
596
|
+
ze_command_list_handle_t cl;
|
|
597
|
+
ze_result_t r = zeCommandListCreateImmediate(ze_ctx, ze_dev, &cq_desc, &cl);
|
|
598
|
+
if (r == ZE_RESULT_SUCCESS) {
|
|
599
|
+
r = zeCommandListAppendMemoryCopy(cl, ptr_dst, ptr_src, size, nullptr, 0, nullptr);
|
|
600
|
+
zeCommandListDestroy(cl);
|
|
601
|
+
if (r == ZE_RESULT_SUCCESS) {
|
|
602
|
+
return;
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
#endif
|
|
607
|
+
// Host-staged copy
|
|
493
608
|
char *host_buf = (char *)malloc(size);
|
|
494
609
|
q_src.memcpy(host_buf, (const char *)ptr_src, size).wait();
|
|
495
610
|
q_dst.memcpy((char *)ptr_dst, host_buf, size).wait();
|
|
@@ -570,9 +685,15 @@ static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
|
|
|
570
685
|
SYCL_CHECK(
|
|
571
686
|
CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw()));
|
|
572
687
|
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
688
|
+
constexpr size_t MAX_CHUNK = 2ULL << 30; // 2 GiB
|
|
689
|
+
for (size_t off = 0; off < buffer->size; off += MAX_CHUNK) {
|
|
690
|
+
size_t chunk = std::min(buffer->size - off, MAX_CHUNK);
|
|
691
|
+
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
692
|
+
(*stream)
|
|
693
|
+
.memset(static_cast<char*>(ctx->dev_ptr) + off, value, chunk)
|
|
694
|
+
.wait()
|
|
695
|
+
));
|
|
696
|
+
}
|
|
576
697
|
}
|
|
577
698
|
catch (sycl::exception const &exc) {
|
|
578
699
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
|
@@ -622,6 +743,8 @@ static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
|
|
|
622
743
|
/* .memset_tensor = */ ggml_backend_sycl_buffer_memset_tensor,
|
|
623
744
|
/* .set_tensor = */ ggml_backend_sycl_buffer_set_tensor,
|
|
624
745
|
/* .get_tensor = */ ggml_backend_sycl_buffer_get_tensor,
|
|
746
|
+
/* .set_tensor_2d = */ NULL,
|
|
747
|
+
/* .get_tensor_2d = */ NULL,
|
|
625
748
|
/* .cpy_tensor = */ ggml_backend_sycl_buffer_cpy_tensor,
|
|
626
749
|
/* .clear = */ ggml_backend_sycl_buffer_clear,
|
|
627
750
|
/* .reset = */ ggml_backend_sycl_buffer_reset,
|
|
@@ -651,8 +774,7 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
|
|
651
774
|
size = std::max(size, (size_t)1); // syclMalloc returns null for size 0
|
|
652
775
|
|
|
653
776
|
void * dev_ptr;
|
|
654
|
-
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)
|
|
655
|
-
size, *stream)));
|
|
777
|
+
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)ggml_sycl_malloc_device(size, *stream)));
|
|
656
778
|
if (!dev_ptr) {
|
|
657
779
|
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
|
|
658
780
|
return nullptr;
|
|
@@ -667,7 +789,7 @@ catch (sycl::exception const &exc) {
|
|
|
667
789
|
}
|
|
668
790
|
|
|
669
791
|
static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
670
|
-
return
|
|
792
|
+
return SYCL_BUFFER_ALIGNMENT;
|
|
671
793
|
GGML_UNUSED(buft);
|
|
672
794
|
}
|
|
673
795
|
|
|
@@ -893,18 +1015,10 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
|
893
1015
|
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
894
1016
|
}
|
|
895
1017
|
|
|
896
|
-
// FIXME: do not crash if SYCL Buffer alloc fails
|
|
897
|
-
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
|
898
1018
|
ggml_sycl_set_device(i);
|
|
899
1019
|
const queue_ptr stream = ctx->streams[i];
|
|
900
1020
|
char * buf;
|
|
901
|
-
|
|
902
|
-
DPCT1009:208: SYCL uses exceptions to report errors and does not use the
|
|
903
|
-
error codes. The original code was commented out and a warning string
|
|
904
|
-
was inserted. You need to rewrite this code.
|
|
905
|
-
*/
|
|
906
|
-
SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
|
|
907
|
-
size, *stream)));
|
|
1021
|
+
SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)ggml_sycl_malloc_device(size, *stream)));
|
|
908
1022
|
if (!buf) {
|
|
909
1023
|
char err_buf[1024];
|
|
910
1024
|
snprintf(err_buf, 1023, "%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
|
|
@@ -1068,6 +1182,8 @@ static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
|
|
|
1068
1182
|
/* .memset_tensor = */ NULL,
|
|
1069
1183
|
/* .set_tensor = */ ggml_backend_sycl_split_buffer_set_tensor,
|
|
1070
1184
|
/* .get_tensor = */ ggml_backend_sycl_split_buffer_get_tensor,
|
|
1185
|
+
/* .set_tensor_2d = */ NULL,
|
|
1186
|
+
/* .get_tensor_2d = */ NULL,
|
|
1071
1187
|
/* .cpy_tensor = */ NULL,
|
|
1072
1188
|
/* .clear = */ ggml_backend_sycl_split_buffer_clear,
|
|
1073
1189
|
/* .reset = */ NULL,
|
|
@@ -1096,7 +1212,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(gg
|
|
|
1096
1212
|
}
|
|
1097
1213
|
|
|
1098
1214
|
static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
1099
|
-
return
|
|
1215
|
+
return SYCL_BUFFER_ALIGNMENT;
|
|
1100
1216
|
GGML_UNUSED(buft);
|
|
1101
1217
|
}
|
|
1102
1218
|
|
|
@@ -1260,16 +1376,53 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
|
|
1260
1376
|
explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : device(device_), qptr(qptr_) {}
|
|
1261
1377
|
|
|
1262
1378
|
~ggml_sycl_pool_leg() {
|
|
1379
|
+
#ifdef DEBUG_SYCL_POOL
|
|
1380
|
+
int n_cached = 0;
|
|
1381
|
+
size_t bytes_cached = 0;
|
|
1382
|
+
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
|
|
1383
|
+
if (buffer_pool[i].ptr != nullptr) {
|
|
1384
|
+
++n_cached;
|
|
1385
|
+
bytes_cached += buffer_pool[i].size;
|
|
1386
|
+
}
|
|
1387
|
+
}
|
|
1388
|
+
GGML_LOG_INFO("%s: %d buffers, cached = %.2f MiB\n", __func__,
|
|
1389
|
+
n_cached, bytes_cached / 1024.0 / 1024.0);
|
|
1390
|
+
const auto slots = format_slots_in_alloc_order();
|
|
1391
|
+
if (!slots.empty()) {
|
|
1392
|
+
GGML_LOG_INFO("%s: slots MiB: %s\n", __func__, slots.c_str());
|
|
1393
|
+
}
|
|
1394
|
+
#endif
|
|
1395
|
+
|
|
1263
1396
|
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
|
|
1264
1397
|
ggml_sycl_buffer & b = buffer_pool[i];
|
|
1265
1398
|
if (b.ptr != nullptr) {
|
|
1266
|
-
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
1399
|
+
SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(b.ptr, *qptr)));
|
|
1267
1400
|
pool_size -= b.size;
|
|
1268
1401
|
}
|
|
1269
1402
|
}
|
|
1270
1403
|
GGML_ASSERT(pool_size == 0);
|
|
1271
1404
|
}
|
|
1272
1405
|
|
|
1406
|
+
#ifdef DEBUG_SYCL_POOL
|
|
1407
|
+
std::string format_slots_in_alloc_order() const {
|
|
1408
|
+
std::string line;
|
|
1409
|
+
char buf[32];
|
|
1410
|
+
bool first = true;
|
|
1411
|
+
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
|
|
1412
|
+
if (buffer_pool[i].ptr == nullptr) {
|
|
1413
|
+
continue;
|
|
1414
|
+
}
|
|
1415
|
+
if (!first) {
|
|
1416
|
+
line += '/';
|
|
1417
|
+
}
|
|
1418
|
+
first = false;
|
|
1419
|
+
snprintf(buf, sizeof(buf), "%.2f", buffer_pool[i].size / 1024.0 / 1024.0);
|
|
1420
|
+
line += buf;
|
|
1421
|
+
}
|
|
1422
|
+
return line;
|
|
1423
|
+
}
|
|
1424
|
+
#endif
|
|
1425
|
+
|
|
1273
1426
|
void * alloc(size_t size, size_t * actual_size) override {
|
|
1274
1427
|
#ifdef DEBUG_sycl_MALLOC
|
|
1275
1428
|
int nnz = 0;
|
|
@@ -1311,9 +1464,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
|
|
1311
1464
|
void * ptr;
|
|
1312
1465
|
size_t look_ahead_size = (size_t) (1.05 * size);
|
|
1313
1466
|
|
|
1314
|
-
SYCL_CHECK(
|
|
1315
|
-
CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
|
|
1316
|
-
look_ahead_size, *qptr)));
|
|
1467
|
+
SYCL_CHECK(CHECK_TRY_ERROR(ptr = (void *)ggml_sycl_malloc_device(look_ahead_size, *qptr)));
|
|
1317
1468
|
if (!ptr) {
|
|
1318
1469
|
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device/GPU\n", __func__, look_ahead_size);
|
|
1319
1470
|
return nullptr;
|
|
@@ -1341,11 +1492,126 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
|
|
1341
1492
|
}
|
|
1342
1493
|
}
|
|
1343
1494
|
GGML_LOG_WARN("WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n");
|
|
1344
|
-
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
1495
|
+
SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(ptr, *qptr)));
|
|
1345
1496
|
pool_size -= size;
|
|
1346
1497
|
}
|
|
1347
1498
|
};
|
|
1348
1499
|
|
|
1500
|
+
// pool with virtual memory management
|
|
1501
|
+
#if defined(GGML_SYCL_USE_VMM)
|
|
1502
|
+
struct ggml_sycl_pool_vmm : public ggml_sycl_pool {
|
|
1503
|
+
static const size_t SYCL_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
|
1504
|
+
|
|
1505
|
+
int device;
|
|
1506
|
+
sycl::context ctx;
|
|
1507
|
+
sycl::device dev;
|
|
1508
|
+
|
|
1509
|
+
uintptr_t pool_addr = 0;
|
|
1510
|
+
size_t pool_used = 0;
|
|
1511
|
+
size_t pool_size = 0;
|
|
1512
|
+
size_t granularity;
|
|
1513
|
+
|
|
1514
|
+
// physical_mem owns the commits (unlike cuMemMap)
|
|
1515
|
+
struct mapping {
|
|
1516
|
+
sycl::ext::oneapi::experimental::physical_mem phys;
|
|
1517
|
+
void * map_ptr;
|
|
1518
|
+
};
|
|
1519
|
+
std::vector<mapping> mappings;
|
|
1520
|
+
|
|
1521
|
+
explicit ggml_sycl_pool_vmm(queue_ptr qptr_, int device_) :
|
|
1522
|
+
device(device_),
|
|
1523
|
+
ctx(qptr_->get_context()),
|
|
1524
|
+
dev(qptr_->get_device()),
|
|
1525
|
+
granularity(ggml_sycl_info().devices[device_].vmm_granularity) {
|
|
1526
|
+
}
|
|
1527
|
+
|
|
1528
|
+
~ggml_sycl_pool_vmm() {
|
|
1529
|
+
if (pool_addr == 0) {
|
|
1530
|
+
return;
|
|
1531
|
+
}
|
|
1532
|
+
|
|
1533
|
+
// Per spec, unmap must (a) match the exact (ptr, size) of an earlier
|
|
1534
|
+
// physical_mem::map() call and (b) precede destruction of the
|
|
1535
|
+
// physical_mem objects (their dtors won't unmap).
|
|
1536
|
+
for (auto & m : mappings) {
|
|
1537
|
+
SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::unmap(
|
|
1538
|
+
m.map_ptr, m.phys.size(), ctx)));
|
|
1539
|
+
}
|
|
1540
|
+
SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::free_virtual_mem(
|
|
1541
|
+
pool_addr, SYCL_POOL_VMM_MAX_SIZE, ctx)));
|
|
1542
|
+
}
|
|
1543
|
+
|
|
1544
|
+
void * alloc(size_t size, size_t * actual_size) override {
|
|
1545
|
+
// round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
|
|
1546
|
+
size = GGML_PAD(size, SYCL_BUFFER_ALIGNMENT);
|
|
1547
|
+
|
|
1548
|
+
size_t avail = pool_size - pool_used;
|
|
1549
|
+
|
|
1550
|
+
if (size > avail) {
|
|
1551
|
+
// round up to the next multiple of the granularity
|
|
1552
|
+
size_t reserve_size = GGML_PAD(size - avail, granularity);
|
|
1553
|
+
|
|
1554
|
+
GGML_ASSERT(pool_size + reserve_size <= SYCL_POOL_VMM_MAX_SIZE);
|
|
1555
|
+
|
|
1556
|
+
// allocate more physical memory
|
|
1557
|
+
std::optional<sycl::ext::oneapi::experimental::physical_mem> phys;
|
|
1558
|
+
SYCL_CHECK(CHECK_TRY_ERROR(phys.emplace(dev, ctx, reserve_size)));
|
|
1559
|
+
|
|
1560
|
+
// reserve virtual address space (if not already reserved)
|
|
1561
|
+
if (pool_addr == 0) {
|
|
1562
|
+
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
1563
|
+
pool_addr = sycl::ext::oneapi::experimental::reserve_virtual_mem(
|
|
1564
|
+
SYCL_POOL_VMM_MAX_SIZE, ctx)));
|
|
1565
|
+
}
|
|
1566
|
+
|
|
1567
|
+
// map at the end of the pool
|
|
1568
|
+
void * map_ptr = nullptr;
|
|
1569
|
+
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
1570
|
+
map_ptr = phys->map(pool_addr + pool_size, reserve_size,
|
|
1571
|
+
sycl::ext::oneapi::experimental::address_access_mode::read_write)));
|
|
1572
|
+
|
|
1573
|
+
// stash these so we could unmap this exact range in dtor
|
|
1574
|
+
mappings.push_back({
|
|
1575
|
+
std::move(*phys),
|
|
1576
|
+
map_ptr,
|
|
1577
|
+
});
|
|
1578
|
+
|
|
1579
|
+
// add to the pool
|
|
1580
|
+
pool_size += reserve_size;
|
|
1581
|
+
|
|
1582
|
+
#ifdef DEBUG_SYCL_MALLOC
|
|
1583
|
+
GGML_LOG_INFO("sycl pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
|
|
1584
|
+
device, (unsigned long long) (pool_size/1024/1024),
|
|
1585
|
+
(unsigned long long) (reserve_size/1024/1024));
|
|
1586
|
+
#endif
|
|
1587
|
+
}
|
|
1588
|
+
|
|
1589
|
+
GGML_ASSERT(pool_addr != 0);
|
|
1590
|
+
|
|
1591
|
+
void * ptr = reinterpret_cast<void *>(pool_addr + pool_used);
|
|
1592
|
+
*actual_size = size;
|
|
1593
|
+
pool_used += size;
|
|
1594
|
+
|
|
1595
|
+
#ifdef DEBUG_SYCL_MALLOC
|
|
1596
|
+
GGML_LOG_INFO("sycl pool[%d]: allocated %llu bytes at %p\n", device, (unsigned long long) size, ptr);
|
|
1597
|
+
#endif
|
|
1598
|
+
|
|
1599
|
+
return ptr;
|
|
1600
|
+
}
|
|
1601
|
+
|
|
1602
|
+
void free(void * ptr, size_t size) override {
|
|
1603
|
+
#ifdef DEBUG_SYCL_MALLOC
|
|
1604
|
+
GGML_LOG_INFO("sycl pool[%d]: freed %llu bytes at %p\n", device, (unsigned long long) size, ptr);
|
|
1605
|
+
#endif
|
|
1606
|
+
|
|
1607
|
+
pool_used -= size;
|
|
1608
|
+
|
|
1609
|
+
// all deallocations must be in reverse order of the allocations
|
|
1610
|
+
GGML_ASSERT(ptr == reinterpret_cast<void *>(pool_addr + pool_used));
|
|
1611
|
+
}
|
|
1612
|
+
};
|
|
1613
|
+
#endif // defined(GGML_SYCL_USE_VMM)
|
|
1614
|
+
|
|
1349
1615
|
struct ggml_sycl_pool_host : public ggml_sycl_pool {
|
|
1350
1616
|
queue_ptr qptr;
|
|
1351
1617
|
int device;
|
|
@@ -1426,15 +1692,18 @@ std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_host(que
|
|
|
1426
1692
|
}
|
|
1427
1693
|
|
|
1428
1694
|
std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) {
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
|
-
|
|
1695
|
+
#if defined(GGML_SYCL_USE_VMM)
|
|
1696
|
+
if (g_ggml_sycl_enable_vmm && ggml_sycl_info().devices[device].vmm) {
|
|
1697
|
+
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(qptr, device));
|
|
1698
|
+
}
|
|
1699
|
+
#endif // defined(GGML_SYCL_USE_VMM)
|
|
1700
|
+
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
|
|
1434
1701
|
}
|
|
1435
1702
|
|
|
1436
|
-
|
|
1437
|
-
|
|
1703
|
+
|
|
1704
|
+
std::unique_ptr<ggml_sycl_fattn_kv_buffers> ggml_backend_sycl_context::new_fattn_kv_buffers(queue_ptr qptr, int device) {
|
|
1705
|
+
return std::unique_ptr<ggml_sycl_fattn_kv_buffers>(new ggml_sycl_fattn_kv_buffers(qptr, device));
|
|
1706
|
+
}
|
|
1438
1707
|
|
|
1439
1708
|
/// kernels
|
|
1440
1709
|
typedef void (*ggml_sycl_op_mul_mat_t)(
|
|
@@ -2156,6 +2425,31 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
|
2156
2425
|
#else
|
|
2157
2426
|
bool use_fp16 = false;
|
|
2158
2427
|
#endif
|
|
2428
|
+
|
|
2429
|
+
#if GGML_SYCL_DNNL && defined(GGML_SYCL_HAS_BF16)
|
|
2430
|
+
// Fast path for bf16 src0
|
|
2431
|
+
if (src0->type == GGML_TYPE_BF16 && !g_ggml_sycl_disable_dnn && ggml_is_contiguous(src0) &&
|
|
2432
|
+
row_diff == src0->ne[1]) {
|
|
2433
|
+
using bf16_t = sycl::ext::oneapi::bfloat16;
|
|
2434
|
+
ggml_sycl_pool_alloc<bf16_t> src1_as_bf16(ctx.pool(), src1_ncols*ne10);
|
|
2435
|
+
if (src1->type != GGML_TYPE_BF16) {
|
|
2436
|
+
const to_bf16_sycl_t to_bf16_sycl = ggml_get_to_bf16_sycl(src1->type, dst);
|
|
2437
|
+
GGML_ASSERT(to_bf16_sycl != nullptr);
|
|
2438
|
+
to_bf16_sycl(src1_ddf_i, src1_as_bf16.get(), src1_ncols*ne10, stream);
|
|
2439
|
+
} else {
|
|
2440
|
+
stream->memcpy(src1_as_bf16.get(), src1_ddf_i, src1_ncols*ne10*sizeof(bf16_t));
|
|
2441
|
+
}
|
|
2442
|
+
DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10,
|
|
2443
|
+
src0_dd_i, DnnlGemmWrapper::to_dt<bf16_t>(),
|
|
2444
|
+
src1_as_bf16.get(), DnnlGemmWrapper::to_dt<bf16_t>(),
|
|
2445
|
+
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
|
|
2446
|
+
GGML_UNUSED(dst);
|
|
2447
|
+
GGML_UNUSED(src1_ddq_i);
|
|
2448
|
+
GGML_UNUSED(src1_padded_row_size);
|
|
2449
|
+
return;
|
|
2450
|
+
}
|
|
2451
|
+
#endif
|
|
2452
|
+
|
|
2159
2453
|
if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && use_fp16 && ggml_is_contiguous(src0) &&
|
|
2160
2454
|
row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
|
2161
2455
|
ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
|
|
@@ -2233,21 +2527,25 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
|
2233
2527
|
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
|
|
2234
2528
|
const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
|
|
2235
2529
|
|
|
2530
|
+
{
|
|
2531
|
+
const int64_t gemm_flops = (int64_t)row_diff * src1_ncols * ne10;
|
|
2532
|
+
const bool use_mkl_direct = gemm_flops < 256 * 256 * 256;
|
|
2236
2533
|
#if GGML_SYCL_DNNL
|
|
2237
|
-
|
|
2238
|
-
|
|
2239
|
-
|
|
2240
|
-
|
|
2241
|
-
|
|
2242
|
-
|
|
2534
|
+
if (!g_ggml_sycl_disable_dnn && !use_mkl_direct) {
|
|
2535
|
+
DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10, src0_ddf_i,
|
|
2536
|
+
DnnlGemmWrapper::to_dt<float>(), src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
|
|
2537
|
+
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
|
|
2538
|
+
}
|
|
2539
|
+
else
|
|
2243
2540
|
#endif
|
|
2244
|
-
|
|
2245
|
-
|
|
2246
|
-
|
|
2247
|
-
|
|
2248
|
-
|
|
2249
|
-
|
|
2250
|
-
|
|
2541
|
+
{
|
|
2542
|
+
const float alpha = 1.0f;
|
|
2543
|
+
const float beta = 0.0f;
|
|
2544
|
+
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
|
|
2545
|
+
*stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff,
|
|
2546
|
+
src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
|
|
2547
|
+
dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
|
|
2548
|
+
}
|
|
2251
2549
|
}
|
|
2252
2550
|
}
|
|
2253
2551
|
GGML_UNUSED(dst);
|
|
@@ -3249,8 +3547,11 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
|
|
|
3249
3547
|
inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
|
|
3250
3548
|
switch (type) {
|
|
3251
3549
|
case GGML_TYPE_Q4_0:
|
|
3550
|
+
case GGML_TYPE_Q8_0:
|
|
3252
3551
|
return true;
|
|
3552
|
+
case GGML_TYPE_Q3_K:
|
|
3253
3553
|
case GGML_TYPE_Q4_K:
|
|
3554
|
+
case GGML_TYPE_Q5_K:
|
|
3254
3555
|
case GGML_TYPE_Q6_K:
|
|
3255
3556
|
return !g_ggml_sycl_prioritize_dmmv;
|
|
3256
3557
|
default:
|
|
@@ -3261,6 +3562,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
|
|
|
3261
3562
|
inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
|
|
3262
3563
|
switch (type) {
|
|
3263
3564
|
case GGML_TYPE_Q4_0:
|
|
3565
|
+
case GGML_TYPE_Q8_0:
|
|
3264
3566
|
return true;
|
|
3265
3567
|
default:
|
|
3266
3568
|
return false;
|
|
@@ -3270,7 +3572,10 @@ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
|
|
|
3270
3572
|
inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
|
|
3271
3573
|
switch (type) {
|
|
3272
3574
|
case GGML_TYPE_Q4_0:
|
|
3575
|
+
case GGML_TYPE_Q8_0:
|
|
3576
|
+
case GGML_TYPE_Q3_K:
|
|
3273
3577
|
case GGML_TYPE_Q4_K:
|
|
3578
|
+
case GGML_TYPE_Q5_K:
|
|
3274
3579
|
case GGML_TYPE_Q6_K:
|
|
3275
3580
|
return true;
|
|
3276
3581
|
default:
|
|
@@ -3291,6 +3596,7 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
|
|
3291
3596
|
case GGML_TYPE_Q5_K:
|
|
3292
3597
|
case GGML_TYPE_Q6_K:
|
|
3293
3598
|
case GGML_TYPE_F16:
|
|
3599
|
+
case GGML_TYPE_BF16:
|
|
3294
3600
|
return true;
|
|
3295
3601
|
default:
|
|
3296
3602
|
return false;
|
|
@@ -3308,7 +3614,7 @@ static inline void * sycl_ext_malloc_device(dpct::queue_ptr stream, size_t size)
|
|
|
3308
3614
|
// If async allocation extension is not available, use_async should always be false.
|
|
3309
3615
|
GGML_ASSERT(!use_async);
|
|
3310
3616
|
#endif
|
|
3311
|
-
return
|
|
3617
|
+
return ggml_sycl_malloc_device(size, *stream);
|
|
3312
3618
|
}
|
|
3313
3619
|
|
|
3314
3620
|
static inline void sycl_ext_free(dpct::queue_ptr stream, void * ptr) {
|
|
@@ -3322,12 +3628,58 @@ static inline void sycl_ext_free(dpct::queue_ptr stream, void * ptr) {
|
|
|
3322
3628
|
// If async allocation extension is not available, use_async should always be false.
|
|
3323
3629
|
GGML_ASSERT(!use_async);
|
|
3324
3630
|
#endif
|
|
3325
|
-
|
|
3631
|
+
ggml_sycl_free_device(ptr, *stream);
|
|
3326
3632
|
}
|
|
3327
3633
|
|
|
3328
|
-
|
|
3634
|
+
// RAII wrapper for temporary reorder buffers with optional host memory fallback.
|
|
3635
|
+
// When device allocation fails and GGML_SYCL_HOST_MEM_FALLBACK is enabled,
|
|
3636
|
+
// falls back to host memory so the reorder kernel can still run (over PCIe).
|
|
3637
|
+
// Device access to host memory requires Linux kernel 6.8+ (Ubuntu 26.04+).
|
|
3638
|
+
struct sycl_reorder_temp_buffer {
|
|
3639
|
+
void * ptr = nullptr;
|
|
3640
|
+
dpct::queue_ptr stream;
|
|
3641
|
+
|
|
3642
|
+
sycl_reorder_temp_buffer(dpct::queue_ptr stream, size_t size) : stream(stream) {
|
|
3643
|
+
ptr = sycl_ext_malloc_device(stream, size);
|
|
3644
|
+
#ifdef GGML_SYCL_HOST_MEM_FALLBACK
|
|
3645
|
+
if (!ptr) {
|
|
3646
|
+
ptr = sycl::malloc_host(size, *stream);
|
|
3647
|
+
if (ptr) {
|
|
3648
|
+
host_fallback = true;
|
|
3649
|
+
GGML_LOG_WARN("%s: device alloc of %zu bytes failed, using host memory fallback\n", __func__, size);
|
|
3650
|
+
}
|
|
3651
|
+
}
|
|
3652
|
+
#endif
|
|
3653
|
+
}
|
|
3654
|
+
|
|
3655
|
+
~sycl_reorder_temp_buffer() {
|
|
3656
|
+
if (!ptr) {
|
|
3657
|
+
return;
|
|
3658
|
+
}
|
|
3659
|
+
if (host_fallback) {
|
|
3660
|
+
sycl::free(ptr, *stream);
|
|
3661
|
+
} else {
|
|
3662
|
+
sycl_ext_free(stream, ptr);
|
|
3663
|
+
}
|
|
3664
|
+
}
|
|
3665
|
+
|
|
3666
|
+
explicit operator bool() const { return ptr != nullptr; }
|
|
3667
|
+
|
|
3668
|
+
sycl_reorder_temp_buffer(const sycl_reorder_temp_buffer &) = delete;
|
|
3669
|
+
sycl_reorder_temp_buffer & operator=(const sycl_reorder_temp_buffer &) = delete;
|
|
3670
|
+
|
|
3671
|
+
private:
|
|
3672
|
+
bool host_fallback = false;
|
|
3673
|
+
};
|
|
3674
|
+
|
|
3675
|
+
static bool reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
|
|
3329
3676
|
dpct::queue_ptr stream) {
|
|
3330
|
-
|
|
3677
|
+
sycl_reorder_temp_buffer tmp(stream, size);
|
|
3678
|
+
if (!tmp) {
|
|
3679
|
+
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
|
|
3680
|
+
return false;
|
|
3681
|
+
}
|
|
3682
|
+
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
|
|
3331
3683
|
|
|
3332
3684
|
sycl::event copy_event;
|
|
3333
3685
|
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
|
|
@@ -3356,16 +3708,60 @@ static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nr
|
|
|
3356
3708
|
if (!g_ggml_sycl_use_async_mem_op) {
|
|
3357
3709
|
reorder_event.wait_and_throw();
|
|
3358
3710
|
}
|
|
3359
|
-
|
|
3711
|
+
return true;
|
|
3712
|
+
}
|
|
3713
|
+
|
|
3714
|
+
static bool reorder_qw_q8_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
|
|
3715
|
+
dpct::queue_ptr stream) {
|
|
3716
|
+
sycl_reorder_temp_buffer tmp(stream, size);
|
|
3717
|
+
if (!tmp) {
|
|
3718
|
+
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
|
|
3719
|
+
return false;
|
|
3720
|
+
}
|
|
3721
|
+
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
|
|
3722
|
+
|
|
3723
|
+
sycl::event copy_event;
|
|
3724
|
+
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
|
|
3725
|
+
if (!g_ggml_sycl_use_async_mem_op) {
|
|
3726
|
+
copy_event.wait();
|
|
3727
|
+
}
|
|
3728
|
+
|
|
3729
|
+
GGML_ASSERT((size % sizeof(block_q8_0) == 0));
|
|
3730
|
+
GGML_ASSERT((offset % sizeof(block_q8_0) == 0));
|
|
3731
|
+
int offset_blks = offset / sizeof(block_q8_0);
|
|
3732
|
+
auto qs_ptr = data_device + offset_blks * QK8_0;
|
|
3733
|
+
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows) + offset_blks;
|
|
3734
|
+
|
|
3735
|
+
auto reorder_event = stream->parallel_for(
|
|
3736
|
+
size / sizeof(block_q8_0),
|
|
3737
|
+
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
3738
|
+
const block_q8_0* x = (const block_q8_0*)tmp_buf;
|
|
3739
|
+
const int ib = i;
|
|
3740
|
+
|
|
3741
|
+
for (int j = 0; j < QK8_0; j++)
|
|
3742
|
+
{
|
|
3743
|
+
*((int8_t*)qs_ptr + ib * QK8_0 + j) = x[ib].qs[j];
|
|
3744
|
+
}
|
|
3745
|
+
*(d_ptr + ib) = x[ib].d;
|
|
3746
|
+
});
|
|
3747
|
+
if (!g_ggml_sycl_use_async_mem_op) {
|
|
3748
|
+
reorder_event.wait_and_throw();
|
|
3749
|
+
}
|
|
3750
|
+
return true;
|
|
3360
3751
|
}
|
|
3361
3752
|
|
|
3362
|
-
static
|
|
3753
|
+
static bool reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
|
3363
3754
|
GGML_ASSERT(size % sizeof(block_q4_K) == 0);
|
|
3364
3755
|
GGML_ASSERT(offset % sizeof(block_q4_K) == 0);
|
|
3365
3756
|
|
|
3366
3757
|
const int nblocks = size / sizeof(block_q4_K);
|
|
3367
3758
|
|
|
3368
|
-
|
|
3759
|
+
sycl_reorder_temp_buffer tmp(stream, size);
|
|
3760
|
+
if (!tmp) {
|
|
3761
|
+
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
|
|
3762
|
+
return false;
|
|
3763
|
+
}
|
|
3764
|
+
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
|
|
3369
3765
|
|
|
3370
3766
|
sycl::event copy_event;
|
|
3371
3767
|
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
|
|
@@ -3394,16 +3790,117 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
|
|
|
3394
3790
|
if (!g_ggml_sycl_use_async_mem_op) {
|
|
3395
3791
|
reorder_event.wait_and_throw();
|
|
3396
3792
|
}
|
|
3397
|
-
|
|
3793
|
+
return true;
|
|
3794
|
+
}
|
|
3795
|
+
|
|
3796
|
+
static bool reorder_qw_q3_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
|
3797
|
+
GGML_ASSERT(size % sizeof(block_q3_K) == 0);
|
|
3798
|
+
GGML_ASSERT(offset % sizeof(block_q3_K) == 0);
|
|
3799
|
+
|
|
3800
|
+
const int nblocks = size / sizeof(block_q3_K);
|
|
3801
|
+
|
|
3802
|
+
sycl_reorder_temp_buffer tmp(stream, size);
|
|
3803
|
+
if (!tmp) {
|
|
3804
|
+
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
|
|
3805
|
+
return false;
|
|
3806
|
+
}
|
|
3807
|
+
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
|
|
3808
|
+
|
|
3809
|
+
sycl::event copy_event;
|
|
3810
|
+
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
|
|
3811
|
+
if (!g_ggml_sycl_use_async_mem_op) {
|
|
3812
|
+
copy_event.wait();
|
|
3813
|
+
}
|
|
3814
|
+
|
|
3815
|
+
auto * qs_ptr = data_device;
|
|
3816
|
+
auto * hmask_ptr = qs_ptr + (QK_K / 4) * nblocks;
|
|
3817
|
+
auto * scales_ptr = hmask_ptr + (QK_K / 8) * nblocks;
|
|
3818
|
+
sycl::half * d_ptr = (sycl::half *) (scales_ptr + 12 * nblocks);
|
|
3819
|
+
|
|
3820
|
+
auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
|
|
3821
|
+
const block_q3_K * x = (const block_q3_K *) tmp_buf;
|
|
3822
|
+
const int ib = i;
|
|
3823
|
+
|
|
3824
|
+
for (int j = 0; j < QK_K / 4; ++j) {
|
|
3825
|
+
qs_ptr[ib * (QK_K / 4) + j] = x[ib].qs[j];
|
|
3826
|
+
}
|
|
3827
|
+
|
|
3828
|
+
for (int j = 0; j < QK_K / 8; ++j) {
|
|
3829
|
+
hmask_ptr[ib * (QK_K / 8) + j] = x[ib].hmask[j];
|
|
3830
|
+
}
|
|
3831
|
+
|
|
3832
|
+
for (int j = 0; j < 12; ++j) {
|
|
3833
|
+
scales_ptr[ib * 12 + j] = x[ib].scales[j];
|
|
3834
|
+
}
|
|
3835
|
+
|
|
3836
|
+
d_ptr[ib] = x[ib].d;
|
|
3837
|
+
});
|
|
3838
|
+
if (!g_ggml_sycl_use_async_mem_op) {
|
|
3839
|
+
reorder_event.wait_and_throw();
|
|
3840
|
+
}
|
|
3841
|
+
return true;
|
|
3398
3842
|
}
|
|
3399
3843
|
|
|
3400
|
-
static
|
|
3844
|
+
static bool reorder_qw_q5_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
|
3845
|
+
GGML_ASSERT(size % sizeof(block_q5_K) == 0);
|
|
3846
|
+
GGML_ASSERT(offset % sizeof(block_q5_K) == 0);
|
|
3847
|
+
|
|
3848
|
+
const int nblocks = size / sizeof(block_q5_K);
|
|
3849
|
+
|
|
3850
|
+
sycl_reorder_temp_buffer tmp(stream, size);
|
|
3851
|
+
if (!tmp) {
|
|
3852
|
+
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
|
|
3853
|
+
return false;
|
|
3854
|
+
}
|
|
3855
|
+
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
|
|
3856
|
+
|
|
3857
|
+
sycl::event copy_event;
|
|
3858
|
+
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
|
|
3859
|
+
if (!g_ggml_sycl_use_async_mem_op) {
|
|
3860
|
+
copy_event.wait();
|
|
3861
|
+
}
|
|
3862
|
+
|
|
3863
|
+
auto * qs_ptr = data_device;
|
|
3864
|
+
auto * qh_ptr = qs_ptr + (QK_K / 2) * nblocks;
|
|
3865
|
+
auto * scales_ptr = qh_ptr + (QK_K / 8) * nblocks;
|
|
3866
|
+
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
|
|
3867
|
+
|
|
3868
|
+
auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
|
|
3869
|
+
const block_q5_K * x = (const block_q5_K *) tmp_buf;
|
|
3870
|
+
const int ib = i;
|
|
3871
|
+
|
|
3872
|
+
for (int j = 0; j < QK_K / 2; ++j) {
|
|
3873
|
+
qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
|
|
3874
|
+
}
|
|
3875
|
+
|
|
3876
|
+
for (int j = 0; j < QK_K / 8; ++j) {
|
|
3877
|
+
qh_ptr[ib * (QK_K / 8) + j] = x[ib].qh[j];
|
|
3878
|
+
}
|
|
3879
|
+
|
|
3880
|
+
for (int j = 0; j < K_SCALE_SIZE; ++j) {
|
|
3881
|
+
scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
|
|
3882
|
+
}
|
|
3883
|
+
|
|
3884
|
+
dm_ptr[ib] = x[ib].dm;
|
|
3885
|
+
});
|
|
3886
|
+
if (!g_ggml_sycl_use_async_mem_op) {
|
|
3887
|
+
reorder_event.wait_and_throw();
|
|
3888
|
+
}
|
|
3889
|
+
return true;
|
|
3890
|
+
}
|
|
3891
|
+
|
|
3892
|
+
static bool reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
|
3401
3893
|
GGML_ASSERT(size % sizeof(block_q6_K) == 0);
|
|
3402
3894
|
GGML_ASSERT(offset % sizeof(block_q6_K) == 0);
|
|
3403
3895
|
|
|
3404
3896
|
const int nblocks = size / sizeof(block_q6_K);
|
|
3405
3897
|
|
|
3406
|
-
|
|
3898
|
+
sycl_reorder_temp_buffer tmp(stream, size);
|
|
3899
|
+
if (!tmp) {
|
|
3900
|
+
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
|
|
3901
|
+
return false;
|
|
3902
|
+
}
|
|
3903
|
+
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
|
|
3407
3904
|
|
|
3408
3905
|
sycl::event copy_event;
|
|
3409
3906
|
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
|
|
@@ -3442,10 +3939,10 @@ static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, d
|
|
|
3442
3939
|
if (!g_ggml_sycl_use_async_mem_op) {
|
|
3443
3940
|
reorder_event.wait_and_throw();
|
|
3444
3941
|
}
|
|
3445
|
-
|
|
3942
|
+
return true;
|
|
3446
3943
|
}
|
|
3447
3944
|
|
|
3448
|
-
static
|
|
3945
|
+
static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
|
3449
3946
|
uint8_t * data_device = (uint8_t *) src0->data;
|
|
3450
3947
|
size_t ncols = src0->ne[0];
|
|
3451
3948
|
size_t nrows = src0->ne[1];
|
|
@@ -3453,17 +3950,20 @@ static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
|
|
3453
3950
|
|
|
3454
3951
|
switch (src0->type) {
|
|
3455
3952
|
case GGML_TYPE_Q4_0:
|
|
3456
|
-
reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
|
|
3457
|
-
|
|
3953
|
+
return reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
|
|
3954
|
+
case GGML_TYPE_Q8_0:
|
|
3955
|
+
return reorder_qw_q8_0(data_device, ncols, nrows, size, 0, stream);
|
|
3956
|
+
case GGML_TYPE_Q3_K:
|
|
3957
|
+
return reorder_qw_q3_k(data_device, size, 0, stream);
|
|
3458
3958
|
case GGML_TYPE_Q4_K:
|
|
3459
|
-
reorder_qw_q4_k(data_device, size, 0, stream);
|
|
3460
|
-
|
|
3959
|
+
return reorder_qw_q4_k(data_device, size, 0, stream);
|
|
3960
|
+
case GGML_TYPE_Q5_K:
|
|
3961
|
+
return reorder_qw_q5_k(data_device, size, 0, stream);
|
|
3461
3962
|
case GGML_TYPE_Q6_K:
|
|
3462
|
-
reorder_qw_q6_k(data_device, size, 0, stream);
|
|
3463
|
-
break;
|
|
3963
|
+
return reorder_qw_q6_k(data_device, size, 0, stream);
|
|
3464
3964
|
default:
|
|
3465
3965
|
GGML_ABORT("reorder_qw() called with unsupported type");
|
|
3466
|
-
|
|
3966
|
+
return false;
|
|
3467
3967
|
}
|
|
3468
3968
|
}
|
|
3469
3969
|
|
|
@@ -3471,7 +3971,9 @@ static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_ten
|
|
|
3471
3971
|
return !g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
|
|
3472
3972
|
ctx.opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf.
|
|
3473
3973
|
dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases.
|
|
3474
|
-
|
|
3974
|
+
// ne[1] <= 8 so multi-column decode (spec / MTP verify) also bootstraps the reorder;
|
|
3975
|
+
// all reorderable types have a _switch_ncols kernel.
|
|
3976
|
+
dst->src[1]->ne[1] <= 8 && dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
|
|
3475
3977
|
}
|
|
3476
3978
|
|
|
3477
3979
|
static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * /* src1 */,
|
|
@@ -3503,14 +4005,20 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor *
|
|
|
3503
4005
|
break;
|
|
3504
4006
|
}
|
|
3505
4007
|
|
|
3506
|
-
reorder_qw(src0, ctx->stream())
|
|
3507
|
-
|
|
4008
|
+
if (reorder_qw(src0, ctx->stream())) {
|
|
4009
|
+
extra->optimized_feature.reorder = true; // Used to decode/dequan in next steps and avoid re-reordering
|
|
4010
|
+
}
|
|
3508
4011
|
}
|
|
3509
4012
|
|
|
3510
4013
|
|
|
3511
4014
|
static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
4015
|
+
// The F16/BF16 qk=1 kernel iterates with stride 2*DMMV_X, requiring ne[0] to be
|
|
4016
|
+
// a multiple of 2*DMMV_X. Quantized types use block-structured kernels that only
|
|
4017
|
+
// need ne[0] % DMMV_X == 0.
|
|
4018
|
+
const int64_t dmmv_x_required = (src0->type == GGML_TYPE_BF16 || src0->type == GGML_TYPE_F16) ?
|
|
4019
|
+
2*GGML_SYCL_DMMV_X : GGML_SYCL_DMMV_X;
|
|
3512
4020
|
return ggml_sycl_supports_dmmv(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
|
|
3513
|
-
src0->ne[0] %
|
|
4021
|
+
src0->ne[0] % dmmv_x_required == 0 && src1->ne[1] == 1;
|
|
3514
4022
|
}
|
|
3515
4023
|
|
|
3516
4024
|
static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3560,9 +4068,16 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
|
3560
4068
|
// Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
|
|
3561
4069
|
// is enabled takes precedence over DMMV, the current if-else implementation
|
|
3562
4070
|
// requires disabling DMMV if both conditions are met
|
|
4071
|
+
|
|
3563
4072
|
if (!g_ggml_sycl_prioritize_dmmv && ((should_reorder_tensor(ctx, dst) &&
|
|
3564
4073
|
ggml_sycl_supports_reorder_mmvq(src0->type)))) {
|
|
3565
|
-
|
|
4074
|
+
// Arc770 get benefit with Q4_0 by skipping it.
|
|
4075
|
+
if (!(ggml_sycl_info().devices[ctx.device].hw_info.arch ==
|
|
4076
|
+
gpu_arch::intel_gpu_acm_g10 &&
|
|
4077
|
+
src0->type == GGML_TYPE_Q4_0)) {
|
|
4078
|
+
use_dequantize_mul_mat_vec =
|
|
4079
|
+
use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
|
4080
|
+
}
|
|
3566
4081
|
}
|
|
3567
4082
|
|
|
3568
4083
|
if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
|
@@ -3607,35 +4122,17 @@ struct mmid_row_mapping {
|
|
|
3607
4122
|
|
|
3608
4123
|
__dpct_inline__ static void k_copy_src1_to_contiguous(
|
|
3609
4124
|
const char *__restrict__ src1_original, char *__restrict__ src1_contiguous,
|
|
3610
|
-
|
|
3611
|
-
const char *__restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
|
|
4125
|
+
const mmid_row_mapping *__restrict__ row_mapping,
|
|
3612
4126
|
int64_t ne11, int64_t ne10, size_t nb11, size_t nb12,
|
|
3613
|
-
const sycl::nd_item<3> &item_ct1
|
|
3614
|
-
int32_t
|
|
3615
|
-
int32_t id = item_ct1.get_group(1);
|
|
4127
|
+
const sycl::nd_item<3> &item_ct1) {
|
|
4128
|
+
const int32_t src1_row = item_ct1.get_group(2);
|
|
3616
4129
|
|
|
3617
|
-
const int32_t
|
|
3618
|
-
|
|
3619
|
-
if (row_id_i != i02) {
|
|
3620
|
-
return;
|
|
3621
|
-
}
|
|
4130
|
+
const int32_t iid1 = row_mapping[src1_row].i2;
|
|
4131
|
+
const int32_t id = row_mapping[src1_row].i1;
|
|
3622
4132
|
|
|
3623
4133
|
const int64_t i11 = id % ne11;
|
|
3624
4134
|
const int64_t i12 = iid1;
|
|
3625
4135
|
|
|
3626
|
-
if (item_ct1.get_local_id(2) == 0) {
|
|
3627
|
-
src1_row =
|
|
3628
|
-
dpct::atomic_fetch_add<sycl::access::address_space::generic_space>(
|
|
3629
|
-
cur_src1_row, 1);
|
|
3630
|
-
row_mapping[src1_row] = {id, iid1};
|
|
3631
|
-
}
|
|
3632
|
-
/*
|
|
3633
|
-
DPCT1065:194: Consider replacing sycl::nd_item::barrier() with
|
|
3634
|
-
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
|
|
3635
|
-
performance if there is no access to global memory.
|
|
3636
|
-
*/
|
|
3637
|
-
item_ct1.barrier();
|
|
3638
|
-
|
|
3639
4136
|
const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
|
|
3640
4137
|
float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
|
|
3641
4138
|
|
|
@@ -3665,6 +4162,92 @@ __dpct_inline__ static void k_copy_dst_from_contiguous(
|
|
|
3665
4162
|
}
|
|
3666
4163
|
}
|
|
3667
4164
|
|
|
4165
|
+
// Fused MoE TG fast path. Returns false to fall back to the per-expert loop below.
|
|
4166
|
+
static bool ggml_sycl_mul_mat_id_mmvq_fused(
|
|
4167
|
+
ggml_backend_sycl_context & ctx, const ggml_tensor * src0,
|
|
4168
|
+
const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst)
|
|
4169
|
+
{
|
|
4170
|
+
const int64_t ne10 = src1->ne[0];
|
|
4171
|
+
const int64_t ne11 = src1->ne[1];
|
|
4172
|
+
const int64_t ne12 = src1->ne[2];
|
|
4173
|
+
if (ne12 != 1) return false;
|
|
4174
|
+
if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) return false;
|
|
4175
|
+
if (ne10 != src0->ne[0] || ne10 % QK8_1 != 0) return false;
|
|
4176
|
+
if (!ggml_is_contiguous(src1)) return false;
|
|
4177
|
+
|
|
4178
|
+
// Reorder layout not supported; fall back.
|
|
4179
|
+
const ggml_tensor_extra_gpu * src0_extra =
|
|
4180
|
+
static_cast<const ggml_tensor_extra_gpu *>(src0->extra);
|
|
4181
|
+
if (src0_extra && src0_extra->optimized_feature.reorder) return false;
|
|
4182
|
+
|
|
4183
|
+
const int64_t n_ids_per_group = ids->ne[0];
|
|
4184
|
+
if (ids->ne[1] != 1) return false;
|
|
4185
|
+
if (ne11 != 1 && ne11 != n_ids_per_group) return false;
|
|
4186
|
+
|
|
4187
|
+
const queue_ptr stream = ctx.stream();
|
|
4188
|
+
const int src1_padded_cols = GGML_PAD((int) ne10, MATRIX_ROW_PADDING);
|
|
4189
|
+
const int n_experts_used = (int) n_ids_per_group;
|
|
4190
|
+
const int nrows = (int) src0->ne[1];
|
|
4191
|
+
|
|
4192
|
+
ggml_sycl_pool_alloc<char> src1_q8_alloc(ctx.pool(),
|
|
4193
|
+
(size_t) ne11 * src1_padded_cols * sizeof(block_q8_1) / QK8_1);
|
|
4194
|
+
char * src1_ddq = src1_q8_alloc.get();
|
|
4195
|
+
quantize_row_q8_1_sycl<quantize_q8_1>(
|
|
4196
|
+
(const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
|
|
4197
|
+
src1_padded_cols, stream);
|
|
4198
|
+
|
|
4199
|
+
const size_t bytes_per_qrow = (size_t) src1_padded_cols * sizeof(block_q8_1) / QK8_1;
|
|
4200
|
+
const size_t src1_row_stride = (ne11 == 1) ? 0 : bytes_per_qrow;
|
|
4201
|
+
|
|
4202
|
+
return ggml_sycl_mul_mat_vec_q_id(
|
|
4203
|
+
src0->type, src0->data, src1_ddq, (const int32_t *) ids->data,
|
|
4204
|
+
(float *) dst->data, (int) ne10, nrows, n_experts_used,
|
|
4205
|
+
/*expert_weight_stride=*/ src0->nb[2],
|
|
4206
|
+
/*dst_row_stride=*/ dst->nb[1],
|
|
4207
|
+
src1_row_stride, stream);
|
|
4208
|
+
}
|
|
4209
|
+
|
|
4210
|
+
// counting sort of the routed rows by expert id (row_id_i, as chosen by the router):
|
|
4211
|
+
// builds a projection of a memory layout where each expert's slice is contiguous
|
|
4212
|
+
static void mmid_counting_sort_rows(
|
|
4213
|
+
const ggml_tensor * ids, const char * ids_host,
|
|
4214
|
+
int64_t n_ids, int64_t n_as, int64_t n_routed_rows,
|
|
4215
|
+
std::vector<int64_t> & expert_counts,
|
|
4216
|
+
std::vector<int64_t> & expert_row_offsets,
|
|
4217
|
+
std::vector<mmid_row_mapping> & routed_row_src) {
|
|
4218
|
+
|
|
4219
|
+
// frequencies: how many routed rows each expert "owns"
|
|
4220
|
+
expert_counts.assign(n_as, 0);
|
|
4221
|
+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
|
4222
|
+
for (int64_t id = 0; id < n_ids; id++) {
|
|
4223
|
+
const int32_t row_id_i = *(const int32_t *) (ids_host + iid1*ids->nb[1] + id*ids->nb[0]);
|
|
4224
|
+
GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
|
|
4225
|
+
expert_counts[row_id_i]++;
|
|
4226
|
+
}
|
|
4227
|
+
}
|
|
4228
|
+
|
|
4229
|
+
// where each expert's slice starts (row indices) and the previous ends
|
|
4230
|
+
expert_row_offsets.assign(n_as + 1, 0);
|
|
4231
|
+
for (int64_t i02 = 0; i02 < n_as; i02++) {
|
|
4232
|
+
expert_row_offsets[i02 + 1] = expert_row_offsets[i02] + expert_counts[i02];
|
|
4233
|
+
}
|
|
4234
|
+
|
|
4235
|
+
std::vector<int64_t> expert_row_next = expert_row_offsets;
|
|
4236
|
+
routed_row_src.resize(n_routed_rows);
|
|
4237
|
+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
|
4238
|
+
for (int64_t id = 0; id < n_ids; id++) {
|
|
4239
|
+
const int32_t row_id_i = *(const int32_t *) (ids_host + iid1*ids->nb[1] + id*ids->nb[0]);
|
|
4240
|
+
GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
|
|
4241
|
+
|
|
4242
|
+
// find and validate the next free row for a given expert (row_id_i)
|
|
4243
|
+
const int64_t routed_row = expert_row_next[row_id_i]++;
|
|
4244
|
+
GGML_ASSERT(routed_row >= expert_row_offsets[row_id_i]);
|
|
4245
|
+
GGML_ASSERT(routed_row < expert_row_offsets[row_id_i + 1]);
|
|
4246
|
+
routed_row_src[routed_row] = {(int32_t) id, (int32_t) iid1};
|
|
4247
|
+
}
|
|
4248
|
+
}
|
|
4249
|
+
}
|
|
4250
|
+
|
|
3668
4251
|
static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
|
3669
4252
|
ggml_tensor *dst) try {
|
|
3670
4253
|
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
|
|
@@ -3680,6 +4263,12 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
|
|
3680
4263
|
const int64_t n_as = ne02;
|
|
3681
4264
|
const int64_t n_ids = ids->ne[0];
|
|
3682
4265
|
|
|
4266
|
+
if (ne12 == 1) {
|
|
4267
|
+
if (ggml_sycl_mul_mat_id_mmvq_fused(ctx, src0, src1, ids, dst)) {
|
|
4268
|
+
return;
|
|
4269
|
+
}
|
|
4270
|
+
}
|
|
4271
|
+
|
|
3683
4272
|
std::vector<char> ids_host(ggml_nbytes(ids));
|
|
3684
4273
|
const char * ids_dev = (const char *) ids->data;
|
|
3685
4274
|
|
|
@@ -3730,105 +4319,98 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
|
|
3730
4319
|
}
|
|
3731
4320
|
}
|
|
3732
4321
|
} else {
|
|
3733
|
-
|
|
3734
|
-
ggml_sycl_pool_alloc<char>
|
|
4322
|
+
const int64_t n_routed_rows = ids->ne[1] * n_ids;
|
|
4323
|
+
ggml_sycl_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*n_routed_rows*ne10);
|
|
4324
|
+
ggml_sycl_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*n_routed_rows*ne0);
|
|
3735
4325
|
|
|
3736
4326
|
src1_row.data = src1_contiguous.get();
|
|
3737
4327
|
dst_row.data = dst_contiguous.get();
|
|
3738
4328
|
|
|
3739
|
-
|
|
3740
|
-
|
|
3741
|
-
|
|
3742
|
-
|
|
3743
|
-
|
|
4329
|
+
// how many "owned" routed rows to pass to each expert
|
|
4330
|
+
std::vector<int64_t> expert_row_counts;
|
|
4331
|
+
// where each expert's slice starts and the previous ends (row indices, right-exclusive)
|
|
4332
|
+
std::vector<int64_t> expert_row_offsets;
|
|
4333
|
+
// the sources (slot/token pairs) of contiguous rows to guide k_copy_src1_to_contiguous
|
|
4334
|
+
std::vector<mmid_row_mapping> routed_row_src;
|
|
3744
4335
|
|
|
3745
|
-
|
|
4336
|
+
mmid_counting_sort_rows(ids, ids_host.data(), n_ids, n_as, n_routed_rows,
|
|
4337
|
+
expert_row_counts, expert_row_offsets, routed_row_src);
|
|
3746
4338
|
|
|
3747
|
-
|
|
3748
|
-
|
|
3749
|
-
|
|
4339
|
+
ggml_sycl_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), n_routed_rows);
|
|
4340
|
+
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
4341
|
+
stream->memcpy(dev_row_mapping.get(), routed_row_src.data(), n_routed_rows*sizeof(mmid_row_mapping))));
|
|
3750
4342
|
|
|
3751
|
-
|
|
3752
|
-
|
|
3753
|
-
|
|
4343
|
+
const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
|
|
4344
|
+
assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
|
|
4345
|
+
|
|
4346
|
+
{
|
|
4347
|
+
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
|
|
4348
|
+
sycl::range<3> grid_dims(1, 1, n_routed_rows);
|
|
4349
|
+
stream->submit([&](sycl::handler &cgh) {
|
|
4350
|
+
char *__restrict src1_contiguous_get =
|
|
4351
|
+
src1_contiguous.get();
|
|
4352
|
+
mmid_row_mapping *__restrict dev_row_mapping_get =
|
|
4353
|
+
dev_row_mapping.get();
|
|
4354
|
+
|
|
4355
|
+
cgh.parallel_for(
|
|
4356
|
+
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
|
4357
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
4358
|
+
k_copy_src1_to_contiguous(
|
|
4359
|
+
src1_original, src1_contiguous_get,
|
|
4360
|
+
dev_row_mapping_get,
|
|
4361
|
+
ne11, ne10, nb11, nb12,
|
|
4362
|
+
item_ct1);
|
|
4363
|
+
});
|
|
4364
|
+
});
|
|
4365
|
+
}
|
|
4366
|
+
|
|
4367
|
+
for (int64_t i02 = 0; i02 < n_as; i02++) {
|
|
4368
|
+
const int64_t num_src1_rows = expert_row_counts[i02];
|
|
3754
4369
|
|
|
3755
4370
|
if (num_src1_rows == 0) {
|
|
3756
4371
|
continue;
|
|
3757
4372
|
}
|
|
3758
4373
|
|
|
3759
|
-
|
|
3760
|
-
ggml_sycl_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
|
|
3761
|
-
ggml_sycl_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
|
|
3762
|
-
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
3763
|
-
stream->memset(dev_cur_src1_row.get(), 0, sizeof(int))));
|
|
3764
|
-
|
|
3765
|
-
const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
|
|
3766
|
-
assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
|
|
3767
|
-
|
|
3768
|
-
{
|
|
3769
|
-
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
|
|
3770
|
-
sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
|
|
3771
|
-
stream->submit([&](sycl::handler &cgh) {
|
|
3772
|
-
sycl::local_accessor<int, 0> src1_row_acc(cgh);
|
|
3773
|
-
|
|
3774
|
-
char *__restrict src1_contiguous_get =
|
|
3775
|
-
src1_contiguous.get();
|
|
3776
|
-
int *__restrict dev_cur_src1_row_get =
|
|
3777
|
-
dev_cur_src1_row.get();
|
|
3778
|
-
mmid_row_mapping *__restrict dev_row_mapping_get =
|
|
3779
|
-
dev_row_mapping.get();
|
|
3780
|
-
size_t ids_nb_ct6 = ids->nb[1];
|
|
3781
|
-
size_t ids_nb_ct7 = ids->nb[0];
|
|
3782
|
-
|
|
3783
|
-
cgh.parallel_for(
|
|
3784
|
-
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
|
3785
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
3786
|
-
k_copy_src1_to_contiguous(
|
|
3787
|
-
src1_original, src1_contiguous_get,
|
|
3788
|
-
dev_cur_src1_row_get,
|
|
3789
|
-
dev_row_mapping_get, ids_dev, i02,
|
|
3790
|
-
ids_nb_ct6, ids_nb_ct7, ne11, ne10, nb11, nb12,
|
|
3791
|
-
item_ct1, src1_row_acc);
|
|
3792
|
-
});
|
|
3793
|
-
});
|
|
3794
|
-
}
|
|
4374
|
+
const int64_t expert_row_offset = expert_row_offsets[i02];
|
|
3795
4375
|
|
|
3796
4376
|
src0_row.data = src0_original + i02*nb02;
|
|
3797
4377
|
|
|
3798
4378
|
GGML_ASSERT(nb11 == sizeof(float)*ne10);
|
|
3799
4379
|
GGML_ASSERT(nb1 == sizeof(float)*ne0);
|
|
4380
|
+
src1_row.data = src1_contiguous.get() + expert_row_offset*nb11;
|
|
3800
4381
|
src1_row.ne[1] = num_src1_rows;
|
|
3801
4382
|
|
|
3802
4383
|
src1_row.nb[1] = nb11;
|
|
3803
4384
|
src1_row.nb[2] = num_src1_rows*nb11;
|
|
3804
4385
|
src1_row.nb[3] = num_src1_rows*nb11;
|
|
3805
4386
|
|
|
4387
|
+
dst_row.data = dst_contiguous.get() + expert_row_offset*nb1;
|
|
3806
4388
|
dst_row.ne[1] = num_src1_rows;
|
|
3807
4389
|
dst_row.nb[1] = nb1;
|
|
3808
4390
|
dst_row.nb[2] = num_src1_rows*nb1;
|
|
3809
4391
|
dst_row.nb[3] = num_src1_rows*nb1;
|
|
3810
4392
|
|
|
3811
4393
|
ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
|
|
4394
|
+
}
|
|
3812
4395
|
|
|
3813
|
-
|
|
3814
|
-
|
|
3815
|
-
|
|
3816
|
-
|
|
3817
|
-
|
|
3818
|
-
|
|
3819
|
-
|
|
3820
|
-
|
|
3821
|
-
|
|
3822
|
-
|
|
3823
|
-
|
|
3824
|
-
|
|
3825
|
-
|
|
3826
|
-
|
|
3827
|
-
|
|
3828
|
-
|
|
3829
|
-
|
|
3830
|
-
|
|
3831
|
-
}
|
|
4396
|
+
{
|
|
4397
|
+
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, max_work_group_size));
|
|
4398
|
+
sycl::range<3> grid_dims(1, 1, n_routed_rows);
|
|
4399
|
+
stream->submit([&](sycl::handler &cgh) {
|
|
4400
|
+
const char *__restrict dst_contiguous_get =
|
|
4401
|
+
dst_contiguous.get();
|
|
4402
|
+
const mmid_row_mapping *__restrict dev_row_mapping_get =
|
|
4403
|
+
dev_row_mapping.get();
|
|
4404
|
+
|
|
4405
|
+
cgh.parallel_for(
|
|
4406
|
+
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
|
4407
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
4408
|
+
k_copy_dst_from_contiguous(dst_original,
|
|
4409
|
+
dst_contiguous_get,
|
|
4410
|
+
dev_row_mapping_get,
|
|
4411
|
+
ne0, nb1, nb2, item_ct1);
|
|
4412
|
+
});
|
|
4413
|
+
});
|
|
3832
4414
|
}
|
|
3833
4415
|
}
|
|
3834
4416
|
}
|
|
@@ -3858,6 +4440,11 @@ static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
|
|
|
3858
4440
|
ggml_sycl_op_im2col(ctx, dst);
|
|
3859
4441
|
}
|
|
3860
4442
|
|
|
4443
|
+
static void ggml_sycl_im2col_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
4444
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
|
4445
|
+
ggml_sycl_op_im2col_3d(ctx, dst);
|
|
4446
|
+
}
|
|
4447
|
+
|
|
3861
4448
|
static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
3862
4449
|
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
|
3863
4450
|
GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
|
|
@@ -4155,6 +4742,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
|
|
4155
4742
|
case GGML_OP_IM2COL:
|
|
4156
4743
|
ggml_sycl_im2col(ctx, dst);
|
|
4157
4744
|
break;
|
|
4745
|
+
case GGML_OP_IM2COL_3D:
|
|
4746
|
+
ggml_sycl_im2col_3d(ctx, dst);
|
|
4747
|
+
break;
|
|
4158
4748
|
case GGML_OP_POOL_2D:
|
|
4159
4749
|
ggml_sycl_pool2d(ctx, dst);
|
|
4160
4750
|
break;
|
|
@@ -4191,6 +4781,21 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
|
|
4191
4781
|
case GGML_OP_SSM_CONV:
|
|
4192
4782
|
ggml_sycl_ssm_conv(ctx, dst);
|
|
4193
4783
|
break;
|
|
4784
|
+
case GGML_OP_SSM_SCAN:
|
|
4785
|
+
ggml_sycl_ssm_scan(ctx, dst);
|
|
4786
|
+
break;
|
|
4787
|
+
case GGML_OP_FILL:
|
|
4788
|
+
ggml_sycl_fill(ctx, dst);
|
|
4789
|
+
break;
|
|
4790
|
+
case GGML_OP_CUMSUM:
|
|
4791
|
+
ggml_sycl_cumsum(ctx, dst);
|
|
4792
|
+
break;
|
|
4793
|
+
case GGML_OP_DIAG:
|
|
4794
|
+
ggml_sycl_diag(ctx, dst);
|
|
4795
|
+
break;
|
|
4796
|
+
case GGML_OP_SOLVE_TRI:
|
|
4797
|
+
ggml_sycl_solve_tri(ctx, dst);
|
|
4798
|
+
break;
|
|
4194
4799
|
case GGML_OP_ROLL:
|
|
4195
4800
|
ggml_sycl_roll(ctx, dst);
|
|
4196
4801
|
break;
|
|
@@ -4497,6 +5102,8 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|
|
4497
5102
|
/* .free = */ ggml_backend_sycl_free,
|
|
4498
5103
|
/* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
|
|
4499
5104
|
/* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
|
|
5105
|
+
/* .set_tensor_2d_async = */ NULL,
|
|
5106
|
+
/* .get_tensor_2d_async = */ NULL,
|
|
4500
5107
|
/* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
|
|
4501
5108
|
// // TODO: update for the new
|
|
4502
5109
|
// interface
|
|
@@ -4665,26 +5272,19 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
4665
5272
|
struct ggml_tensor * a = op->src[0];
|
|
4666
5273
|
struct ggml_tensor * b = op->src[1];
|
|
4667
5274
|
|
|
4668
|
-
|
|
5275
|
+
// disable Q1_0 until implementation
|
|
5276
|
+
if (a->type == GGML_TYPE_Q1_0 || b->type == GGML_TYPE_Q1_0) {
|
|
4669
5277
|
return false;
|
|
4670
5278
|
}
|
|
4671
|
-
|
|
4672
|
-
if (
|
|
4673
|
-
a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ3_S ||
|
|
4674
|
-
a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ2_S ||
|
|
4675
|
-
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ1_M
|
|
4676
|
-
) {
|
|
4677
|
-
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
|
4678
|
-
return false;
|
|
4679
|
-
}
|
|
4680
|
-
}
|
|
4681
|
-
ggml_type src0_type = op->src[0]->type;
|
|
4682
|
-
if (src0_type == GGML_TYPE_BF16 ) {
|
|
4683
|
-
// TODO: support GGML_TYPE_BF16
|
|
4684
|
-
// FIXME: keep a list of supported types to avoid breaking the backend when a new type is added
|
|
5279
|
+
|
|
5280
|
+
if (a->ne[3] != b->ne[3]) {
|
|
4685
5281
|
return false;
|
|
4686
5282
|
}
|
|
4687
5283
|
|
|
5284
|
+
ggml_type src0_type = op->src[0]->type;
|
|
5285
|
+
|
|
5286
|
+
|
|
5287
|
+
|
|
4688
5288
|
// TODO: The configuration below needs more work to be supported with oneDNN
|
|
4689
5289
|
if (ggml_is_permuted(a) && !ggml_is_contiguous(a) &&
|
|
4690
5290
|
a->ne[2] > 1 && a->ne[3] > 1 && src0_type == GGML_TYPE_F16) {
|
|
@@ -4703,12 +5303,31 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
4703
5303
|
case GGML_OP_GET_ROWS:
|
|
4704
5304
|
{
|
|
4705
5305
|
switch (op->src[0]->type) {
|
|
5306
|
+
case GGML_TYPE_I32:
|
|
4706
5307
|
case GGML_TYPE_F16:
|
|
5308
|
+
case GGML_TYPE_BF16:
|
|
4707
5309
|
case GGML_TYPE_F32:
|
|
5310
|
+
case GGML_TYPE_Q1_0:
|
|
5311
|
+
case GGML_TYPE_MXFP4:
|
|
5312
|
+
case GGML_TYPE_NVFP4:
|
|
5313
|
+
case GGML_TYPE_IQ2_XXS:
|
|
5314
|
+
case GGML_TYPE_IQ2_XS:
|
|
5315
|
+
case GGML_TYPE_IQ2_S:
|
|
5316
|
+
case GGML_TYPE_IQ3_XXS:
|
|
5317
|
+
case GGML_TYPE_IQ1_S:
|
|
5318
|
+
case GGML_TYPE_IQ1_M:
|
|
5319
|
+
case GGML_TYPE_IQ3_S:
|
|
5320
|
+
case GGML_TYPE_IQ4_NL:
|
|
5321
|
+
case GGML_TYPE_IQ4_XS:
|
|
5322
|
+
case GGML_TYPE_Q2_K:
|
|
5323
|
+
case GGML_TYPE_Q3_K:
|
|
4708
5324
|
case GGML_TYPE_Q4_0:
|
|
4709
5325
|
case GGML_TYPE_Q4_1:
|
|
5326
|
+
case GGML_TYPE_Q4_K:
|
|
4710
5327
|
case GGML_TYPE_Q5_0:
|
|
4711
5328
|
case GGML_TYPE_Q5_1:
|
|
5329
|
+
case GGML_TYPE_Q5_K:
|
|
5330
|
+
case GGML_TYPE_Q6_K:
|
|
4712
5331
|
case GGML_TYPE_Q8_0:
|
|
4713
5332
|
return true;
|
|
4714
5333
|
default:
|
|
@@ -4863,9 +5482,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
4863
5482
|
case GGML_OP_ROPE:
|
|
4864
5483
|
case GGML_OP_ROPE_BACK:
|
|
4865
5484
|
case GGML_OP_IM2COL:
|
|
4866
|
-
|
|
5485
|
+
case GGML_OP_IM2COL_3D:
|
|
4867
5486
|
case GGML_OP_UPSCALE:
|
|
4868
|
-
return
|
|
5487
|
+
return true;
|
|
4869
5488
|
case GGML_OP_SUM:
|
|
4870
5489
|
case GGML_OP_SUM_ROWS:
|
|
4871
5490
|
case GGML_OP_MEAN:
|
|
@@ -4887,11 +5506,10 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
4887
5506
|
case GGML_OP_ACC:
|
|
4888
5507
|
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
|
4889
5508
|
case GGML_OP_PAD:
|
|
4890
|
-
// TODO: add circular padding support for syscl, see https://github.com/ggml-org/llama.cpp/pull/16985
|
|
4891
5509
|
if (ggml_get_op_params_i32(op, 8) != 0) {
|
|
4892
5510
|
return false;
|
|
4893
5511
|
}
|
|
4894
|
-
return
|
|
5512
|
+
return true;
|
|
4895
5513
|
case GGML_OP_LEAKY_RELU:
|
|
4896
5514
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
4897
5515
|
case GGML_OP_RWKV_WKV6:
|
|
@@ -4907,6 +5525,21 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
4907
5525
|
return op->type == GGML_TYPE_F32;
|
|
4908
5526
|
case GGML_OP_ARANGE:
|
|
4909
5527
|
return op->type == GGML_TYPE_F32;
|
|
5528
|
+
case GGML_OP_SSM_SCAN:
|
|
5529
|
+
if (op->src[3]->ne[0] == 1) {
|
|
5530
|
+
// Mamba2
|
|
5531
|
+
// (kernel only supports (d_state == 128 || d_state == 256) && d_head % WARP_SIZE == 0)
|
|
5532
|
+
return (op->src[0]->ne[0] == 128 || op->src[0]->ne[0] == 256) && op->src[0]->ne[1] % WARP_SIZE == 0;
|
|
5533
|
+
} else {
|
|
5534
|
+
// TODO Mamba-1 not yet ported to SYCL
|
|
5535
|
+
return false;
|
|
5536
|
+
}
|
|
5537
|
+
case GGML_OP_FILL:
|
|
5538
|
+
case GGML_OP_CUMSUM:
|
|
5539
|
+
case GGML_OP_DIAG:
|
|
5540
|
+
return true;
|
|
5541
|
+
case GGML_OP_SOLVE_TRI:
|
|
5542
|
+
return op->src[0]->ne[0] <= SYCL_SOLVE_TRI_MAX_N && op->src[1]->ne[0] <= SYCL_SOLVE_TRI_MAX_K;
|
|
4910
5543
|
case GGML_OP_FLASH_ATTN_EXT:
|
|
4911
5544
|
return ggml_sycl_flash_attn_ext_supported(device, op);
|
|
4912
5545
|
default:
|