whispercpp 1.3.6 → 1.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.document +3 -0
- data/.rdoc_options +2 -0
- data/README.md +38 -5
- data/Rakefile +18 -3
- data/ext/dependencies.rb +10 -4
- data/ext/dependencies_for_windows.rb +17 -0
- data/ext/extconf.rb +20 -8
- data/ext/options.rb +54 -14
- data/ext/options_for_windows.rb +51 -0
- data/ext/ruby_whisper.c +36 -42
- data/ext/ruby_whisper.h +135 -0
- data/ext/ruby_whisper_context.c +107 -28
- data/ext/ruby_whisper_log_queue.c +180 -0
- data/ext/ruby_whisper_log_settable.h +47 -0
- data/ext/ruby_whisper_parakeet.c +49 -0
- data/ext/ruby_whisper_parakeet_context.c +304 -0
- data/ext/ruby_whisper_parakeet_context_params.c +117 -0
- data/ext/ruby_whisper_parakeet_model.c +84 -0
- data/ext/ruby_whisper_parakeet_params.c +548 -0
- data/ext/ruby_whisper_parakeet_segment.c +157 -0
- data/ext/ruby_whisper_parakeet_token.c +188 -0
- data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
- data/ext/ruby_whisper_params.c +256 -65
- data/ext/ruby_whisper_segment.c +6 -6
- data/ext/ruby_whisper_transcribe.cpp +42 -15
- data/ext/sources/CMakeLists.txt +41 -3
- data/ext/sources/CMakePresets.json +95 -0
- data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
- data/ext/sources/cmake/parakeet.pc.in +10 -0
- data/ext/sources/cmake/whisper.pc.in +1 -1
- data/ext/sources/examples/CMakeLists.txt +4 -2
- data/ext/sources/examples/bench/bench.cpp +1 -1
- data/ext/sources/examples/cli/cli.cpp +43 -9
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/common-whisper.cpp +139 -67
- data/ext/sources/examples/common-whisper.h +11 -0
- data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
- data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
- data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
- data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
- data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
- data/ext/sources/examples/server/server.cpp +199 -163
- data/ext/sources/ggml/CMakeLists.txt +21 -13
- data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
- data/ext/sources/ggml/include/ggml-alloc.h +1 -0
- data/ext/sources/ggml/include/ggml-backend.h +72 -10
- data/ext/sources/ggml/include/ggml-cuda.h +3 -0
- data/ext/sources/ggml/include/ggml-rpc.h +3 -3
- data/ext/sources/ggml/include/ggml.h +101 -9
- data/ext/sources/ggml/include/gguf.h +10 -2
- data/ext/sources/ggml/src/CMakeLists.txt +22 -5
- data/ext/sources/ggml/src/ggml-alloc.c +5 -1
- data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
- data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +12 -0
- data/ext/sources/ggml/src/ggml-backend.cpp +110 -9
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +4 -0
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +672 -257
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +71 -0
- data/ext/sources/ggml/src/ggml-cann/common.h +20 -10
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +211 -30
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +58 -29
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +2 -0
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +16 -16
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +116 -7
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +65 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +4279 -1292
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +5 -35
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +0 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +177 -27
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +5 -0
- data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +95 -5
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +146 -134
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +88 -70
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +372 -73
- data/ext/sources/ggml/src/ggml-cpu/ops.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +55 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +3 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +90 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +3 -16
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
- data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +37 -53
- data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +17 -7
- data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
- data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +62 -26
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +44 -18
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +242 -28
- data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +53 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +14 -6
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +278 -44
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +331 -130
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +126 -27
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +40 -15
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +18 -9
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +152 -49
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
- data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +84 -35
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1069 -609
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +4 -2
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +242 -195
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +3 -3
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +502 -423
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +19 -12
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +485 -57
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -1
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +36 -10
- data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +133 -26
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +5 -1
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +11 -4
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
- data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
- data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +45 -13
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -4
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +26 -23
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +31 -2
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +80 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +22 -4
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +2 -1
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1428 -743
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -7
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +53 -84
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +25 -12
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +165 -184
- data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +170 -127
- data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +125 -97
- data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +148 -42
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.c +2 -2
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +252 -62
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +9 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +87 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +96 -13
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +182 -57
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +71 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +27 -10
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +63 -23
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +9 -8
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +1 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +5 -8
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +529 -815
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2522 -234
- data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +291 -95
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +59 -37
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +121 -133
- data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +244 -151
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +6 -6
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +719 -45
- data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +3 -1
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +22 -9
- data/ext/sources/ggml/src/ggml-impl.h +6 -1
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +138 -13
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +32 -1
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +164 -28
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +80 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +190 -19
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +2 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +39 -26
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +823 -322
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +54 -5
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +12248 -5907
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +67 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +59 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1819 -112
- data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mm_q8_0_f32_8x4.cl → gemm_noshuffle_q8_0_f32.cl} +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
- data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
- data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +48 -64
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +15 -5
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +18 -11
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +35 -13
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +264 -192
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +33 -7
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +1 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +1 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +27 -3
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +67 -36
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +1 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +101 -44
- data/ext/sources/ggml/src/ggml-openvino/utils.h +23 -3
- data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
- data/ext/sources/ggml/src/ggml-quants.c +289 -114
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
- data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
- data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +50 -4
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +3 -1
- data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +41 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +115 -13
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
- data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
- data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
- data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1 -90
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +0 -2
- data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +7 -5
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +4 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +76 -168
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +3 -1
- data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +69 -31
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +823 -190
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
- data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
- data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +71 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
- data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
- data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
- data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
- data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
- data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +215 -53
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +4 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +2 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +2 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +1 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +1 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +0 -2
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +11 -0
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +2060 -535
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +197 -48
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +60 -59
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +115 -113
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +122 -31
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +125 -64
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +43 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +5 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +3 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +11 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +171 -147
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +2202 -283
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +2610 -1403
- data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +8 -7
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +76 -95
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +19 -1
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -50
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +107 -184
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +183 -78
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +655 -495
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +8 -6
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +5 -1
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +80 -409
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +6 -4
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +2 -3
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +68 -48
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +18 -14
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +244 -10
- data/ext/sources/ggml/src/ggml.c +110 -28
- data/ext/sources/ggml/src/gguf.cpp +173 -28
- data/ext/sources/include/parakeet.h +342 -0
- data/ext/sources/include/whisper.h +10 -0
- data/ext/sources/media/matmul.png +0 -0
- data/ext/sources/src/CMakeLists.txt +23 -0
- data/ext/sources/src/parakeet-arch.h +188 -0
- data/ext/sources/src/parakeet.cpp +3838 -0
- data/ext/sources/src/whisper.cpp +56 -12
- data/extsources.rb +26 -10
- data/lib/whisper/log_settable.rb +36 -0
- data/lib/whisper/model/uri.rb +13 -1
- data/lib/whisper/output.rb +74 -0
- data/sig/whisper.rbs +411 -62
- data/test/helper.rb +2 -0
- data/test/jfk_reader/jfk_reader.c +50 -7
- data/test/test_callback.rb +1 -0
- data/test/test_package.rb +6 -5
- data/test/test_parakeet.rb +28 -0
- data/test/test_parakeet_callback.rb +107 -0
- data/test/test_parakeet_context.rb +116 -0
- data/test/test_parakeet_context_params.rb +24 -0
- data/test/test_parakeet_model.rb +21 -0
- data/test/test_parakeet_params.rb +78 -0
- data/test/test_parakeet_segment.rb +42 -0
- data/test/test_parakeet_token.rb +73 -0
- data/test/test_params.rb +2 -0
- data/test/test_vad_segment.rb +1 -1
- data/test/test_whisper.rb +24 -6
- data/whispercpp.gemspec +2 -2
- metadata +215 -281
- data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
- data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
- data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
- data/ext/sources/bindings/javascript/package.json +0 -26
- data/ext/sources/bindings/javascript/whisper.js +0 -19
- data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
- data/ext/sources/examples/addon.node/addon.cpp +0 -557
- data/ext/sources/examples/addon.node/index.js +0 -59
- data/ext/sources/examples/addon.node/package.json +0 -16
- data/ext/sources/examples/addon.node/vad-example.js +0 -132
- data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
- data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
- data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
- data/ext/sources/examples/coi-serviceworker.js +0 -146
- data/ext/sources/examples/command/CMakeLists.txt +0 -10
- data/ext/sources/examples/command/command.cpp +0 -802
- data/ext/sources/examples/command/commands.txt +0 -9
- data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
- data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
- data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
- data/ext/sources/examples/generate-karaoke.sh +0 -57
- data/ext/sources/examples/helpers.js +0 -191
- data/ext/sources/examples/livestream.sh +0 -112
- data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
- data/ext/sources/examples/lsp/lsp.cpp +0 -471
- data/ext/sources/examples/lsp/whisper.vim +0 -362
- data/ext/sources/examples/python/test_whisper_processor.py +0 -7
- data/ext/sources/examples/python/whisper_processor.py +0 -54
- data/ext/sources/examples/server/bench.js +0 -29
- data/ext/sources/examples/server.py +0 -120
- data/ext/sources/examples/stream/CMakeLists.txt +0 -10
- data/ext/sources/examples/stream/stream.cpp +0 -437
- data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
- data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
- data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
- data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
- data/ext/sources/examples/sycl/build.sh +0 -22
- data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
- data/ext/sources/examples/sycl/run-whisper.sh +0 -17
- data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -48
- data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -488
- data/ext/sources/examples/talk-llama/llama-adapter.h +0 -89
- data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2877
- data/ext/sources/examples/talk-llama/llama-arch.h +0 -628
- data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -919
- data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
- data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -896
- data/ext/sources/examples/talk-llama/llama-chat.h +0 -71
- data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3633
- data/ext/sources/examples/talk-llama/llama-context.h +0 -359
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
- data/ext/sources/examples/talk-llama/llama-cparams.h +0 -47
- data/ext/sources/examples/talk-llama/llama-ext.h +0 -12
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
- data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
- data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2735
- data/ext/sources/examples/talk-llama/llama-graph.h +0 -1031
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -258
- data/ext/sources/examples/talk-llama/llama-hparams.h +0 -353
- data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
- data/ext/sources/examples/talk-llama/llama-impl.h +0 -75
- data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
- data/ext/sources/examples/talk-llama/llama-io.h +0 -35
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -330
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2285
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -389
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +0 -275
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +0 -140
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1165
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
- data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
- data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -752
- data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1655
- data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -206
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -299
- data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -40
- data/ext/sources/examples/talk-llama/llama-model.cpp +0 -9056
- data/ext/sources/examples/talk-llama/llama-model.h +0 -597
- data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1304
- data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
- data/ext/sources/examples/talk-llama/llama-sampler.cpp +0 -3885
- data/ext/sources/examples/talk-llama/llama-sampler.h +0 -42
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3970
- data/ext/sources/examples/talk-llama/llama-vocab.h +0 -187
- data/ext/sources/examples/talk-llama/llama.cpp +0 -1194
- data/ext/sources/examples/talk-llama/llama.h +0 -1573
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -190
- data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
- data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -137
- data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -143
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -133
- data/ext/sources/examples/talk-llama/models/bert.cpp +0 -184
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -145
- data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
- data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -142
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -262
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +0 -445
- data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -148
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +0 -97
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +0 -145
- data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -111
- data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
- data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
- data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -157
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -195
- data/ext/sources/examples/talk-llama/models/granite.cpp +0 -210
- data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -139
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -153
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/jais2.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +0 -381
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -196
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/llama.cpp +0 -175
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/mamba-base.cpp +0 -289
- data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -54
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -129
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -200
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
- data/ext/sources/examples/talk-llama/models/models.h +0 -704
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -109
- data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -162
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
- data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
- data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
- data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -320
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/plm.cpp +0 -169
- data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +0 -381
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +0 -422
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -131
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -525
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -140
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -164
- data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
- data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -137
- data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +0 -165
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
- data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
- data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
- data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
- data/ext/sources/examples/talk-llama/speak +0 -40
- data/ext/sources/examples/talk-llama/speak.bat +0 -1
- data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
- data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
- data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
- data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
- data/ext/sources/examples/talk-llama/unicode.cpp +0 -1103
- data/ext/sources/examples/talk-llama/unicode.h +0 -111
- data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
- data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
- data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
- data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
- data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
- data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
- data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
- data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
- data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
- data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -155
- data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
- data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +0 -123
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +0 -17
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +0 -333
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -182
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -718
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
- data/ext/sources/tests/CMakeLists.txt +0 -112
- data/ext/sources/tests/earnings21/eval.mk +0 -58
- data/ext/sources/tests/earnings21/eval.py +0 -68
- data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
- data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
- data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
- data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
- data/ext/sources/tests/earnings21/requirements.txt +0 -6
- data/ext/sources/tests/en-0-ref.txt +0 -1
- data/ext/sources/tests/en-1-ref.txt +0 -1
- data/ext/sources/tests/en-2-ref.txt +0 -1
- data/ext/sources/tests/es-0-ref.txt +0 -1
- data/ext/sources/tests/librispeech/eval.mk +0 -39
- data/ext/sources/tests/librispeech/eval.py +0 -47
- data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
- data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
- data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
- data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
- data/ext/sources/tests/librispeech/requirements.txt +0 -6
- data/ext/sources/tests/run-tests.sh +0 -130
- data/ext/sources/tests/test-c.c +0 -3
- data/ext/sources/tests/test-vad-full.cpp +0 -56
- data/ext/sources/tests/test-vad.cpp +0 -83
- data/ext/sources/tests/test-whisper.js +0 -58
- data/lib/whisper/context.rb +0 -15
- data/lib/whisper/segment.rb +0 -58
- /data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general_q8_0_f32.cl → gemv_noshuffle_q8_0_f32.cl} +0 -0
|
@@ -14,11 +14,16 @@
|
|
|
14
14
|
#define GGML_SYCL_DEQUANTIZE_HPP
|
|
15
15
|
|
|
16
16
|
#include "common.hpp"
|
|
17
|
+
#include "convert.hpp"
|
|
17
18
|
|
|
18
19
|
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
|
19
20
|
typedef void (*dequantize_kernel_t_reorder)(const void *d, const int64_t ib, const void *qs,
|
|
20
21
|
const int iqs, dfloat2 &v);
|
|
21
22
|
|
|
23
|
+
#if QK_K == 256
|
|
24
|
+
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m);
|
|
25
|
+
#endif
|
|
26
|
+
|
|
22
27
|
static __dpct_inline__ void dequantize_q4_0(const void *vx, const int64_t ib,
|
|
23
28
|
const int iqs, dfloat2 &v) {
|
|
24
29
|
const block_q4_0 * x = (const block_q4_0 *) vx;
|
|
@@ -89,6 +94,474 @@ static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib,
|
|
|
89
94
|
#endif // GGML_SYCL_F16
|
|
90
95
|
}
|
|
91
96
|
|
|
97
|
+
static __dpct_inline__ void dequantize_q4_K(const void *vx, const int64_t ib,
|
|
98
|
+
const int iqs, dfloat2 &v) {
|
|
99
|
+
#if QK_K == 256
|
|
100
|
+
const block_q4_K * x = (const block_q4_K *) vx;
|
|
101
|
+
const sycl::half2 dm = x[ib].dm;
|
|
102
|
+
const float dall = dm[0];
|
|
103
|
+
const float dmin = dm[1];
|
|
104
|
+
|
|
105
|
+
auto dequantize_one = [&](const int idx) -> dfloat {
|
|
106
|
+
const int il = idx / 64;
|
|
107
|
+
const int in = idx % 64;
|
|
108
|
+
const int is = 2 * il + (in >= 32 ? 1 : 0);
|
|
109
|
+
const int off = in & 31;
|
|
110
|
+
const int qsi = 32 * il + off;
|
|
111
|
+
|
|
112
|
+
uint8_t sc;
|
|
113
|
+
uint8_t m;
|
|
114
|
+
get_scale_min_k4(is, x[ib].scales, sc, m);
|
|
115
|
+
|
|
116
|
+
const uint8_t q = x[ib].qs[qsi];
|
|
117
|
+
const uint8_t qv = (in >= 32) ? (q >> 4) : (q & 0xF);
|
|
118
|
+
return sycl::fma((dfloat) qv, (dfloat) (dall * sc), (dfloat) (-dmin * m));
|
|
119
|
+
};
|
|
120
|
+
|
|
121
|
+
v.x() = dequantize_one(iqs + 0);
|
|
122
|
+
v.y() = dequantize_one(iqs + 1);
|
|
123
|
+
#else
|
|
124
|
+
GGML_ABORT("Q4_K dequantize not supported for QK_K != 256");
|
|
125
|
+
#endif
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
static __dpct_inline__ void dequantize_q2_K(const void *vx, const int64_t ib,
|
|
129
|
+
const int iqs, dfloat2 &v) {
|
|
130
|
+
#if QK_K == 256
|
|
131
|
+
const block_q2_K * x = (const block_q2_K *) vx;
|
|
132
|
+
const float dall = x[ib].dm[0];
|
|
133
|
+
const float dmin = x[ib].dm[1];
|
|
134
|
+
|
|
135
|
+
auto dequantize_one = [&](const int idx) -> dfloat {
|
|
136
|
+
const int n = idx / 128;
|
|
137
|
+
const int r = idx % 128;
|
|
138
|
+
const int g = r / 32;
|
|
139
|
+
const int l = r % 32;
|
|
140
|
+
const int is = 8 * n + l / 16;
|
|
141
|
+
|
|
142
|
+
const uint8_t q = x[ib].qs[32 * n + l];
|
|
143
|
+
const uint8_t sc = x[ib].scales[is + 2 * g];
|
|
144
|
+
const float d = dall * (sc & 0xF);
|
|
145
|
+
const float m = dmin * (sc >> 4);
|
|
146
|
+
|
|
147
|
+
return sycl::fma((dfloat) ((q >> (2 * g)) & 3), (dfloat) d, (dfloat) (-m));
|
|
148
|
+
};
|
|
149
|
+
|
|
150
|
+
v.x() = dequantize_one(iqs + 0);
|
|
151
|
+
v.y() = dequantize_one(iqs + 1);
|
|
152
|
+
#else
|
|
153
|
+
GGML_ABORT("Q2_K dequantize not supported for QK_K != 256");
|
|
154
|
+
#endif
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
static __dpct_inline__ void dequantize_q3_K(const void *vx, const int64_t ib,
|
|
158
|
+
const int iqs, dfloat2 &v) {
|
|
159
|
+
#if QK_K == 256
|
|
160
|
+
const block_q3_K * x = (const block_q3_K *) vx;
|
|
161
|
+
const float d_all = x[ib].d;
|
|
162
|
+
|
|
163
|
+
auto dequantize_one = [&](const int idx) -> dfloat {
|
|
164
|
+
const int n = idx / 128;
|
|
165
|
+
const int r = idx % 128;
|
|
166
|
+
const int j = r / 32;
|
|
167
|
+
const int l = r % 32;
|
|
168
|
+
|
|
169
|
+
const int is0 = l / 16;
|
|
170
|
+
const int is = 8 * n + 2 * j + is0;
|
|
171
|
+
const int shift = 2 * j;
|
|
172
|
+
const uint8_t m = 1 << (4 * n + j);
|
|
173
|
+
|
|
174
|
+
const int8_t us = is < 4 ? (x[ib].scales[is - 0] & 0xF) | (((x[ib].scales[is + 8] >> 0) & 3) << 4) :
|
|
175
|
+
is < 8 ? (x[ib].scales[is - 0] & 0xF) | (((x[ib].scales[is + 4] >> 2) & 3) << 4) :
|
|
176
|
+
is < 12 ? (x[ib].scales[is - 8] >> 4) | (((x[ib].scales[is + 0] >> 4) & 3) << 4) :
|
|
177
|
+
(x[ib].scales[is - 8] >> 4) | (((x[ib].scales[is - 4] >> 6) & 3) << 4);
|
|
178
|
+
|
|
179
|
+
const float dl = d_all * (us - 32);
|
|
180
|
+
const uint8_t q = x[ib].qs[32 * n + l];
|
|
181
|
+
const uint8_t h = x[ib].hmask[l];
|
|
182
|
+
const int8_t qv = ((q >> shift) & 3) - ((h & m) ? 0 : 4);
|
|
183
|
+
|
|
184
|
+
return (dfloat) (dl * qv);
|
|
185
|
+
};
|
|
186
|
+
|
|
187
|
+
v.x() = dequantize_one(iqs + 0);
|
|
188
|
+
v.y() = dequantize_one(iqs + 1);
|
|
189
|
+
#else
|
|
190
|
+
GGML_ABORT("Q3_K dequantize not supported for QK_K != 256");
|
|
191
|
+
#endif
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
static __dpct_inline__ void dequantize_q5_K(const void *vx, const int64_t ib,
|
|
195
|
+
const int iqs, dfloat2 &v) {
|
|
196
|
+
#if QK_K == 256
|
|
197
|
+
const block_q5_K * x = (const block_q5_K *) vx;
|
|
198
|
+
const float dall = x[ib].dm[0];
|
|
199
|
+
const float dmin = x[ib].dm[1];
|
|
200
|
+
|
|
201
|
+
auto dequantize_one = [&](const int idx) -> dfloat {
|
|
202
|
+
const int il = idx / 64;
|
|
203
|
+
const int in = idx % 64;
|
|
204
|
+
const int is = 2 * il + (in >= 32 ? 1 : 0);
|
|
205
|
+
const int ir = (in & 31) / 2;
|
|
206
|
+
const int iq = in & 1;
|
|
207
|
+
|
|
208
|
+
const uint8_t q = x[ib].qs[32 * il + 2 * ir + iq];
|
|
209
|
+
const uint8_t h = x[ib].qh[2 * ir + iq];
|
|
210
|
+
const uint8_t qv = (in >= 32) ? (q >> 4) : (q & 0xF);
|
|
211
|
+
|
|
212
|
+
uint8_t sc;
|
|
213
|
+
uint8_t m;
|
|
214
|
+
get_scale_min_k4(is, x[ib].scales, sc, m);
|
|
215
|
+
|
|
216
|
+
const float d = dall * sc;
|
|
217
|
+
const float mn = dmin * m;
|
|
218
|
+
const uint8_t hm = 1 << (2 * il + (in >= 32 ? 1 : 0));
|
|
219
|
+
|
|
220
|
+
return sycl::fma((dfloat) (qv + ((h & hm) ? 16 : 0)), (dfloat) d, (dfloat) (-mn));
|
|
221
|
+
};
|
|
222
|
+
|
|
223
|
+
v.x() = dequantize_one(iqs + 0);
|
|
224
|
+
v.y() = dequantize_one(iqs + 1);
|
|
225
|
+
#else
|
|
226
|
+
GGML_ABORT("Q5_K dequantize not supported for QK_K != 256");
|
|
227
|
+
#endif
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
static __dpct_inline__ void dequantize_q6_K(const void *vx, const int64_t ib,
|
|
231
|
+
const int iqs, dfloat2 &v) {
|
|
232
|
+
#if QK_K == 256
|
|
233
|
+
const block_q6_K * x = (const block_q6_K *) vx;
|
|
234
|
+
const float d = x[ib].d;
|
|
235
|
+
|
|
236
|
+
auto dequantize_one = [&](const int idx) -> dfloat {
|
|
237
|
+
const int ip = idx / 128;
|
|
238
|
+
const int in = idx % 128;
|
|
239
|
+
const int il = in & 31;
|
|
240
|
+
const int ig = in / 32;
|
|
241
|
+
const int is = 8 * ip + il / 16;
|
|
242
|
+
|
|
243
|
+
const uint8_t ql0 = x[ib].ql[64 * ip + il];
|
|
244
|
+
const uint8_t ql1 = x[ib].ql[64 * ip + il + 32];
|
|
245
|
+
const uint8_t qh = x[ib].qh[32 * ip + il];
|
|
246
|
+
const int8_t * sc = x[ib].scales + is;
|
|
247
|
+
|
|
248
|
+
uint8_t qv;
|
|
249
|
+
int8_t scale;
|
|
250
|
+
if (ig == 0) {
|
|
251
|
+
qv = (ql0 & 0xF) | (((qh >> 0) & 3) << 4);
|
|
252
|
+
scale = sc[0];
|
|
253
|
+
} else if (ig == 1) {
|
|
254
|
+
qv = (ql1 & 0xF) | (((qh >> 2) & 3) << 4);
|
|
255
|
+
scale = sc[2];
|
|
256
|
+
} else if (ig == 2) {
|
|
257
|
+
qv = (ql0 >> 4) | (((qh >> 4) & 3) << 4);
|
|
258
|
+
scale = sc[4];
|
|
259
|
+
} else {
|
|
260
|
+
qv = (ql1 >> 4) | (((qh >> 6) & 3) << 4);
|
|
261
|
+
scale = sc[6];
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
return (dfloat) (d * scale * ((int8_t) qv - 32));
|
|
265
|
+
};
|
|
266
|
+
|
|
267
|
+
v.x() = dequantize_one(iqs + 0);
|
|
268
|
+
v.y() = dequantize_one(iqs + 1);
|
|
269
|
+
#else
|
|
270
|
+
GGML_ABORT("Q6_K dequantize not supported for QK_K != 256");
|
|
271
|
+
#endif
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
static __dpct_inline__ void dequantize_mxfp4(const void *vx, const int64_t ib,
|
|
275
|
+
const int iqs, dfloat2 &v) {
|
|
276
|
+
const block_mxfp4 * x = (const block_mxfp4 *) vx;
|
|
277
|
+
const float d = ggml_sycl_e8m0_to_fp32(x[ib].e);
|
|
278
|
+
const uint8_t q = x[ib].qs[iqs];
|
|
279
|
+
|
|
280
|
+
v.x() = d * kvalues_mxfp4[q & 0xF] * 0.5f;
|
|
281
|
+
v.y() = d * kvalues_mxfp4[q >> 4] * 0.5f;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
static __dpct_inline__ void dequantize_q1_0(const void *vx, const int64_t ib,
|
|
285
|
+
const int iqs, dfloat2 &v) {
|
|
286
|
+
const block_q1_0 * x = (const block_q1_0 *) vx;
|
|
287
|
+
const dfloat d = x[ib].d;
|
|
288
|
+
|
|
289
|
+
const int bit_index_0 = iqs + 0;
|
|
290
|
+
const int bit_index_1 = iqs + 1;
|
|
291
|
+
|
|
292
|
+
const int bit_0 = (x[ib].qs[bit_index_0 / 8] >> (bit_index_0 % 8)) & 1;
|
|
293
|
+
const int bit_1 = (x[ib].qs[bit_index_1 / 8] >> (bit_index_1 % 8)) & 1;
|
|
294
|
+
|
|
295
|
+
v.x() = (2 * bit_0 - 1) * d;
|
|
296
|
+
v.y() = (2 * bit_1 - 1) * d;
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
static __dpct_inline__ void dequantize_nvfp4(const void *vx, const int64_t ib,
|
|
300
|
+
const int iqs, dfloat2 &v) {
|
|
301
|
+
const block_nvfp4 & xb = ((const block_nvfp4 *) vx)[ib];
|
|
302
|
+
|
|
303
|
+
auto dequantize_one = [&](const int idx) -> dfloat {
|
|
304
|
+
const int sub = idx / QK_NVFP4_SUB;
|
|
305
|
+
const int j = idx % QK_NVFP4_SUB;
|
|
306
|
+
const int jh = j % (QK_NVFP4_SUB / 2);
|
|
307
|
+
|
|
308
|
+
const float d = ggml_sycl_ue4m3_to_fp32(xb.d[sub]);
|
|
309
|
+
const uint8_t q = xb.qs[sub * (QK_NVFP4_SUB / 2) + jh];
|
|
310
|
+
const uint8_t qv = (j < (QK_NVFP4_SUB / 2)) ? (q & 0x0F) : (q >> 4);
|
|
311
|
+
|
|
312
|
+
return d * kvalues_mxfp4[qv];
|
|
313
|
+
};
|
|
314
|
+
|
|
315
|
+
v.x() = dequantize_one(iqs + 0);
|
|
316
|
+
v.y() = dequantize_one(iqs + 1);
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
static __dpct_inline__ void dequantize_iq2_xxs(const void *vx, const int64_t ib,
|
|
320
|
+
const int iqs, dfloat2 &v) {
|
|
321
|
+
#if QK_K == 256
|
|
322
|
+
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
|
323
|
+
|
|
324
|
+
auto dequantize_one = [&](const int idx) -> dfloat {
|
|
325
|
+
const int ib8 = idx / 32;
|
|
326
|
+
const int r = idx % 32;
|
|
327
|
+
const int il = r / 8;
|
|
328
|
+
const int j = r % 8;
|
|
329
|
+
|
|
330
|
+
const uint16_t * q2 = x[ib].qs + 4 * ib8;
|
|
331
|
+
const uint8_t * aux8 = (const uint8_t *) q2;
|
|
332
|
+
const uint8_t * grid = (const uint8_t *) (iq2xxs_grid + aux8[il]);
|
|
333
|
+
const uint32_t aux32 = q2[2] | (q2[3] << 16);
|
|
334
|
+
const float d = (float) x[ib].d * (0.5f + (aux32 >> 28)) * 0.25f;
|
|
335
|
+
const uint8_t signs = ksigns_iq2xs[(aux32 >> (7 * il)) & 127];
|
|
336
|
+
|
|
337
|
+
return d * grid[j] * ((signs & kmask_iq2xs[j]) ? -1.f : 1.f);
|
|
338
|
+
};
|
|
339
|
+
|
|
340
|
+
v.x() = dequantize_one(iqs + 0);
|
|
341
|
+
v.y() = dequantize_one(iqs + 1);
|
|
342
|
+
#else
|
|
343
|
+
GGML_ABORT("IQ2_XXS dequantize not supported for QK_K != 256");
|
|
344
|
+
#endif
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
static __dpct_inline__ void dequantize_iq2_xs(const void *vx, const int64_t ib,
|
|
348
|
+
const int iqs, dfloat2 &v) {
|
|
349
|
+
#if QK_K == 256
|
|
350
|
+
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
|
351
|
+
|
|
352
|
+
auto dequantize_one = [&](const int idx) -> dfloat {
|
|
353
|
+
const int ib8 = idx / 32;
|
|
354
|
+
const int r = idx % 32;
|
|
355
|
+
const int il = r / 8;
|
|
356
|
+
const int j = r % 8;
|
|
357
|
+
|
|
358
|
+
const uint16_t * q2 = x[ib].qs + 4 * ib8;
|
|
359
|
+
const uint8_t * grid = (const uint8_t *) (iq2xs_grid + (q2[il] & 511));
|
|
360
|
+
const float d = (float) x[ib].d * (0.5f + ((x[ib].scales[ib8] >> (4 * (il / 2))) & 0xf)) * 0.25f;
|
|
361
|
+
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
|
362
|
+
|
|
363
|
+
return d * grid[j] * ((signs & kmask_iq2xs[j]) ? -1.f : 1.f);
|
|
364
|
+
};
|
|
365
|
+
|
|
366
|
+
v.x() = dequantize_one(iqs + 0);
|
|
367
|
+
v.y() = dequantize_one(iqs + 1);
|
|
368
|
+
#else
|
|
369
|
+
GGML_ABORT("IQ2_XS dequantize not supported for QK_K != 256");
|
|
370
|
+
#endif
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
static __dpct_inline__ void dequantize_iq2_s(const void *vx, const int64_t ib,
|
|
374
|
+
const int iqs, dfloat2 &v) {
|
|
375
|
+
#if QK_K == 256
|
|
376
|
+
const block_iq2_s * x = (const block_iq2_s *) vx;
|
|
377
|
+
|
|
378
|
+
auto dequantize_one = [&](const int idx) -> dfloat {
|
|
379
|
+
const int ib8 = idx / 32;
|
|
380
|
+
const int r = idx % 32;
|
|
381
|
+
const int il = r / 8;
|
|
382
|
+
const int j = r % 8;
|
|
383
|
+
|
|
384
|
+
const uint16_t grid_id = x[ib].qs[4 * ib8 + il] | ((x[ib].qh[ib8] << (8 - 2 * il)) & 0x300);
|
|
385
|
+
const uint8_t * grid = (const uint8_t *) (iq2s_grid + grid_id);
|
|
386
|
+
const float d = (float) x[ib].d * (0.5f + ((x[ib].scales[ib8] >> (4 * (il / 2))) & 0xf)) * 0.25f;
|
|
387
|
+
const uint8_t signs = x[ib].qs[QK_K / 8 + 4 * ib8 + il];
|
|
388
|
+
|
|
389
|
+
return d * grid[j] * ((signs & kmask_iq2xs[j]) ? -1.f : 1.f);
|
|
390
|
+
};
|
|
391
|
+
|
|
392
|
+
v.x() = dequantize_one(iqs + 0);
|
|
393
|
+
v.y() = dequantize_one(iqs + 1);
|
|
394
|
+
#else
|
|
395
|
+
GGML_ABORT("IQ2_S dequantize not supported for QK_K != 256");
|
|
396
|
+
#endif
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
static __dpct_inline__ void dequantize_iq3_xxs(const void *vx, const int64_t ib,
|
|
400
|
+
const int iqs, dfloat2 &v) {
|
|
401
|
+
#if QK_K == 256
|
|
402
|
+
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
|
403
|
+
|
|
404
|
+
auto dequantize_one = [&](const int idx) -> dfloat {
|
|
405
|
+
const int ib8 = idx / 32;
|
|
406
|
+
const int r = idx % 32;
|
|
407
|
+
const int il = r / 8;
|
|
408
|
+
const int j = r % 8;
|
|
409
|
+
|
|
410
|
+
const uint8_t * q3 = x[ib].qs + 8 * ib8;
|
|
411
|
+
const uint16_t * gas = (const uint16_t *) (x[ib].qs + QK_K / 4) + 2 * ib8;
|
|
412
|
+
const uint8_t * grid1 = (const uint8_t *) (iq3xxs_grid + q3[2 * il + 0]);
|
|
413
|
+
const uint8_t * grid2 = (const uint8_t *) (iq3xxs_grid + q3[2 * il + 1]);
|
|
414
|
+
const uint32_t aux32 = gas[0] | (gas[1] << 16);
|
|
415
|
+
const float d = (float) x[ib].d * (0.5f + (aux32 >> 28)) * 0.5f;
|
|
416
|
+
const uint8_t signs = ksigns_iq2xs[(aux32 >> (7 * il)) & 127];
|
|
417
|
+
|
|
418
|
+
if (j < 4) {
|
|
419
|
+
return d * grid1[j] * ((signs & kmask_iq2xs[j + 0]) ? -1.f : 1.f);
|
|
420
|
+
}
|
|
421
|
+
return d * grid2[j - 4] * ((signs & kmask_iq2xs[j + 0]) ? -1.f : 1.f);
|
|
422
|
+
};
|
|
423
|
+
|
|
424
|
+
v.x() = dequantize_one(iqs + 0);
|
|
425
|
+
v.y() = dequantize_one(iqs + 1);
|
|
426
|
+
#else
|
|
427
|
+
GGML_ABORT("IQ3_XXS dequantize not supported for QK_K != 256");
|
|
428
|
+
#endif
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
static __dpct_inline__ void dequantize_iq3_s(const void *vx, const int64_t ib,
|
|
432
|
+
const int iqs, dfloat2 &v) {
|
|
433
|
+
#if QK_K == 256
|
|
434
|
+
const block_iq3_s * x = (const block_iq3_s *) vx;
|
|
435
|
+
|
|
436
|
+
auto dequantize_one = [&](const int idx) -> dfloat {
|
|
437
|
+
const int ib8 = idx / 32;
|
|
438
|
+
const int r = idx % 32;
|
|
439
|
+
const int il = r / 8;
|
|
440
|
+
const int j = r % 8;
|
|
441
|
+
|
|
442
|
+
const uint8_t * qs = x[ib].qs + 8 * ib8;
|
|
443
|
+
const uint16_t grid1_id = qs[2 * il + 0] | ((x[ib].qh[ib8] << (8 - 2 * il)) & 256);
|
|
444
|
+
const uint16_t grid2_id = qs[2 * il + 1] | ((x[ib].qh[ib8] << (7 - 2 * il)) & 256);
|
|
445
|
+
const uint8_t * grid1 = (const uint8_t *) (iq3s_grid + grid1_id);
|
|
446
|
+
const uint8_t * grid2 = (const uint8_t *) (iq3s_grid + grid2_id);
|
|
447
|
+
const float d = (float) x[ib].d * (1 + 2 * ((x[ib].scales[ib8 / 2] >> (4 * (ib8 % 2))) & 0xf));
|
|
448
|
+
const uint8_t signs = x[ib].signs[4 * ib8 + il];
|
|
449
|
+
|
|
450
|
+
if (j < 4) {
|
|
451
|
+
return d * grid1[j] * ((signs & kmask_iq2xs[j + 0]) ? -1.f : 1.f);
|
|
452
|
+
}
|
|
453
|
+
return d * grid2[j - 4] * ((signs & kmask_iq2xs[j + 0]) ? -1.f : 1.f);
|
|
454
|
+
};
|
|
455
|
+
|
|
456
|
+
v.x() = dequantize_one(iqs + 0);
|
|
457
|
+
v.y() = dequantize_one(iqs + 1);
|
|
458
|
+
#else
|
|
459
|
+
GGML_ABORT("IQ3_S dequantize not supported for QK_K != 256");
|
|
460
|
+
#endif
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
static __dpct_inline__ void dequantize_iq1_s(const void *vx, const int64_t ib,
|
|
464
|
+
const int iqs, dfloat2 &v) {
|
|
465
|
+
#if QK_K == 256
|
|
466
|
+
const block_iq1_s * x = (const block_iq1_s *) vx;
|
|
467
|
+
|
|
468
|
+
auto dequantize_one = [&](const int idx) -> dfloat {
|
|
469
|
+
const int ib8 = idx / 32;
|
|
470
|
+
const int r = idx % 32;
|
|
471
|
+
const int il = r / 8;
|
|
472
|
+
const int j = r % 8;
|
|
473
|
+
|
|
474
|
+
const float delta = (x[ib].qh[ib8] & 0x8000) ? (-1.f - IQ1S_DELTA) : (-1.f + IQ1S_DELTA);
|
|
475
|
+
const float d = (float) x[ib].d * (2 * ((x[ib].qh[ib8] >> 12) & 7) + 1);
|
|
476
|
+
const uint16_t grid_id = x[ib].qs[4 * ib8 + il] | (((x[ib].qh[ib8] >> (3 * il)) & 7) << 8);
|
|
477
|
+
const uint32_t g = iq1s_grid_gpu[grid_id];
|
|
478
|
+
const int8_t qv = (j < 4) ? ((g >> (8 * j)) & 0x0F) : ((g >> (8 * (j - 4) + 4)) & 0x0F);
|
|
479
|
+
|
|
480
|
+
return d * (qv + delta);
|
|
481
|
+
};
|
|
482
|
+
|
|
483
|
+
v.x() = dequantize_one(iqs + 0);
|
|
484
|
+
v.y() = dequantize_one(iqs + 1);
|
|
485
|
+
#else
|
|
486
|
+
GGML_ABORT("IQ1_S dequantize not supported for QK_K != 256");
|
|
487
|
+
#endif
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
static __dpct_inline__ void dequantize_iq1_m(const void *vx, const int64_t ib,
|
|
491
|
+
const int iqs, dfloat2 &v) {
|
|
492
|
+
#if QK_K == 256
|
|
493
|
+
const block_iq1_m * x = (const block_iq1_m *) vx;
|
|
494
|
+
|
|
495
|
+
auto dequantize_one = [&](const int idx) -> dfloat {
|
|
496
|
+
const int ib8 = idx / 32;
|
|
497
|
+
const int r = idx % 32;
|
|
498
|
+
const int il = r / 8;
|
|
499
|
+
const int j = r % 8;
|
|
500
|
+
|
|
501
|
+
const uint16_t * sc = (const uint16_t *) x[ib].scales;
|
|
502
|
+
iq1m_scale_t scale;
|
|
503
|
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
|
504
|
+
|
|
505
|
+
const int ib16 = 2 * ib8 + il / 2;
|
|
506
|
+
const float d = (float) scale.f16 * (2 * ((sc[ib16 / 4] >> (3 * (ib16 % 4))) & 0x7) + 1);
|
|
507
|
+
|
|
508
|
+
const uint8_t qh = x[ib].qh[2 * ib8 + il / 2];
|
|
509
|
+
const float delta = (qh & (0x08 << (4 * (il % 2)))) ? (-1.f - IQ1M_DELTA) : (-1.f + IQ1M_DELTA);
|
|
510
|
+
|
|
511
|
+
const uint16_t grid_id = x[ib].qs[4 * ib8 + il] | (((qh >> (4 * (il % 2))) & 7) << 8);
|
|
512
|
+
const uint32_t g = iq1s_grid_gpu[grid_id];
|
|
513
|
+
const int8_t qv = (j < 4) ? ((g >> (8 * j)) & 0x0F) : ((g >> (8 * (j - 4) + 4)) & 0x0F);
|
|
514
|
+
|
|
515
|
+
return d * (qv + delta);
|
|
516
|
+
};
|
|
517
|
+
|
|
518
|
+
v.x() = dequantize_one(iqs + 0);
|
|
519
|
+
v.y() = dequantize_one(iqs + 1);
|
|
520
|
+
#else
|
|
521
|
+
GGML_ABORT("IQ1_M dequantize not supported for QK_K != 256");
|
|
522
|
+
#endif
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
static __dpct_inline__ void dequantize_iq4_nl(const void *vx, const int64_t ib,
|
|
526
|
+
const int iqs, dfloat2 &v) {
|
|
527
|
+
const block_iq4_nl * x = (const block_iq4_nl *) vx;
|
|
528
|
+
const float d = (float) x[ib].d;
|
|
529
|
+
|
|
530
|
+
auto dequantize_one = [&](const int idx) -> dfloat {
|
|
531
|
+
if (idx < 16) {
|
|
532
|
+
return d * kvalues_iq4nl[x[ib].qs[idx] & 0xF];
|
|
533
|
+
}
|
|
534
|
+
return d * kvalues_iq4nl[x[ib].qs[idx - 16] >> 4];
|
|
535
|
+
};
|
|
536
|
+
|
|
537
|
+
v.x() = dequantize_one(iqs + 0);
|
|
538
|
+
v.y() = dequantize_one(iqs + 1);
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
static __dpct_inline__ void dequantize_iq4_xs(const void *vx, const int64_t ib,
|
|
542
|
+
const int iqs, dfloat2 &v) {
|
|
543
|
+
#if QK_K == 256
|
|
544
|
+
const block_iq4_xs * x = (const block_iq4_xs *) vx;
|
|
545
|
+
|
|
546
|
+
auto dequantize_one = [&](const int idx) -> dfloat {
|
|
547
|
+
const int ib8 = idx / 32;
|
|
548
|
+
const int r = idx % 32;
|
|
549
|
+
const int byte_idx = (r < 16) ? r : (r - 16);
|
|
550
|
+
const uint8_t q = x[ib].qs[16 * ib8 + byte_idx];
|
|
551
|
+
const uint8_t qv = (r < 16) ? (q & 0x0F) : (q >> 4);
|
|
552
|
+
|
|
553
|
+
const float d = (float) x[ib].d * ((((x[ib].scales_l[ib8 / 2] >> (4 * (ib8 % 2))) & 0xf) |
|
|
554
|
+
(((x[ib].scales_h >> (2 * ib8)) & 3) << 4)) - 32);
|
|
555
|
+
return d * kvalues_iq4nl[qv];
|
|
556
|
+
};
|
|
557
|
+
|
|
558
|
+
v.x() = dequantize_one(iqs + 0);
|
|
559
|
+
v.y() = dequantize_one(iqs + 1);
|
|
560
|
+
#else
|
|
561
|
+
GGML_ABORT("IQ4_XS dequantize not supported for QK_K != 256");
|
|
562
|
+
#endif
|
|
563
|
+
}
|
|
564
|
+
|
|
92
565
|
static __dpct_inline__ void dequantize_q5_0(const void *vx, const int64_t ib,
|
|
93
566
|
const int iqs, dfloat2 &v) {
|
|
94
567
|
const block_q5_0 * x = (const block_q5_0 *) vx;
|
|
@@ -143,6 +616,22 @@ static __dpct_inline__ void dequantize_q5_1(const void *vx, const int64_t ib,
|
|
|
143
616
|
#endif // GGML_SYCL_F16
|
|
144
617
|
}
|
|
145
618
|
|
|
619
|
+
static __dpct_inline__ void dequantize_q8_0_reorder(const void *d_ptr, const int64_t ib, const void *qs,
|
|
620
|
+
const int iqs, dfloat2 &v) {
|
|
621
|
+
const dfloat d = (const dfloat)*((const sycl::half*)d_ptr + ib);
|
|
622
|
+
|
|
623
|
+
v.x() = ((const int8_t *)qs)[iqs + 0];
|
|
624
|
+
v.y() = ((const int8_t *)qs)[iqs + 1];
|
|
625
|
+
|
|
626
|
+
#ifdef GGML_SYCL_F16
|
|
627
|
+
v.s0() *= d;
|
|
628
|
+
v.s1() *= d;
|
|
629
|
+
#else
|
|
630
|
+
v.x() *= d;
|
|
631
|
+
v.y() *= d;
|
|
632
|
+
#endif // GGML_SYCL_F16
|
|
633
|
+
}
|
|
634
|
+
|
|
146
635
|
static __dpct_inline__ void dequantize_q8_0(const void *vx, const int64_t ib,
|
|
147
636
|
const int iqs, dfloat2 &v) {
|
|
148
637
|
const block_q8_0 * x = (const block_q8_0 *) vx;
|
|
@@ -222,6 +711,34 @@ static void dequantize_block_q4_0_reorder(const void * __restrict__ vx, dst_t *
|
|
|
222
711
|
|
|
223
712
|
}
|
|
224
713
|
|
|
714
|
+
// Dequantize Q8_0 from reorder layout: [all qs (k bytes)][all d values]
|
|
715
|
+
// Each thread handles one block of QK8_0 elements.
|
|
716
|
+
template<typename dst_t>
|
|
717
|
+
static void dequantize_block_q8_0_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t k,
|
|
718
|
+
const sycl::nd_item<3> &item_ct1) {
|
|
719
|
+
|
|
720
|
+
const int64_t i = item_ct1.get_group(2);
|
|
721
|
+
const int64_t tid = item_ct1.get_local_id(2);
|
|
722
|
+
const int lane_ib = i * WARP_SIZE + tid;
|
|
723
|
+
|
|
724
|
+
if (lane_ib >= k / QK8_0) {
|
|
725
|
+
return;
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
dst_t * y_ptr = yy + lane_ib * QK8_0;
|
|
729
|
+
|
|
730
|
+
auto qs = (const int8_t*)vx + lane_ib * QK8_0;
|
|
731
|
+
auto s_ptr = (const sycl::half*)((const uint8_t*)vx + k) + lane_ib;
|
|
732
|
+
|
|
733
|
+
const float d = float(*s_ptr);
|
|
734
|
+
|
|
735
|
+
#pragma unroll
|
|
736
|
+
for (int l = 0; l < QK8_0; ++l) {
|
|
737
|
+
y_ptr[l] = d * qs[l];
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
}
|
|
741
|
+
|
|
225
742
|
template<typename dst_t>
|
|
226
743
|
static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
|
|
227
744
|
const sycl::nd_item<3> &item_ct1) {
|
|
@@ -345,6 +862,63 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
345
862
|
|
|
346
863
|
}
|
|
347
864
|
|
|
865
|
+
template<typename dst_t>
|
|
866
|
+
static void dequantize_block_q3_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
|
867
|
+
const sycl::nd_item<3> & item_ct1, int64_t n_blocks) {
|
|
868
|
+
#if QK_K == 256
|
|
869
|
+
const int64_t i = item_ct1.get_group(2);
|
|
870
|
+
if (i >= n_blocks) {
|
|
871
|
+
return;
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
const uint8_t * base = static_cast<const uint8_t *>(vx);
|
|
875
|
+
const size_t qs_offset = i * (QK_K / 4);
|
|
876
|
+
const size_t hmask_offset = n_blocks * (QK_K / 4) + i * (QK_K / 8);
|
|
877
|
+
const size_t scales_offset = n_blocks * (QK_K / 4) + n_blocks * (QK_K / 8) + i * 12;
|
|
878
|
+
const size_t d_offset = n_blocks * (QK_K / 4) + n_blocks * (QK_K / 8) + n_blocks * 12 +
|
|
879
|
+
i * sizeof(ggml_half);
|
|
880
|
+
|
|
881
|
+
const uint8_t * qs = base + qs_offset;
|
|
882
|
+
const uint8_t * hmask = base + hmask_offset;
|
|
883
|
+
const uint8_t * scales = base + scales_offset;
|
|
884
|
+
const float d_all = static_cast<float>(*reinterpret_cast<const ggml_half *>(base + d_offset));
|
|
885
|
+
|
|
886
|
+
const int64_t r = item_ct1.get_local_id(2) / 4;
|
|
887
|
+
const int64_t tid = r / 2;
|
|
888
|
+
const int64_t is0 = r % 2;
|
|
889
|
+
const int64_t l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4);
|
|
890
|
+
const int64_t n = tid / 4;
|
|
891
|
+
const int64_t j = tid - 4 * n;
|
|
892
|
+
const int64_t is = 8 * n + 2 * j + is0;
|
|
893
|
+
const int shift = 2 * j;
|
|
894
|
+
uint8_t m = 1 << (4 * n + j);
|
|
895
|
+
|
|
896
|
+
uint8_t us = is < 4
|
|
897
|
+
? (scales[is - 0] & 0xF) | (((scales[is + 8] >> 0) & 3) << 4)
|
|
898
|
+
: is < 8
|
|
899
|
+
? (scales[is - 0] & 0xF) | (((scales[is + 4] >> 2) & 3) << 4)
|
|
900
|
+
: is < 12
|
|
901
|
+
? (scales[is - 8] >> 4) | (((scales[is + 0] >> 4) & 3) << 4)
|
|
902
|
+
: (scales[is - 8] >> 4) | (((scales[is - 4] >> 6) & 3) << 4);
|
|
903
|
+
|
|
904
|
+
const float dl = d_all * (us - 32);
|
|
905
|
+
|
|
906
|
+
dst_t * y = yy + i * QK_K + 128 * n + 32 * j;
|
|
907
|
+
const uint8_t * q = qs + 32 * n;
|
|
908
|
+
const uint8_t * hm = hmask;
|
|
909
|
+
|
|
910
|
+
for (int l = l0; l < l0 + 4; ++l) {
|
|
911
|
+
y[l] = dl * ((int8_t) ((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
|
912
|
+
}
|
|
913
|
+
#else
|
|
914
|
+
GGML_UNUSED(vx);
|
|
915
|
+
GGML_UNUSED(yy);
|
|
916
|
+
GGML_UNUSED(item_ct1);
|
|
917
|
+
GGML_UNUSED(n_blocks);
|
|
918
|
+
GGML_ABORT("Q3_K reorder dequantize not supported for QK_K != 256");
|
|
919
|
+
#endif
|
|
920
|
+
}
|
|
921
|
+
|
|
348
922
|
#if QK_K == 256
|
|
349
923
|
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
|
350
924
|
if (j < 4) {
|
|
@@ -492,6 +1066,63 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
492
1066
|
#endif
|
|
493
1067
|
}
|
|
494
1068
|
|
|
1069
|
+
template <typename dst_t>
|
|
1070
|
+
static void dequantize_block_q5_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
|
1071
|
+
uint8_t * scales_local, const sycl::nd_item<3> & item_ct1, int64_t n_blocks) {
|
|
1072
|
+
const int64_t ib = item_ct1.get_group(2);
|
|
1073
|
+
|
|
1074
|
+
#if QK_K == 256
|
|
1075
|
+
// assume 64 threads
|
|
1076
|
+
const int64_t tid = item_ct1.get_local_id(2);
|
|
1077
|
+
const int64_t il = tid / 16; // 0...3
|
|
1078
|
+
const int64_t ir = tid % 16; // 0...15
|
|
1079
|
+
const int64_t is = 2 * il;
|
|
1080
|
+
|
|
1081
|
+
dst_t * y = yy + ib * QK_K + 64 * il + 2 * ir;
|
|
1082
|
+
|
|
1083
|
+
const uint8_t * base = static_cast<const uint8_t *>(vx);
|
|
1084
|
+
|
|
1085
|
+
// Reordered layout: [qs (QK_K/2 per block)] [qh (QK_K/8 per block)] [scales (K_SCALE_SIZE per block)] [dm (half2 per block)]
|
|
1086
|
+
const size_t qs_offset = ib * (QK_K / 2);
|
|
1087
|
+
const size_t qh_offset = n_blocks * (QK_K / 2) + ib * (QK_K / 8);
|
|
1088
|
+
const size_t scales_offset = n_blocks * (QK_K / 2) + n_blocks * (QK_K / 8) + ib * K_SCALE_SIZE;
|
|
1089
|
+
const size_t dm_offset = n_blocks * (QK_K / 2) + n_blocks * (QK_K / 8) + n_blocks * K_SCALE_SIZE + ib * sizeof(ggml_half2);
|
|
1090
|
+
|
|
1091
|
+
const uint8_t * qs_ptr = base + qs_offset;
|
|
1092
|
+
const uint8_t * qh_ptr = base + qh_offset;
|
|
1093
|
+
const uint8_t * scales_ptr = base + scales_offset;
|
|
1094
|
+
const ggml_half2 dm_values = *reinterpret_cast<const ggml_half2 *>(base + dm_offset);
|
|
1095
|
+
|
|
1096
|
+
const float dall = dm_values.x();
|
|
1097
|
+
const float dmin = dm_values.y();
|
|
1098
|
+
|
|
1099
|
+
const uint8_t * ql = qs_ptr + 32 * il + 2 * ir;
|
|
1100
|
+
const uint8_t * qh = qh_ptr + 2 * ir;
|
|
1101
|
+
|
|
1102
|
+
if (tid < K_SCALE_SIZE) {
|
|
1103
|
+
scales_local[tid] = scales_ptr[tid];
|
|
1104
|
+
}
|
|
1105
|
+
|
|
1106
|
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
1107
|
+
|
|
1108
|
+
uint8_t sc, m;
|
|
1109
|
+
get_scale_min_k4(is + 0, scales_local, sc, m);
|
|
1110
|
+
const float d1 = dall * sc; const float m1 = dmin * m;
|
|
1111
|
+
get_scale_min_k4(is + 1, scales_local, sc, m);
|
|
1112
|
+
const float d2 = dall * sc; const float m2 = dmin * m;
|
|
1113
|
+
|
|
1114
|
+
uint8_t hm = 1 << (2 * il);
|
|
1115
|
+
y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
|
|
1116
|
+
y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
|
|
1117
|
+
hm <<= 1;
|
|
1118
|
+
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
|
1119
|
+
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
|
1120
|
+
#else
|
|
1121
|
+
GGML_UNUSED(ib); GGML_UNUSED(tid); GGML_UNUSED(yy); GGML_UNUSED(scales_local); GGML_UNUSED(n_blocks);
|
|
1122
|
+
GGML_ABORT("Q5_K reorder dequantize not supported for QK_K != 256");
|
|
1123
|
+
#endif
|
|
1124
|
+
}
|
|
1125
|
+
|
|
495
1126
|
template<typename dst_t>
|
|
496
1127
|
static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
|
497
1128
|
const sycl::nd_item<3> &item_ct1) {
|
|
@@ -838,4 +1469,36 @@ static void dequantize_block_mxfp4(const void * __restrict__ vx, dst_t * __restr
|
|
|
838
1469
|
}
|
|
839
1470
|
}
|
|
840
1471
|
|
|
1472
|
+
|
|
1473
|
+
template <typename dst_t>
|
|
1474
|
+
static void dequantize_block_nvfp4(
|
|
1475
|
+
const void * __restrict__ vx,
|
|
1476
|
+
dst_t * __restrict__ yy,
|
|
1477
|
+
const int64_t ne) {
|
|
1478
|
+
auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
|
|
1479
|
+
const int64_t i = item_ct1.get_group(2);
|
|
1480
|
+
const int tid = item_ct1.get_local_id(2);
|
|
1481
|
+
|
|
1482
|
+
const int64_t base = i * QK_NVFP4;
|
|
1483
|
+
if (base >= ne) {
|
|
1484
|
+
return;
|
|
1485
|
+
}
|
|
1486
|
+
|
|
1487
|
+
const block_nvfp4 * x = (const block_nvfp4 *) vx;
|
|
1488
|
+
const block_nvfp4 & xb = x[i];
|
|
1489
|
+
|
|
1490
|
+
const int sub = tid / (QK_NVFP4_SUB / 2);
|
|
1491
|
+
const int j = tid % (QK_NVFP4_SUB / 2);
|
|
1492
|
+
|
|
1493
|
+
const float d = ggml_sycl_ue4m3_to_fp32(xb.d[sub]);
|
|
1494
|
+
const uint8_t q = xb.qs[sub * (QK_NVFP4_SUB / 2) + j];
|
|
1495
|
+
|
|
1496
|
+
const int64_t y0 = base + sub * QK_NVFP4_SUB + j;
|
|
1497
|
+
const int64_t y1 = y0 + QK_NVFP4_SUB / 2;
|
|
1498
|
+
|
|
1499
|
+
yy[y0] = ggml_sycl_cast<dst_t>(d * kvalues_mxfp4[q & 0x0F]);
|
|
1500
|
+
yy[y1] = ggml_sycl_cast<dst_t>(d * kvalues_mxfp4[q >> 4]);
|
|
1501
|
+
}
|
|
1502
|
+
|
|
1503
|
+
|
|
841
1504
|
#endif // GGML_SYCL_DEQUANTIZE_HPP
|