whispercpp 1.3.5 → 1.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.document +3 -0
- data/.rdoc_options +2 -0
- data/LICENSE +1 -1
- data/README.md +133 -3
- data/Rakefile +18 -3
- data/ext/dependencies.rb +10 -4
- data/ext/dependencies_for_windows.rb +17 -0
- data/ext/extconf.rb +20 -7
- data/ext/options.rb +54 -14
- data/ext/options_for_windows.rb +51 -0
- data/ext/ruby_whisper.c +56 -46
- data/ext/ruby_whisper.h +165 -2
- data/ext/ruby_whisper_context.c +297 -126
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_log_queue.c +180 -0
- data/ext/ruby_whisper_log_settable.h +47 -0
- data/ext/ruby_whisper_model.c +0 -1
- data/ext/ruby_whisper_parakeet.c +49 -0
- data/ext/ruby_whisper_parakeet_context.c +304 -0
- data/ext/ruby_whisper_parakeet_context_params.c +117 -0
- data/ext/ruby_whisper_parakeet_model.c +84 -0
- data/ext/ruby_whisper_parakeet_params.c +548 -0
- data/ext/ruby_whisper_parakeet_segment.c +157 -0
- data/ext/ruby_whisper_parakeet_token.c +188 -0
- data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
- data/ext/ruby_whisper_params.c +256 -66
- data/ext/ruby_whisper_segment.c +6 -7
- data/ext/ruby_whisper_token.c +29 -9
- data/ext/ruby_whisper_transcribe.cpp +46 -16
- data/ext/ruby_whisper_vad_context.c +48 -1
- data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +0 -1
- data/ext/ruby_whisper_vad_segments.c +0 -1
- data/ext/sources/CMakeLists.txt +41 -3
- data/ext/sources/CMakePresets.json +95 -0
- data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
- data/ext/sources/cmake/parakeet.pc.in +10 -0
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/cmake/whisper.pc.in +1 -1
- data/ext/sources/examples/CMakeLists.txt +4 -2
- data/ext/sources/examples/bench/bench.cpp +24 -19
- data/ext/sources/examples/cli/cli.cpp +51 -9
- data/ext/sources/examples/common-ggml.cpp +4 -0
- data/ext/sources/examples/common-whisper.cpp +139 -67
- data/ext/sources/examples/common-whisper.h +11 -0
- data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
- data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
- data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
- data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
- data/ext/sources/examples/server/server.cpp +213 -163
- data/ext/sources/ggml/CMakeLists.txt +29 -15
- data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
- data/ext/sources/ggml/include/ggml-alloc.h +1 -0
- data/ext/sources/ggml/include/ggml-backend.h +73 -11
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +5 -0
- data/ext/sources/ggml/include/ggml-cuda.h +3 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +8 -3
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml.h +155 -16
- data/ext/sources/ggml/include/gguf.h +10 -2
- data/ext/sources/ggml/src/CMakeLists.txt +25 -5
- data/ext/sources/ggml/src/ggml-alloc.c +9 -10
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
- data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +40 -86
- data/ext/sources/ggml/src/ggml-backend.cpp +114 -10
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +10 -2
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +1016 -442
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +111 -85
- data/ext/sources/ggml/src/ggml-cann/common.h +23 -14
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +255 -92
- data/ext/sources/ggml/src/ggml-common.h +22 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +68 -34
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +44 -19
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +101 -101
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +194 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2874 -613
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +5480 -840
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1361 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -11
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +186 -36
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +119 -19
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +112 -26
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +153 -16
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +17 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +976 -251
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +671 -266
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1277 -263
- data/ext/sources/ggml/src/ggml-cpu/ops.h +4 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +95 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +6 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2893 -679
- data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +226 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +114 -19
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
- data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +54 -53
- data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +18 -8
- data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
- data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +73 -28
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +69 -41
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +359 -29
- data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +94 -27
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +20 -9
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +333 -85
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +632 -190
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +162 -49
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +43 -18
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +44 -14
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +241 -23
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
- data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +312 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1454 -599
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +397 -183
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +161 -88
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +522 -431
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +139 -72
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +608 -88
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +47 -79
- data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +134 -27
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +7 -17
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +244 -137
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
- data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
- data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +96 -40
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +6 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +6 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -5
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +202 -135
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +86 -2
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +111 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +30 -2
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +84 -46
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1612 -753
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +51 -11
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +361 -261
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +294 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +753 -241
- data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +295 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +471 -296
- data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +159 -53
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +3 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +372 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +86 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +137 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +97 -14
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +163 -67
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +308 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +262 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +291 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +216 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +142 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -1348
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +547 -635
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +3556 -1101
- data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +475 -269
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +94 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +222 -217
- data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +432 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +886 -117
- data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +40 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +28 -9
- data/ext/sources/ggml/src/ggml-impl.h +68 -1
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +409 -83
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +54 -5
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +254 -52
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +254 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +756 -285
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +7 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +359 -133
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1867 -1123
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +71 -4
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +14127 -5314
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +97 -88
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +104 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1978 -67
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
- data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
- data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +178 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +985 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +380 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1132 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +956 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +149 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +47 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +40 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +317 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +257 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +86 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +880 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +143 -0
- data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
- data/ext/sources/ggml/src/ggml-quants.c +385 -119
- data/ext/sources/ggml/src/ggml-quants.h +6 -0
- data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
- data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
- data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +64 -91
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +4 -1
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
- data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +356 -11
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +184 -14
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +31 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
- data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
- data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
- data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +77 -156
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1181 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +59 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1246 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +674 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +227 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +347 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +1134 -236
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
- data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
- data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +72 -1
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
- data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
- data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
- data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
- data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
- data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +228 -53
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +123 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +160 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +99 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +545 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +115 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3250 -940
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +533 -180
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +113 -68
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +412 -222
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +222 -83
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +189 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +22 -20
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +51 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +39 -63
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +13 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +27 -11
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +193 -149
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +3221 -97
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3493 -1997
- data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +142 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +115 -141
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +93 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -44
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +198 -230
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +234 -335
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +871 -42
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +149 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +36 -138
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +151 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +15 -40
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +39 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +213 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +24 -15
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +253 -16
- data/ext/sources/ggml/src/ggml.c +268 -52
- data/ext/sources/ggml/src/gguf.cpp +377 -47
- data/ext/sources/include/parakeet.h +342 -0
- data/ext/sources/include/whisper.h +10 -0
- data/ext/sources/media/matmul.png +0 -0
- data/ext/sources/src/CMakeLists.txt +23 -0
- data/ext/sources/src/parakeet-arch.h +188 -0
- data/ext/sources/src/parakeet.cpp +3838 -0
- data/ext/sources/src/whisper.cpp +62 -40
- data/extsources.rb +26 -10
- data/lib/whisper/log_settable.rb +36 -0
- data/lib/whisper/model/uri.rb +13 -1
- data/lib/whisper/output.rb +74 -0
- data/sig/whisper.rbs +445 -55
- data/test/helper.rb +2 -0
- data/test/jfk_reader/jfk_reader.c +50 -7
- data/test/test_callback.rb +1 -0
- data/test/test_context_params.rb +82 -0
- data/test/test_package.rb +6 -5
- data/test/test_parakeet.rb +28 -0
- data/test/test_parakeet_callback.rb +107 -0
- data/test/test_parakeet_context.rb +116 -0
- data/test/test_parakeet_context_params.rb +24 -0
- data/test/test_parakeet_model.rb +21 -0
- data/test/test_parakeet_params.rb +78 -0
- data/test/test_parakeet_segment.rb +42 -0
- data/test/test_parakeet_token.rb +73 -0
- data/test/test_params.rb +2 -0
- data/test/test_token.rb +11 -0
- data/test/test_vad_context.rb +58 -8
- data/test/test_vad_segment.rb +1 -1
- data/test/test_whisper.rb +44 -6
- data/whispercpp.gemspec +2 -2
- metadata +426 -280
- data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
- data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
- data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
- data/ext/sources/bindings/javascript/package.json +0 -26
- data/ext/sources/bindings/javascript/whisper.js +0 -19
- data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
- data/ext/sources/examples/addon.node/addon.cpp +0 -557
- data/ext/sources/examples/addon.node/index.js +0 -59
- data/ext/sources/examples/addon.node/package.json +0 -16
- data/ext/sources/examples/addon.node/vad-example.js +0 -132
- data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
- data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
- data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
- data/ext/sources/examples/coi-serviceworker.js +0 -146
- data/ext/sources/examples/command/CMakeLists.txt +0 -10
- data/ext/sources/examples/command/command.cpp +0 -802
- data/ext/sources/examples/command/commands.txt +0 -9
- data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
- data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
- data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
- data/ext/sources/examples/generate-karaoke.sh +0 -57
- data/ext/sources/examples/helpers.js +0 -191
- data/ext/sources/examples/livestream.sh +0 -112
- data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
- data/ext/sources/examples/lsp/lsp.cpp +0 -471
- data/ext/sources/examples/lsp/whisper.vim +0 -362
- data/ext/sources/examples/python/test_whisper_processor.py +0 -7
- data/ext/sources/examples/python/whisper_processor.py +0 -54
- data/ext/sources/examples/server/bench.js +0 -29
- data/ext/sources/examples/server.py +0 -120
- data/ext/sources/examples/stream/CMakeLists.txt +0 -10
- data/ext/sources/examples/stream/stream.cpp +0 -437
- data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
- data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
- data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
- data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
- data/ext/sources/examples/sycl/build.sh +0 -22
- data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
- data/ext/sources/examples/sycl/run-whisper.sh +0 -17
- data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -47
- data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -494
- data/ext/sources/examples/talk-llama/llama-adapter.h +0 -88
- data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2559
- data/ext/sources/examples/talk-llama/llama-arch.h +0 -586
- data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -917
- data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
- data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -876
- data/ext/sources/examples/talk-llama/llama-chat.h +0 -70
- data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3645
- data/ext/sources/examples/talk-llama/llama-context.h +0 -360
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
- data/ext/sources/examples/talk-llama/llama-cparams.h +0 -42
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
- data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
- data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2282
- data/ext/sources/examples/talk-llama/llama-graph.h +0 -910
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -241
- data/ext/sources/examples/talk-llama/llama-hparams.h +0 -284
- data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
- data/ext/sources/examples/talk-llama/llama-impl.h +0 -63
- data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
- data/ext/sources/examples/talk-llama/llama-io.h +0 -35
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -328
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2100
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -390
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1167
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
- data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
- data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -735
- data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1247
- data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -176
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -285
- data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -37
- data/ext/sources/examples/talk-llama/llama-model.cpp +0 -8338
- data/ext/sources/examples/talk-llama/llama-model.h +0 -544
- data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1072
- data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +0 -3771
- data/ext/sources/examples/talk-llama/llama-sampling.h +0 -44
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3900
- data/ext/sources/examples/talk-llama/llama-vocab.h +0 -182
- data/ext/sources/examples/talk-llama/llama.cpp +0 -1140
- data/ext/sources/examples/talk-llama/llama.h +0 -1540
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -191
- data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
- data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -138
- data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -144
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/bert.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -160
- data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
- data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -144
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -259
- data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -134
- data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
- data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -113
- data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
- data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
- data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -196
- data/ext/sources/examples/talk-llama/models/granite.cpp +0 -211
- data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +0 -283
- data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -141
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -154
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -175
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/llama.cpp +0 -168
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -55
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -199
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
- data/ext/sources/examples/talk-llama/models/models.h +0 -569
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -116
- data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
- data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
- data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
- data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -316
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/plm.cpp +0 -168
- data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -873
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -149
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -141
- data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -162
- data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
- data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
- data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
- data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
- data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
- data/ext/sources/examples/talk-llama/speak +0 -40
- data/ext/sources/examples/talk-llama/speak.bat +0 -1
- data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
- data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
- data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
- data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
- data/ext/sources/examples/talk-llama/unicode.cpp +0 -1147
- data/ext/sources/examples/talk-llama/unicode.h +0 -111
- data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
- data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
- data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
- data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
- data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
- data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
- data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
- data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
- data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
- data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +0 -157
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -165
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
- data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
- data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -147
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +0 -907
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +0 -247
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
- data/ext/sources/tests/CMakeLists.txt +0 -112
- data/ext/sources/tests/earnings21/eval.mk +0 -58
- data/ext/sources/tests/earnings21/eval.py +0 -68
- data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
- data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
- data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
- data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
- data/ext/sources/tests/earnings21/requirements.txt +0 -6
- data/ext/sources/tests/en-0-ref.txt +0 -1
- data/ext/sources/tests/en-1-ref.txt +0 -1
- data/ext/sources/tests/en-2-ref.txt +0 -1
- data/ext/sources/tests/es-0-ref.txt +0 -1
- data/ext/sources/tests/librispeech/eval.mk +0 -39
- data/ext/sources/tests/librispeech/eval.py +0 -47
- data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
- data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
- data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
- data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
- data/ext/sources/tests/librispeech/requirements.txt +0 -6
- data/ext/sources/tests/run-tests.sh +0 -130
- data/ext/sources/tests/test-c.c +0 -3
- data/ext/sources/tests/test-vad-full.cpp +0 -56
- data/ext/sources/tests/test-vad.cpp +0 -83
- data/ext/sources/tests/test-whisper.js +0 -58
- data/lib/whisper/context.rb +0 -15
- data/lib/whisper/segment.rb +0 -58
|
@@ -1,4 +1,12 @@
|
|
|
1
1
|
|
|
2
|
+
// Each format defines a scalar dequantFunc<T> plus a V=4 dequantFunc<T>_v
|
|
3
|
+
// passed as the optional vector decoder to coopMatLoadTensorNV via
|
|
4
|
+
// GL_NV_cooperative_matrix_decode_vector. When the driver doesn't support
|
|
5
|
+
// the extension, ggml-vulkan.cpp strips it from the compiled SPIR-V.
|
|
6
|
+
#ifdef GL_NV_cooperative_matrix_decode_vector
|
|
7
|
+
#extension GL_NV_cooperative_matrix_decode_vector : enable
|
|
8
|
+
#endif
|
|
9
|
+
|
|
2
10
|
#include "types.glsl"
|
|
3
11
|
|
|
4
12
|
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufF32 {
|
|
@@ -13,6 +21,31 @@ float16_t dequantFuncF32(const in decodeBufF32 bl, const in uint blockCoords[2],
|
|
|
13
21
|
return vf16[idx];
|
|
14
22
|
}
|
|
15
23
|
|
|
24
|
+
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ1_0 {
|
|
25
|
+
block_q1_0 block;
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
float16_t dequantFuncQ1_0(const in decodeBufQ1_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
29
|
+
{
|
|
30
|
+
const float16_t d = bl.block.d;
|
|
31
|
+
const uint idx = coordInBlock[1];
|
|
32
|
+
const uint bit = (uint(bl.block.qs[(idx & 0x78) >> 3]) >> (idx & 0x7)) & 1u;
|
|
33
|
+
return bit != 0u ? d : -d;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
f16vec4 dequantFuncQ1_0_v(const in decodeBufQ1_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
37
|
+
{
|
|
38
|
+
const float16_t d = bl.block.d;
|
|
39
|
+
const float16_t md = -d;
|
|
40
|
+
const uint idx = coordInBlock[1];
|
|
41
|
+
const uint qs_nib = uint(bl.block.qs[idx >> 3]) >> (idx & 0x4u);
|
|
42
|
+
return f16vec4(
|
|
43
|
+
(qs_nib & 1u) != 0u ? d : md,
|
|
44
|
+
(qs_nib & 2u) != 0u ? d : md,
|
|
45
|
+
(qs_nib & 4u) != 0u ? d : md,
|
|
46
|
+
(qs_nib & 8u) != 0u ? d : md);
|
|
47
|
+
}
|
|
48
|
+
|
|
16
49
|
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ4_0 {
|
|
17
50
|
block_q4_0_packed16 block;
|
|
18
51
|
};
|
|
@@ -30,10 +63,28 @@ float16_t dequantFuncQ4_0(const in decodeBufQ4_0 bl, const in uint blockCoords[2
|
|
|
30
63
|
return ret;
|
|
31
64
|
}
|
|
32
65
|
|
|
66
|
+
f16vec4 dequantFuncQ4_0_v(const in decodeBufQ4_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
67
|
+
{
|
|
68
|
+
const float16_t d = bl.block.d;
|
|
69
|
+
const uint idx = coordInBlock[1];
|
|
70
|
+
const uint shift = (idx & 0x10) >> 2; // 0 or 4
|
|
71
|
+
const uint qs_i = (idx & 0xE) >> 1; // even, in {0,2,4,6}
|
|
72
|
+
const uint qsw = uint32_t(bl.block.qs[qs_i ])
|
|
73
|
+
| (uint32_t(bl.block.qs[qs_i + 1u]) << 16);
|
|
74
|
+
// shift in {0,4}: per-byte mask 0x0F isolates the wanted nibble in each byte.
|
|
75
|
+
const uint q4 = (qsw >> shift) & 0x0F0F0F0Fu;
|
|
76
|
+
const u8vec4 q = unpack8(q4);
|
|
77
|
+
return f16vec4((vec4(q) - vec4(8.0)) * vec4(float(d)));
|
|
78
|
+
}
|
|
79
|
+
|
|
33
80
|
layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ4_1 {
|
|
34
81
|
block_q4_1 block;
|
|
35
82
|
};
|
|
36
83
|
|
|
84
|
+
layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ4_1_packed32 {
|
|
85
|
+
block_q4_1_packed32 block;
|
|
86
|
+
};
|
|
87
|
+
|
|
37
88
|
float16_t dequantFuncQ4_1(const in decodeBufQ4_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
38
89
|
{
|
|
39
90
|
const float16_t d = bl.block.d;
|
|
@@ -48,10 +99,27 @@ float16_t dequantFuncQ4_1(const in decodeBufQ4_1 bl, const in uint blockCoords[2
|
|
|
48
99
|
return ret;
|
|
49
100
|
}
|
|
50
101
|
|
|
102
|
+
f16vec4 dequantFuncQ4_1_v(const in decodeBufQ4_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
103
|
+
{
|
|
104
|
+
decodeBufQ4_1_packed32 bl32 = decodeBufQ4_1_packed32(bl);
|
|
105
|
+
const float16_t d = bl.block.d;
|
|
106
|
+
const float16_t m = bl.block.m;
|
|
107
|
+
const uint idx = coordInBlock[1];
|
|
108
|
+
const uint shift = (idx & 0x10) >> 2; // 0 or 4
|
|
109
|
+
const uint qs_w = (idx & 0xC) >> 2; // iqs / 4 in [0,4)
|
|
110
|
+
const uint qsw = uint32_t(bl32.block.qs[qs_w]);
|
|
111
|
+
const u8vec4 q = unpack8((qsw >> shift) & 0x0F0F0F0Fu);
|
|
112
|
+
return f16vec4(vec4(q) * vec4(float(d)) + vec4(float(m)));
|
|
113
|
+
}
|
|
114
|
+
|
|
51
115
|
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ5_0 {
|
|
52
116
|
block_q5_0 block;
|
|
53
117
|
};
|
|
54
118
|
|
|
119
|
+
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ5_0_packed16 {
|
|
120
|
+
block_q5_0_packed16 block;
|
|
121
|
+
};
|
|
122
|
+
|
|
55
123
|
float16_t dequantFuncQ5_0(const in decodeBufQ5_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
56
124
|
{
|
|
57
125
|
const float16_t d = bl.block.d;
|
|
@@ -70,10 +138,32 @@ float16_t dequantFuncQ5_0(const in decodeBufQ5_0 bl, const in uint blockCoords[2
|
|
|
70
138
|
return ret;
|
|
71
139
|
}
|
|
72
140
|
|
|
141
|
+
f16vec4 dequantFuncQ5_0_v(const in decodeBufQ5_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
142
|
+
{
|
|
143
|
+
decodeBufQ5_0_packed16 bl16 = decodeBufQ5_0_packed16(bl);
|
|
144
|
+
const float16_t d = bl.block.d;
|
|
145
|
+
const uint idx = coordInBlock[1];
|
|
146
|
+
const uint shift = (idx & 0x10) >> 2; // 0 or 4
|
|
147
|
+
const uint qs_i = (idx & 0xC) >> 1; // packed16 word index, in {0,2,4,6}
|
|
148
|
+
const uint qsw = uint32_t(bl16.block.qs[qs_i ])
|
|
149
|
+
| (uint32_t(bl16.block.qs[qs_i + 1u]) << 16);
|
|
150
|
+
const u8vec4 ql = unpack8((qsw >> shift) & 0x0F0F0F0Fu);
|
|
151
|
+
|
|
152
|
+
const uint uint_qh = uint(bl16.block.qh[1]) << 16 | uint(bl16.block.qh[0]);
|
|
153
|
+
const uint qh_pack = uint_qh >> idx; // bits 0..3 = element idx..idx+3 high bits
|
|
154
|
+
const uvec4 qh_high = (uvec4(qh_pack, qh_pack >> 1u, qh_pack >> 2u, qh_pack >> 3u) & uvec4(0x01u)) << 4u;
|
|
155
|
+
|
|
156
|
+
return f16vec4((vec4(ql) + vec4(qh_high) - vec4(16.0)) * vec4(float(d)));
|
|
157
|
+
}
|
|
158
|
+
|
|
73
159
|
layout(buffer_reference, std430, buffer_reference_align = 8) buffer decodeBufQ5_1 {
|
|
74
160
|
block_q5_1 block;
|
|
75
161
|
};
|
|
76
162
|
|
|
163
|
+
layout(buffer_reference, std430, buffer_reference_align = 8) buffer decodeBufQ5_1_packed32 {
|
|
164
|
+
block_q5_1_packed32 block;
|
|
165
|
+
};
|
|
166
|
+
|
|
77
167
|
float16_t dequantFuncQ5_1(const in decodeBufQ5_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
78
168
|
{
|
|
79
169
|
const float16_t d = bl.block.d;
|
|
@@ -93,6 +183,23 @@ float16_t dequantFuncQ5_1(const in decodeBufQ5_1 bl, const in uint blockCoords[2
|
|
|
93
183
|
return ret;
|
|
94
184
|
}
|
|
95
185
|
|
|
186
|
+
f16vec4 dequantFuncQ5_1_v(const in decodeBufQ5_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
187
|
+
{
|
|
188
|
+
decodeBufQ5_1_packed32 bl32 = decodeBufQ5_1_packed32(bl);
|
|
189
|
+
const float16_t d = bl.block.d;
|
|
190
|
+
const float16_t m = bl.block.m;
|
|
191
|
+
const uint idx = coordInBlock[1];
|
|
192
|
+
const uint shift = (idx & 0x10) >> 2; // 0 or 4
|
|
193
|
+
const uint qs_w = (idx & 0xC) >> 2; // iqs / 4 in [0,4)
|
|
194
|
+
const uint qsw = uint32_t(bl32.block.qs[qs_w]);
|
|
195
|
+
const u8vec4 ql = unpack8((qsw >> shift) & 0x0F0F0F0Fu);
|
|
196
|
+
|
|
197
|
+
const uint qh_pack = bl.block.qh >> idx; // bits 0..3 = element idx..idx+3 high bits
|
|
198
|
+
const uvec4 qh_high = (uvec4(qh_pack, qh_pack >> 1u, qh_pack >> 2u, qh_pack >> 3u) & uvec4(0x01u)) << 4u;
|
|
199
|
+
|
|
200
|
+
return f16vec4((vec4(ql) + vec4(qh_high)) * vec4(float(d)) + vec4(float(m)));
|
|
201
|
+
}
|
|
202
|
+
|
|
96
203
|
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ8_0 {
|
|
97
204
|
block_q8_0_packed16 block;
|
|
98
205
|
};
|
|
@@ -109,6 +216,17 @@ float16_t dequantFuncQ8_0(const in decodeBufQ8_0 bl, const in uint blockCoords[2
|
|
|
109
216
|
return ret;
|
|
110
217
|
}
|
|
111
218
|
|
|
219
|
+
f16vec4 dequantFuncQ8_0_v(const in decodeBufQ8_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
220
|
+
{
|
|
221
|
+
const float16_t d = bl.block.d;
|
|
222
|
+
const uint idx = coordInBlock[1];
|
|
223
|
+
const uint base = idx >> 1u;
|
|
224
|
+
const uint w = uint(uint16_t(bl.block.qs[base]))
|
|
225
|
+
| (uint(uint16_t(bl.block.qs[base + 1u])) << 16u);
|
|
226
|
+
const i8vec4 qi = unpack8(int32_t(w));
|
|
227
|
+
return f16vec4(vec4(qi) * vec4(float(d)));
|
|
228
|
+
}
|
|
229
|
+
|
|
112
230
|
layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ2_K {
|
|
113
231
|
block_q2_K block;
|
|
114
232
|
};
|
|
@@ -117,6 +235,10 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ2
|
|
|
117
235
|
block_q2_K_packed16 block;
|
|
118
236
|
};
|
|
119
237
|
|
|
238
|
+
layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ2_K_packed32 {
|
|
239
|
+
block_q2_K_packed32 block;
|
|
240
|
+
};
|
|
241
|
+
|
|
120
242
|
float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
121
243
|
{
|
|
122
244
|
decodeBufQ2_K_packed16 bl16 = decodeBufQ2_K_packed16(bl);
|
|
@@ -135,10 +257,36 @@ float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2
|
|
|
135
257
|
return ret;
|
|
136
258
|
}
|
|
137
259
|
|
|
260
|
+
f16vec4 dequantFuncQ2_K_v(const in decodeBufQ2_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
261
|
+
{
|
|
262
|
+
decodeBufQ2_K_packed32 bl32 = decodeBufQ2_K_packed32(bl);
|
|
263
|
+
const f16vec2 dm = bl.block.dm;
|
|
264
|
+
const uint idx = coordInBlock[1];
|
|
265
|
+
|
|
266
|
+
const uint scalesi = idx >> 4; // 0..15
|
|
267
|
+
const uint qsshift = (idx & 0x60) >> 4; // 0,2,4,6
|
|
268
|
+
|
|
269
|
+
// qs_i (packed16) = ((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1) is even for idx % 4 == 0,
|
|
270
|
+
// so qs_w (packed32) = qs_i / 2 = ((idx & 0x80) >> 4) + ((idx & 0x1Cu) >> 2).
|
|
271
|
+
const uint qs_w = ((idx & 0x80) >> 4) + ((idx & 0x1Cu) >> 2);
|
|
272
|
+
const uint qsw = uint32_t(bl32.block.qs[qs_w]);
|
|
273
|
+
const uint qs4 = (qsw >> qsshift) & 0x03030303u;
|
|
274
|
+
const u8vec4 qi = unpack8(qs4);
|
|
275
|
+
|
|
276
|
+
const uint scales = bl.block.scales[scalesi];
|
|
277
|
+
const float16_t d_sub = dm.x * float16_t(scales & 0xF);
|
|
278
|
+
const float16_t m_sub = dm.y * float16_t(scales >> 4);
|
|
279
|
+
return f16vec4(vec4(qi) * vec4(float(d_sub)) - vec4(float(m_sub)));
|
|
280
|
+
}
|
|
281
|
+
|
|
138
282
|
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ3_K {
|
|
139
283
|
block_q3_K block;
|
|
140
284
|
};
|
|
141
285
|
|
|
286
|
+
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ3_K_packed16 {
|
|
287
|
+
block_q3_K_packed16 block;
|
|
288
|
+
};
|
|
289
|
+
|
|
142
290
|
float16_t dequantFuncQ3_K(const in decodeBufQ3_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
143
291
|
{
|
|
144
292
|
const uint idx = coordInBlock[1];
|
|
@@ -167,6 +315,47 @@ float16_t dequantFuncQ3_K(const in decodeBufQ3_K bl, const in uint blockCoords[2
|
|
|
167
315
|
return ret;
|
|
168
316
|
}
|
|
169
317
|
|
|
318
|
+
f16vec4 dequantFuncQ3_K_v(const in decodeBufQ3_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
319
|
+
{
|
|
320
|
+
decodeBufQ3_K_packed16 bl16 = decodeBufQ3_K_packed16(bl);
|
|
321
|
+
const uint idx = coordInBlock[1];
|
|
322
|
+
|
|
323
|
+
const uint n = idx >> 7; // 0,1
|
|
324
|
+
const uint is = idx >> 4; // 0..15
|
|
325
|
+
const uint halfsplit = (idx & 0x60) >> 5; // 0,1,2,3
|
|
326
|
+
const uint qsshift = halfsplit << 1; // 0,2,4,6
|
|
327
|
+
const uint hbit = (n << 2) + halfsplit; // 0..7 (bit position in hmask byte)
|
|
328
|
+
|
|
329
|
+
uint32_t scaleidx0 = (is < 8) ? is : (is - 8);
|
|
330
|
+
uint32_t scaleidx0shift = (is < 8) ? 0u : 4u;
|
|
331
|
+
uint32_t scaleidx1 = is + 8 - (is / 4) * 4;
|
|
332
|
+
uint32_t scaleidx1shift = (is / 4) * 2;
|
|
333
|
+
|
|
334
|
+
const int8_t us = int8_t(
|
|
335
|
+
((bl.block.scales[scaleidx0] >> scaleidx0shift) & 0xF) |
|
|
336
|
+
(((bl.block.scales[scaleidx1] >> scaleidx1shift) & 3) << 4));
|
|
337
|
+
const float16_t dl = bl.block.d * float16_t(int(us) - 32);
|
|
338
|
+
|
|
339
|
+
// For idx % 4 == 0: (idx & 0x1F) == (idx & 0x1C) is a multiple of 4.
|
|
340
|
+
const uint qsi = (n << 5) + (idx & 0x1Cu);
|
|
341
|
+
const uint hmi = (idx & 0x1Cu);
|
|
342
|
+
|
|
343
|
+
// Two adjacent uint16 packed16 reads, combined into a uint32 in registers.
|
|
344
|
+
// After this: byte j of qsw / hmw holds the data for element idx+j.
|
|
345
|
+
const uint qsw = uint32_t(bl16.block.qs[qsi >> 1])
|
|
346
|
+
| (uint32_t(bl16.block.qs[(qsi >> 1) + 1u]) << 16);
|
|
347
|
+
const uint hmw = uint32_t(bl16.block.hmask[hmi >> 1])
|
|
348
|
+
| (uint32_t(bl16.block.hmask[(hmi >> 1) + 1u]) << 16);
|
|
349
|
+
|
|
350
|
+
// qsshift in {0,2,4,6} and hbit in {0..7}: per-byte masks isolate the wanted bits
|
|
351
|
+
// with no inter-byte leakage.
|
|
352
|
+
const uint ql4 = (qsw >> qsshift) & 0x03030303u;
|
|
353
|
+
const uint qh4 = (hmw >> hbit) & 0x01010101u;
|
|
354
|
+
|
|
355
|
+
const ivec4 q = ivec4(unpack8(ql4 | (qh4 << 2))) - ivec4(4);
|
|
356
|
+
return f16vec4(vec4(q) * vec4(float(dl)));
|
|
357
|
+
}
|
|
358
|
+
|
|
170
359
|
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K {
|
|
171
360
|
block_q4_K block;
|
|
172
361
|
};
|
|
@@ -175,6 +364,10 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4
|
|
|
175
364
|
block_q4_K_packed16 block;
|
|
176
365
|
};
|
|
177
366
|
|
|
367
|
+
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K_packed32 {
|
|
368
|
+
block_q4_K_packed32 block;
|
|
369
|
+
};
|
|
370
|
+
|
|
178
371
|
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K_packed128 {
|
|
179
372
|
block_q4_K_packed128 block;
|
|
180
373
|
};
|
|
@@ -322,6 +515,55 @@ float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2
|
|
|
322
515
|
return float16_t(ret);
|
|
323
516
|
}
|
|
324
517
|
|
|
518
|
+
f16vec4 dequantFuncQ4_K_v(const in decodeBufQ4_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
519
|
+
{
|
|
520
|
+
decodeBufQ4_K_packed32 bl32 = decodeBufQ4_K_packed32(bl);
|
|
521
|
+
decodeBufQ4_K_packed128 bl128 = decodeBufQ4_K_packed128(bl);
|
|
522
|
+
const uint idx = coordInBlock[1];
|
|
523
|
+
|
|
524
|
+
const uint is = idx >> 5; // 0..7
|
|
525
|
+
|
|
526
|
+
#if defined(IS_MUL_MM2) && defined(DATA_A_Q4_K)
|
|
527
|
+
vec2 v = shAscales[is * shAscales_stride + (blockCoords[0] % BM)];
|
|
528
|
+
float d = v.x;
|
|
529
|
+
float m = v.y;
|
|
530
|
+
#else
|
|
531
|
+
uvec4 v = bl128.block.q4k[0];
|
|
532
|
+
const vec2 loadd = vec2(unpackFloat2x16(v.x));
|
|
533
|
+
|
|
534
|
+
uint32_t sc;
|
|
535
|
+
uint32_t mbyte;
|
|
536
|
+
|
|
537
|
+
uint32_t scale0 = v.y;
|
|
538
|
+
uint32_t scale4 = v.z;
|
|
539
|
+
uint32_t scale8 = v.w;
|
|
540
|
+
|
|
541
|
+
uint32_t sc_lo = scale0;
|
|
542
|
+
uint32_t mb_lo = scale4;
|
|
543
|
+
uint32_t sc_hi = (scale8 & 0x0F0F0F0F) | ((scale0 & 0xC0C0C0C0) >> 2);
|
|
544
|
+
uint32_t mb_hi = ((scale8 & 0xF0F0F0F0) >> 4) | ((scale4 & 0xC0C0C0C0) >> 2);
|
|
545
|
+
|
|
546
|
+
sc = is < 4 ? sc_lo : sc_hi;
|
|
547
|
+
mbyte = is < 4 ? mb_lo : mb_hi;
|
|
548
|
+
sc = sc >> (8 * (is & 3));
|
|
549
|
+
mbyte = mbyte >> (8 * (is & 3));
|
|
550
|
+
sc &= 0x3F;
|
|
551
|
+
mbyte &= 0x3F;
|
|
552
|
+
|
|
553
|
+
const float d = loadd.x * float(sc);
|
|
554
|
+
const float m = loadd.y * float(mbyte);
|
|
555
|
+
#endif
|
|
556
|
+
|
|
557
|
+
// idx in [0,256); vector decode uses idx a multiple of 4. packed32 word index:
|
|
558
|
+
// (qs_i >> 1) == (idx >> 6) * 8 + ((idx & 0x1E) >> 2). sh is 0 or 4 only, so a
|
|
559
|
+
// single (w >> sh) & 0x0F0F0F0F isolates all four nibbles without inter-byte leakage.
|
|
560
|
+
const uint sh = (idx & 0x20u) >> 3u;
|
|
561
|
+
const uint w = uint32_t(bl32.block.qs[(idx >> 6) * 8u + ((idx & 0x1Eu) >> 2)]);
|
|
562
|
+
const u8vec4 q = unpack8((w >> sh) & 0x0F0F0F0Fu);
|
|
563
|
+
|
|
564
|
+
return f16vec4(vec4(d) * vec4(q) - vec4(m));
|
|
565
|
+
}
|
|
566
|
+
|
|
325
567
|
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K {
|
|
326
568
|
block_q5_K block;
|
|
327
569
|
};
|
|
@@ -334,6 +576,10 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5
|
|
|
334
576
|
block_q5_K_packed128 block;
|
|
335
577
|
};
|
|
336
578
|
|
|
579
|
+
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K_packed32 {
|
|
580
|
+
block_q5_K_packed32 block;
|
|
581
|
+
};
|
|
582
|
+
|
|
337
583
|
float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
338
584
|
{
|
|
339
585
|
decodeBufQ5_K_packed16 bl16 = decodeBufQ5_K_packed16(bl);
|
|
@@ -387,6 +633,58 @@ float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2
|
|
|
387
633
|
return float16_t(ret);
|
|
388
634
|
}
|
|
389
635
|
|
|
636
|
+
f16vec4 dequantFuncQ5_K_v(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
637
|
+
{
|
|
638
|
+
decodeBufQ5_K_packed32 bl32 = decodeBufQ5_K_packed32(bl);
|
|
639
|
+
decodeBufQ5_K_packed128 bl128 = decodeBufQ5_K_packed128(bl);
|
|
640
|
+
const uint idx = coordInBlock[1];
|
|
641
|
+
const uint is = idx >> 5;
|
|
642
|
+
|
|
643
|
+
#if defined(IS_MUL_MM2) && defined(DATA_A_Q5_K)
|
|
644
|
+
vec2 v = shAscales[is * shAscales_stride + (blockCoords[0] % BM)];
|
|
645
|
+
float d = v.x;
|
|
646
|
+
float m = v.y;
|
|
647
|
+
#else
|
|
648
|
+
uvec4 v = bl128.block.q5k[0];
|
|
649
|
+
|
|
650
|
+
const f16vec2 loadd = unpackFloat2x16(v.x);
|
|
651
|
+
|
|
652
|
+
uint32_t sc;
|
|
653
|
+
uint32_t mbyte;
|
|
654
|
+
|
|
655
|
+
uint32_t scale0 = v.y;
|
|
656
|
+
uint32_t scale4 = v.z;
|
|
657
|
+
uint32_t scale8 = v.w;
|
|
658
|
+
|
|
659
|
+
uint32_t sc_lo = scale0;
|
|
660
|
+
uint32_t mb_lo = scale4;
|
|
661
|
+
uint32_t sc_hi = (scale8 & 0x0F0F0F0F) | ((scale0 & 0xC0C0C0C0) >> 2);
|
|
662
|
+
uint32_t mb_hi = ((scale8 & 0xF0F0F0F0) >> 4) | ((scale4 & 0xC0C0C0C0) >> 2);
|
|
663
|
+
|
|
664
|
+
sc = is < 4 ? sc_lo : sc_hi;
|
|
665
|
+
mbyte = is < 4 ? mb_lo : mb_hi;
|
|
666
|
+
sc = sc >> (8 * (is & 3));
|
|
667
|
+
mbyte = mbyte >> (8 * (is & 3));
|
|
668
|
+
sc &= 0x3F;
|
|
669
|
+
mbyte &= 0x3F;
|
|
670
|
+
|
|
671
|
+
const float16_t d = loadd.x * float16_t(sc);
|
|
672
|
+
const float16_t m = loadd.y * float16_t(mbyte);
|
|
673
|
+
#endif
|
|
674
|
+
|
|
675
|
+
// sh is 0 or 4; mask 0x0F0F0F0F covers the four nibbles regardless (no inter-byte leakage).
|
|
676
|
+
const uint sh = (idx & 0x20u) >> 3u;
|
|
677
|
+
const uint qs_w = (idx >> 6) * 8u + ((idx & 0x1Eu) >> 2);
|
|
678
|
+
const uint qh_w = (idx & 0x1Eu) >> 2;
|
|
679
|
+
|
|
680
|
+
const uint ql4 = (uint32_t(bl32.block.qs[qs_w]) >> sh) & 0x0F0F0F0Fu;
|
|
681
|
+
// qh stores bit `is` per element across 4 consecutive bytes; one shift+mask handles all 4.
|
|
682
|
+
const uint qh4 = ((uint32_t(bl32.block.qh[qh_w]) >> is) & 0x01010101u) << 4u;
|
|
683
|
+
|
|
684
|
+
const u8vec4 qi = unpack8(ql4 | qh4);
|
|
685
|
+
return f16vec4(vec4(qi) * vec4(d) - vec4(m));
|
|
686
|
+
}
|
|
687
|
+
|
|
390
688
|
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ6_K {
|
|
391
689
|
block_q6_K block;
|
|
392
690
|
};
|
|
@@ -419,6 +717,35 @@ float16_t dequantFuncQ6_K(const in decodeBufQ6_K bl, const in uint blockCoords[2
|
|
|
419
717
|
return ret;
|
|
420
718
|
}
|
|
421
719
|
|
|
720
|
+
f16vec4 dequantFuncQ6_K_v(const in decodeBufQ6_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
721
|
+
{
|
|
722
|
+
decodeBufQ6_K_packed16 bl16 = decodeBufQ6_K_packed16(bl);
|
|
723
|
+
const uint idx = coordInBlock[1];
|
|
724
|
+
|
|
725
|
+
const uint b = (idx & 0x40) >> 6;
|
|
726
|
+
const uint qhshift = (idx & 0x60) >> 4; // 0,2,4,6
|
|
727
|
+
const uint is = idx >> 4;
|
|
728
|
+
const uint sh = b * 4; // 0 or 4
|
|
729
|
+
|
|
730
|
+
const float16_t dscale = bl.block.d * float16_t(bl.block.scales[is]);
|
|
731
|
+
|
|
732
|
+
const uint ql_i = ((idx & 0x80) >> 2) + ((idx & 0x3E) >> 1);
|
|
733
|
+
const uint qh_i = ((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1);
|
|
734
|
+
|
|
735
|
+
// Two adjacent uint16 packed16 reads, combined into a uint32 in registers.
|
|
736
|
+
// After this: byte j of qlw / qhw holds the data for element idx+j.
|
|
737
|
+
const uint qlw = uint32_t(bl16.block.ql[ql_i ]) | (uint32_t(bl16.block.ql[ql_i + 1]) << 16);
|
|
738
|
+
const uint qhw = uint32_t(bl16.block.qh[qh_i ]) | (uint32_t(bl16.block.qh[qh_i + 1]) << 16);
|
|
739
|
+
|
|
740
|
+
// sh in {0,4} and qhshift in {0,2,4,6}: per-byte masks 0x0F / 0x03 keep only the
|
|
741
|
+
// wanted bits with no inter-byte leakage; place qh's 2 bits at nibble high position.
|
|
742
|
+
const uint ql4 = (qlw >> sh) & 0x0F0F0F0Fu;
|
|
743
|
+
const uint qh4 = ((qhw >> qhshift) & 0x03030303u) << 4u;
|
|
744
|
+
|
|
745
|
+
const ivec4 qi = ivec4(unpack8(ql4 | qh4));
|
|
746
|
+
return f16vec4((vec4(qi) - vec4(32.0f)) * vec4(float(dscale)));
|
|
747
|
+
}
|
|
748
|
+
|
|
422
749
|
#if defined(DATA_A_IQ1_S)
|
|
423
750
|
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ1_S {
|
|
424
751
|
block_iq1_s block;
|
|
@@ -441,6 +768,29 @@ float16_t dequantFuncIQ1_S(const in decodeBufIQ1_S bl, const in uint blockCoords
|
|
|
441
768
|
float16_t ret = float16_t(dl) * (float16_t(bitfieldExtract(int(grid), 2 * int(idx % 8), 2)) + float16_t(delta));
|
|
442
769
|
return ret;
|
|
443
770
|
}
|
|
771
|
+
|
|
772
|
+
f16vec4 dequantFuncIQ1_S_v(const in decodeBufIQ1_S bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
773
|
+
{
|
|
774
|
+
const float16_t d = bl.block.d;
|
|
775
|
+
const uint idx = coordInBlock[1];
|
|
776
|
+
|
|
777
|
+
const uint ib32 = idx >> 5;
|
|
778
|
+
const uint ib8 = idx >> 3;
|
|
779
|
+
const int i8b = int(idx & 4); // 0 or 4
|
|
780
|
+
|
|
781
|
+
const uint qh = bl.block.qh[ib32];
|
|
782
|
+
const uint qs = bl.block.qs[ib8];
|
|
783
|
+
const float dl = float(d) * float(2 * bitfieldExtract(qh, 12, 3) + 1);
|
|
784
|
+
const float delta = ((qh & 0x8000u) != 0u) ? -IQ1S_DELTA : IQ1S_DELTA;
|
|
785
|
+
const uint grid = iq1s_grid[qs | (bitfieldExtract(qh, 3 * int(ib8 & 3), 3) << 8)];
|
|
786
|
+
|
|
787
|
+
const ivec4 q = ivec4(
|
|
788
|
+
bitfieldExtract(int(grid), 2 * (i8b + 0), 2),
|
|
789
|
+
bitfieldExtract(int(grid), 2 * (i8b + 1), 2),
|
|
790
|
+
bitfieldExtract(int(grid), 2 * (i8b + 2), 2),
|
|
791
|
+
bitfieldExtract(int(grid), 2 * (i8b + 3), 2));
|
|
792
|
+
return f16vec4((vec4(q) + vec4(delta)) * dl);
|
|
793
|
+
}
|
|
444
794
|
#endif
|
|
445
795
|
|
|
446
796
|
#if defined(DATA_A_IQ1_M)
|
|
@@ -473,6 +823,33 @@ float16_t dequantFuncIQ1_M(const in decodeBufIQ1_M bl, const in uint blockCoords
|
|
|
473
823
|
float16_t ret = d * float16_t(dl) * (float16_t(bitfieldExtract(int(grid), 2 * i8, 2)) + float16_t(delta));
|
|
474
824
|
return ret;
|
|
475
825
|
}
|
|
826
|
+
|
|
827
|
+
f16vec4 dequantFuncIQ1_M_v(const in decodeBufIQ1_M bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
828
|
+
{
|
|
829
|
+
decodeBufIQ1_M_packed64 bl64 = decodeBufIQ1_M_packed64(bl);
|
|
830
|
+
const uint idx = coordInBlock[1];
|
|
831
|
+
|
|
832
|
+
uvec2 scales = unpack32(bl64.block.scales);
|
|
833
|
+
const float16_t d = uint16BitsToHalf(uint16_t(((scales.x & 0xF000) >> 12) | ((scales.x & 0xF0000000) >> 24) | ((scales.y & 0xF000) >> 4) | ((scales.y & 0xF0000000) >> 16)));
|
|
834
|
+
|
|
835
|
+
const uint ib8 = idx >> 3;
|
|
836
|
+
const uint ib16 = idx >> 4;
|
|
837
|
+
const int i8b = int(idx & 4); // 0 or 4 -- i8 base for the V=4 group
|
|
838
|
+
|
|
839
|
+
const uint sc = bl.block.scales[ib8 / 8];
|
|
840
|
+
const uint qs = bl.block.qs[ib8];
|
|
841
|
+
const uint qh = bl.block.qh[ib16] >> (4 * (ib8 & 1));
|
|
842
|
+
const float dl = 2.0 * float(bitfieldExtract(sc, 3 * int(ib16 & 3), 3)) + 1.0;
|
|
843
|
+
const float delta = ((qh & 8u) != 0u) ? -IQ1S_DELTA : IQ1S_DELTA;
|
|
844
|
+
const uint grid = iq1s_grid[qs | ((qh & 7u) << 8)];
|
|
845
|
+
|
|
846
|
+
const ivec4 q = ivec4(
|
|
847
|
+
bitfieldExtract(int(grid), 2 * (i8b + 0), 2),
|
|
848
|
+
bitfieldExtract(int(grid), 2 * (i8b + 1), 2),
|
|
849
|
+
bitfieldExtract(int(grid), 2 * (i8b + 2), 2),
|
|
850
|
+
bitfieldExtract(int(grid), 2 * (i8b + 3), 2));
|
|
851
|
+
return f16vec4((vec4(q) + vec4(delta)) * (float(d) * dl));
|
|
852
|
+
}
|
|
476
853
|
#endif
|
|
477
854
|
|
|
478
855
|
#if defined(DATA_A_IQ2_XXS)
|
|
@@ -508,6 +885,33 @@ float16_t dequantFuncIQ2_XXS(const in decodeBufIQ2_XXS bl, const in uint blockCo
|
|
|
508
885
|
vec2 ret = dscale * g * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf);
|
|
509
886
|
return float16_t(ret[idx & 1]);
|
|
510
887
|
}
|
|
888
|
+
|
|
889
|
+
f16vec4 dequantFuncIQ2_XXS_v(const in decodeBufIQ2_XXS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
890
|
+
{
|
|
891
|
+
decodeBufIQ2_XXS_packed16 bl16 = decodeBufIQ2_XXS_packed16(bl);
|
|
892
|
+
const uint idx = coordInBlock[1];
|
|
893
|
+
|
|
894
|
+
const uint ib32 = idx >> 5;
|
|
895
|
+
const uint ib8 = (idx & 0x18) >> 3;
|
|
896
|
+
const uint iqs = 8 * ib32 + ib8;
|
|
897
|
+
|
|
898
|
+
const uint qs = bl.block.qs[iqs];
|
|
899
|
+
const uint signscale = pack32(u16vec2(bl16.block.qs[4*ib32+2], bl16.block.qs[4*ib32+3]));
|
|
900
|
+
const float dscale = float(bl.block.d) * 0.25 * (0.5 + float(signscale >> 28));
|
|
901
|
+
|
|
902
|
+
uint sign = bitfieldExtract(signscale, 7 * int(ib8), 7);
|
|
903
|
+
sign |= bitCount(sign) << 7;
|
|
904
|
+
const uint sb = sign >> (idx & 7u);
|
|
905
|
+
|
|
906
|
+
const uint g2 = iq2xxs_grid[qs][(idx & 4) >> 2];
|
|
907
|
+
const u8vec4 g = unpack8(g2);
|
|
908
|
+
|
|
909
|
+
return f16vec4(
|
|
910
|
+
dscale * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0),
|
|
911
|
+
dscale * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0),
|
|
912
|
+
dscale * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0),
|
|
913
|
+
dscale * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0));
|
|
914
|
+
}
|
|
511
915
|
#endif
|
|
512
916
|
|
|
513
917
|
#if defined(DATA_A_IQ2_XS)
|
|
@@ -536,6 +940,31 @@ float16_t dequantFuncIQ2_XS(const in decodeBufIQ2_XS bl, const in uint blockCoor
|
|
|
536
940
|
vec2 ret = dscale * g * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf);
|
|
537
941
|
return float16_t(ret[idx & 1]);
|
|
538
942
|
}
|
|
943
|
+
|
|
944
|
+
f16vec4 dequantFuncIQ2_XS_v(const in decodeBufIQ2_XS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
945
|
+
{
|
|
946
|
+
const uint idx = coordInBlock[1];
|
|
947
|
+
|
|
948
|
+
const uint is = idx >> 5;
|
|
949
|
+
const uint sshift = (idx & 0x10) >> 2;
|
|
950
|
+
const uint iqs = idx >> 3;
|
|
951
|
+
|
|
952
|
+
const uint16_t qs = bl.block.qs[iqs];
|
|
953
|
+
const float dscale = float(bl.block.d) * 0.25 * (0.5 + float((bl.block.scales[is] >> sshift) & 0xF));
|
|
954
|
+
|
|
955
|
+
uint sign = uint(qs >> 9);
|
|
956
|
+
sign |= bitCount(sign) << 7;
|
|
957
|
+
const uint sb = sign >> (idx & 7u);
|
|
958
|
+
|
|
959
|
+
const uint g2 = iq2xs_grid[qs & 0x1FF][(idx & 4) >> 2];
|
|
960
|
+
const u8vec4 g = unpack8(g2);
|
|
961
|
+
|
|
962
|
+
return f16vec4(
|
|
963
|
+
dscale * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0),
|
|
964
|
+
dscale * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0),
|
|
965
|
+
dscale * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0),
|
|
966
|
+
dscale * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0));
|
|
967
|
+
}
|
|
539
968
|
#endif
|
|
540
969
|
|
|
541
970
|
#if defined(DATA_A_IQ2_S)
|
|
@@ -564,6 +993,32 @@ float16_t dequantFuncIQ2_S(const in decodeBufIQ2_S bl, const in uint blockCoords
|
|
|
564
993
|
const vec2 v = db * vec2(sign01) * vec2(unpack8(g2));
|
|
565
994
|
return float16_t(v[idx & 1]);
|
|
566
995
|
}
|
|
996
|
+
|
|
997
|
+
f16vec4 dequantFuncIQ2_S_v(const in decodeBufIQ2_S bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
998
|
+
{
|
|
999
|
+
const uint idx = coordInBlock[1];
|
|
1000
|
+
|
|
1001
|
+
const uint ib32 = idx >> 5;
|
|
1002
|
+
const uint ib8 = idx >> 3;
|
|
1003
|
+
const uint qhshift = 2 * (ib8 % 4);
|
|
1004
|
+
|
|
1005
|
+
const uint scale = (bl.block.scales[ib32] >> ((idx & 0x10) >> 2)) & 0xf;
|
|
1006
|
+
const uint qs = bl.block.qs[ib8];
|
|
1007
|
+
const uint qh = bl.block.qh[ib32];
|
|
1008
|
+
const uint sb = uint(bl.block.qs[QUANT_K / 8 + ib8]) >> (idx & 0x6u);
|
|
1009
|
+
|
|
1010
|
+
const float d = float(bl.block.d);
|
|
1011
|
+
const float db = d * 0.25 * (0.5 + scale);
|
|
1012
|
+
|
|
1013
|
+
const uint g2 = iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(idx & 4) >> 2];
|
|
1014
|
+
const u8vec4 g = unpack8(g2);
|
|
1015
|
+
|
|
1016
|
+
return f16vec4(
|
|
1017
|
+
db * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0),
|
|
1018
|
+
db * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0),
|
|
1019
|
+
db * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0),
|
|
1020
|
+
db * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0));
|
|
1021
|
+
}
|
|
567
1022
|
#endif
|
|
568
1023
|
|
|
569
1024
|
#if defined(DATA_A_IQ3_XXS)
|
|
@@ -597,6 +1052,32 @@ float16_t dequantFuncIQ3_XXS(const in decodeBufIQ3_XXS bl, const in uint blockCo
|
|
|
597
1052
|
const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
|
|
598
1053
|
return float16_t(v[idx & 1]);
|
|
599
1054
|
}
|
|
1055
|
+
|
|
1056
|
+
f16vec4 dequantFuncIQ3_XXS_v(const in decodeBufIQ3_XXS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
1057
|
+
{
|
|
1058
|
+
decodeBufIQ3_XXS_packed16 bl16 = decodeBufIQ3_XXS_packed16(bl);
|
|
1059
|
+
const uint idx = coordInBlock[1];
|
|
1060
|
+
|
|
1061
|
+
const uint iqs = idx >> 2;
|
|
1062
|
+
const uint is = QUANT_K / 4 + ((idx & 0xE0) >> 3);
|
|
1063
|
+
|
|
1064
|
+
const float d = float(bl.block.d);
|
|
1065
|
+
const uint qs = bl.block.qs[iqs];
|
|
1066
|
+
const uint signs = pack32(u16vec2(bl16.block.qs[is/2+0], bl16.block.qs[is/2+1]));
|
|
1067
|
+
const float db = d * 0.5 * (0.5 + (signs >> 28));
|
|
1068
|
+
|
|
1069
|
+
const uint sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7);
|
|
1070
|
+
const uint sb = (sign7 | (bitCount(sign7) << 7)) >> (idx & 0x6u);
|
|
1071
|
+
|
|
1072
|
+
const uint grid = iq3xxs_grid[qs];
|
|
1073
|
+
const u8vec4 g = unpack8(grid);
|
|
1074
|
+
|
|
1075
|
+
return f16vec4(
|
|
1076
|
+
db * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0),
|
|
1077
|
+
db * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0),
|
|
1078
|
+
db * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0),
|
|
1079
|
+
db * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0));
|
|
1080
|
+
}
|
|
600
1081
|
#endif
|
|
601
1082
|
|
|
602
1083
|
#if defined(DATA_A_IQ3_S)
|
|
@@ -623,6 +1104,30 @@ float16_t dequantFuncIQ3_S(const in decodeBufIQ3_S bl, const in uint blockCoords
|
|
|
623
1104
|
|
|
624
1105
|
return float16_t(v[idx & 1]);
|
|
625
1106
|
}
|
|
1107
|
+
|
|
1108
|
+
f16vec4 dequantFuncIQ3_S_v(const in decodeBufIQ3_S bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
1109
|
+
{
|
|
1110
|
+
const uint idx = coordInBlock[1];
|
|
1111
|
+
|
|
1112
|
+
const uint iqs = idx >> 2;
|
|
1113
|
+
const uint iqh = idx >> 5;
|
|
1114
|
+
|
|
1115
|
+
const float d = float(bl.block.d);
|
|
1116
|
+
const uint qs = bl.block.qs[iqs];
|
|
1117
|
+
const uint qh = bl.block.qh[iqh];
|
|
1118
|
+
const uint sb = uint(bl.block.signs[iqs / 2]) >> (idx & 0x6u);
|
|
1119
|
+
const uint scale = bl.block.scales[iqs / 16];
|
|
1120
|
+
const float db = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf));
|
|
1121
|
+
|
|
1122
|
+
const uint grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)];
|
|
1123
|
+
const u8vec4 g = unpack8(grid);
|
|
1124
|
+
|
|
1125
|
+
return f16vec4(
|
|
1126
|
+
db * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0),
|
|
1127
|
+
db * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0),
|
|
1128
|
+
db * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0),
|
|
1129
|
+
db * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0));
|
|
1130
|
+
}
|
|
626
1131
|
#endif
|
|
627
1132
|
|
|
628
1133
|
#if defined(DATA_A_IQ4_XS)
|
|
@@ -630,6 +1135,10 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4
|
|
|
630
1135
|
block_iq4_xs block;
|
|
631
1136
|
};
|
|
632
1137
|
|
|
1138
|
+
layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufIQ4_XS_packed32 {
|
|
1139
|
+
block_iq4_xs_packed32 block;
|
|
1140
|
+
};
|
|
1141
|
+
|
|
633
1142
|
float16_t dequantFuncIQ4_XS(const in decodeBufIQ4_XS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
634
1143
|
{
|
|
635
1144
|
const float16_t d = bl.block.d;
|
|
@@ -645,6 +1154,30 @@ float16_t dequantFuncIQ4_XS(const in decodeBufIQ4_XS bl, const in uint blockCoor
|
|
|
645
1154
|
float16_t ret = d * float16_t(int(sl | (sh << 4)) - 32) * float16_t(kvalues_iq4nl[q]);
|
|
646
1155
|
return ret;
|
|
647
1156
|
}
|
|
1157
|
+
|
|
1158
|
+
f16vec4 dequantFuncIQ4_XS_v(const in decodeBufIQ4_XS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
1159
|
+
{
|
|
1160
|
+
decodeBufIQ4_XS_packed32 bl32 = decodeBufIQ4_XS_packed32(bl);
|
|
1161
|
+
const float16_t d = bl.block.d;
|
|
1162
|
+
const uint idx = coordInBlock[1];
|
|
1163
|
+
|
|
1164
|
+
const uint ib32 = idx >> 5; // 0..7
|
|
1165
|
+
const uint sl = (bl32.block.scales_l >> (4 * ib32)) & 0xF;
|
|
1166
|
+
const uint sh = (uint(bl32.block.scales_h) >> (2 * ib32)) & 0x3;
|
|
1167
|
+
const uint qshift = (idx & 0x10) >> 2; // {0, 4}
|
|
1168
|
+
const uint qs_w = 4 * ib32 + ((idx & 0xC) >> 2); // iqs / 4, in [0,32)
|
|
1169
|
+
|
|
1170
|
+
const float16_t dl = d * float16_t(int(sl | (sh << 4)) - 32);
|
|
1171
|
+
|
|
1172
|
+
const uint qsw = bl32.block.qs[qs_w];
|
|
1173
|
+
const u8vec4 qv = unpack8((qsw >> qshift) & 0x0F0F0F0Fu);
|
|
1174
|
+
const vec4 ret = vec4(
|
|
1175
|
+
float(kvalues_iq4nl[qv.x]),
|
|
1176
|
+
float(kvalues_iq4nl[qv.y]),
|
|
1177
|
+
float(kvalues_iq4nl[qv.z]),
|
|
1178
|
+
float(kvalues_iq4nl[qv.w])) * float(dl);
|
|
1179
|
+
return f16vec4(ret);
|
|
1180
|
+
}
|
|
648
1181
|
#endif
|
|
649
1182
|
|
|
650
1183
|
#if defined(DATA_A_IQ4_NL)
|
|
@@ -652,6 +1185,10 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4
|
|
|
652
1185
|
block_iq4_nl block;
|
|
653
1186
|
};
|
|
654
1187
|
|
|
1188
|
+
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_NL_packed16 {
|
|
1189
|
+
block_iq4_nl_packed16 block;
|
|
1190
|
+
};
|
|
1191
|
+
|
|
655
1192
|
float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
656
1193
|
{
|
|
657
1194
|
const float16_t d = bl.block.d;
|
|
@@ -664,6 +1201,24 @@ float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoor
|
|
|
664
1201
|
float16_t ret = float16_t(kvalues_iq4nl[qs]) * d;
|
|
665
1202
|
return ret;
|
|
666
1203
|
}
|
|
1204
|
+
|
|
1205
|
+
f16vec4 dequantFuncIQ4_NL_v(const in decodeBufIQ4_NL bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
1206
|
+
{
|
|
1207
|
+
decodeBufIQ4_NL_packed16 bl16 = decodeBufIQ4_NL_packed16(bl);
|
|
1208
|
+
const float16_t d = bl.block.d;
|
|
1209
|
+
const uint idx = coordInBlock[1];
|
|
1210
|
+
const uint shift = (idx & 0x10) >> 2; // 0 or 4
|
|
1211
|
+
const uint qs_i = (idx & 0xC) >> 1; // packed16 word index, in {0,2,4,6}
|
|
1212
|
+
const uint qsw = uint32_t(bl16.block.qs[qs_i ])
|
|
1213
|
+
| (uint32_t(bl16.block.qs[qs_i + 1u]) << 16);
|
|
1214
|
+
// shift in {0,4}: per-byte mask 0x0F isolates the wanted nibble in each byte.
|
|
1215
|
+
const u8vec4 q = unpack8((qsw >> shift) & 0x0F0F0F0Fu);
|
|
1216
|
+
return f16vec4(
|
|
1217
|
+
float(d) * float(kvalues_iq4nl[q.x]),
|
|
1218
|
+
float(d) * float(kvalues_iq4nl[q.y]),
|
|
1219
|
+
float(d) * float(kvalues_iq4nl[q.z]),
|
|
1220
|
+
float(d) * float(kvalues_iq4nl[q.w]));
|
|
1221
|
+
}
|
|
667
1222
|
#endif
|
|
668
1223
|
|
|
669
1224
|
#if defined(DATA_A_MXFP4)
|
|
@@ -683,52 +1238,139 @@ float16_t dequantFuncMXFP4(const in decodeBufMXFP4 bl, const in uint blockCoords
|
|
|
683
1238
|
float16_t ret = float16_t(kvalues_mxfp4[qs] * d * 0.5);
|
|
684
1239
|
return ret;
|
|
685
1240
|
}
|
|
1241
|
+
|
|
1242
|
+
f16vec4 dequantFuncMXFP4_v(const in decodeBufMXFP4 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
1243
|
+
{
|
|
1244
|
+
const float d = e8m0_to_fp32(bl.block.e);
|
|
1245
|
+
const uint idx = coordInBlock[1];
|
|
1246
|
+
const uint iqs = idx & 0xF;
|
|
1247
|
+
const uint shift = (idx & 0x10) >> 2;
|
|
1248
|
+
uvec4 qv = uvec4(
|
|
1249
|
+
uint(bl.block.qs[iqs]),
|
|
1250
|
+
uint(bl.block.qs[iqs + 1u]),
|
|
1251
|
+
uint(bl.block.qs[iqs + 2u]),
|
|
1252
|
+
uint(bl.block.qs[iqs + 3u]));
|
|
1253
|
+
qv = (qv >> shift) & 0xFu;
|
|
1254
|
+
const vec4 ret = vec4(
|
|
1255
|
+
float(kvalues_mxfp4[qv.x]),
|
|
1256
|
+
float(kvalues_mxfp4[qv.y]),
|
|
1257
|
+
float(kvalues_mxfp4[qv.z]),
|
|
1258
|
+
float(kvalues_mxfp4[qv.w])) * d * 0.5f;
|
|
1259
|
+
return f16vec4(ret);
|
|
1260
|
+
}
|
|
1261
|
+
#endif
|
|
1262
|
+
|
|
1263
|
+
#if defined(DATA_A_NVFP4)
|
|
1264
|
+
layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufNVFP4 {
|
|
1265
|
+
block_nvfp4 block;
|
|
1266
|
+
};
|
|
1267
|
+
|
|
1268
|
+
layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufNVFP4_packed32 {
|
|
1269
|
+
block_nvfp4_packed32 block;
|
|
1270
|
+
};
|
|
1271
|
+
|
|
1272
|
+
float16_t dequantFuncNVFP4(const in decodeBufNVFP4 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
1273
|
+
{
|
|
1274
|
+
const uint idx = coordInBlock[1];
|
|
1275
|
+
const uint sub = (idx & 0x30) >> 4;
|
|
1276
|
+
const uint iqs = ((idx & 0x30) >> 1) + (idx & 0x7);
|
|
1277
|
+
const uint shift = (idx & 0x8) >> 1;
|
|
1278
|
+
const float d = ue4m3_to_fp32(bl.block.d[sub]);
|
|
1279
|
+
uint qs = uint(bl.block.qs[iqs]);
|
|
1280
|
+
qs = (qs >> shift) & 0xF;
|
|
1281
|
+
return float16_t(kvalues_mxfp4[qs] * d * 0.5);
|
|
1282
|
+
}
|
|
1283
|
+
|
|
1284
|
+
f16vec4 dequantFuncNVFP4_v(const in decodeBufNVFP4 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
|
1285
|
+
{
|
|
1286
|
+
decodeBufNVFP4_packed32 bl32 = decodeBufNVFP4_packed32(bl);
|
|
1287
|
+
const uint idx = coordInBlock[1];
|
|
1288
|
+
const uint sub = idx >> 4;
|
|
1289
|
+
const uint qs_w = ((idx & 0x30) >> 3) + ((idx & 0x4u) >> 2); // iqs / 4, in [0,8)
|
|
1290
|
+
const uint shift = (idx & 0x8) >> 1;
|
|
1291
|
+
const float d = ue4m3_to_fp32(bl.block.d[sub]);
|
|
1292
|
+
|
|
1293
|
+
const uint qsw = uint32_t(bl32.block.qs[qs_w]);
|
|
1294
|
+
const u8vec4 qv = unpack8((qsw >> shift) & 0x0F0F0F0Fu);
|
|
1295
|
+
const vec4 ret = vec4(
|
|
1296
|
+
float(kvalues_mxfp4[qv.x]),
|
|
1297
|
+
float(kvalues_mxfp4[qv.y]),
|
|
1298
|
+
float(kvalues_mxfp4[qv.z]),
|
|
1299
|
+
float(kvalues_mxfp4[qv.w])) * d * 0.5f;
|
|
1300
|
+
return f16vec4(ret);
|
|
1301
|
+
}
|
|
686
1302
|
#endif
|
|
687
1303
|
|
|
688
|
-
#if defined(
|
|
1304
|
+
#if defined(DATA_A_Q1_0)
|
|
1305
|
+
#define dequantFuncA dequantFuncQ1_0
|
|
1306
|
+
#define dequantFuncA_v dequantFuncQ1_0_v
|
|
1307
|
+
#elif defined(DATA_A_Q4_0)
|
|
689
1308
|
#define dequantFuncA dequantFuncQ4_0
|
|
1309
|
+
#define dequantFuncA_v dequantFuncQ4_0_v
|
|
690
1310
|
#elif defined(DATA_A_Q4_1)
|
|
691
1311
|
#define dequantFuncA dequantFuncQ4_1
|
|
1312
|
+
#define dequantFuncA_v dequantFuncQ4_1_v
|
|
692
1313
|
#elif defined(DATA_A_Q5_0)
|
|
693
1314
|
#define dequantFuncA dequantFuncQ5_0
|
|
1315
|
+
#define dequantFuncA_v dequantFuncQ5_0_v
|
|
694
1316
|
#elif defined(DATA_A_Q5_1)
|
|
695
1317
|
#define dequantFuncA dequantFuncQ5_1
|
|
1318
|
+
#define dequantFuncA_v dequantFuncQ5_1_v
|
|
696
1319
|
#elif defined(DATA_A_Q8_0)
|
|
697
1320
|
#define dequantFuncA dequantFuncQ8_0
|
|
1321
|
+
#define dequantFuncA_v dequantFuncQ8_0_v
|
|
698
1322
|
#elif defined(DATA_A_Q2_K)
|
|
699
1323
|
#define dequantFuncA dequantFuncQ2_K
|
|
1324
|
+
#define dequantFuncA_v dequantFuncQ2_K_v
|
|
700
1325
|
#elif defined(DATA_A_Q3_K)
|
|
701
1326
|
#define dequantFuncA dequantFuncQ3_K
|
|
1327
|
+
#define dequantFuncA_v dequantFuncQ3_K_v
|
|
702
1328
|
#elif defined(DATA_A_Q4_K)
|
|
703
1329
|
#define dequantFuncA dequantFuncQ4_K
|
|
1330
|
+
#define dequantFuncA_v dequantFuncQ4_K_v
|
|
704
1331
|
#define fetch_scales fetch_scalesQ4_K
|
|
705
1332
|
#define store_scales store_scalesQ4_K
|
|
706
1333
|
#elif defined(DATA_A_Q5_K)
|
|
707
1334
|
#define dequantFuncA dequantFuncQ5_K
|
|
1335
|
+
#define dequantFuncA_v dequantFuncQ5_K_v
|
|
708
1336
|
#define fetch_scales fetch_scalesQ5_K
|
|
709
1337
|
#define store_scales store_scalesQ4_K
|
|
710
1338
|
#elif defined(DATA_A_Q6_K)
|
|
711
1339
|
#define dequantFuncA dequantFuncQ6_K
|
|
1340
|
+
#define dequantFuncA_v dequantFuncQ6_K_v
|
|
712
1341
|
#elif defined(DATA_A_IQ1_S)
|
|
713
1342
|
#define dequantFuncA dequantFuncIQ1_S
|
|
1343
|
+
#define dequantFuncA_v dequantFuncIQ1_S_v
|
|
714
1344
|
#elif defined(DATA_A_IQ1_M)
|
|
715
1345
|
#define dequantFuncA dequantFuncIQ1_M
|
|
1346
|
+
#define dequantFuncA_v dequantFuncIQ1_M_v
|
|
716
1347
|
#elif defined(DATA_A_IQ2_XXS)
|
|
717
1348
|
#define dequantFuncA dequantFuncIQ2_XXS
|
|
1349
|
+
#define dequantFuncA_v dequantFuncIQ2_XXS_v
|
|
718
1350
|
#elif defined(DATA_A_IQ2_XS)
|
|
719
1351
|
#define dequantFuncA dequantFuncIQ2_XS
|
|
1352
|
+
#define dequantFuncA_v dequantFuncIQ2_XS_v
|
|
720
1353
|
#elif defined(DATA_A_IQ2_S)
|
|
721
1354
|
#define dequantFuncA dequantFuncIQ2_S
|
|
1355
|
+
#define dequantFuncA_v dequantFuncIQ2_S_v
|
|
722
1356
|
#elif defined(DATA_A_IQ3_XXS)
|
|
723
1357
|
#define dequantFuncA dequantFuncIQ3_XXS
|
|
1358
|
+
#define dequantFuncA_v dequantFuncIQ3_XXS_v
|
|
724
1359
|
#elif defined(DATA_A_IQ3_S)
|
|
725
1360
|
#define dequantFuncA dequantFuncIQ3_S
|
|
1361
|
+
#define dequantFuncA_v dequantFuncIQ3_S_v
|
|
726
1362
|
#elif defined(DATA_A_IQ4_XS)
|
|
727
1363
|
#define dequantFuncA dequantFuncIQ4_XS
|
|
1364
|
+
#define dequantFuncA_v dequantFuncIQ4_XS_v
|
|
728
1365
|
#elif defined(DATA_A_IQ4_NL)
|
|
729
1366
|
#define dequantFuncA dequantFuncIQ4_NL
|
|
1367
|
+
#define dequantFuncA_v dequantFuncIQ4_NL_v
|
|
730
1368
|
#elif defined(DATA_A_MXFP4)
|
|
731
1369
|
#define dequantFuncA dequantFuncMXFP4
|
|
1370
|
+
#define dequantFuncA_v dequantFuncMXFP4_v
|
|
1371
|
+
#elif defined(DATA_A_NVFP4)
|
|
1372
|
+
#define dequantFuncA dequantFuncNVFP4
|
|
1373
|
+
#define dequantFuncA_v dequantFuncNVFP4_v
|
|
732
1374
|
#elif defined(DATA_A_F32)
|
|
733
1375
|
#define dequantFuncA dequantFuncF32
|
|
734
1376
|
#endif
|