whispercpp 1.3.5 → 1.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.document +3 -0
- data/.rdoc_options +2 -0
- data/LICENSE +1 -1
- data/README.md +133 -3
- data/Rakefile +18 -3
- data/ext/dependencies.rb +10 -4
- data/ext/dependencies_for_windows.rb +17 -0
- data/ext/extconf.rb +20 -7
- data/ext/options.rb +54 -14
- data/ext/options_for_windows.rb +51 -0
- data/ext/ruby_whisper.c +56 -46
- data/ext/ruby_whisper.h +165 -2
- data/ext/ruby_whisper_context.c +297 -126
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_log_queue.c +180 -0
- data/ext/ruby_whisper_log_settable.h +47 -0
- data/ext/ruby_whisper_model.c +0 -1
- data/ext/ruby_whisper_parakeet.c +49 -0
- data/ext/ruby_whisper_parakeet_context.c +304 -0
- data/ext/ruby_whisper_parakeet_context_params.c +117 -0
- data/ext/ruby_whisper_parakeet_model.c +84 -0
- data/ext/ruby_whisper_parakeet_params.c +548 -0
- data/ext/ruby_whisper_parakeet_segment.c +157 -0
- data/ext/ruby_whisper_parakeet_token.c +188 -0
- data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
- data/ext/ruby_whisper_params.c +256 -66
- data/ext/ruby_whisper_segment.c +6 -7
- data/ext/ruby_whisper_token.c +29 -9
- data/ext/ruby_whisper_transcribe.cpp +46 -16
- data/ext/ruby_whisper_vad_context.c +48 -1
- data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +0 -1
- data/ext/ruby_whisper_vad_segments.c +0 -1
- data/ext/sources/CMakeLists.txt +41 -3
- data/ext/sources/CMakePresets.json +95 -0
- data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
- data/ext/sources/cmake/parakeet.pc.in +10 -0
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/cmake/whisper.pc.in +1 -1
- data/ext/sources/examples/CMakeLists.txt +4 -2
- data/ext/sources/examples/bench/bench.cpp +24 -19
- data/ext/sources/examples/cli/cli.cpp +51 -9
- data/ext/sources/examples/common-ggml.cpp +4 -0
- data/ext/sources/examples/common-whisper.cpp +139 -67
- data/ext/sources/examples/common-whisper.h +11 -0
- data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
- data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
- data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
- data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
- data/ext/sources/examples/server/server.cpp +213 -163
- data/ext/sources/ggml/CMakeLists.txt +29 -15
- data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
- data/ext/sources/ggml/include/ggml-alloc.h +1 -0
- data/ext/sources/ggml/include/ggml-backend.h +73 -11
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +5 -0
- data/ext/sources/ggml/include/ggml-cuda.h +3 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +8 -3
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml.h +155 -16
- data/ext/sources/ggml/include/gguf.h +10 -2
- data/ext/sources/ggml/src/CMakeLists.txt +25 -5
- data/ext/sources/ggml/src/ggml-alloc.c +9 -10
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
- data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +40 -86
- data/ext/sources/ggml/src/ggml-backend.cpp +114 -10
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +10 -2
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +1016 -442
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +111 -85
- data/ext/sources/ggml/src/ggml-cann/common.h +23 -14
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +255 -92
- data/ext/sources/ggml/src/ggml-common.h +22 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +68 -34
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +44 -19
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +101 -101
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +194 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2874 -613
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +5480 -840
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1361 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -11
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +186 -36
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +119 -19
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +112 -26
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +153 -16
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +17 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +976 -251
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +671 -266
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1277 -263
- data/ext/sources/ggml/src/ggml-cpu/ops.h +4 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +95 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +6 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2893 -679
- data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +226 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +114 -19
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
- data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +54 -53
- data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +18 -8
- data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
- data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +73 -28
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +69 -41
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +359 -29
- data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +94 -27
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +20 -9
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +333 -85
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +632 -190
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +162 -49
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +43 -18
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +44 -14
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +241 -23
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
- data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +312 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1454 -599
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +397 -183
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +161 -88
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +522 -431
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +139 -72
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +608 -88
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +47 -79
- data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +134 -27
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +7 -17
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +244 -137
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
- data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
- data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +96 -40
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +6 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +6 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -5
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +202 -135
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +86 -2
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +111 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +30 -2
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +84 -46
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1612 -753
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +51 -11
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +361 -261
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +294 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +753 -241
- data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +295 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +471 -296
- data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +159 -53
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +3 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +372 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +86 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +137 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +97 -14
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +163 -67
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +308 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +262 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +291 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +216 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +142 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -1348
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +547 -635
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +3556 -1101
- data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +475 -269
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +94 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +222 -217
- data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +432 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +886 -117
- data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +40 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +28 -9
- data/ext/sources/ggml/src/ggml-impl.h +68 -1
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +409 -83
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +54 -5
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +254 -52
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +254 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +756 -285
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +7 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +359 -133
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1867 -1123
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +71 -4
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +14127 -5314
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +97 -88
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +104 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1978 -67
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
- data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
- data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +178 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +985 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +380 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1132 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +956 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +149 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +47 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +40 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +317 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +257 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +86 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +880 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +143 -0
- data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
- data/ext/sources/ggml/src/ggml-quants.c +385 -119
- data/ext/sources/ggml/src/ggml-quants.h +6 -0
- data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
- data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
- data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +64 -91
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +4 -1
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
- data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +356 -11
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +184 -14
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +31 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
- data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
- data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
- data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +77 -156
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1181 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +59 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1246 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +674 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +227 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +347 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +1134 -236
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
- data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
- data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +72 -1
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
- data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
- data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
- data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
- data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
- data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +228 -53
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +123 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +160 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +99 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +545 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +115 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3250 -940
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +533 -180
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +113 -68
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +412 -222
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +222 -83
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +189 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +22 -20
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +51 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +39 -63
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +13 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +27 -11
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +193 -149
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +3221 -97
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3493 -1997
- data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +142 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +115 -141
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +93 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -44
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +198 -230
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +234 -335
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +871 -42
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +149 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +36 -138
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +151 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +15 -40
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +39 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +213 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +24 -15
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +253 -16
- data/ext/sources/ggml/src/ggml.c +268 -52
- data/ext/sources/ggml/src/gguf.cpp +377 -47
- data/ext/sources/include/parakeet.h +342 -0
- data/ext/sources/include/whisper.h +10 -0
- data/ext/sources/media/matmul.png +0 -0
- data/ext/sources/src/CMakeLists.txt +23 -0
- data/ext/sources/src/parakeet-arch.h +188 -0
- data/ext/sources/src/parakeet.cpp +3838 -0
- data/ext/sources/src/whisper.cpp +62 -40
- data/extsources.rb +26 -10
- data/lib/whisper/log_settable.rb +36 -0
- data/lib/whisper/model/uri.rb +13 -1
- data/lib/whisper/output.rb +74 -0
- data/sig/whisper.rbs +445 -55
- data/test/helper.rb +2 -0
- data/test/jfk_reader/jfk_reader.c +50 -7
- data/test/test_callback.rb +1 -0
- data/test/test_context_params.rb +82 -0
- data/test/test_package.rb +6 -5
- data/test/test_parakeet.rb +28 -0
- data/test/test_parakeet_callback.rb +107 -0
- data/test/test_parakeet_context.rb +116 -0
- data/test/test_parakeet_context_params.rb +24 -0
- data/test/test_parakeet_model.rb +21 -0
- data/test/test_parakeet_params.rb +78 -0
- data/test/test_parakeet_segment.rb +42 -0
- data/test/test_parakeet_token.rb +73 -0
- data/test/test_params.rb +2 -0
- data/test/test_token.rb +11 -0
- data/test/test_vad_context.rb +58 -8
- data/test/test_vad_segment.rb +1 -1
- data/test/test_whisper.rb +44 -6
- data/whispercpp.gemspec +2 -2
- metadata +426 -280
- data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
- data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
- data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
- data/ext/sources/bindings/javascript/package.json +0 -26
- data/ext/sources/bindings/javascript/whisper.js +0 -19
- data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
- data/ext/sources/examples/addon.node/addon.cpp +0 -557
- data/ext/sources/examples/addon.node/index.js +0 -59
- data/ext/sources/examples/addon.node/package.json +0 -16
- data/ext/sources/examples/addon.node/vad-example.js +0 -132
- data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
- data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
- data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
- data/ext/sources/examples/coi-serviceworker.js +0 -146
- data/ext/sources/examples/command/CMakeLists.txt +0 -10
- data/ext/sources/examples/command/command.cpp +0 -802
- data/ext/sources/examples/command/commands.txt +0 -9
- data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
- data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
- data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
- data/ext/sources/examples/generate-karaoke.sh +0 -57
- data/ext/sources/examples/helpers.js +0 -191
- data/ext/sources/examples/livestream.sh +0 -112
- data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
- data/ext/sources/examples/lsp/lsp.cpp +0 -471
- data/ext/sources/examples/lsp/whisper.vim +0 -362
- data/ext/sources/examples/python/test_whisper_processor.py +0 -7
- data/ext/sources/examples/python/whisper_processor.py +0 -54
- data/ext/sources/examples/server/bench.js +0 -29
- data/ext/sources/examples/server.py +0 -120
- data/ext/sources/examples/stream/CMakeLists.txt +0 -10
- data/ext/sources/examples/stream/stream.cpp +0 -437
- data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
- data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
- data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
- data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
- data/ext/sources/examples/sycl/build.sh +0 -22
- data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
- data/ext/sources/examples/sycl/run-whisper.sh +0 -17
- data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -47
- data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -494
- data/ext/sources/examples/talk-llama/llama-adapter.h +0 -88
- data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2559
- data/ext/sources/examples/talk-llama/llama-arch.h +0 -586
- data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -917
- data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
- data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -876
- data/ext/sources/examples/talk-llama/llama-chat.h +0 -70
- data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3645
- data/ext/sources/examples/talk-llama/llama-context.h +0 -360
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
- data/ext/sources/examples/talk-llama/llama-cparams.h +0 -42
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
- data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
- data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2282
- data/ext/sources/examples/talk-llama/llama-graph.h +0 -910
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -241
- data/ext/sources/examples/talk-llama/llama-hparams.h +0 -284
- data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
- data/ext/sources/examples/talk-llama/llama-impl.h +0 -63
- data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
- data/ext/sources/examples/talk-llama/llama-io.h +0 -35
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -328
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2100
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -390
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1167
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
- data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
- data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -735
- data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1247
- data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -176
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -285
- data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -37
- data/ext/sources/examples/talk-llama/llama-model.cpp +0 -8338
- data/ext/sources/examples/talk-llama/llama-model.h +0 -544
- data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1072
- data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +0 -3771
- data/ext/sources/examples/talk-llama/llama-sampling.h +0 -44
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3900
- data/ext/sources/examples/talk-llama/llama-vocab.h +0 -182
- data/ext/sources/examples/talk-llama/llama.cpp +0 -1140
- data/ext/sources/examples/talk-llama/llama.h +0 -1540
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -191
- data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
- data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -138
- data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -144
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/bert.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -160
- data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
- data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -144
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -259
- data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -134
- data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
- data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -113
- data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
- data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
- data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -196
- data/ext/sources/examples/talk-llama/models/granite.cpp +0 -211
- data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +0 -283
- data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -141
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -154
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -175
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/llama.cpp +0 -168
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -55
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -199
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
- data/ext/sources/examples/talk-llama/models/models.h +0 -569
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -116
- data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
- data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
- data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
- data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -316
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/plm.cpp +0 -168
- data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -873
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -149
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -141
- data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -162
- data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
- data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
- data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
- data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
- data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
- data/ext/sources/examples/talk-llama/speak +0 -40
- data/ext/sources/examples/talk-llama/speak.bat +0 -1
- data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
- data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
- data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
- data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
- data/ext/sources/examples/talk-llama/unicode.cpp +0 -1147
- data/ext/sources/examples/talk-llama/unicode.h +0 -111
- data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
- data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
- data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
- data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
- data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
- data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
- data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
- data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
- data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
- data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +0 -157
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -165
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
- data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
- data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -147
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +0 -907
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +0 -247
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
- data/ext/sources/tests/CMakeLists.txt +0 -112
- data/ext/sources/tests/earnings21/eval.mk +0 -58
- data/ext/sources/tests/earnings21/eval.py +0 -68
- data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
- data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
- data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
- data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
- data/ext/sources/tests/earnings21/requirements.txt +0 -6
- data/ext/sources/tests/en-0-ref.txt +0 -1
- data/ext/sources/tests/en-1-ref.txt +0 -1
- data/ext/sources/tests/en-2-ref.txt +0 -1
- data/ext/sources/tests/es-0-ref.txt +0 -1
- data/ext/sources/tests/librispeech/eval.mk +0 -39
- data/ext/sources/tests/librispeech/eval.py +0 -47
- data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
- data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
- data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
- data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
- data/ext/sources/tests/librispeech/requirements.txt +0 -6
- data/ext/sources/tests/run-tests.sh +0 -130
- data/ext/sources/tests/test-c.c +0 -3
- data/ext/sources/tests/test-vad-full.cpp +0 -56
- data/ext/sources/tests/test-vad.cpp +0 -83
- data/ext/sources/tests/test-whisper.js +0 -58
- data/lib/whisper/context.rb +0 -15
- data/lib/whisper/segment.rb +0 -58
|
@@ -28,6 +28,7 @@
|
|
|
28
28
|
#define QK8_0 32
|
|
29
29
|
#define QR8_0 1
|
|
30
30
|
#define QK_K 256
|
|
31
|
+
#define K_SCALE_SIZE (3 * QK_K / 64)
|
|
31
32
|
#define K_QUANTS_PER_ITERATION 2
|
|
32
33
|
|
|
33
34
|
typedef char int8_t;
|
|
@@ -46,6 +47,118 @@ struct block_q4_0
|
|
|
46
47
|
uint8_t qs[QK4_0 / 2];
|
|
47
48
|
};
|
|
48
49
|
|
|
50
|
+
//------------------------------------------------------------------------------
|
|
51
|
+
// block_q4_1
|
|
52
|
+
//------------------------------------------------------------------------------
|
|
53
|
+
struct block_q4_1 {
|
|
54
|
+
half d; // delta
|
|
55
|
+
half m; // min
|
|
56
|
+
uchar qs[QK4_1 / 2]; // nibbles / quants
|
|
57
|
+
};
|
|
58
|
+
|
|
59
|
+
//------------------------------------------------------------------------------
|
|
60
|
+
// block_q5_0
|
|
61
|
+
//------------------------------------------------------------------------------
|
|
62
|
+
struct block_q5_0 {
|
|
63
|
+
half d; // delta
|
|
64
|
+
uchar qh[4]; // 5-th bit of quants
|
|
65
|
+
uchar qs[QK5_0 / 2]; // nibbles / quants
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
//------------------------------------------------------------------------------
|
|
69
|
+
// block_q5_1
|
|
70
|
+
//------------------------------------------------------------------------------
|
|
71
|
+
struct block_q5_1 {
|
|
72
|
+
half d; // delta
|
|
73
|
+
half m; // min
|
|
74
|
+
uchar qh[4]; // 5-th bit of quants
|
|
75
|
+
uchar qs[QK5_1 / 2]; // nibbles / quants
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
//------------------------------------------------------------------------------
|
|
79
|
+
// block_q4_k
|
|
80
|
+
//------------------------------------------------------------------------------
|
|
81
|
+
struct block_q4_K {
|
|
82
|
+
half d; // delta
|
|
83
|
+
half dm; // min
|
|
84
|
+
uchar s[K_SCALE_SIZE];
|
|
85
|
+
uchar q[QK_K / 2]; // nibbles / quants
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
//------------------------------------------------------------------------------
|
|
89
|
+
// block_q5_k
|
|
90
|
+
//------------------------------------------------------------------------------
|
|
91
|
+
struct block_q5_K {
|
|
92
|
+
half d; // delta
|
|
93
|
+
half dm; // min
|
|
94
|
+
uchar s[K_SCALE_SIZE];
|
|
95
|
+
uchar qh[QK_K / 8];
|
|
96
|
+
uchar qs[QK_K / 2]; // nibbles / quants
|
|
97
|
+
};
|
|
98
|
+
|
|
99
|
+
//------------------------------------------------------------------------------
|
|
100
|
+
// block_q6_K
|
|
101
|
+
//------------------------------------------------------------------------------
|
|
102
|
+
struct block_q6_K {
|
|
103
|
+
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
|
104
|
+
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
|
105
|
+
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
|
106
|
+
half d; // super-block scale
|
|
107
|
+
};
|
|
108
|
+
|
|
109
|
+
//------------------------------------------------------------------------------
|
|
110
|
+
// block_iq4_nl
|
|
111
|
+
//------------------------------------------------------------------------------
|
|
112
|
+
#define QK4_NL 32
|
|
113
|
+
|
|
114
|
+
struct block_iq4_nl
|
|
115
|
+
{
|
|
116
|
+
half d;
|
|
117
|
+
uint8_t qs[QK4_NL / 2];
|
|
118
|
+
};
|
|
119
|
+
|
|
120
|
+
//------------------------------------------------------------------------------
|
|
121
|
+
// bf16 to f16
|
|
122
|
+
//------------------------------------------------------------------------------
|
|
123
|
+
kernel void kernel_convert_bf16_to_f16(
|
|
124
|
+
global const ushort * src,
|
|
125
|
+
global half * dst,
|
|
126
|
+
ulong off_dst,
|
|
127
|
+
ulong n
|
|
128
|
+
) {
|
|
129
|
+
uint i = get_global_id(0);
|
|
130
|
+
if (i >= n) {
|
|
131
|
+
return;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
dst[i + off_dst] = (half) as_float((uint) src[i] << 16);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
//------------------------------------------------------------------------------
|
|
138
|
+
// f16 to bf16
|
|
139
|
+
//------------------------------------------------------------------------------
|
|
140
|
+
kernel void kernel_convert_f16_to_bf16(
|
|
141
|
+
global const half * src,
|
|
142
|
+
ulong off_src,
|
|
143
|
+
global ushort * dst,
|
|
144
|
+
ulong n
|
|
145
|
+
) {
|
|
146
|
+
uint i = get_global_id(0);
|
|
147
|
+
if (i >= n) {
|
|
148
|
+
return;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
float f = (float) src[i + off_src];
|
|
152
|
+
uint bits = as_uint(f);
|
|
153
|
+
if ((bits & 0x7fffffffu) > 0x7f800000u) {
|
|
154
|
+
// nan to quiet nan
|
|
155
|
+
dst[i] = (ushort)((bits >> 16) | 0x40u);
|
|
156
|
+
} else {
|
|
157
|
+
uint rounded = bits + 0x7fffu + ((bits >> 16) & 1u);
|
|
158
|
+
dst[i] = (ushort)(rounded >> 16);
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
49
162
|
//------------------------------------------------------------------------------
|
|
50
163
|
// kernel_convert_block_q4_0
|
|
51
164
|
// Convert the block_q4_0 format to 2 separate arrays (AOS -> SOA).
|
|
@@ -138,76 +251,248 @@ kernel void kernel_restore_block_q4_0_noshuffle(
|
|
|
138
251
|
}
|
|
139
252
|
}
|
|
140
253
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
254
|
+
kernel void kernel_convert_block_q4_0_trans4_ns(
|
|
255
|
+
global struct block_q4_0 * src0,
|
|
256
|
+
__global uint * dst_q,
|
|
257
|
+
__global half * dst_d,
|
|
258
|
+
uint ne00,
|
|
259
|
+
uint ne01
|
|
260
|
+
) {
|
|
261
|
+
uint i00 = get_global_id(1);
|
|
262
|
+
uint i01 = get_global_id(0);
|
|
263
|
+
uint i02 = get_global_id(2);
|
|
264
|
+
|
|
265
|
+
if (i01 >= ne01) {
|
|
266
|
+
return;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
uint ne00_blk = ne00 / QK4_0;
|
|
270
|
+
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
|
271
|
+
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
|
272
|
+
|
|
273
|
+
global struct block_q4_0 * b = src0 + src_blk_offset;
|
|
274
|
+
dst_d[dst_blk_offset] = b->d;
|
|
275
|
+
|
|
276
|
+
// extract quantization and unshuffle
|
|
277
|
+
ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
|
|
278
|
+
|
|
279
|
+
ushort8 post_block = (ushort8)(0);
|
|
280
|
+
|
|
281
|
+
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
|
282
|
+
uchar * post_block_ptr = (uchar *)(&post_block);
|
|
283
|
+
|
|
284
|
+
for (int i = 0; i < QK4_0 / 4; ++i) {
|
|
285
|
+
uchar x0 = pre_block_ptr[2*i + 0];
|
|
286
|
+
uchar x1 = pre_block_ptr[2*i + 1];
|
|
287
|
+
|
|
288
|
+
post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
|
289
|
+
post_block_ptr[i + QK4_0 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
uint4 q_block = as_uint4(post_block);
|
|
293
|
+
|
|
294
|
+
uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
|
295
|
+
dst_q[offset] = q_block.x;
|
|
296
|
+
dst_q[offset + ne01] = q_block.y;
|
|
297
|
+
dst_q[offset + ne01 * 2] = q_block.z;
|
|
298
|
+
dst_q[offset + ne01 * 3] = q_block.w;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
kernel void kernel_restore_block_q4_0_trans4_ns(
|
|
302
|
+
__global uint * src_q,
|
|
303
|
+
__global half * src_d,
|
|
304
|
+
__global struct block_q4_0 * dst0,
|
|
305
|
+
uint ne00,
|
|
306
|
+
uint ne01
|
|
307
|
+
) {
|
|
308
|
+
uint i00 = get_global_id(1);
|
|
309
|
+
uint i01 = get_global_id(0);
|
|
310
|
+
uint i02 = get_global_id(2);
|
|
311
|
+
|
|
312
|
+
if (i01 >= ne01) {
|
|
313
|
+
return;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
uint ne00_blk = ne00 / QK4_0;
|
|
317
|
+
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
|
318
|
+
uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
|
319
|
+
|
|
320
|
+
__global struct block_q4_0 * b = dst0 + dst_blk_offset;
|
|
321
|
+
b->d = src_d[src_d_offset];
|
|
322
|
+
|
|
323
|
+
// collect transposed quantization parts for a block
|
|
324
|
+
uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
|
325
|
+
uint4 q_block;
|
|
326
|
+
q_block.x = src_q[src_q_offset];
|
|
327
|
+
q_block.y = src_q[src_q_offset + ne01];
|
|
328
|
+
q_block.z = src_q[src_q_offset + ne01 * 2];
|
|
329
|
+
q_block.w = src_q[src_q_offset + ne01 * 3];
|
|
330
|
+
|
|
331
|
+
ushort8 post_block = as_ushort8(q_block);
|
|
332
|
+
ushort8 pre_block = (ushort8)(0);
|
|
333
|
+
|
|
334
|
+
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
|
335
|
+
uchar * post_block_ptr = (uchar *)(&post_block);
|
|
336
|
+
|
|
337
|
+
for (int i = 0; i < QK4_0 / 4; ++i) {
|
|
338
|
+
uchar x0 = post_block_ptr[i + 0];
|
|
339
|
+
uchar x1 = post_block_ptr[i + QK4_0 / 4];
|
|
340
|
+
|
|
341
|
+
pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
|
342
|
+
pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
|
|
346
|
+
}
|
|
149
347
|
|
|
150
348
|
//------------------------------------------------------------------------------
|
|
151
|
-
//
|
|
152
|
-
// Convert the
|
|
349
|
+
// kernel_convert_block_q4_1
|
|
350
|
+
// Convert the block_q4_1 format to 2 separate arrays (AOS -> SOA).
|
|
153
351
|
// This kernel does not deshuffle the bits.
|
|
154
352
|
//------------------------------------------------------------------------------
|
|
155
|
-
kernel void
|
|
156
|
-
global struct
|
|
353
|
+
kernel void kernel_convert_block_q4_1(
|
|
354
|
+
global struct block_q4_1 * src0,
|
|
157
355
|
global uchar * dst_q,
|
|
158
|
-
global
|
|
356
|
+
global half * dst_d,
|
|
357
|
+
global half * dst_m
|
|
159
358
|
) {
|
|
160
|
-
global struct
|
|
161
|
-
global uchar * q = (global uchar *) dst_q +
|
|
162
|
-
global
|
|
359
|
+
global struct block_q4_1 * b = (global struct block_q4_1 *) src0 + get_global_id(0);
|
|
360
|
+
global uchar * q = (global uchar *) dst_q + QK4_1/2*get_global_id(0);
|
|
361
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
362
|
+
global half * m = (global half *) dst_m + get_global_id(0);
|
|
163
363
|
|
|
164
|
-
*
|
|
364
|
+
*d = b->d;
|
|
365
|
+
*m = b->m;
|
|
165
366
|
|
|
166
|
-
for (int i = 0; i <
|
|
367
|
+
for (int i = 0; i < QK4_1/2; ++i) {
|
|
167
368
|
q[i] = b->qs[i];
|
|
168
369
|
}
|
|
169
370
|
}
|
|
170
371
|
|
|
171
|
-
kernel void
|
|
172
|
-
global
|
|
173
|
-
|
|
174
|
-
|
|
372
|
+
kernel void kernel_restore_block_q4_1(
|
|
373
|
+
global uchar * src_q,
|
|
374
|
+
global half * src_d,
|
|
375
|
+
global half * src_m,
|
|
376
|
+
global struct block_q4_1 * dst
|
|
377
|
+
) {
|
|
378
|
+
global struct block_q4_1 * b = (global struct block_q4_1 *) dst + get_global_id(0);
|
|
379
|
+
global uchar * q = (global uchar *) src_q + QK4_1/2*get_global_id(0);
|
|
380
|
+
global half * d = (global half *) src_d + get_global_id(0);
|
|
381
|
+
global half * m = (global half *) src_m + get_global_id(0);
|
|
382
|
+
|
|
383
|
+
b->d = *d;
|
|
384
|
+
b->m = *m;
|
|
385
|
+
for (int i = 0; i < QK4_1/2; ++i) {
|
|
386
|
+
b->qs[i] = q[i];
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
kernel void kernel_convert_block_q4_1_noshuffle(
|
|
391
|
+
global struct block_q4_1 * src0,
|
|
392
|
+
global uchar * dst_q,
|
|
393
|
+
global half * dst_d,
|
|
394
|
+
global half * dst_m
|
|
395
|
+
) {
|
|
396
|
+
global struct block_q4_1 * b = (global struct block_q4_1 *) src0 + get_global_id(0);
|
|
397
|
+
global uchar * q = (global uchar *) dst_q + QK4_1/2*get_global_id(0);
|
|
398
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
399
|
+
global half * m = (global half *) dst_m + get_global_id(0);
|
|
400
|
+
|
|
401
|
+
*d = b->d;
|
|
402
|
+
*m = b->m;
|
|
403
|
+
for (int i = 0; i < QK4_1/4; ++i) {
|
|
404
|
+
uchar x0 = b->qs[2*i + 0];
|
|
405
|
+
uchar x1 = b->qs[2*i + 1];
|
|
406
|
+
|
|
407
|
+
q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
|
408
|
+
q[i + QK4_1/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
|
409
|
+
|
|
410
|
+
#ifdef ADRENO_GPU
|
|
411
|
+
if (get_global_id(0) == 65536*4096) {
|
|
412
|
+
printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
|
|
413
|
+
}
|
|
414
|
+
#endif
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
kernel void kernel_restore_block_q4_1_noshuffle(
|
|
419
|
+
global uchar * src_q,
|
|
420
|
+
global half * src_d,
|
|
421
|
+
global half * src_m,
|
|
422
|
+
global struct block_q4_1 * dst,
|
|
423
|
+
uchar mask_0F,
|
|
424
|
+
uchar mask_F0
|
|
425
|
+
) {
|
|
426
|
+
global struct block_q4_1 * b = (global struct block_q4_1 *) dst + get_global_id(0);
|
|
427
|
+
global uchar * q = (global uchar *) src_q + QK4_1/2*get_global_id(0);
|
|
428
|
+
global half * d = (global half *) src_d + get_global_id(0);
|
|
429
|
+
global half * m = (global half *) src_m + get_global_id(0);
|
|
430
|
+
|
|
431
|
+
b->d = *d;
|
|
432
|
+
b->m = *m;
|
|
433
|
+
for (int i = 0; i < QK4_1/4; ++i) {
|
|
434
|
+
uchar x0 = q[i + 0 ] ;
|
|
435
|
+
uchar x1 = q[i + QK4_1/4];
|
|
436
|
+
|
|
437
|
+
b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
|
|
438
|
+
b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
kernel void kernel_convert_block_q4_1_trans4_ns(
|
|
443
|
+
__global struct block_q4_1 * src0,
|
|
444
|
+
__global uint * dst_q,
|
|
445
|
+
__global half * dst_d,
|
|
446
|
+
__global half * dst_m,
|
|
175
447
|
uint ne00,
|
|
176
448
|
uint ne01
|
|
177
449
|
) {
|
|
178
|
-
|
|
450
|
+
uint i00 = get_global_id(1);
|
|
179
451
|
uint i01 = get_global_id(0);
|
|
180
452
|
uint i02 = get_global_id(2);
|
|
181
453
|
|
|
182
|
-
|
|
454
|
+
if (i01 >= ne01) {
|
|
455
|
+
return;
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
uint ne00_blk = ne00 / QK4_1;
|
|
183
459
|
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
|
184
460
|
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
|
185
461
|
|
|
186
|
-
global struct
|
|
462
|
+
global struct block_q4_1 * b = src0 + src_blk_offset;
|
|
463
|
+
dst_d[dst_blk_offset] = b->d;
|
|
464
|
+
dst_m[dst_blk_offset] = b->m;
|
|
187
465
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
}
|
|
466
|
+
// extract quantization and unshuffle
|
|
467
|
+
ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
|
|
191
468
|
|
|
192
|
-
|
|
193
|
-
global uchar * src_q,
|
|
194
|
-
global half * src_e,
|
|
195
|
-
global struct block_mxfp4 * dst
|
|
196
|
-
) {
|
|
197
|
-
global struct block_mxfp4 * b = (global struct block_mxfp4 *) dst + get_global_id(0);
|
|
198
|
-
global uchar * q = (global uchar *) src_q + QK_MXFP4 / 2 * get_global_id(0);
|
|
199
|
-
global uchar * e = (global uchar *) src_e + get_global_id(0);
|
|
469
|
+
ushort8 post_block = (ushort8)(0);
|
|
200
470
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
471
|
+
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
|
472
|
+
uchar * post_block_ptr = (uchar *)(&post_block);
|
|
473
|
+
|
|
474
|
+
for (int i = 0; i < QK4_1 / 4; ++i) {
|
|
475
|
+
uchar x0 = pre_block_ptr[2*i + 0];
|
|
476
|
+
uchar x1 = pre_block_ptr[2*i + 1];
|
|
477
|
+
|
|
478
|
+
post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
|
479
|
+
post_block_ptr[i + QK4_1 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
|
204
480
|
}
|
|
481
|
+
|
|
482
|
+
uint4 q_block = as_uint4(post_block);
|
|
483
|
+
|
|
484
|
+
uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
|
485
|
+
dst_q[offset] = q_block.x;
|
|
486
|
+
dst_q[offset + ne01] = q_block.y;
|
|
487
|
+
dst_q[offset + ne01 * 2] = q_block.z;
|
|
488
|
+
dst_q[offset + ne01 * 3] = q_block.w;
|
|
205
489
|
}
|
|
206
490
|
|
|
207
|
-
kernel void
|
|
208
|
-
__global
|
|
209
|
-
__global
|
|
210
|
-
|
|
491
|
+
kernel void kernel_restore_block_q4_1_trans4_ns(
|
|
492
|
+
__global uint * src_q,
|
|
493
|
+
__global half * src_d,
|
|
494
|
+
__global half * src_m,
|
|
495
|
+
__global struct block_q4_1 * dst0,
|
|
211
496
|
uint ne00,
|
|
212
497
|
uint ne01
|
|
213
498
|
) {
|
|
@@ -215,51 +500,1677 @@ kernel void kernel_restore_block_mxfp4_trans(
|
|
|
215
500
|
uint i01 = get_global_id(0);
|
|
216
501
|
uint i02 = get_global_id(2);
|
|
217
502
|
|
|
218
|
-
|
|
219
|
-
|
|
503
|
+
if (i01 >= ne01) {
|
|
504
|
+
return;
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
uint ne00_blk = ne00 / QK4_1;
|
|
220
508
|
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
|
509
|
+
uint src_dm_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
|
221
510
|
|
|
222
|
-
|
|
511
|
+
__global struct block_q4_1 * b = dst0 + dst_blk_offset;
|
|
512
|
+
b->d = src_d[src_dm_offset];
|
|
513
|
+
b->m = src_m[src_dm_offset];
|
|
223
514
|
|
|
224
|
-
|
|
225
|
-
|
|
515
|
+
// collect transposed quantization parts for a block
|
|
516
|
+
uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
|
517
|
+
uint4 q_block;
|
|
518
|
+
q_block.x = src_q[src_q_offset];
|
|
519
|
+
q_block.y = src_q[src_q_offset + ne01];
|
|
520
|
+
q_block.z = src_q[src_q_offset + ne01 * 2];
|
|
521
|
+
q_block.w = src_q[src_q_offset + ne01 * 3];
|
|
522
|
+
|
|
523
|
+
ushort8 post_block = as_ushort8(q_block);
|
|
524
|
+
ushort8 pre_block = (ushort8)(0);
|
|
525
|
+
|
|
526
|
+
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
|
527
|
+
uchar * post_block_ptr = (uchar *)(&post_block);
|
|
528
|
+
|
|
529
|
+
for (int i = 0; i < QK4_0 / 4; ++i) {
|
|
530
|
+
uchar x0 = post_block_ptr[i + 0];
|
|
531
|
+
uchar x1 = post_block_ptr[i + QK4_0 / 4];
|
|
532
|
+
|
|
533
|
+
pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
|
534
|
+
pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
|
|
226
538
|
}
|
|
227
539
|
|
|
228
540
|
//------------------------------------------------------------------------------
|
|
229
|
-
//
|
|
541
|
+
// kernel_convert_block_q5_0
|
|
542
|
+
// Convert the block_q5_0 format to 3 separate arrays (AOS -> SOA).
|
|
543
|
+
// This kernel does not deshuffle the bits.
|
|
230
544
|
//------------------------------------------------------------------------------
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
545
|
+
kernel void kernel_convert_block_q5_0(
|
|
546
|
+
global struct block_q5_0 * src0,
|
|
547
|
+
global uchar * dst_qs,
|
|
548
|
+
global uint * dst_qh,
|
|
549
|
+
global half * dst_d,
|
|
550
|
+
ulong n_blk
|
|
551
|
+
) {
|
|
552
|
+
if (get_global_id(0) >= n_blk) {
|
|
553
|
+
return;
|
|
554
|
+
}
|
|
235
555
|
|
|
236
|
-
|
|
237
|
-
global
|
|
556
|
+
global struct block_q5_0 * b = (global struct block_q5_0 *) src0 + get_global_id(0);
|
|
557
|
+
global uchar * qs = (global uchar *) dst_qs + (QK5_0/2)*get_global_id(0);
|
|
558
|
+
global uint * qh = (global uint *) dst_qh + get_global_id(0);
|
|
559
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
560
|
+
|
|
561
|
+
*d = b->d;
|
|
562
|
+
*qh = *((global uint *)(b->qh));
|
|
563
|
+
|
|
564
|
+
for (int i = 0; i < QK5_0/2; ++i) {
|
|
565
|
+
qs[i] = b->qs[i];
|
|
566
|
+
}
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
kernel void kernel_restore_block_q5_0(
|
|
570
|
+
global uchar * src_qs,
|
|
571
|
+
global uint * src_qh,
|
|
572
|
+
global half * src_d,
|
|
573
|
+
global struct block_q5_0 * dst
|
|
574
|
+
) {
|
|
575
|
+
global struct block_q5_0 * b = (global struct block_q5_0 *) dst + get_global_id(0);
|
|
576
|
+
global uchar * qs = (global uchar *) src_qs + (QK5_0/2)*get_global_id(0);
|
|
577
|
+
global uint * qh = (global uint *) src_qh + get_global_id(0);
|
|
578
|
+
global half * d = (global half *) src_d + get_global_id(0);
|
|
579
|
+
|
|
580
|
+
b->d = *d;
|
|
581
|
+
*((global uint *)(b->qh)) = *qh;
|
|
582
|
+
for (int i = 0; i < QK5_0/2; ++i) {
|
|
583
|
+
b->qs[i] = qs[i];
|
|
584
|
+
}
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
kernel void kernel_convert_block_q5_0_noshuffle(
|
|
588
|
+
global struct block_q5_0 * src0,
|
|
238
589
|
global uchar * dst_q,
|
|
590
|
+
global uint * dst_qh,
|
|
239
591
|
global half * dst_d
|
|
240
592
|
) {
|
|
241
|
-
global
|
|
242
|
-
global uchar
|
|
243
|
-
global
|
|
593
|
+
global struct block_q5_0 * b = (global struct block_q5_0 *) src0 + get_global_id(0);
|
|
594
|
+
global uchar * q = (global uchar *) dst_q + QK5_0/2*get_global_id(0);
|
|
595
|
+
global uint * qh = (global uint *) dst_qh + get_global_id(0);
|
|
596
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
244
597
|
|
|
245
598
|
*d = b->d;
|
|
599
|
+
*qh = *((global uint *)(b->qh));
|
|
246
600
|
|
|
247
|
-
for (int i = 0; i <
|
|
248
|
-
|
|
601
|
+
for (int i = 0; i < QK5_0/4; ++i) {
|
|
602
|
+
uchar x0 = b->qs[2*i + 0];
|
|
603
|
+
uchar x1 = b->qs[2*i + 1];
|
|
604
|
+
|
|
605
|
+
q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
|
606
|
+
q[i + QK5_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
|
607
|
+
|
|
608
|
+
#ifdef ADRENO_GPU
|
|
609
|
+
if (get_global_id(0) == 65536*4096) {
|
|
610
|
+
printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
|
|
611
|
+
}
|
|
612
|
+
#endif
|
|
249
613
|
}
|
|
250
614
|
}
|
|
251
615
|
|
|
252
|
-
kernel void
|
|
616
|
+
kernel void kernel_restore_block_q5_0_noshuffle(
|
|
253
617
|
global uchar * src_q,
|
|
618
|
+
global uint * src_qh,
|
|
254
619
|
global half * src_d,
|
|
255
|
-
global
|
|
620
|
+
global struct block_q5_0 * dst,
|
|
621
|
+
uchar mask_0F,
|
|
622
|
+
uchar mask_F0
|
|
256
623
|
) {
|
|
257
|
-
global
|
|
258
|
-
global uchar
|
|
259
|
-
global
|
|
624
|
+
global struct block_q5_0 * b = (global struct block_q5_0 *) dst + get_global_id(0);
|
|
625
|
+
global uchar * q = (global uchar *) src_q + QK5_0/2*get_global_id(0);
|
|
626
|
+
global uint * qh = (global uint *) src_qh + get_global_id(0);
|
|
627
|
+
global half * d = (global half *) src_d + get_global_id(0);
|
|
260
628
|
|
|
261
629
|
b->d = *d;
|
|
262
|
-
|
|
263
|
-
|
|
630
|
+
*((global uint *)(b->qh)) = *qh;
|
|
631
|
+
|
|
632
|
+
for (int i = 0; i < QK5_0/4; ++i) {
|
|
633
|
+
uchar x0 = q[i + 0 ];
|
|
634
|
+
uchar x1 = q[i + QK5_0/4];
|
|
635
|
+
|
|
636
|
+
b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
|
|
637
|
+
b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
kernel void kernel_convert_block_q5_0_trans4_ns(
|
|
642
|
+
__global struct block_q5_0 * src0,
|
|
643
|
+
__global uint * dst_qs,
|
|
644
|
+
__global uint * dst_qh,
|
|
645
|
+
__global half * dst_d,
|
|
646
|
+
uint ne00,
|
|
647
|
+
uint ne01
|
|
648
|
+
) {
|
|
649
|
+
uint i00 = get_global_id(1);
|
|
650
|
+
uint i01 = get_global_id(0);
|
|
651
|
+
uint i02 = get_global_id(2);
|
|
652
|
+
|
|
653
|
+
if (i01 >= ne01) {
|
|
654
|
+
return;
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
uint ne00_blk = ne00 / QK5_0;
|
|
658
|
+
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
|
659
|
+
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
|
660
|
+
|
|
661
|
+
global struct block_q5_0 * b = src0 + src_blk_offset;
|
|
662
|
+
dst_d[dst_blk_offset] = b->d;
|
|
663
|
+
|
|
664
|
+
dst_qh[dst_blk_offset] = ((global uint *)(&(b->qh[0])))[0];
|
|
665
|
+
|
|
666
|
+
// extract quantization and unshuffle
|
|
667
|
+
ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
|
|
668
|
+
ushort8 post_block = (ushort8)(0);
|
|
669
|
+
|
|
670
|
+
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
|
671
|
+
uchar * post_block_ptr = (uchar *)(&post_block);
|
|
672
|
+
|
|
673
|
+
for (int i = 0; i < QK5_0 / 4; ++i) {
|
|
674
|
+
uchar x0 = pre_block_ptr[2*i + 0];
|
|
675
|
+
uchar x1 = pre_block_ptr[2*i + 1];
|
|
676
|
+
|
|
677
|
+
post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
|
678
|
+
post_block_ptr[i + QK5_0 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
uint4 q_block = as_uint4(post_block);
|
|
682
|
+
|
|
683
|
+
uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
|
684
|
+
dst_qs[offset] = q_block.x;
|
|
685
|
+
dst_qs[offset + ne01] = q_block.y;
|
|
686
|
+
dst_qs[offset + ne01 * 2] = q_block.z;
|
|
687
|
+
dst_qs[offset + ne01 * 3] = q_block.w;
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
kernel void kernel_restore_block_q5_0_trans4_ns(
|
|
691
|
+
__global uint * src_qs,
|
|
692
|
+
__global uint * src_qh,
|
|
693
|
+
__global half * src_d,
|
|
694
|
+
__global struct block_q5_0 * dst0,
|
|
695
|
+
uint ne00,
|
|
696
|
+
uint ne01
|
|
697
|
+
) {
|
|
698
|
+
int i00 = get_global_id(1);
|
|
699
|
+
uint i01 = get_global_id(0);
|
|
700
|
+
uint i02 = get_global_id(2);
|
|
701
|
+
|
|
702
|
+
if (i01 >= ne01) {
|
|
703
|
+
return;
|
|
704
|
+
}
|
|
705
|
+
|
|
706
|
+
uint ne00_blk = ne00 / QK5_0;
|
|
707
|
+
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
|
708
|
+
uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
|
709
|
+
|
|
710
|
+
__global struct block_q5_0 * b = dst0 + dst_blk_offset;
|
|
711
|
+
b->d = src_d[src_blk_offset];
|
|
712
|
+
|
|
713
|
+
((__global uint *)(&(b->qh[0])))[0] = src_qh[src_blk_offset];
|
|
714
|
+
|
|
715
|
+
// collect transposed quantization parts for a block
|
|
716
|
+
uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
|
717
|
+
uint4 q_block;
|
|
718
|
+
q_block.x = src_qs[src_q_offset];
|
|
719
|
+
q_block.y = src_qs[src_q_offset + ne01];
|
|
720
|
+
q_block.z = src_qs[src_q_offset + ne01 * 2];
|
|
721
|
+
q_block.w = src_qs[src_q_offset + ne01 * 3];
|
|
722
|
+
|
|
723
|
+
ushort8 post_block = as_ushort8(q_block);
|
|
724
|
+
ushort8 pre_block = (ushort8)(0);
|
|
725
|
+
|
|
726
|
+
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
|
727
|
+
uchar * post_block_ptr = (uchar *)(&post_block);
|
|
728
|
+
|
|
729
|
+
for (int i = 0; i < QK5_0 / 4; ++i) {
|
|
730
|
+
uchar x0 = post_block_ptr[i + 0];
|
|
731
|
+
uchar x1 = post_block_ptr[i + QK5_0 / 4];
|
|
732
|
+
|
|
733
|
+
pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
|
734
|
+
pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
|
735
|
+
}
|
|
736
|
+
|
|
737
|
+
((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
//------------------------------------------------------------------------------
|
|
741
|
+
// kernel_convert_block_q5_1
|
|
742
|
+
// Convert the block_q5_1 format to 4 separate arrays (AOS -> SOA).
|
|
743
|
+
// This kernel does not deshuffle the bits.
|
|
744
|
+
//------------------------------------------------------------------------------
|
|
745
|
+
kernel void kernel_convert_block_q5_1(
|
|
746
|
+
global struct block_q5_1 * src0,
|
|
747
|
+
global uchar * dst_qs,
|
|
748
|
+
global uint * dst_qh,
|
|
749
|
+
global half * dst_d,
|
|
750
|
+
global half * dst_m,
|
|
751
|
+
ulong n_blk
|
|
752
|
+
) {
|
|
753
|
+
if (get_global_id(0) >= n_blk) {
|
|
754
|
+
return;
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
global struct block_q5_1 * b = (global struct block_q5_1 *) src0 + get_global_id(0);
|
|
758
|
+
global uchar * qs = (global uchar *) dst_qs + (QK5_1/2)*get_global_id(0);
|
|
759
|
+
global uint * qh = (global uint *) dst_qh + get_global_id(0);
|
|
760
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
761
|
+
global half * m = (global half *) dst_m + get_global_id(0);
|
|
762
|
+
|
|
763
|
+
*d = b->d;
|
|
764
|
+
*m = b->m;
|
|
765
|
+
*qh = *((global uint *)(b->qh));
|
|
766
|
+
|
|
767
|
+
for (int i = 0; i < QK5_1/2; ++i) {
|
|
768
|
+
qs[i] = b->qs[i];
|
|
769
|
+
}
|
|
770
|
+
}
|
|
771
|
+
|
|
772
|
+
kernel void kernel_restore_block_q5_1(
|
|
773
|
+
global uchar * src_qs,
|
|
774
|
+
global uint * src_qh,
|
|
775
|
+
global half * src_d,
|
|
776
|
+
global half * src_m,
|
|
777
|
+
global struct block_q5_1 * dst
|
|
778
|
+
) {
|
|
779
|
+
global struct block_q5_1 * b = (global struct block_q5_1 *) dst + get_global_id(0);
|
|
780
|
+
global uchar * qs = (global uchar *) src_qs + (QK5_1/2)*get_global_id(0);
|
|
781
|
+
global uint * qh = (global uint *) src_qh + get_global_id(0);
|
|
782
|
+
global half * d = (global half *) src_d + get_global_id(0);
|
|
783
|
+
global half * m = (global half *) src_m + get_global_id(0);
|
|
784
|
+
|
|
785
|
+
b->d = *d;
|
|
786
|
+
b->m = *m;
|
|
787
|
+
*((global uint *)(b->qh)) = *qh;
|
|
788
|
+
for (int i = 0; i < QK5_1/2; ++i) {
|
|
789
|
+
b->qs[i] = qs[i];
|
|
790
|
+
}
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
kernel void kernel_convert_block_q5_1_noshuffle(
|
|
794
|
+
global struct block_q5_1 * src0,
|
|
795
|
+
global uchar * dst_q,
|
|
796
|
+
global uint * dst_qh,
|
|
797
|
+
global half * dst_d,
|
|
798
|
+
global half * dst_m
|
|
799
|
+
) {
|
|
800
|
+
global struct block_q5_1 * b = (global struct block_q5_1 *) src0 + get_global_id(0);
|
|
801
|
+
global uchar * q = (global uchar *) dst_q + QK5_1/2*get_global_id(0);
|
|
802
|
+
global uint * qh = (global uint *) dst_qh + get_global_id(0);
|
|
803
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
804
|
+
global half * m = (global half *) dst_m + get_global_id(0);
|
|
805
|
+
|
|
806
|
+
*d = b->d;
|
|
807
|
+
*m = b->m;
|
|
808
|
+
*qh = *((global uint *)(b->qh));
|
|
809
|
+
|
|
810
|
+
for (int i = 0; i < QK5_1/4; ++i) {
|
|
811
|
+
uchar x0 = b->qs[2*i + 0];
|
|
812
|
+
uchar x1 = b->qs[2*i + 1];
|
|
813
|
+
|
|
814
|
+
q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
|
815
|
+
q[i + QK5_1/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
|
816
|
+
|
|
817
|
+
#ifdef ADRENO_GPU
|
|
818
|
+
if (get_global_id(0) == 65536*4096) {
|
|
819
|
+
printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
|
|
820
|
+
}
|
|
821
|
+
#endif
|
|
822
|
+
}
|
|
823
|
+
}
|
|
824
|
+
|
|
825
|
+
kernel void kernel_restore_block_q5_1_noshuffle(
|
|
826
|
+
global uchar * src_q,
|
|
827
|
+
global uint * src_qh,
|
|
828
|
+
global half * src_d,
|
|
829
|
+
global half * src_m,
|
|
830
|
+
global struct block_q5_1 * dst,
|
|
831
|
+
uchar mask_0F,
|
|
832
|
+
uchar mask_F0
|
|
833
|
+
) {
|
|
834
|
+
global struct block_q5_1 * b = (global struct block_q5_1 *) dst + get_global_id(0);
|
|
835
|
+
global uchar * q = (global uchar *) src_q + QK5_1/2*get_global_id(0);
|
|
836
|
+
global uint * qh = (global uint *) src_qh + get_global_id(0);
|
|
837
|
+
global half * d = (global half *) src_d + get_global_id(0);
|
|
838
|
+
global half * m = (global half *) src_m + get_global_id(0);
|
|
839
|
+
|
|
840
|
+
b->d = *d;
|
|
841
|
+
b->m = *m;
|
|
842
|
+
*((global uint *)(b->qh)) = *qh;
|
|
843
|
+
|
|
844
|
+
for (int i = 0; i < QK5_1/4; ++i) {
|
|
845
|
+
uchar x0 = q[i + 0 ];
|
|
846
|
+
uchar x1 = q[i + QK5_1/4];
|
|
847
|
+
|
|
848
|
+
b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
|
|
849
|
+
b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
|
|
850
|
+
}
|
|
851
|
+
}
|
|
852
|
+
|
|
853
|
+
kernel void kernel_convert_block_q5_1_trans4_ns(
|
|
854
|
+
__global struct block_q5_1 * src0,
|
|
855
|
+
__global uint * dst_qs,
|
|
856
|
+
__global uint * dst_qh,
|
|
857
|
+
__global half * dst_d,
|
|
858
|
+
__global half * dst_m,
|
|
859
|
+
uint ne00,
|
|
860
|
+
uint ne01
|
|
861
|
+
) {
|
|
862
|
+
uint i00 = get_global_id(1);
|
|
863
|
+
uint i01 = get_global_id(0);
|
|
864
|
+
uint i02 = get_global_id(2);
|
|
865
|
+
|
|
866
|
+
if (i01 >= ne01) {
|
|
867
|
+
return;
|
|
868
|
+
}
|
|
869
|
+
|
|
870
|
+
uint ne00_blk = ne00 / QK5_1;
|
|
871
|
+
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
|
872
|
+
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
|
873
|
+
|
|
874
|
+
global struct block_q5_1 * b = src0 + src_blk_offset;
|
|
875
|
+
dst_d[dst_blk_offset] = b->d;
|
|
876
|
+
dst_m[dst_blk_offset] = b->m;
|
|
877
|
+
|
|
878
|
+
dst_qh[dst_blk_offset] = ((global uint *)(&(b->qh[0])))[0];
|
|
879
|
+
|
|
880
|
+
// extract quantization and unshuffle
|
|
881
|
+
ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
|
|
882
|
+
ushort8 post_block = (ushort8)(0);
|
|
883
|
+
|
|
884
|
+
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
|
885
|
+
uchar * post_block_ptr = (uchar *)(&post_block);
|
|
886
|
+
|
|
887
|
+
for (int i = 0; i < QK5_1 / 4; ++i) {
|
|
888
|
+
uchar x0 = pre_block_ptr[2*i + 0];
|
|
889
|
+
uchar x1 = pre_block_ptr[2*i + 1];
|
|
890
|
+
|
|
891
|
+
post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
|
892
|
+
post_block_ptr[i + QK5_1 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
|
893
|
+
}
|
|
894
|
+
|
|
895
|
+
uint4 q_block = as_uint4(post_block);
|
|
896
|
+
|
|
897
|
+
uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
|
898
|
+
dst_qs[offset] = q_block.x;
|
|
899
|
+
dst_qs[offset + ne01] = q_block.y;
|
|
900
|
+
dst_qs[offset + ne01 * 2] = q_block.z;
|
|
901
|
+
dst_qs[offset + ne01 * 3] = q_block.w;
|
|
902
|
+
}
|
|
903
|
+
|
|
904
|
+
kernel void kernel_restore_block_q5_1_trans4_ns(
|
|
905
|
+
__global uint * src_qs,
|
|
906
|
+
__global uint * src_qh,
|
|
907
|
+
__global half * src_d,
|
|
908
|
+
__global half * src_m,
|
|
909
|
+
__global struct block_q5_1 * dst0,
|
|
910
|
+
uint ne00,
|
|
911
|
+
uint ne01
|
|
912
|
+
) {
|
|
913
|
+
int i00 = get_global_id(1);
|
|
914
|
+
uint i01 = get_global_id(0);
|
|
915
|
+
uint i02 = get_global_id(2);
|
|
916
|
+
|
|
917
|
+
if (i01 >= ne01) {
|
|
918
|
+
return;
|
|
919
|
+
}
|
|
920
|
+
|
|
921
|
+
uint ne00_blk = ne00 / QK5_1;
|
|
922
|
+
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
|
923
|
+
uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
|
924
|
+
|
|
925
|
+
__global struct block_q5_1 * b = dst0 + dst_blk_offset;
|
|
926
|
+
b->d = src_d[src_blk_offset];
|
|
927
|
+
b->m = src_m[src_blk_offset];
|
|
928
|
+
|
|
929
|
+
((__global uint *)(&(b->qh[0])))[0] = src_qh[src_blk_offset];
|
|
930
|
+
|
|
931
|
+
// collect transposed quantization parts for a block
|
|
932
|
+
uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
|
933
|
+
uint4 q_block;
|
|
934
|
+
q_block.x = src_qs[src_q_offset];
|
|
935
|
+
q_block.y = src_qs[src_q_offset + ne01];
|
|
936
|
+
q_block.z = src_qs[src_q_offset + ne01 * 2];
|
|
937
|
+
q_block.w = src_qs[src_q_offset + ne01 * 3];
|
|
938
|
+
|
|
939
|
+
ushort8 post_block = as_ushort8(q_block);
|
|
940
|
+
ushort8 pre_block = (ushort8)(0);
|
|
941
|
+
|
|
942
|
+
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
|
943
|
+
uchar * post_block_ptr = (uchar *)(&post_block);
|
|
944
|
+
|
|
945
|
+
for (int i = 0; i < QK5_1 / 4; ++i) {
|
|
946
|
+
uchar x0 = post_block_ptr[i + 0];
|
|
947
|
+
uchar x1 = post_block_ptr[i + QK5_1 / 4];
|
|
948
|
+
|
|
949
|
+
pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
|
950
|
+
pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
|
951
|
+
}
|
|
952
|
+
((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
|
|
953
|
+
}
|
|
954
|
+
|
|
955
|
+
kernel void kernel_convert_block_q4_k_trans4_ns(
|
|
956
|
+
__global struct block_q4_K * src0,
|
|
957
|
+
__global uint * dst_q,
|
|
958
|
+
__global half * dst_d,
|
|
959
|
+
__global half * dst_dm,
|
|
960
|
+
__global uchar * dst_s,
|
|
961
|
+
uint ne00,
|
|
962
|
+
uint ne01,
|
|
963
|
+
uchar mask_0F,
|
|
964
|
+
uchar mask_F0
|
|
965
|
+
) {
|
|
966
|
+
uint i00 = get_global_id(1);
|
|
967
|
+
uint i01 = get_global_id(0);
|
|
968
|
+
uint i02 = get_global_id(2);
|
|
969
|
+
|
|
970
|
+
if (i01 >= ne01) {
|
|
971
|
+
return;
|
|
972
|
+
}
|
|
973
|
+
|
|
974
|
+
uint ne00_blk = ne00 / QK_K;
|
|
975
|
+
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
|
976
|
+
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
|
977
|
+
|
|
978
|
+
__global struct block_q4_K * b = src0 + src_blk_offset;
|
|
979
|
+
|
|
980
|
+
dst_d [dst_blk_offset] = b->d;
|
|
981
|
+
dst_dm[dst_blk_offset] = b->dm;
|
|
982
|
+
|
|
983
|
+
uint4 qv[8];
|
|
984
|
+
uchar * qv_bytes = (uchar *)qv;
|
|
985
|
+
for (int i = 0; i < QK_K / 64; ++i) {
|
|
986
|
+
for (int j = 0; j < 16; ++j) {
|
|
987
|
+
uchar x0 = b->q[i*32 + 2*j];
|
|
988
|
+
uchar x1 = b->q[i*32 + 2*j + 1];
|
|
989
|
+
|
|
990
|
+
qv_bytes[i*32 + j ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
|
|
991
|
+
qv_bytes[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
|
|
992
|
+
}
|
|
993
|
+
}
|
|
994
|
+
|
|
995
|
+
uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
|
|
996
|
+
#pragma unroll
|
|
997
|
+
for (int p = 0; p < 8; ++p) {
|
|
998
|
+
uint4 v = qv[p];
|
|
999
|
+
dst_q[base + (p * 4 + 0) * ne01] = v.x;
|
|
1000
|
+
dst_q[base + (p * 4 + 1) * ne01] = v.y;
|
|
1001
|
+
dst_q[base + (p * 4 + 2) * ne01] = v.z;
|
|
1002
|
+
dst_q[base + (p * 4 + 3) * ne01] = v.w;
|
|
1003
|
+
}
|
|
1004
|
+
|
|
1005
|
+
__global uchar * s_dst = dst_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE;
|
|
1006
|
+
#pragma unroll
|
|
1007
|
+
for (int i = 0; i < K_SCALE_SIZE; ++i) {
|
|
1008
|
+
s_dst[i] = b->s[i];
|
|
1009
|
+
}
|
|
1010
|
+
}
|
|
1011
|
+
|
|
1012
|
+
kernel void kernel_restore_block_q4_k_trans4_ns(
|
|
1013
|
+
__global uint * src_q,
|
|
1014
|
+
__global half * src_d,
|
|
1015
|
+
__global half * src_dm,
|
|
1016
|
+
__global uchar * src_s,
|
|
1017
|
+
__global struct block_q4_K * dst0,
|
|
1018
|
+
uint ne00,
|
|
1019
|
+
uint ne01,
|
|
1020
|
+
uchar mask_0F,
|
|
1021
|
+
uchar mask_F0
|
|
1022
|
+
) {
|
|
1023
|
+
uint i00 = get_global_id(1); // block index along K
|
|
1024
|
+
uint i01 = get_global_id(0); // row index
|
|
1025
|
+
uint i02 = get_global_id(2); // batch index
|
|
1026
|
+
|
|
1027
|
+
if (i01 >= ne01) {
|
|
1028
|
+
return;
|
|
1029
|
+
}
|
|
1030
|
+
|
|
1031
|
+
uint ne00_blk = ne00 / QK_K;
|
|
1032
|
+
|
|
1033
|
+
uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
|
1034
|
+
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
|
1035
|
+
|
|
1036
|
+
__global struct block_q4_K * b = dst0 + dst_blk_offset;
|
|
1037
|
+
|
|
1038
|
+
b->d = src_d[src_blk_offset];
|
|
1039
|
+
b->dm = src_dm[src_blk_offset];
|
|
1040
|
+
|
|
1041
|
+
__global uchar * s_src = src_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE;
|
|
1042
|
+
for (int i = 0; i < K_SCALE_SIZE; ++i) {
|
|
1043
|
+
b->s[i] = s_src[i];
|
|
1044
|
+
}
|
|
1045
|
+
|
|
1046
|
+
uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
|
|
1047
|
+
|
|
1048
|
+
uint4 qv[8];
|
|
1049
|
+
for (int p = 0; p < 8; ++p) {
|
|
1050
|
+
qv[p].x = src_q[base + (p * 4 + 0) * ne01];
|
|
1051
|
+
qv[p].y = src_q[base + (p * 4 + 1) * ne01];
|
|
1052
|
+
qv[p].z = src_q[base + (p * 4 + 2) * ne01];
|
|
1053
|
+
qv[p].w = src_q[base + (p * 4 + 3) * ne01];
|
|
1054
|
+
}
|
|
1055
|
+
|
|
1056
|
+
uchar * qv_bytes = (uchar *)qv;
|
|
1057
|
+
for (int i = 0; i < QK_K / 64; ++i) {
|
|
1058
|
+
for (int j = 0; j < 16; ++j) {
|
|
1059
|
+
uchar lo = qv_bytes[i*32 + j];
|
|
1060
|
+
uchar hi = qv_bytes[i*32 + j + 16];
|
|
1061
|
+
b->q[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4));
|
|
1062
|
+
b->q[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0));
|
|
1063
|
+
}
|
|
1064
|
+
}
|
|
1065
|
+
}
|
|
1066
|
+
|
|
1067
|
+
kernel void kernel_convert_block_q5_k_trans4_ns(
|
|
1068
|
+
__global struct block_q5_K * src0,
|
|
1069
|
+
__global uint * dst_qs,
|
|
1070
|
+
__global uint * dst_qh,
|
|
1071
|
+
__global half * dst_d,
|
|
1072
|
+
__global half * dst_dm,
|
|
1073
|
+
__global uchar * dst_s,
|
|
1074
|
+
uint ne00,
|
|
1075
|
+
uint ne01,
|
|
1076
|
+
uchar mask_0F,
|
|
1077
|
+
uchar mask_F0
|
|
1078
|
+
) {
|
|
1079
|
+
uint i00 = get_global_id(1);
|
|
1080
|
+
uint i01 = get_global_id(0);
|
|
1081
|
+
uint i02 = get_global_id(2);
|
|
1082
|
+
|
|
1083
|
+
if (i01 >= ne01) {
|
|
1084
|
+
return;
|
|
1085
|
+
}
|
|
1086
|
+
|
|
1087
|
+
uint ne00_blk = ne00 / QK_K;
|
|
1088
|
+
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
|
1089
|
+
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
|
1090
|
+
|
|
1091
|
+
__global struct block_q5_K * b = src0 + src_blk_offset;
|
|
1092
|
+
|
|
1093
|
+
dst_d [dst_blk_offset] = b->d;
|
|
1094
|
+
dst_dm[dst_blk_offset] = b->dm;
|
|
1095
|
+
|
|
1096
|
+
for (int k = 0; k < 8; k++) {
|
|
1097
|
+
uchar b0 = 0, b1 = 0, b2 = 0, b3 = 0;
|
|
1098
|
+
for (int bit = 0; bit < 8; bit++) {
|
|
1099
|
+
b0 |= (uchar)(((b->qh[bit] >> k) & 1) << bit);
|
|
1100
|
+
b1 |= (uchar)(((b->qh[8 + bit] >> k) & 1) << bit);
|
|
1101
|
+
b2 |= (uchar)(((b->qh[16 + bit] >> k) & 1) << bit);
|
|
1102
|
+
b3 |= (uchar)(((b->qh[24 + bit] >> k) & 1) << bit);
|
|
1103
|
+
}
|
|
1104
|
+
uint packed = (uint)b0 | ((uint)b1 << 8) | ((uint)b2 << 16) | ((uint)b3 << 24);
|
|
1105
|
+
dst_qh[i01 + (i00 * 8 + k) * ne01 + i02 * ne00_blk * 8 * ne01] = packed;
|
|
1106
|
+
}
|
|
1107
|
+
|
|
1108
|
+
uint4 qv[8];
|
|
1109
|
+
uchar * qv_bytes = (uchar *)qv;
|
|
1110
|
+
for (int i = 0; i < QK_K / 64; ++i) {
|
|
1111
|
+
for (int j = 0; j < 16; ++j) {
|
|
1112
|
+
uchar x0 = b->qs[i*32 + 2*j];
|
|
1113
|
+
uchar x1 = b->qs[i*32 + 2*j + 1];
|
|
1114
|
+
|
|
1115
|
+
qv_bytes[i*32 + j ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
|
|
1116
|
+
qv_bytes[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
|
|
1117
|
+
}
|
|
1118
|
+
}
|
|
1119
|
+
|
|
1120
|
+
uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
|
|
1121
|
+
#pragma unroll
|
|
1122
|
+
for (int p = 0; p < 8; ++p) {
|
|
1123
|
+
uint4 v = qv[p];
|
|
1124
|
+
dst_qs[base + (p * 4 + 0) * ne01] = v.x;
|
|
1125
|
+
dst_qs[base + (p * 4 + 1) * ne01] = v.y;
|
|
1126
|
+
dst_qs[base + (p * 4 + 2) * ne01] = v.z;
|
|
1127
|
+
dst_qs[base + (p * 4 + 3) * ne01] = v.w;
|
|
1128
|
+
}
|
|
1129
|
+
|
|
1130
|
+
__global uchar * s_dst = dst_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE;
|
|
1131
|
+
#pragma unroll
|
|
1132
|
+
for (int i = 0; i < K_SCALE_SIZE; ++i) {
|
|
1133
|
+
s_dst[i] = b->s[i];
|
|
1134
|
+
}
|
|
1135
|
+
}
|
|
1136
|
+
|
|
1137
|
+
kernel void kernel_restore_block_q5_k_trans4_ns(
|
|
1138
|
+
__global uint * src_qs,
|
|
1139
|
+
__global uint * src_qh,
|
|
1140
|
+
__global half * src_d,
|
|
1141
|
+
__global half * src_dm,
|
|
1142
|
+
__global uchar * src_s,
|
|
1143
|
+
__global struct block_q5_K * dst0,
|
|
1144
|
+
uint ne00,
|
|
1145
|
+
uint ne01,
|
|
1146
|
+
uchar mask_0F,
|
|
1147
|
+
uchar mask_F0
|
|
1148
|
+
) {
|
|
1149
|
+
uint i00 = get_global_id(1); // block index along K
|
|
1150
|
+
uint i01 = get_global_id(0); // row index
|
|
1151
|
+
uint i02 = get_global_id(2); // batch index
|
|
1152
|
+
|
|
1153
|
+
if (i01 >= ne01) {
|
|
1154
|
+
return;
|
|
1155
|
+
}
|
|
1156
|
+
|
|
1157
|
+
uint ne00_blk = ne00 / QK_K;
|
|
1158
|
+
|
|
1159
|
+
uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
|
1160
|
+
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
|
1161
|
+
|
|
1162
|
+
__global struct block_q5_K * b = dst0 + dst_blk_offset;
|
|
1163
|
+
|
|
1164
|
+
b->d = src_d[src_blk_offset];
|
|
1165
|
+
b->dm = src_dm[src_blk_offset];
|
|
1166
|
+
|
|
1167
|
+
for (int j = 0; j < 32; j++) b->qh[j] = 0;
|
|
1168
|
+
for (int k = 0; k < 8; k++) {
|
|
1169
|
+
uint packed = src_qh[i01 + (i00 * 8 + k) * ne01 + i02 * ne00_blk * 8 * ne01];
|
|
1170
|
+
uchar b0 = (uchar)(packed & 0xFF);
|
|
1171
|
+
uchar b1 = (uchar)((packed >> 8) & 0xFF);
|
|
1172
|
+
uchar b2 = (uchar)((packed >> 16) & 0xFF);
|
|
1173
|
+
uchar b3 = (uchar)((packed >> 24) & 0xFF);
|
|
1174
|
+
for (int bit = 0; bit < 8; bit++) {
|
|
1175
|
+
b->qh[bit] |= (uchar)(((b0 >> bit) & 1) << k);
|
|
1176
|
+
b->qh[8 + bit] |= (uchar)(((b1 >> bit) & 1) << k);
|
|
1177
|
+
b->qh[16 + bit] |= (uchar)(((b2 >> bit) & 1) << k);
|
|
1178
|
+
b->qh[24 + bit] |= (uchar)(((b3 >> bit) & 1) << k);
|
|
1179
|
+
}
|
|
1180
|
+
}
|
|
1181
|
+
|
|
1182
|
+
__global uchar * s_src = src_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE;
|
|
1183
|
+
for (int i = 0; i < K_SCALE_SIZE; ++i) {
|
|
1184
|
+
b->s[i] = s_src[i];
|
|
1185
|
+
}
|
|
1186
|
+
|
|
1187
|
+
uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
|
|
1188
|
+
|
|
1189
|
+
uint4 qv[8];
|
|
1190
|
+
for (int p = 0; p < 8; ++p) {
|
|
1191
|
+
qv[p].x = src_qs[base + (p * 4 + 0) * ne01];
|
|
1192
|
+
qv[p].y = src_qs[base + (p * 4 + 1) * ne01];
|
|
1193
|
+
qv[p].z = src_qs[base + (p * 4 + 2) * ne01];
|
|
1194
|
+
qv[p].w = src_qs[base + (p * 4 + 3) * ne01];
|
|
1195
|
+
}
|
|
1196
|
+
|
|
1197
|
+
uchar * qv_bytes = (uchar *)qv;
|
|
1198
|
+
for (int i = 0; i < QK_K / 64; ++i) {
|
|
1199
|
+
for (int j = 0; j < 16; ++j) {
|
|
1200
|
+
uchar lo = qv_bytes[i*32 + j];
|
|
1201
|
+
uchar hi = qv_bytes[i*32 + j + 16];
|
|
1202
|
+
b->qs[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4));
|
|
1203
|
+
b->qs[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0));
|
|
1204
|
+
}
|
|
1205
|
+
}
|
|
1206
|
+
}
|
|
1207
|
+
|
|
1208
|
+
kernel void kernel_convert_block_q6_k_trans4_ns(
|
|
1209
|
+
__global struct block_q6_K * src0,
|
|
1210
|
+
__global uint * dst_ql,
|
|
1211
|
+
__global uint * dst_qh,
|
|
1212
|
+
__global half * dst_d,
|
|
1213
|
+
__global char * dst_s,
|
|
1214
|
+
uint ne00,
|
|
1215
|
+
uint ne01,
|
|
1216
|
+
uchar mask_0F,
|
|
1217
|
+
uchar mask_F0
|
|
1218
|
+
) {
|
|
1219
|
+
uint i00 = get_global_id(1);
|
|
1220
|
+
uint i01 = get_global_id(0);
|
|
1221
|
+
uint i02 = get_global_id(2);
|
|
1222
|
+
|
|
1223
|
+
if (i01 >= ne01) {
|
|
1224
|
+
return;
|
|
1225
|
+
}
|
|
1226
|
+
|
|
1227
|
+
uint ne00_blk = ne00 / QK_K;
|
|
1228
|
+
|
|
1229
|
+
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
|
1230
|
+
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
|
1231
|
+
|
|
1232
|
+
__global struct block_q6_K * b = src0 + src_blk_offset;
|
|
1233
|
+
|
|
1234
|
+
dst_d[dst_blk_offset] = b->d;
|
|
1235
|
+
|
|
1236
|
+
uint4 qlv[8];
|
|
1237
|
+
uchar * qlv_bytes = (uchar *)qlv;
|
|
1238
|
+
for (int i = 0; i < 2; ++i) {
|
|
1239
|
+
for (int j = 0; j < 16; ++j) {
|
|
1240
|
+
uchar x0 = b->ql[i*64 + 2*j];
|
|
1241
|
+
uchar x1 = b->ql[i*64 + 2*j + 1];
|
|
1242
|
+
uchar x2 = b->ql[i*64 + 32 + 2*j];
|
|
1243
|
+
uchar x3 = b->ql[i*64 + 32 + 2*j + 1];
|
|
1244
|
+
qlv_bytes[i*64 + j ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
|
|
1245
|
+
qlv_bytes[i*64 + j + 16] = convert_uchar(x2 & mask_0F) | convert_uchar((x3 & mask_0F) << 4);
|
|
1246
|
+
qlv_bytes[i*64 + j + 32] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
|
|
1247
|
+
qlv_bytes[i*64 + j + 48] = convert_uchar((x2 & mask_F0) >> 4) | convert_uchar(x3 & mask_F0);
|
|
1248
|
+
}
|
|
1249
|
+
}
|
|
1250
|
+
|
|
1251
|
+
uint ql_base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
|
|
1252
|
+
|
|
1253
|
+
#pragma unroll
|
|
1254
|
+
for (int p = 0; p < 8; ++p) {
|
|
1255
|
+
uint4 v = qlv[p];
|
|
1256
|
+
dst_ql[ql_base + (p * 4 + 0) * ne01] = v.x;
|
|
1257
|
+
dst_ql[ql_base + (p * 4 + 1) * ne01] = v.y;
|
|
1258
|
+
dst_ql[ql_base + (p * 4 + 2) * ne01] = v.z;
|
|
1259
|
+
dst_ql[ql_base + (p * 4 + 3) * ne01] = v.w;
|
|
1260
|
+
}
|
|
1261
|
+
|
|
1262
|
+
uint qhv[16] = {0};
|
|
1263
|
+
|
|
1264
|
+
for (int n = 0; n < 2; ++n) {
|
|
1265
|
+
for (int l = 0; l < 32; ++l) {
|
|
1266
|
+
uchar h = b->qh[n*32 + l];
|
|
1267
|
+
int u = l / 16;
|
|
1268
|
+
int bit_pos = (l % 16) * 2;
|
|
1269
|
+
qhv[(n*4 + 0)*2 + u] |= ((uint)((h >> 0) & 0x03)) << bit_pos;
|
|
1270
|
+
qhv[(n*4 + 1)*2 + u] |= ((uint)((h >> 2) & 0x03)) << bit_pos;
|
|
1271
|
+
qhv[(n*4 + 2)*2 + u] |= ((uint)((h >> 4) & 0x03)) << bit_pos;
|
|
1272
|
+
qhv[(n*4 + 3)*2 + u] |= ((uint)((h >> 6) & 0x03)) << bit_pos;
|
|
1273
|
+
}
|
|
1274
|
+
}
|
|
1275
|
+
|
|
1276
|
+
uint qh_base = i02 * ne00_blk * ne01 * 16 + i00 * ne01 * 16 + i01;
|
|
1277
|
+
|
|
1278
|
+
for (int p = 0; p < 16; ++p) {
|
|
1279
|
+
dst_qh[qh_base + p * ne01] = qhv[p];
|
|
1280
|
+
}
|
|
1281
|
+
|
|
1282
|
+
__global char * s_dst = dst_s + (i02 * ne01 + i01) * ne00_blk * 16 + i00 * 16;
|
|
1283
|
+
#pragma unroll
|
|
1284
|
+
for (int i = 0; i < 16; ++i) {
|
|
1285
|
+
s_dst[i] = b->scales[i];
|
|
1286
|
+
}
|
|
1287
|
+
}
|
|
1288
|
+
|
|
1289
|
+
kernel void kernel_restore_block_q6_k_trans4_ns(
|
|
1290
|
+
__global uint * src_ql,
|
|
1291
|
+
__global uint * src_qh,
|
|
1292
|
+
__global half * src_d,
|
|
1293
|
+
__global char * src_s,
|
|
1294
|
+
__global struct block_q6_K * dst0,
|
|
1295
|
+
uint ne00,
|
|
1296
|
+
uint ne01,
|
|
1297
|
+
uchar mask_0F,
|
|
1298
|
+
uchar mask_F0
|
|
1299
|
+
) {
|
|
1300
|
+
uint i00 = get_global_id(1); // block index along K
|
|
1301
|
+
uint i01 = get_global_id(0); // row index
|
|
1302
|
+
uint i02 = get_global_id(2); // batch index
|
|
1303
|
+
|
|
1304
|
+
if (i01 >= ne01) {
|
|
1305
|
+
return;
|
|
1306
|
+
}
|
|
1307
|
+
|
|
1308
|
+
uint ne00_blk = ne00 / QK_K;
|
|
1309
|
+
|
|
1310
|
+
uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
|
1311
|
+
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
|
1312
|
+
|
|
1313
|
+
__global struct block_q6_K * b = dst0 + dst_blk_offset;
|
|
1314
|
+
|
|
1315
|
+
b->d = src_d[src_blk_offset];
|
|
1316
|
+
|
|
1317
|
+
uint ql_base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
|
|
1318
|
+
uint4 qlv[8];
|
|
1319
|
+
for (int p = 0; p < 8; ++p) {
|
|
1320
|
+
qlv[p].x = src_ql[ql_base + (p * 4 + 0) * ne01];
|
|
1321
|
+
qlv[p].y = src_ql[ql_base + (p * 4 + 1) * ne01];
|
|
1322
|
+
qlv[p].z = src_ql[ql_base + (p * 4 + 2) * ne01];
|
|
1323
|
+
qlv[p].w = src_ql[ql_base + (p * 4 + 3) * ne01];
|
|
1324
|
+
}
|
|
1325
|
+
|
|
1326
|
+
uchar * qlv_bytes = (uchar *)qlv;
|
|
1327
|
+
for (int i = 0; i < 2; ++i) {
|
|
1328
|
+
for (int j = 0; j < 16; ++j) {
|
|
1329
|
+
uchar lo_02 = qlv_bytes[i*64 + j];
|
|
1330
|
+
uchar lo_13 = qlv_bytes[i*64 + j + 16];
|
|
1331
|
+
uchar hi_02 = qlv_bytes[i*64 + j + 32];
|
|
1332
|
+
uchar hi_13 = qlv_bytes[i*64 + j + 48];
|
|
1333
|
+
b->ql[i*64 + 2*j] = convert_uchar((lo_02 & mask_0F) | ((hi_02 & mask_0F) << 4));
|
|
1334
|
+
b->ql[i*64 + 2*j + 1] = convert_uchar(((lo_02 & mask_F0) >> 4) | (hi_02 & mask_F0));
|
|
1335
|
+
b->ql[i*64 + 32 + 2*j] = convert_uchar((lo_13 & mask_0F) | ((hi_13 & mask_0F) << 4));
|
|
1336
|
+
b->ql[i*64 + 32 + 2*j + 1] = convert_uchar(((lo_13 & mask_F0) >> 4) | (hi_13 & mask_F0));
|
|
1337
|
+
}
|
|
1338
|
+
}
|
|
1339
|
+
|
|
1340
|
+
uint qh_base = i02 * ne00_blk * ne01 * 16 + i00 * ne01 * 16 + i01;
|
|
1341
|
+
uint qhv[16];
|
|
1342
|
+
for (int p = 0; p < 16; ++p) {
|
|
1343
|
+
qhv[p] = src_qh[qh_base + p * ne01];
|
|
1344
|
+
}
|
|
1345
|
+
|
|
1346
|
+
for (int n = 0; n < 2; ++n) {
|
|
1347
|
+
for (int l = 0; l < 32; ++l) {
|
|
1348
|
+
int u = l / 16;
|
|
1349
|
+
int bit_pos = (l % 16) * 2;
|
|
1350
|
+
uchar v0 = (uchar)((qhv[(n*4 + 0)*2 + u] >> bit_pos) & 0x03);
|
|
1351
|
+
uchar v1 = (uchar)((qhv[(n*4 + 1)*2 + u] >> bit_pos) & 0x03);
|
|
1352
|
+
uchar v2 = (uchar)((qhv[(n*4 + 2)*2 + u] >> bit_pos) & 0x03);
|
|
1353
|
+
uchar v3 = (uchar)((qhv[(n*4 + 3)*2 + u] >> bit_pos) & 0x03);
|
|
1354
|
+
b->qh[n*32 + l] = v0 | (v1 << 2) | (v2 << 4) | (v3 << 6);
|
|
1355
|
+
}
|
|
1356
|
+
}
|
|
1357
|
+
|
|
1358
|
+
__global char * s_src = src_s + (i02 * ne01 + i01) * ne00_blk * 16 + i00 * 16;
|
|
1359
|
+
for (int i = 0; i < 16; ++i) {
|
|
1360
|
+
b->scales[i] = s_src[i];
|
|
1361
|
+
}
|
|
1362
|
+
}
|
|
1363
|
+
|
|
1364
|
+
//------------------------------------------------------------------------------
|
|
1365
|
+
// block_mxfp4
|
|
1366
|
+
//------------------------------------------------------------------------------
|
|
1367
|
+
#define QK_MXFP4 32
|
|
1368
|
+
struct block_mxfp4 {
|
|
1369
|
+
uchar e; // E8M0
|
|
1370
|
+
uchar qs[QK_MXFP4 / 2];
|
|
1371
|
+
};
|
|
1372
|
+
|
|
1373
|
+
//------------------------------------------------------------------------------
|
|
1374
|
+
// kernel_convert_block_mxfp4
|
|
1375
|
+
// Convert the block_mxfp4 format to 2 separate arrays (AOS -> SOA).
|
|
1376
|
+
// This kernel does not deshuffle the bits.
|
|
1377
|
+
//------------------------------------------------------------------------------
|
|
1378
|
+
kernel void kernel_convert_block_mxfp4(
|
|
1379
|
+
global struct block_mxfp4 * src0,
|
|
1380
|
+
global uchar * dst_q,
|
|
1381
|
+
global uchar * dst_e
|
|
1382
|
+
) {
|
|
1383
|
+
global struct block_mxfp4 * b = (global struct block_mxfp4 *) src0 + get_global_id(0);
|
|
1384
|
+
global uchar * q = (global uchar *) dst_q + QK_MXFP4 / 2 * get_global_id(0);
|
|
1385
|
+
global uchar * e = (global uchar *) dst_e + get_global_id(0);
|
|
1386
|
+
|
|
1387
|
+
*e = b->e;
|
|
1388
|
+
|
|
1389
|
+
for (int i = 0; i < QK_MXFP4 / 2; ++i) {
|
|
1390
|
+
q[i] = b->qs[i];
|
|
1391
|
+
}
|
|
1392
|
+
}
|
|
1393
|
+
|
|
1394
|
+
kernel void kernel_convert_block_mxfp4_trans(
|
|
1395
|
+
global struct block_mxfp4 * src0,
|
|
1396
|
+
__global uint4 * dst_q,
|
|
1397
|
+
__global uchar * dst_e,
|
|
1398
|
+
uint ne00,
|
|
1399
|
+
uint ne01
|
|
1400
|
+
) {
|
|
1401
|
+
int i00 = get_global_id(1);
|
|
1402
|
+
uint i01 = get_global_id(0);
|
|
1403
|
+
uint i02 = get_global_id(2);
|
|
1404
|
+
|
|
1405
|
+
uint ne00_blk = ne00 / QK_MXFP4;
|
|
1406
|
+
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
|
1407
|
+
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
|
1408
|
+
|
|
1409
|
+
global struct block_mxfp4 * b = src0 + src_blk_offset;
|
|
1410
|
+
|
|
1411
|
+
dst_q[dst_blk_offset] = ((global uint4 *)(&(b->qs[0])))[0];
|
|
1412
|
+
dst_e[dst_blk_offset] = b->e;
|
|
1413
|
+
}
|
|
1414
|
+
|
|
1415
|
+
kernel void kernel_restore_block_mxfp4(
|
|
1416
|
+
global uchar * src_q,
|
|
1417
|
+
global half * src_e,
|
|
1418
|
+
global struct block_mxfp4 * dst
|
|
1419
|
+
) {
|
|
1420
|
+
global struct block_mxfp4 * b = (global struct block_mxfp4 *) dst + get_global_id(0);
|
|
1421
|
+
global uchar * q = (global uchar *) src_q + QK_MXFP4 / 2 * get_global_id(0);
|
|
1422
|
+
global uchar * e = (global uchar *) src_e + get_global_id(0);
|
|
1423
|
+
|
|
1424
|
+
b->e = *e;
|
|
1425
|
+
for (int i = 0; i < QK_MXFP4 / 2; ++i) {
|
|
1426
|
+
b->qs[i] = q[i];
|
|
1427
|
+
}
|
|
1428
|
+
}
|
|
1429
|
+
|
|
1430
|
+
kernel void kernel_restore_block_mxfp4_trans(
|
|
1431
|
+
__global uint4 * src_q,
|
|
1432
|
+
__global uchar * src_e,
|
|
1433
|
+
global struct block_mxfp4 * dst,
|
|
1434
|
+
uint ne00,
|
|
1435
|
+
uint ne01
|
|
1436
|
+
) {
|
|
1437
|
+
int i00 = get_global_id(1);
|
|
1438
|
+
uint i01 = get_global_id(0);
|
|
1439
|
+
uint i02 = get_global_id(2);
|
|
1440
|
+
|
|
1441
|
+
uint ne00_blk = ne00 / QK_MXFP4;
|
|
1442
|
+
uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
|
1443
|
+
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
|
1444
|
+
|
|
1445
|
+
global struct block_mxfp4 * b = dst + dst_blk_offset;
|
|
1446
|
+
|
|
1447
|
+
((global uint4 *)(&(b->qs[0])))[0] = src_q[src_blk_offset];
|
|
1448
|
+
b->e = src_e[src_blk_offset];
|
|
1449
|
+
}
|
|
1450
|
+
|
|
1451
|
+
kernel void kernel_convert_block_mxfp4_trans4_ns(
|
|
1452
|
+
global struct block_mxfp4 * src0,
|
|
1453
|
+
__global uint * dst_q,
|
|
1454
|
+
__global uchar * dst_e,
|
|
1455
|
+
uint ne00,
|
|
1456
|
+
uint ne01
|
|
1457
|
+
) {
|
|
1458
|
+
uint i00 = get_global_id(1);
|
|
1459
|
+
uint i01 = get_global_id(0);
|
|
1460
|
+
uint i02 = get_global_id(2);
|
|
1461
|
+
|
|
1462
|
+
if (i01 >= ne01) {
|
|
1463
|
+
return;
|
|
1464
|
+
}
|
|
1465
|
+
|
|
1466
|
+
uint ne00_blk = ne00 / QK_MXFP4;
|
|
1467
|
+
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
|
1468
|
+
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
|
1469
|
+
|
|
1470
|
+
global struct block_mxfp4 * b = src0 + src_blk_offset;
|
|
1471
|
+
dst_e[dst_blk_offset] = b->e;
|
|
1472
|
+
|
|
1473
|
+
// extract quantization and unshuffle
|
|
1474
|
+
ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
|
|
1475
|
+
|
|
1476
|
+
ushort8 post_block = (ushort8)(0);
|
|
1477
|
+
|
|
1478
|
+
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
|
1479
|
+
uchar * post_block_ptr = (uchar *)(&post_block);
|
|
1480
|
+
|
|
1481
|
+
for (int i = 0; i < QK_MXFP4 / 4; ++i) {
|
|
1482
|
+
uchar x0 = pre_block_ptr[2*i + 0];
|
|
1483
|
+
uchar x1 = pre_block_ptr[2*i + 1];
|
|
1484
|
+
|
|
1485
|
+
post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
|
1486
|
+
post_block_ptr[i + QK_MXFP4 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
|
1487
|
+
}
|
|
1488
|
+
|
|
1489
|
+
uint4 q_block = as_uint4(post_block);
|
|
1490
|
+
|
|
1491
|
+
uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
|
1492
|
+
dst_q[offset] = q_block.x;
|
|
1493
|
+
dst_q[offset + ne01] = q_block.y;
|
|
1494
|
+
dst_q[offset + ne01 * 2] = q_block.z;
|
|
1495
|
+
dst_q[offset + ne01 * 3] = q_block.w;
|
|
1496
|
+
}
|
|
1497
|
+
|
|
1498
|
+
kernel void kernel_restore_block_mxfp4_trans4_ns(
|
|
1499
|
+
__global uint * src_q,
|
|
1500
|
+
__global uchar * src_e,
|
|
1501
|
+
__global struct block_mxfp4 * dst0,
|
|
1502
|
+
uint ne00,
|
|
1503
|
+
uint ne01
|
|
1504
|
+
) {
|
|
1505
|
+
uint i00 = get_global_id(1);
|
|
1506
|
+
uint i01 = get_global_id(0);
|
|
1507
|
+
uint i02 = get_global_id(2);
|
|
1508
|
+
|
|
1509
|
+
if (i01 >= ne01) {
|
|
1510
|
+
return;
|
|
1511
|
+
}
|
|
1512
|
+
|
|
1513
|
+
uint ne00_blk = ne00 / QK_MXFP4;
|
|
1514
|
+
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
|
1515
|
+
uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
|
1516
|
+
|
|
1517
|
+
__global struct block_mxfp4 * b = dst0 + dst_blk_offset;
|
|
1518
|
+
b->e = src_e[src_d_offset];
|
|
1519
|
+
|
|
1520
|
+
// collect transposed quantization parts for a block
|
|
1521
|
+
uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
|
1522
|
+
uint4 q_block;
|
|
1523
|
+
q_block.x = src_q[src_q_offset];
|
|
1524
|
+
q_block.y = src_q[src_q_offset + ne01];
|
|
1525
|
+
q_block.z = src_q[src_q_offset + ne01 * 2];
|
|
1526
|
+
q_block.w = src_q[src_q_offset + ne01 * 3];
|
|
1527
|
+
|
|
1528
|
+
ushort8 post_block = as_ushort8(q_block);
|
|
1529
|
+
ushort8 pre_block = (ushort8)(0);
|
|
1530
|
+
|
|
1531
|
+
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
|
1532
|
+
uchar * post_block_ptr = (uchar *)(&post_block);
|
|
1533
|
+
|
|
1534
|
+
for (int i = 0; i < QK_MXFP4 / 4; ++i) {
|
|
1535
|
+
uchar x0 = post_block_ptr[i + 0];
|
|
1536
|
+
uchar x1 = post_block_ptr[i + QK_MXFP4 / 4];
|
|
1537
|
+
|
|
1538
|
+
pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
|
1539
|
+
pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
|
1540
|
+
}
|
|
1541
|
+
|
|
1542
|
+
((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
|
|
1543
|
+
}
|
|
1544
|
+
|
|
1545
|
+
|
|
1546
|
+
//------------------------------------------------------------------------------
|
|
1547
|
+
// block_q8_0
|
|
1548
|
+
//------------------------------------------------------------------------------
|
|
1549
|
+
typedef struct {
|
|
1550
|
+
half d; // delta
|
|
1551
|
+
char qs[QK8_0]; // quants
|
|
1552
|
+
} block_q8_0;
|
|
1553
|
+
|
|
1554
|
+
kernel void kernel_convert_block_q8_0(
|
|
1555
|
+
global block_q8_0 * src0,
|
|
1556
|
+
global uchar * dst_q,
|
|
1557
|
+
global half * dst_d
|
|
1558
|
+
) {
|
|
1559
|
+
global block_q8_0 * b = (global block_q8_0 *) src0 + get_global_id(0);
|
|
1560
|
+
global uchar * q = (global uchar *) dst_q + QK8_0*get_global_id(0);
|
|
1561
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
1562
|
+
|
|
1563
|
+
*d = b->d;
|
|
1564
|
+
|
|
1565
|
+
for (int i = 0; i < QK8_0; ++i) {
|
|
1566
|
+
q[i] = b->qs[i];
|
|
1567
|
+
}
|
|
1568
|
+
}
|
|
1569
|
+
|
|
1570
|
+
kernel void kernel_restore_block_q8_0(
|
|
1571
|
+
global uchar * src_q,
|
|
1572
|
+
global half * src_d,
|
|
1573
|
+
global block_q8_0 * dst
|
|
1574
|
+
) {
|
|
1575
|
+
global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0);
|
|
1576
|
+
global uchar * q = (global uchar *) src_q + QK8_0*get_global_id(0);
|
|
1577
|
+
global half * d = (global half *) src_d + get_global_id(0);
|
|
1578
|
+
|
|
1579
|
+
b->d = *d;
|
|
1580
|
+
for (int i = 0; i < QK8_0; ++i) {
|
|
1581
|
+
b->qs[i] = q[i];
|
|
1582
|
+
}
|
|
1583
|
+
}
|
|
1584
|
+
|
|
1585
|
+
kernel void kernel_restore_block_q8_0_trans(
|
|
1586
|
+
global uchar * src_q,
|
|
1587
|
+
global half * src_d,
|
|
1588
|
+
global block_q8_0 * dst,
|
|
1589
|
+
uint ne00,
|
|
1590
|
+
uint ne01
|
|
1591
|
+
){
|
|
1592
|
+
uint num_blk_per_row = ne00 / QK8_0;
|
|
1593
|
+
|
|
1594
|
+
global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0) * num_blk_per_row;
|
|
1595
|
+
global uchar * q = (global uchar *) src_q + get_global_id(0) * 4; // 4 8-bit packed
|
|
1596
|
+
global half * d = (global half *) src_d + get_global_id(0);
|
|
1597
|
+
|
|
1598
|
+
for (uint blk = 0; blk < num_blk_per_row; blk++) {
|
|
1599
|
+
b->d = *d;
|
|
1600
|
+
|
|
1601
|
+
for (uint i = 0; i < QK8_0; i+=4) {
|
|
1602
|
+
b->qs[i] = q[0];
|
|
1603
|
+
b->qs[i+1] = q[1];
|
|
1604
|
+
b->qs[i+2] = q[2];
|
|
1605
|
+
b->qs[i+3] = q[3];
|
|
1606
|
+
|
|
1607
|
+
q += 4 * ne01; // M stride
|
|
1608
|
+
}
|
|
1609
|
+
|
|
1610
|
+
d += ne01;
|
|
1611
|
+
|
|
1612
|
+
b++;
|
|
1613
|
+
}
|
|
1614
|
+
}
|
|
1615
|
+
|
|
1616
|
+
//------------------------------------------------------------------------------
|
|
1617
|
+
// kernel_convert_block_q4_K
|
|
1618
|
+
// Convert the block_q4_K format to 4 separate arrays (AOS -> SOA).
|
|
1619
|
+
// This kernel does not deshuffle the bits.
|
|
1620
|
+
// Each thread processes a super block.
|
|
1621
|
+
// Mask args are just to keep the signature consistent with the no-shuffle
|
|
1622
|
+
// version and they are not used in this kernel.
|
|
1623
|
+
//------------------------------------------------------------------------------
|
|
1624
|
+
kernel void kernel_convert_block_q4_K(
|
|
1625
|
+
global struct block_q4_K * src0,
|
|
1626
|
+
global uchar * dst_q,
|
|
1627
|
+
global uchar * dst_s,
|
|
1628
|
+
global half * dst_d,
|
|
1629
|
+
global half * dst_dm,
|
|
1630
|
+
uchar mask_0F,
|
|
1631
|
+
uchar mask_F0
|
|
1632
|
+
) {
|
|
1633
|
+
global struct block_q4_K * b = (global struct block_q4_K *) src0 + get_global_id(0);
|
|
1634
|
+
global uchar * q = (global uchar *) dst_q + QK_K/2*get_global_id(0);
|
|
1635
|
+
global uchar * s = (global uchar *) dst_s + K_SCALE_SIZE*get_global_id(0);
|
|
1636
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
1637
|
+
global half * dm = (global half *) dst_dm + get_global_id(0);
|
|
1638
|
+
|
|
1639
|
+
*d = b->d;
|
|
1640
|
+
*dm = b->dm;
|
|
1641
|
+
|
|
1642
|
+
for (int i = 0; i < QK_K/2; ++i) {
|
|
1643
|
+
q[i] = b->q[i];
|
|
1644
|
+
}
|
|
1645
|
+
for (int i = 0; i < K_SCALE_SIZE; ++i) {
|
|
1646
|
+
s[i] = b->s[i];
|
|
1647
|
+
}
|
|
1648
|
+
}
|
|
1649
|
+
|
|
1650
|
+
// Restore block_q4_K from flattened arrays.
|
|
1651
|
+
// Each thread processes a super block.
|
|
1652
|
+
// Mask args are just to keep the signature consistent with the no-shuffle ones.
|
|
1653
|
+
kernel void kernel_restore_block_q4_K(
|
|
1654
|
+
global uchar * src_q,
|
|
1655
|
+
global uchar * src_s,
|
|
1656
|
+
global half * src_d,
|
|
1657
|
+
global half * src_dm,
|
|
1658
|
+
global struct block_q4_K * dst,
|
|
1659
|
+
uchar mask_0F,
|
|
1660
|
+
uchar mask_F0
|
|
1661
|
+
) {
|
|
1662
|
+
global struct block_q4_K * b = (global struct block_q4_K *) dst + get_global_id(0);
|
|
1663
|
+
global uchar * q = (global uchar *) src_q + QK_K/2*get_global_id(0);
|
|
1664
|
+
global uchar * s = (global uchar *) src_s + K_SCALE_SIZE*get_global_id(0);
|
|
1665
|
+
global half * d = (global half *) src_d + get_global_id(0);
|
|
1666
|
+
global half * dm = (global half *) src_dm + get_global_id(0);
|
|
1667
|
+
|
|
1668
|
+
b->d = *d;
|
|
1669
|
+
b->dm = *dm;
|
|
1670
|
+
|
|
1671
|
+
for (int i = 0; i < QK_K/2; ++i) {
|
|
1672
|
+
b->q[i] = q[i];
|
|
1673
|
+
}
|
|
1674
|
+
for (int i = 0; i < K_SCALE_SIZE; ++i) {
|
|
1675
|
+
b->s[i] = s[i];
|
|
1676
|
+
}
|
|
1677
|
+
}
|
|
1678
|
+
|
|
1679
|
+
kernel void kernel_convert_block_q4_K_noshuffle(
|
|
1680
|
+
global struct block_q4_K * src0,
|
|
1681
|
+
global uchar * dst_q,
|
|
1682
|
+
global uchar * dst_s,
|
|
1683
|
+
global half * dst_d,
|
|
1684
|
+
global half * dst_dm,
|
|
1685
|
+
uchar mask_0F,
|
|
1686
|
+
uchar mask_F0
|
|
1687
|
+
) {
|
|
1688
|
+
global struct block_q4_K * b = (global struct block_q4_K *) src0 + get_global_id(0);
|
|
1689
|
+
global uchar * q = (global uchar *) dst_q + QK_K/2 * get_global_id(0);
|
|
1690
|
+
global uchar * s = (global uchar *) dst_s + K_SCALE_SIZE * get_global_id(0);
|
|
1691
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
1692
|
+
global half * dm = (global half *) dst_dm + get_global_id(0);
|
|
1693
|
+
|
|
1694
|
+
*d = b->d;
|
|
1695
|
+
*dm = b->dm;
|
|
1696
|
+
|
|
1697
|
+
for (int i = 0; i < QK_K / 64; ++i) {
|
|
1698
|
+
for (int j = 0; j < 16; ++j) {
|
|
1699
|
+
uchar x0 = b->q[i*32 + 2*j];
|
|
1700
|
+
uchar x1 = b->q[i*32 + 2*j + 1];
|
|
1701
|
+
q[i*32 + j] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
|
|
1702
|
+
q[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
|
|
1703
|
+
}
|
|
1704
|
+
}
|
|
1705
|
+
|
|
1706
|
+
for (int i = 0; i < K_SCALE_SIZE; ++i) {
|
|
1707
|
+
s[i] = b->s[i];
|
|
1708
|
+
}
|
|
1709
|
+
}
|
|
1710
|
+
|
|
1711
|
+
kernel void kernel_restore_block_q4_K_noshuffle(
|
|
1712
|
+
global uchar * src_q,
|
|
1713
|
+
global uchar * src_s,
|
|
1714
|
+
global half * src_d,
|
|
1715
|
+
global half * src_dm,
|
|
1716
|
+
global struct block_q4_K * dst,
|
|
1717
|
+
uchar mask_0F,
|
|
1718
|
+
uchar mask_F0
|
|
1719
|
+
) {
|
|
1720
|
+
global struct block_q4_K * b = (global struct block_q4_K *) dst + get_global_id(0);
|
|
1721
|
+
global uchar * q = (global uchar *) src_q + QK_K/2 * get_global_id(0);
|
|
1722
|
+
global uchar * s = (global uchar *) src_s + K_SCALE_SIZE * get_global_id(0);
|
|
1723
|
+
global half * d = (global half *) src_d + get_global_id(0);
|
|
1724
|
+
global half * dm = (global half *) src_dm + get_global_id(0);
|
|
1725
|
+
|
|
1726
|
+
b->d = *d;
|
|
1727
|
+
b->dm = *dm;
|
|
1728
|
+
|
|
1729
|
+
for (int i = 0; i < QK_K / 64; ++i) {
|
|
1730
|
+
for (int j = 0; j < 16; ++j) {
|
|
1731
|
+
uchar lo = q[i*32 + j];
|
|
1732
|
+
uchar hi = q[i*32 + j + 16];
|
|
1733
|
+
b->q[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4));
|
|
1734
|
+
b->q[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0));
|
|
1735
|
+
}
|
|
1736
|
+
}
|
|
1737
|
+
|
|
1738
|
+
for (int i = 0; i < K_SCALE_SIZE; ++i) {
|
|
1739
|
+
b->s[i] = s[i];
|
|
1740
|
+
}
|
|
1741
|
+
}
|
|
1742
|
+
|
|
1743
|
+
//------------------------------------------------------------------------------
|
|
1744
|
+
// kernel_convert_block_q5_K
|
|
1745
|
+
// Convert the block_q5_K format to 5 separate arrays (AOS -> SOA).
|
|
1746
|
+
// Each thread processes a super block.
|
|
1747
|
+
//------------------------------------------------------------------------------
|
|
1748
|
+
kernel void kernel_convert_block_q5_K(
|
|
1749
|
+
global struct block_q5_K * src0,
|
|
1750
|
+
global uchar * dst_q,
|
|
1751
|
+
global uchar * dst_qh,
|
|
1752
|
+
global uchar * dst_s,
|
|
1753
|
+
global half * dst_d,
|
|
1754
|
+
global half * dst_dm,
|
|
1755
|
+
uchar mask_0F,
|
|
1756
|
+
uchar mask_F0
|
|
1757
|
+
) {
|
|
1758
|
+
global struct block_q5_K * b = (global struct block_q5_K *) src0 + get_global_id(0);
|
|
1759
|
+
global uchar * q = (global uchar *) dst_q + QK_K/2*get_global_id(0);
|
|
1760
|
+
global uchar * qh = (global uchar *) dst_qh + QK_K/8*get_global_id(0);
|
|
1761
|
+
global uchar * s = (global uchar *) dst_s + K_SCALE_SIZE*get_global_id(0);
|
|
1762
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
1763
|
+
global half * dm = (global half *) dst_dm + get_global_id(0);
|
|
1764
|
+
|
|
1765
|
+
*d = b->d;
|
|
1766
|
+
*dm = b->dm;
|
|
1767
|
+
|
|
1768
|
+
for (int i = 0; i < QK_K/2; ++i) {
|
|
1769
|
+
q[i] = b->qs[i];
|
|
1770
|
+
}
|
|
1771
|
+
for (int i = 0; i < QK_K/8; ++i) {
|
|
1772
|
+
qh[i] = b->qh[i];
|
|
1773
|
+
}
|
|
1774
|
+
for (int i = 0; i < K_SCALE_SIZE; ++i) {
|
|
1775
|
+
s[i] = b->s[i];
|
|
1776
|
+
}
|
|
1777
|
+
}
|
|
1778
|
+
|
|
1779
|
+
// Restore block_q5_K from flattened arrays.
|
|
1780
|
+
// Each thread processes a super block.
|
|
1781
|
+
kernel void kernel_restore_block_q5_K(
|
|
1782
|
+
global uchar * src_q,
|
|
1783
|
+
global uchar * src_qh,
|
|
1784
|
+
global uchar * src_s,
|
|
1785
|
+
global half * src_d,
|
|
1786
|
+
global half * src_dm,
|
|
1787
|
+
global struct block_q5_K * dst,
|
|
1788
|
+
uchar mask_0F,
|
|
1789
|
+
uchar mask_F0
|
|
1790
|
+
) {
|
|
1791
|
+
global struct block_q5_K * b = (global struct block_q5_K *) dst + get_global_id(0);
|
|
1792
|
+
global uchar * q = (global uchar *) src_q + QK_K/2*get_global_id(0);
|
|
1793
|
+
global uchar * qh = (global uchar *) src_qh + QK_K/8*get_global_id(0);
|
|
1794
|
+
global uchar * s = (global uchar *) src_s + K_SCALE_SIZE*get_global_id(0);
|
|
1795
|
+
global half * d = (global half *) src_d + get_global_id(0);
|
|
1796
|
+
global half * dm = (global half *) src_dm + get_global_id(0);
|
|
1797
|
+
|
|
1798
|
+
b->d = *d;
|
|
1799
|
+
b->dm = *dm;
|
|
1800
|
+
|
|
1801
|
+
for (int i = 0; i < QK_K/2; ++i) {
|
|
1802
|
+
b->qs[i] = q[i];
|
|
1803
|
+
}
|
|
1804
|
+
for (int i = 0; i < QK_K/8; ++i) {
|
|
1805
|
+
b->qh[i] = qh[i];
|
|
1806
|
+
}
|
|
1807
|
+
for (int i = 0; i < K_SCALE_SIZE; ++i) {
|
|
1808
|
+
b->s[i] = s[i];
|
|
1809
|
+
}
|
|
1810
|
+
}
|
|
1811
|
+
|
|
1812
|
+
kernel void kernel_convert_block_q5_K_noshuffle(
|
|
1813
|
+
global struct block_q5_K * src0,
|
|
1814
|
+
global uchar * dst_q,
|
|
1815
|
+
global uchar * dst_qh,
|
|
1816
|
+
global uchar * dst_s,
|
|
1817
|
+
global half * dst_d,
|
|
1818
|
+
global half * dst_dm,
|
|
1819
|
+
uchar mask_0F,
|
|
1820
|
+
uchar mask_F0
|
|
1821
|
+
) {
|
|
1822
|
+
global struct block_q5_K * b = (global struct block_q5_K *) src0 + get_global_id(0);
|
|
1823
|
+
global uchar * q = (global uchar *) dst_q + QK_K/2 * get_global_id(0);
|
|
1824
|
+
global uchar * qh = (global uchar *) dst_qh + QK_K/8 * get_global_id(0);
|
|
1825
|
+
global uchar * s = (global uchar *) dst_s + K_SCALE_SIZE * get_global_id(0);
|
|
1826
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
1827
|
+
global half * dm = (global half *) dst_dm + get_global_id(0);
|
|
1828
|
+
|
|
1829
|
+
*d = b->d;
|
|
1830
|
+
*dm = b->dm;
|
|
1831
|
+
|
|
1832
|
+
for (int i = 0; i < QK_K / 64; ++i) {
|
|
1833
|
+
for (int j = 0; j < 16; ++j) {
|
|
1834
|
+
uchar x0 = b->qs[i*32 + 2*j];
|
|
1835
|
+
uchar x1 = b->qs[i*32 + 2*j + 1];
|
|
1836
|
+
q[i*32 + j] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
|
|
1837
|
+
q[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
|
|
1838
|
+
}
|
|
1839
|
+
}
|
|
1840
|
+
|
|
1841
|
+
for (int l = 0; l < QK_K/8; ++l) {
|
|
1842
|
+
uchar x0 = 0;
|
|
1843
|
+
for (int i = 0; i < 8; ++i) {
|
|
1844
|
+
x0 |= ((b->qh[(l%4)*8+i] >> (l/4)) & 0x01) << i;
|
|
1845
|
+
}
|
|
1846
|
+
qh[l] = x0;
|
|
1847
|
+
}
|
|
1848
|
+
|
|
1849
|
+
for (int i = 0; i < K_SCALE_SIZE; ++i) {
|
|
1850
|
+
s[i] = b->s[i];
|
|
1851
|
+
}
|
|
1852
|
+
}
|
|
1853
|
+
|
|
1854
|
+
kernel void kernel_restore_block_q5_K_noshuffle(
|
|
1855
|
+
global uchar * src_q,
|
|
1856
|
+
global uchar * src_qh,
|
|
1857
|
+
global uchar * src_s,
|
|
1858
|
+
global half * src_d,
|
|
1859
|
+
global half * src_dm,
|
|
1860
|
+
global struct block_q5_K * dst,
|
|
1861
|
+
uchar mask_0F,
|
|
1862
|
+
uchar mask_F0
|
|
1863
|
+
) {
|
|
1864
|
+
global struct block_q5_K * b = (global struct block_q5_K *) dst + get_global_id(0);
|
|
1865
|
+
global uchar * q = (global uchar *) src_q + QK_K/2 * get_global_id(0);
|
|
1866
|
+
global uchar * qh = (global uchar *) src_qh + QK_K/8 * get_global_id(0);
|
|
1867
|
+
global uchar * s = (global uchar *) src_s + K_SCALE_SIZE * get_global_id(0);
|
|
1868
|
+
global half * d = (global half *) src_d + get_global_id(0);
|
|
1869
|
+
global half * dm = (global half *) src_dm + get_global_id(0);
|
|
1870
|
+
|
|
1871
|
+
b->d = *d;
|
|
1872
|
+
b->dm = *dm;
|
|
1873
|
+
|
|
1874
|
+
for (int i = 0; i < QK_K / 64; ++i) {
|
|
1875
|
+
for (int j = 0; j < 16; ++j) {
|
|
1876
|
+
uchar lo = q[i*32 + j];
|
|
1877
|
+
uchar hi = q[i*32 + j + 16];
|
|
1878
|
+
b->qs[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4));
|
|
1879
|
+
b->qs[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0));
|
|
1880
|
+
}
|
|
1881
|
+
}
|
|
1882
|
+
|
|
1883
|
+
for (int g = 0; g < 4; ++g) {
|
|
1884
|
+
for (int i = 0; i < 8; ++i) {
|
|
1885
|
+
uchar x0 = 0;
|
|
1886
|
+
for (int k = 0; k < 8; ++k) {
|
|
1887
|
+
x0 |= ((qh[4*k+g] >> i) & 0x01) << k;
|
|
1888
|
+
}
|
|
1889
|
+
b->qh[g*8+i] = x0;
|
|
1890
|
+
}
|
|
1891
|
+
}
|
|
1892
|
+
|
|
1893
|
+
for (int i = 0; i < K_SCALE_SIZE; ++i) {
|
|
1894
|
+
b->s[i] = s[i];
|
|
1895
|
+
}
|
|
1896
|
+
}
|
|
1897
|
+
|
|
1898
|
+
//------------------------------------------------------------------------------
|
|
1899
|
+
// kernel_convert_block_q6_K
|
|
1900
|
+
// Convert the block_q6_K format to 3 separate arrays (AOS -> SOA).
|
|
1901
|
+
// This kernel does not deshuffle the bits.
|
|
1902
|
+
// Each thread processes a super block.
|
|
1903
|
+
//------------------------------------------------------------------------------
|
|
1904
|
+
kernel void kernel_convert_block_q6_K(
|
|
1905
|
+
global struct block_q6_K * src0,
|
|
1906
|
+
global uchar * dst_ql,
|
|
1907
|
+
global uchar * dst_qh,
|
|
1908
|
+
global char * dst_s,
|
|
1909
|
+
global half * dst_d,
|
|
1910
|
+
uchar mask_lsb_8,
|
|
1911
|
+
ulong n_blk
|
|
1912
|
+
) {
|
|
1913
|
+
if (get_global_id(0) >= n_blk) {
|
|
1914
|
+
return;
|
|
1915
|
+
}
|
|
1916
|
+
global struct block_q6_K * b = (global struct block_q6_K *) src0 + get_global_id(0);
|
|
1917
|
+
global uchar * ql = (global uchar *) dst_ql + QK_K/2*get_global_id(0);
|
|
1918
|
+
global uchar * qh = (global uchar *) dst_qh + QK_K/4*get_global_id(0);
|
|
1919
|
+
global char * s = (global char *) dst_s + QK_K/16*get_global_id(0);
|
|
1920
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
1921
|
+
|
|
1922
|
+
*d = b->d;
|
|
1923
|
+
|
|
1924
|
+
for (int i = 0; i < QK_K/2; ++i) {
|
|
1925
|
+
ql[i] = b->ql[i];
|
|
1926
|
+
}
|
|
1927
|
+
for (int i = 0; i < QK_K/4; ++i) {
|
|
1928
|
+
qh[i] = b->qh[i];
|
|
1929
|
+
}
|
|
1930
|
+
for (int i = 0; i < QK_K/16; ++i) {
|
|
1931
|
+
s[i] = b->scales[i];
|
|
1932
|
+
}
|
|
1933
|
+
}
|
|
1934
|
+
|
|
1935
|
+
// Restore block_q6_K from flattened arrays.
|
|
1936
|
+
// Each thread processes a super block.
|
|
1937
|
+
kernel void kernel_restore_block_q6_K(
|
|
1938
|
+
global uchar * dst_ql,
|
|
1939
|
+
global uchar * dst_qh,
|
|
1940
|
+
global char * dst_s,
|
|
1941
|
+
global half * dst_d,
|
|
1942
|
+
global struct block_q6_K * dst,
|
|
1943
|
+
uchar mask_lsb_8,
|
|
1944
|
+
ulong n_blk
|
|
1945
|
+
) {
|
|
1946
|
+
if (get_global_id(0) >= n_blk) {
|
|
1947
|
+
return;
|
|
1948
|
+
}
|
|
1949
|
+
global struct block_q6_K * b = (global struct block_q6_K *) dst + get_global_id(0);
|
|
1950
|
+
global uchar * ql = (global uchar *) dst_ql + QK_K/2*get_global_id(0);
|
|
1951
|
+
global uchar * qh = (global uchar *) dst_qh + QK_K/4*get_global_id(0);
|
|
1952
|
+
global char * s = (global char *) dst_s + QK_K/16*get_global_id(0);
|
|
1953
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
1954
|
+
|
|
1955
|
+
b->d = *d;
|
|
1956
|
+
|
|
1957
|
+
for (int i = 0; i < QK_K/2; ++i) {
|
|
1958
|
+
b->ql[i] = ql[i];
|
|
1959
|
+
}
|
|
1960
|
+
for (int i = 0; i < QK_K/4; ++i) {
|
|
1961
|
+
b->qh[i] = qh[i];
|
|
1962
|
+
}
|
|
1963
|
+
for (int i = 0; i < QK_K/16; ++i) {
|
|
1964
|
+
b->scales[i] = s[i];
|
|
1965
|
+
}
|
|
1966
|
+
}
|
|
1967
|
+
|
|
1968
|
+
kernel void kernel_convert_block_q6_K_noshuffle(
|
|
1969
|
+
global struct block_q6_K * src0,
|
|
1970
|
+
global uchar * dst_ql,
|
|
1971
|
+
global uchar * dst_qh,
|
|
1972
|
+
global char * dst_s,
|
|
1973
|
+
global half * dst_d,
|
|
1974
|
+
uchar mask_lsb_8,
|
|
1975
|
+
ulong n_blk
|
|
1976
|
+
) {
|
|
1977
|
+
if (get_global_id(0) >= n_blk) {
|
|
1978
|
+
return;
|
|
1979
|
+
}
|
|
1980
|
+
global struct block_q6_K * b = (global struct block_q6_K *) src0 + get_global_id(0);
|
|
1981
|
+
global uchar * ql = (global uchar *) dst_ql + QK_K/2*get_global_id(0);
|
|
1982
|
+
global uchar * qh = (global uchar *) dst_qh + QK_K/4*get_global_id(0);
|
|
1983
|
+
global char * s = (global char *) dst_s + QK_K/16*get_global_id(0);
|
|
1984
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
1985
|
+
|
|
1986
|
+
*d = b->d;
|
|
1987
|
+
|
|
1988
|
+
for (int i = 0; i < QK_K/2/4; ++i) {
|
|
1989
|
+
uchar x0 = b->ql[i*2 + 0] & mask_lsb_8;
|
|
1990
|
+
uchar x1 = b->ql[i*2 + 1] & mask_lsb_8;
|
|
1991
|
+
ql[i + 0] = (x0 & 0x0F) | ((x1 & 0x0F) << 4);
|
|
1992
|
+
ql[i + 32] = ((x0 & 0xF0) >> 4) | (x1 & 0xF0);
|
|
1993
|
+
|
|
1994
|
+
uchar x2 = b->ql[i*2 + 0 + 64] & mask_lsb_8;
|
|
1995
|
+
uchar x3 = b->ql[i*2 + 1 + 64] & mask_lsb_8;
|
|
1996
|
+
ql[i + 64] = (x2 & 0x0F) | ((x3 & 0x0F) << 4);
|
|
1997
|
+
ql[i + 96] = ((x2 & 0xF0) >> 4) | (x3 & 0xF0);
|
|
1998
|
+
}
|
|
1999
|
+
|
|
2000
|
+
for (int i = 0; i < QK_K/4/8; ++i) {
|
|
2001
|
+
uchar x0 = b->qh[i*4 + 0] & mask_lsb_8;
|
|
2002
|
+
uchar x1 = b->qh[i*4 + 1] & mask_lsb_8;
|
|
2003
|
+
uchar x2 = b->qh[i*4 + 2] & mask_lsb_8;
|
|
2004
|
+
uchar x3 = b->qh[i*4 + 3] & mask_lsb_8;
|
|
2005
|
+
qh[i + 0] = (x0 & 0x03) | ((x1 & 0x03) << 2) | ((x2 & 0x03) << 4) | ((x3 & 0x03) << 6);
|
|
2006
|
+
qh[i + 8] = ((x0 & 0x0C) >> 2) | (x1 & 0x0C) | ((x2 & 0x0C) << 2) | ((x3 & 0x0C) << 4);
|
|
2007
|
+
qh[i + 16] = ((x0 & 0x30) >> 4) | ((x1 & 0x30) >> 2) | (x2 & 0x30) | ((x3 & 0x30) << 2);
|
|
2008
|
+
qh[i + 24] = ((x0 & 0xC0) >> 6) | ((x1 & 0xC0) >> 4) | ((x2 & 0xC0) >> 2) | (x3 & 0xC0);
|
|
2009
|
+
|
|
2010
|
+
uchar x4 = b->qh[i*4 + 0 + 32] & mask_lsb_8;
|
|
2011
|
+
uchar x5 = b->qh[i*4 + 1 + 32] & mask_lsb_8;
|
|
2012
|
+
uchar x6 = b->qh[i*4 + 2 + 32] & mask_lsb_8;
|
|
2013
|
+
uchar x7 = b->qh[i*4 + 3 + 32] & mask_lsb_8;
|
|
2014
|
+
qh[i + 32] = (x4 & 0x03) | ((x5 & 0x03) << 2) | ((x6 & 0x03) << 4) | ((x7 & 0x03) << 6);
|
|
2015
|
+
qh[i + 40] = ((x4 & 0x0C) >> 2) | (x5 & 0x0C) | ((x6 & 0x0C) << 2) | ((x7 & 0x0C) << 4);
|
|
2016
|
+
qh[i + 48] = ((x4 & 0x30) >> 4) | ((x5 & 0x30) >> 2) | (x6 & 0x30) | ((x7 & 0x30) << 2);
|
|
2017
|
+
qh[i + 56] = ((x4 & 0xC0) >> 6) | ((x5 & 0xC0) >> 4) | ((x6 & 0xC0) >> 2) | (x7 & 0xC0);
|
|
2018
|
+
}
|
|
2019
|
+
|
|
2020
|
+
for (int i = 0; i < QK_K/16; ++i) {
|
|
2021
|
+
s[i] = b->scales[i];
|
|
2022
|
+
}
|
|
2023
|
+
}
|
|
2024
|
+
|
|
2025
|
+
kernel void kernel_restore_block_q6_K_noshuffle(
|
|
2026
|
+
global uchar * src_ql,
|
|
2027
|
+
global uchar * src_qh,
|
|
2028
|
+
global char * src_s,
|
|
2029
|
+
global half * src_d,
|
|
2030
|
+
global struct block_q6_K * dst,
|
|
2031
|
+
uchar mask_lsb_8,
|
|
2032
|
+
ulong n_blk
|
|
2033
|
+
) {
|
|
2034
|
+
if (get_global_id(0) >= n_blk) {
|
|
2035
|
+
return;
|
|
2036
|
+
}
|
|
2037
|
+
global struct block_q6_K * b = (global struct block_q6_K *) dst + get_global_id(0);
|
|
2038
|
+
global uchar * ql = (global uchar *) src_ql + QK_K/2*get_global_id(0);
|
|
2039
|
+
global uchar * qh = (global uchar *) src_qh + QK_K/4*get_global_id(0);
|
|
2040
|
+
global char * s = (global char *) src_s + QK_K/16*get_global_id(0);
|
|
2041
|
+
global half * d = (global half *) src_d + get_global_id(0);
|
|
2042
|
+
|
|
2043
|
+
b->d = *d;
|
|
2044
|
+
|
|
2045
|
+
for (int i = 0; i < QK_K/2/4; ++i) {
|
|
2046
|
+
uchar x0 = ql[i + 0] & mask_lsb_8;
|
|
2047
|
+
uchar x1 = ql[i + 32] & mask_lsb_8;
|
|
2048
|
+
b->ql[i*2 + 0] = (x0 & 0x0F) | ((x1 & 0x0F) << 4);
|
|
2049
|
+
b->ql[i*2 + 1] = ((x0 & 0xF0) >> 4) | (x1 & 0xF0);
|
|
2050
|
+
|
|
2051
|
+
uchar x2 = ql[i + 64] & mask_lsb_8;
|
|
2052
|
+
uchar x3 = ql[i + 96] & mask_lsb_8;
|
|
2053
|
+
b->ql[i*2 + 0 + 64] = (x2 & 0x0F) | ((x3 & 0x0F) << 4);
|
|
2054
|
+
b->ql[i*2 + 1 + 64] = ((x2 & 0xF0) >> 4) | (x3 & 0xF0);
|
|
2055
|
+
}
|
|
2056
|
+
|
|
2057
|
+
for (int i = 0; i < QK_K/4/8; ++i) {
|
|
2058
|
+
uchar x0 = qh[i + 0] & mask_lsb_8;
|
|
2059
|
+
uchar x1 = qh[i + 8] & mask_lsb_8;
|
|
2060
|
+
uchar x2 = qh[i + 16] & mask_lsb_8;
|
|
2061
|
+
uchar x3 = qh[i + 24] & mask_lsb_8;
|
|
2062
|
+
b->qh[i*4 + 0] = (x0 & 0x03) | ((x1 & 0x03) << 2) | ((x2 & 0x03) << 4) | ((x3 & 0x03) << 6);
|
|
2063
|
+
b->qh[i*4 + 1] = ((x0 & 0x0C) >> 2) | (x1 & 0x0C) | ((x2 & 0x0C) << 2) | ((x3 & 0x0C) << 4);
|
|
2064
|
+
b->qh[i*4 + 2] = ((x0 & 0x30) >> 4) | ((x1 & 0x30) >> 2) | (x2 & 0x30) | ((x3 & 0x30) << 2);
|
|
2065
|
+
b->qh[i*4 + 3] = ((x0 & 0xC0) >> 6) | ((x1 & 0xC0) >> 4) | ((x2 & 0xC0) >> 2) | (x3 & 0xC0);
|
|
2066
|
+
|
|
2067
|
+
uchar x4 = qh[i + 0 + 32] & mask_lsb_8;
|
|
2068
|
+
uchar x5 = qh[i + 8 + 32] & mask_lsb_8;
|
|
2069
|
+
uchar x6 = qh[i + 16 + 32] & mask_lsb_8;
|
|
2070
|
+
uchar x7 = qh[i + 24 + 32] & mask_lsb_8;
|
|
2071
|
+
b->qh[i*4 + 0 + 32] = (x4 & 0x03) | ((x5 & 0x03) << 2) | ((x6 & 0x03) << 4) | ((x7 & 0x03) << 6);
|
|
2072
|
+
b->qh[i*4 + 1 + 32] = ((x4 & 0x0C) >> 2) | (x5 & 0x0C) | ((x6 & 0x0C) << 2) | ((x7 & 0x0C) << 4);
|
|
2073
|
+
b->qh[i*4 + 2 + 32] = ((x4 & 0x30) >> 4) | ((x5 & 0x30) >> 2) | (x6 & 0x30) | ((x7 & 0x30) << 2);
|
|
2074
|
+
b->qh[i*4 + 3 + 32] = ((x4 & 0xC0) >> 6) | ((x5 & 0xC0) >> 4) | ((x6 & 0xC0) >> 2) | (x7 & 0xC0);
|
|
2075
|
+
}
|
|
2076
|
+
|
|
2077
|
+
for (int i = 0; i < QK_K/16; ++i) {
|
|
2078
|
+
b->scales[i] = s[i];
|
|
2079
|
+
}
|
|
2080
|
+
}
|
|
2081
|
+
|
|
2082
|
+
//------------------------------------------------------------------------------
|
|
2083
|
+
// kernel_convert_block_iq4_nl
|
|
2084
|
+
// Convert the block_iq4_nl format to 2 separate arrays (AOS -> SOA).
|
|
2085
|
+
//------------------------------------------------------------------------------
|
|
2086
|
+
kernel void kernel_convert_block_iq4_nl(
|
|
2087
|
+
global struct block_iq4_nl * src0,
|
|
2088
|
+
global uchar * dst_q,
|
|
2089
|
+
global half * dst_d,
|
|
2090
|
+
uchar mask_0F,
|
|
2091
|
+
uchar mask_F0,
|
|
2092
|
+
ulong n_blk
|
|
2093
|
+
) {
|
|
2094
|
+
if (get_global_id(0) >= n_blk) {
|
|
2095
|
+
return;
|
|
2096
|
+
}
|
|
2097
|
+
global struct block_iq4_nl * b = (global struct block_iq4_nl *) src0 + get_global_id(0);
|
|
2098
|
+
global uchar * q = (global uchar *) dst_q + QK4_NL/2*get_global_id(0);
|
|
2099
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
2100
|
+
|
|
2101
|
+
*d = b->d;
|
|
2102
|
+
|
|
2103
|
+
for (int i = 0; i < QK4_NL/2; ++i) {
|
|
2104
|
+
q[i] = b->qs[i];
|
|
2105
|
+
}
|
|
2106
|
+
}
|
|
2107
|
+
|
|
2108
|
+
kernel void kernel_restore_block_iq4_nl(
|
|
2109
|
+
global uchar * src_q,
|
|
2110
|
+
global half * src_d,
|
|
2111
|
+
global struct block_iq4_nl * dst,
|
|
2112
|
+
ulong n_blk
|
|
2113
|
+
) {
|
|
2114
|
+
if (get_global_id(0) >= n_blk) {
|
|
2115
|
+
return;
|
|
2116
|
+
}
|
|
2117
|
+
global struct block_iq4_nl * b = (global struct block_iq4_nl *) dst + get_global_id(0);
|
|
2118
|
+
global uchar * q = (global uchar *) src_q + QK4_NL/2*get_global_id(0);
|
|
2119
|
+
global half * d = (global half *) src_d + get_global_id(0);
|
|
2120
|
+
|
|
2121
|
+
b->d = *d;
|
|
2122
|
+
|
|
2123
|
+
for (int i = 0; i < QK4_NL/2; ++i) {
|
|
2124
|
+
b->qs[i] = q[i];
|
|
2125
|
+
}
|
|
2126
|
+
}
|
|
2127
|
+
|
|
2128
|
+
kernel void kernel_convert_block_iq4_nl_noshuffle(
|
|
2129
|
+
global struct block_iq4_nl * src0,
|
|
2130
|
+
global uchar * dst_q,
|
|
2131
|
+
global half * dst_d,
|
|
2132
|
+
uchar mask_0F,
|
|
2133
|
+
uchar mask_F0,
|
|
2134
|
+
ulong n_blk
|
|
2135
|
+
) {
|
|
2136
|
+
if (get_global_id(0) >= n_blk) {
|
|
2137
|
+
return;
|
|
2138
|
+
}
|
|
2139
|
+
global struct block_iq4_nl * b = (global struct block_iq4_nl *) src0 + get_global_id(0);
|
|
2140
|
+
global uchar * q = (global uchar *) dst_q + QK4_NL/2*get_global_id(0);
|
|
2141
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
2142
|
+
|
|
2143
|
+
*d = b->d;
|
|
2144
|
+
for (int i = 0; i < QK4_NL/4; ++i) {
|
|
2145
|
+
uchar x0 = b->qs[2*i + 0];
|
|
2146
|
+
uchar x1 = b->qs[2*i + 1];
|
|
2147
|
+
|
|
2148
|
+
q[i + 0 ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
|
|
2149
|
+
q[i + QK4_NL/4] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
|
|
2150
|
+
}
|
|
2151
|
+
}
|
|
2152
|
+
|
|
2153
|
+
kernel void kernel_restore_block_iq4_nl_noshuffle(
|
|
2154
|
+
global uchar * src_q,
|
|
2155
|
+
global half * src_d,
|
|
2156
|
+
global struct block_iq4_nl * dst,
|
|
2157
|
+
uchar mask_0F,
|
|
2158
|
+
uchar mask_F0,
|
|
2159
|
+
ulong n_blk
|
|
2160
|
+
) {
|
|
2161
|
+
if (get_global_id(0) >= n_blk) {
|
|
2162
|
+
return;
|
|
2163
|
+
}
|
|
2164
|
+
global struct block_iq4_nl * b = (global struct block_iq4_nl *) dst + get_global_id(0);
|
|
2165
|
+
global uchar * q = (global uchar *) src_q + QK4_NL/2*get_global_id(0);
|
|
2166
|
+
global half * d = (global half *) src_d + get_global_id(0);
|
|
2167
|
+
|
|
2168
|
+
b->d = *d;
|
|
2169
|
+
for (int i = 0; i < QK4_NL/4; ++i) {
|
|
2170
|
+
uchar x0 = q[i + 0 ];
|
|
2171
|
+
uchar x1 = q[i + QK4_NL/4];
|
|
2172
|
+
|
|
2173
|
+
b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
|
|
2174
|
+
b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
|
|
264
2175
|
}
|
|
265
2176
|
}
|