whispercpp 1.3.5 → 1.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.document +3 -0
- data/.rdoc_options +2 -0
- data/LICENSE +1 -1
- data/README.md +133 -3
- data/Rakefile +18 -3
- data/ext/dependencies.rb +10 -4
- data/ext/dependencies_for_windows.rb +17 -0
- data/ext/extconf.rb +20 -7
- data/ext/options.rb +54 -14
- data/ext/options_for_windows.rb +51 -0
- data/ext/ruby_whisper.c +56 -46
- data/ext/ruby_whisper.h +165 -2
- data/ext/ruby_whisper_context.c +297 -126
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_log_queue.c +180 -0
- data/ext/ruby_whisper_log_settable.h +47 -0
- data/ext/ruby_whisper_model.c +0 -1
- data/ext/ruby_whisper_parakeet.c +49 -0
- data/ext/ruby_whisper_parakeet_context.c +304 -0
- data/ext/ruby_whisper_parakeet_context_params.c +117 -0
- data/ext/ruby_whisper_parakeet_model.c +84 -0
- data/ext/ruby_whisper_parakeet_params.c +548 -0
- data/ext/ruby_whisper_parakeet_segment.c +157 -0
- data/ext/ruby_whisper_parakeet_token.c +188 -0
- data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
- data/ext/ruby_whisper_params.c +256 -66
- data/ext/ruby_whisper_segment.c +6 -7
- data/ext/ruby_whisper_token.c +29 -9
- data/ext/ruby_whisper_transcribe.cpp +46 -16
- data/ext/ruby_whisper_vad_context.c +48 -1
- data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +0 -1
- data/ext/ruby_whisper_vad_segments.c +0 -1
- data/ext/sources/CMakeLists.txt +41 -3
- data/ext/sources/CMakePresets.json +95 -0
- data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
- data/ext/sources/cmake/parakeet.pc.in +10 -0
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/cmake/whisper.pc.in +1 -1
- data/ext/sources/examples/CMakeLists.txt +4 -2
- data/ext/sources/examples/bench/bench.cpp +24 -19
- data/ext/sources/examples/cli/cli.cpp +51 -9
- data/ext/sources/examples/common-ggml.cpp +4 -0
- data/ext/sources/examples/common-whisper.cpp +139 -67
- data/ext/sources/examples/common-whisper.h +11 -0
- data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
- data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
- data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
- data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
- data/ext/sources/examples/server/server.cpp +213 -163
- data/ext/sources/ggml/CMakeLists.txt +29 -15
- data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
- data/ext/sources/ggml/include/ggml-alloc.h +1 -0
- data/ext/sources/ggml/include/ggml-backend.h +73 -11
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +5 -0
- data/ext/sources/ggml/include/ggml-cuda.h +3 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +8 -3
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml.h +155 -16
- data/ext/sources/ggml/include/gguf.h +10 -2
- data/ext/sources/ggml/src/CMakeLists.txt +25 -5
- data/ext/sources/ggml/src/ggml-alloc.c +9 -10
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
- data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +40 -86
- data/ext/sources/ggml/src/ggml-backend.cpp +114 -10
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +10 -2
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +1016 -442
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +111 -85
- data/ext/sources/ggml/src/ggml-cann/common.h +23 -14
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +255 -92
- data/ext/sources/ggml/src/ggml-common.h +22 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +68 -34
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +44 -19
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +101 -101
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +194 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2874 -613
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +5480 -840
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1361 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -11
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +186 -36
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +119 -19
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +112 -26
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +153 -16
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +17 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +976 -251
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +671 -266
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1277 -263
- data/ext/sources/ggml/src/ggml-cpu/ops.h +4 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +95 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +6 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2893 -679
- data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +226 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +114 -19
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
- data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +54 -53
- data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +18 -8
- data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
- data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +73 -28
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +69 -41
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +359 -29
- data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +94 -27
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +20 -9
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +333 -85
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +632 -190
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +162 -49
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +43 -18
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +44 -14
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +241 -23
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
- data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +312 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1454 -599
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +397 -183
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +161 -88
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +522 -431
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +139 -72
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +608 -88
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +47 -79
- data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +134 -27
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +7 -17
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +244 -137
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
- data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
- data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +96 -40
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +6 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +6 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -5
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +202 -135
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +86 -2
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +111 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +30 -2
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +84 -46
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1612 -753
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +51 -11
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +361 -261
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +294 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +753 -241
- data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +295 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +471 -296
- data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +159 -53
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +3 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +372 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +86 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +137 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +97 -14
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +163 -67
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +308 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +262 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +291 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +216 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +142 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -1348
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +547 -635
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +3556 -1101
- data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +475 -269
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +94 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +222 -217
- data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +432 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +886 -117
- data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +40 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +28 -9
- data/ext/sources/ggml/src/ggml-impl.h +68 -1
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +409 -83
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +54 -5
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +254 -52
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +254 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +756 -285
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +7 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +359 -133
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1867 -1123
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +71 -4
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +14127 -5314
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +97 -88
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +104 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1978 -67
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
- data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
- data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +178 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +985 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +380 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1132 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +956 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +149 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +47 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +40 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +317 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +257 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +86 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +880 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +143 -0
- data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
- data/ext/sources/ggml/src/ggml-quants.c +385 -119
- data/ext/sources/ggml/src/ggml-quants.h +6 -0
- data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
- data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
- data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +64 -91
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +4 -1
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
- data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +356 -11
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +184 -14
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +31 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
- data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
- data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
- data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +77 -156
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1181 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +59 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1246 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +674 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +227 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +347 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +1134 -236
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
- data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
- data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +72 -1
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
- data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
- data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
- data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
- data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
- data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +228 -53
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +123 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +160 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +99 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +545 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +115 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3250 -940
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +533 -180
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +113 -68
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +412 -222
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +222 -83
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +189 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +22 -20
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +51 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +39 -63
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +13 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +27 -11
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +193 -149
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +3221 -97
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3493 -1997
- data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +142 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +115 -141
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +93 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -44
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +198 -230
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +234 -335
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +871 -42
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +149 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +36 -138
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +151 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +15 -40
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +39 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +213 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +24 -15
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +253 -16
- data/ext/sources/ggml/src/ggml.c +268 -52
- data/ext/sources/ggml/src/gguf.cpp +377 -47
- data/ext/sources/include/parakeet.h +342 -0
- data/ext/sources/include/whisper.h +10 -0
- data/ext/sources/media/matmul.png +0 -0
- data/ext/sources/src/CMakeLists.txt +23 -0
- data/ext/sources/src/parakeet-arch.h +188 -0
- data/ext/sources/src/parakeet.cpp +3838 -0
- data/ext/sources/src/whisper.cpp +62 -40
- data/extsources.rb +26 -10
- data/lib/whisper/log_settable.rb +36 -0
- data/lib/whisper/model/uri.rb +13 -1
- data/lib/whisper/output.rb +74 -0
- data/sig/whisper.rbs +445 -55
- data/test/helper.rb +2 -0
- data/test/jfk_reader/jfk_reader.c +50 -7
- data/test/test_callback.rb +1 -0
- data/test/test_context_params.rb +82 -0
- data/test/test_package.rb +6 -5
- data/test/test_parakeet.rb +28 -0
- data/test/test_parakeet_callback.rb +107 -0
- data/test/test_parakeet_context.rb +116 -0
- data/test/test_parakeet_context_params.rb +24 -0
- data/test/test_parakeet_model.rb +21 -0
- data/test/test_parakeet_params.rb +78 -0
- data/test/test_parakeet_segment.rb +42 -0
- data/test/test_parakeet_token.rb +73 -0
- data/test/test_params.rb +2 -0
- data/test/test_token.rb +11 -0
- data/test/test_vad_context.rb +58 -8
- data/test/test_vad_segment.rb +1 -1
- data/test/test_whisper.rb +44 -6
- data/whispercpp.gemspec +2 -2
- metadata +426 -280
- data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
- data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
- data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
- data/ext/sources/bindings/javascript/package.json +0 -26
- data/ext/sources/bindings/javascript/whisper.js +0 -19
- data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
- data/ext/sources/examples/addon.node/addon.cpp +0 -557
- data/ext/sources/examples/addon.node/index.js +0 -59
- data/ext/sources/examples/addon.node/package.json +0 -16
- data/ext/sources/examples/addon.node/vad-example.js +0 -132
- data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
- data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
- data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
- data/ext/sources/examples/coi-serviceworker.js +0 -146
- data/ext/sources/examples/command/CMakeLists.txt +0 -10
- data/ext/sources/examples/command/command.cpp +0 -802
- data/ext/sources/examples/command/commands.txt +0 -9
- data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
- data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
- data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
- data/ext/sources/examples/generate-karaoke.sh +0 -57
- data/ext/sources/examples/helpers.js +0 -191
- data/ext/sources/examples/livestream.sh +0 -112
- data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
- data/ext/sources/examples/lsp/lsp.cpp +0 -471
- data/ext/sources/examples/lsp/whisper.vim +0 -362
- data/ext/sources/examples/python/test_whisper_processor.py +0 -7
- data/ext/sources/examples/python/whisper_processor.py +0 -54
- data/ext/sources/examples/server/bench.js +0 -29
- data/ext/sources/examples/server.py +0 -120
- data/ext/sources/examples/stream/CMakeLists.txt +0 -10
- data/ext/sources/examples/stream/stream.cpp +0 -437
- data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
- data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
- data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
- data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
- data/ext/sources/examples/sycl/build.sh +0 -22
- data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
- data/ext/sources/examples/sycl/run-whisper.sh +0 -17
- data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -47
- data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -494
- data/ext/sources/examples/talk-llama/llama-adapter.h +0 -88
- data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2559
- data/ext/sources/examples/talk-llama/llama-arch.h +0 -586
- data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -917
- data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
- data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -876
- data/ext/sources/examples/talk-llama/llama-chat.h +0 -70
- data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3645
- data/ext/sources/examples/talk-llama/llama-context.h +0 -360
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
- data/ext/sources/examples/talk-llama/llama-cparams.h +0 -42
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
- data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
- data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2282
- data/ext/sources/examples/talk-llama/llama-graph.h +0 -910
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -241
- data/ext/sources/examples/talk-llama/llama-hparams.h +0 -284
- data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
- data/ext/sources/examples/talk-llama/llama-impl.h +0 -63
- data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
- data/ext/sources/examples/talk-llama/llama-io.h +0 -35
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -328
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2100
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -390
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1167
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
- data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
- data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -735
- data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1247
- data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -176
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -285
- data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -37
- data/ext/sources/examples/talk-llama/llama-model.cpp +0 -8338
- data/ext/sources/examples/talk-llama/llama-model.h +0 -544
- data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1072
- data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +0 -3771
- data/ext/sources/examples/talk-llama/llama-sampling.h +0 -44
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3900
- data/ext/sources/examples/talk-llama/llama-vocab.h +0 -182
- data/ext/sources/examples/talk-llama/llama.cpp +0 -1140
- data/ext/sources/examples/talk-llama/llama.h +0 -1540
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -191
- data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
- data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -138
- data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -144
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/bert.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -160
- data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
- data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -144
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -259
- data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -134
- data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
- data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -113
- data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
- data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
- data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -196
- data/ext/sources/examples/talk-llama/models/granite.cpp +0 -211
- data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +0 -283
- data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -141
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -154
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -175
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/llama.cpp +0 -168
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -55
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -199
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
- data/ext/sources/examples/talk-llama/models/models.h +0 -569
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -116
- data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
- data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
- data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
- data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -316
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/plm.cpp +0 -168
- data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -873
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -149
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -141
- data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -162
- data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
- data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
- data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
- data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
- data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
- data/ext/sources/examples/talk-llama/speak +0 -40
- data/ext/sources/examples/talk-llama/speak.bat +0 -1
- data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
- data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
- data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
- data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
- data/ext/sources/examples/talk-llama/unicode.cpp +0 -1147
- data/ext/sources/examples/talk-llama/unicode.h +0 -111
- data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
- data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
- data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
- data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
- data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
- data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
- data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
- data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
- data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
- data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +0 -157
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -165
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
- data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
- data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -147
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +0 -907
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +0 -247
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
- data/ext/sources/tests/CMakeLists.txt +0 -112
- data/ext/sources/tests/earnings21/eval.mk +0 -58
- data/ext/sources/tests/earnings21/eval.py +0 -68
- data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
- data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
- data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
- data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
- data/ext/sources/tests/earnings21/requirements.txt +0 -6
- data/ext/sources/tests/en-0-ref.txt +0 -1
- data/ext/sources/tests/en-1-ref.txt +0 -1
- data/ext/sources/tests/en-2-ref.txt +0 -1
- data/ext/sources/tests/es-0-ref.txt +0 -1
- data/ext/sources/tests/librispeech/eval.mk +0 -39
- data/ext/sources/tests/librispeech/eval.py +0 -47
- data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
- data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
- data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
- data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
- data/ext/sources/tests/librispeech/requirements.txt +0 -6
- data/ext/sources/tests/run-tests.sh +0 -130
- data/ext/sources/tests/test-c.c +0 -3
- data/ext/sources/tests/test-vad-full.cpp +0 -56
- data/ext/sources/tests/test-vad.cpp +0 -83
- data/ext/sources/tests/test-whisper.js +0 -58
- data/lib/whisper/context.rb +0 -15
- data/lib/whisper/segment.rb +0 -58
|
@@ -48,6 +48,90 @@ static inline int nearest_int(float fval) {
|
|
|
48
48
|
|
|
49
49
|
extern "C" {
|
|
50
50
|
|
|
51
|
+
#if defined __riscv_zvfh
|
|
52
|
+
void ggml_quantize_mat_q8_0_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
53
|
+
assert(QK8_0 == 32);
|
|
54
|
+
assert(k % QK8_0 == 0);
|
|
55
|
+
const int nb = k / QK8_0;
|
|
56
|
+
|
|
57
|
+
block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
|
|
58
|
+
|
|
59
|
+
// scalar
|
|
60
|
+
const int blck_size_interleave = 1;
|
|
61
|
+
float srcv[4][QK8_0];
|
|
62
|
+
float id[4];
|
|
63
|
+
|
|
64
|
+
for (int i = 0; i < nb; i++) {
|
|
65
|
+
for (int row_iter = 0; row_iter < 4; row_iter++) {
|
|
66
|
+
float amax = 0.0f; // absolute max
|
|
67
|
+
|
|
68
|
+
for (int j = 0; j < QK8_0; j++) {
|
|
69
|
+
srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
|
|
70
|
+
amax = MAX(amax, fabsf(srcv[row_iter][j]));
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
const float d = amax / ((1 << 7) - 1);
|
|
74
|
+
id[row_iter] = d ? 1.0f / d : 0.0f;
|
|
75
|
+
|
|
76
|
+
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
for (int j = 0; j < QK8_0 * 4; j++) {
|
|
80
|
+
int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
|
|
81
|
+
int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
|
|
82
|
+
src_offset += (j % blck_size_interleave);
|
|
83
|
+
|
|
84
|
+
float x0 = srcv[src_id][src_offset] * id[src_id];
|
|
85
|
+
y[i].qs[j] = roundf(x0);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
void ggml_quantize_mat_q8_K_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
91
|
+
assert(QK_K == 256);
|
|
92
|
+
assert(k % QK_K == 0);
|
|
93
|
+
const int nb = k / QK_K;
|
|
94
|
+
|
|
95
|
+
block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
|
|
96
|
+
|
|
97
|
+
const int blck_size_interleave = 1;
|
|
98
|
+
float srcv[4][QK_K];
|
|
99
|
+
float iscale[4];
|
|
100
|
+
|
|
101
|
+
for (int i = 0; i < nb; i++) {
|
|
102
|
+
for (int row_iter = 0; row_iter < 4; row_iter++) {
|
|
103
|
+
float amax = 0.0f; // absolute max
|
|
104
|
+
float max = 0;
|
|
105
|
+
|
|
106
|
+
for (int j = 0; j < QK_K; j++) {
|
|
107
|
+
srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
|
|
108
|
+
// Update the maximum value of the corresponding super block
|
|
109
|
+
if(amax < fabsf(srcv[row_iter][j])) {
|
|
110
|
+
amax = fabsf(srcv[row_iter][j]);
|
|
111
|
+
max = srcv[row_iter][j];
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
iscale[row_iter] = amax ? -127.f/max : 0;
|
|
116
|
+
y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
for (int j = 0; j < QK_K / 4; j++) {
|
|
120
|
+
y[i].bsums[j] = 0;
|
|
121
|
+
}
|
|
122
|
+
for (int j = 0; j < QK_K * 4; j++) {
|
|
123
|
+
int src_id = j % 4;
|
|
124
|
+
int src_offset = j / 4;
|
|
125
|
+
int index = ((j >> 6) << 2) + (j & 3);
|
|
126
|
+
|
|
127
|
+
float x0 = srcv[src_id][src_offset] * iscale[src_id];
|
|
128
|
+
y[i].qs[j] = nearest_int(x0);
|
|
129
|
+
y[i].bsums[index] += y[i].qs[j];
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
#endif
|
|
134
|
+
|
|
51
135
|
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
52
136
|
assert(QK8_0 == 32);
|
|
53
137
|
assert(k % QK8_0 == 0);
|
|
@@ -124,7 +208,6 @@ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GG
|
|
|
124
208
|
}
|
|
125
209
|
}
|
|
126
210
|
|
|
127
|
-
|
|
128
211
|
void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
129
212
|
assert(QK_K == 256);
|
|
130
213
|
assert(k % QK_K == 0);
|
|
@@ -256,192 +339,289 @@ template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTR
|
|
|
256
339
|
ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row);
|
|
257
340
|
}
|
|
258
341
|
|
|
259
|
-
|
|
342
|
+
#if defined __riscv_zvfh
|
|
343
|
+
template <> void ggml_quantize_mat_t<1, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
|
|
344
|
+
assert(nrow == 4);
|
|
345
|
+
UNUSED(nrow);
|
|
346
|
+
ggml_quantize_mat_q8_0_4x1(x, vy, n_per_row);
|
|
347
|
+
}
|
|
260
348
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
349
|
+
template <> void ggml_quantize_mat_t<1, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
|
|
350
|
+
assert(nrow == 4);
|
|
351
|
+
UNUSED(nrow);
|
|
352
|
+
ggml_quantize_mat_q8_K_4x1(x, vy, n_per_row);
|
|
353
|
+
}
|
|
354
|
+
#endif
|
|
355
|
+
|
|
356
|
+
template <int M, int N>
|
|
357
|
+
static void ggml_gemv_q6_K_NxM_q8_K_generic_impl(int n,
|
|
358
|
+
float * GGML_RESTRICT s,
|
|
359
|
+
size_t bs,
|
|
360
|
+
const void * GGML_RESTRICT vx,
|
|
361
|
+
const void * GGML_RESTRICT vy,
|
|
362
|
+
int nr,
|
|
363
|
+
int nc) {
|
|
364
|
+
constexpr int blocklen = M;
|
|
365
|
+
constexpr int ncols_interleaved = N;
|
|
366
|
+
const int qk = QK_K;
|
|
367
|
+
const int nb = n / qk;
|
|
368
|
+
const int blocks_per_half = 64 / blocklen;
|
|
266
369
|
|
|
267
|
-
assert(nr == 1);
|
|
268
370
|
assert(n % qk == 0);
|
|
269
371
|
assert(nc % ncols_interleaved == 0);
|
|
270
372
|
|
|
271
|
-
UNUSED(s);
|
|
272
373
|
UNUSED(bs);
|
|
273
|
-
UNUSED(vx);
|
|
274
|
-
UNUSED(vy);
|
|
275
374
|
UNUSED(nr);
|
|
276
|
-
UNUSED(nc);
|
|
277
|
-
UNUSED(nb);
|
|
278
|
-
UNUSED(ncols_interleaved);
|
|
279
|
-
UNUSED(blocklen);
|
|
280
375
|
|
|
281
|
-
float sumf[
|
|
282
|
-
int sumi;
|
|
376
|
+
float sumf[8];
|
|
283
377
|
|
|
284
|
-
const
|
|
378
|
+
const block_q8_K * a_ptr = (const block_q8_K *) vy;
|
|
285
379
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
286
|
-
const
|
|
380
|
+
const block_q6_Kx8 * b_ptr = (const block_q6_Kx8 *) vx + (x * nb);
|
|
381
|
+
|
|
382
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
383
|
+
sumf[j] = 0.0f;
|
|
384
|
+
}
|
|
287
385
|
|
|
288
|
-
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
289
386
|
for (int l = 0; l < nb; l++) {
|
|
290
387
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
388
|
+
const int base_l = (k / blocks_per_half) * 128 + (k % blocks_per_half) * blocklen;
|
|
389
|
+
const int base_h = base_l + 64;
|
|
390
|
+
|
|
391
|
+
const int scale_idx_l = base_l / 16;
|
|
392
|
+
const int scale_idx_h = base_h / 16;
|
|
393
|
+
|
|
394
|
+
const int qh_shift_l = ((base_l % 128) / 32) * 2;
|
|
395
|
+
const int qh_shift_h = ((base_h % 128) / 32) * 2;
|
|
396
|
+
|
|
397
|
+
const int qh_half_l = (base_l / 128) * 32;
|
|
398
|
+
const int qh_half_h = (base_h / 128) * 32;
|
|
399
|
+
|
|
291
400
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
401
|
+
const int8_t scale_l = b_ptr[l].scales[scale_idx_l * ncols_interleaved + j];
|
|
402
|
+
const int8_t scale_h = b_ptr[l].scales[scale_idx_h * ncols_interleaved + j];
|
|
403
|
+
|
|
404
|
+
int sumi_l = 0;
|
|
405
|
+
int sumi_h = 0;
|
|
406
|
+
|
|
407
|
+
for (int i = 0; i < blocklen; i++) {
|
|
408
|
+
const int ql_pos = k * ncols_interleaved * blocklen + j * blocklen + i;
|
|
409
|
+
const int l_4 = b_ptr[l].ql[ql_pos] & 0xF;
|
|
410
|
+
const int hi_4 = (b_ptr[l].ql[ql_pos] >> 4) & 0xF;
|
|
411
|
+
|
|
412
|
+
const int qh_idx_l = qh_half_l + ((base_l + i) % 32);
|
|
413
|
+
const int qh_chunk_l = qh_idx_l / blocklen;
|
|
414
|
+
const int qh_pos_l = qh_idx_l % blocklen;
|
|
415
|
+
const int qh_offset_l = qh_chunk_l * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_l;
|
|
416
|
+
const int hi_2_l = (b_ptr[l].qh[qh_offset_l] >> qh_shift_l) & 0x3;
|
|
417
|
+
|
|
418
|
+
const int qh_idx_h = qh_half_h + ((base_h + i) % 32);
|
|
419
|
+
const int qh_chunk_h = qh_idx_h / blocklen;
|
|
420
|
+
const int qh_pos_h = qh_idx_h % blocklen;
|
|
421
|
+
const int qh_offset_h = qh_chunk_h * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_h;
|
|
422
|
+
const int hi_2_h = (b_ptr[l].qh[qh_offset_h] >> qh_shift_h) & 0x3;
|
|
423
|
+
|
|
424
|
+
const int q_l = ((hi_2_l << 4) | l_4) - 32;
|
|
425
|
+
const int q_h = ((hi_2_h << 4) | hi_4) - 32;
|
|
426
|
+
|
|
427
|
+
const int8_t a_l = a_ptr[l].qs[base_l + i];
|
|
428
|
+
const int8_t a_h = a_ptr[l].qs[base_h + i];
|
|
429
|
+
|
|
430
|
+
sumi_l += q_l * a_l;
|
|
431
|
+
sumi_h += q_h * a_h;
|
|
297
432
|
}
|
|
298
|
-
|
|
433
|
+
|
|
434
|
+
sumf[j] +=
|
|
435
|
+
(sumi_l * scale_l + sumi_h * scale_h) * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
299
436
|
}
|
|
300
437
|
}
|
|
301
438
|
}
|
|
302
|
-
|
|
439
|
+
|
|
440
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
441
|
+
s[x * ncols_interleaved + j] = sumf[j];
|
|
442
|
+
}
|
|
303
443
|
}
|
|
304
444
|
}
|
|
305
445
|
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
446
|
+
template <int M, int N>
|
|
447
|
+
static void ggml_gemm_q6_K_NxM_q8_K_generic_impl(int n,
|
|
448
|
+
float * GGML_RESTRICT s,
|
|
449
|
+
size_t bs,
|
|
450
|
+
const void * GGML_RESTRICT vx,
|
|
451
|
+
const void * GGML_RESTRICT vy,
|
|
452
|
+
int nr,
|
|
453
|
+
int nc) {
|
|
454
|
+
constexpr int blocklen = M;
|
|
455
|
+
constexpr int ncols_interleaved = N;
|
|
456
|
+
const int qk = QK_K;
|
|
457
|
+
const int nb = n / qk;
|
|
458
|
+
const int blocks_per_half = 64 / blocklen;
|
|
459
|
+
const int q8_half_stride = 512;
|
|
460
|
+
const int q8_low_high_step = 256;
|
|
311
461
|
|
|
312
|
-
assert
|
|
313
|
-
assert
|
|
462
|
+
assert(n % qk == 0);
|
|
463
|
+
assert(nr % 4 == 0);
|
|
464
|
+
assert(nc % ncols_interleaved == 0);
|
|
314
465
|
|
|
315
|
-
UNUSED(s);
|
|
316
466
|
UNUSED(bs);
|
|
317
|
-
UNUSED(vx);
|
|
318
|
-
UNUSED(vy);
|
|
319
|
-
UNUSED(nr);
|
|
320
|
-
UNUSED(nc);
|
|
321
|
-
UNUSED(nb);
|
|
322
|
-
UNUSED(ncols_interleaved);
|
|
323
|
-
UNUSED(blocklen);
|
|
324
467
|
|
|
325
|
-
float sumf[4];
|
|
326
|
-
int sumi;
|
|
468
|
+
float sumf[4][8];
|
|
327
469
|
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
470
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
471
|
+
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
|
472
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
473
|
+
const block_q6_Kx8 * b_ptr = (const block_q6_Kx8 *) vx + (x * nb);
|
|
331
474
|
|
|
332
|
-
|
|
333
|
-
for (int l = 0; l < nb; l++) {
|
|
334
|
-
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
475
|
+
for (int m = 0; m < 4; m++) {
|
|
335
476
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
336
|
-
|
|
337
|
-
for (int i = 0; i < blocklen; ++i) {
|
|
338
|
-
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
339
|
-
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
340
|
-
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
341
|
-
}
|
|
342
|
-
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
477
|
+
sumf[m][j] = 0.0f;
|
|
343
478
|
}
|
|
344
479
|
}
|
|
345
|
-
}
|
|
346
|
-
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
347
|
-
}
|
|
348
|
-
}
|
|
349
480
|
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
const int blocklen = 8;
|
|
481
|
+
for (int l = 0; l < nb; l++) {
|
|
482
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
483
|
+
const int base_l = (k / blocks_per_half) * 128 + (k % blocks_per_half) * blocklen;
|
|
484
|
+
const int base_h = base_l + 64;
|
|
355
485
|
|
|
356
|
-
|
|
357
|
-
|
|
486
|
+
const int scale_idx_l = base_l / 16;
|
|
487
|
+
const int scale_idx_h = base_h / 16;
|
|
358
488
|
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
UNUSED(vx);
|
|
362
|
-
UNUSED(vy);
|
|
363
|
-
UNUSED(nr);
|
|
364
|
-
UNUSED(nc);
|
|
365
|
-
UNUSED(nb);
|
|
366
|
-
UNUSED(ncols_interleaved);
|
|
367
|
-
UNUSED(blocklen);
|
|
489
|
+
const int qh_shift_l = ((base_l % 128) / 32) * 2;
|
|
490
|
+
const int qh_shift_h = ((base_h % 128) / 32) * 2;
|
|
368
491
|
|
|
369
|
-
|
|
370
|
-
|
|
492
|
+
const int qh_half_l = (base_l / 128) * 32;
|
|
493
|
+
const int qh_half_h = (base_h / 128) * 32;
|
|
371
494
|
|
|
372
|
-
|
|
373
|
-
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
374
|
-
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
|
495
|
+
const int q8_base = (k / blocks_per_half) * q8_half_stride + (k % blocks_per_half) * (blocklen * 4);
|
|
375
496
|
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
497
|
+
for (int m = 0; m < 4; m++) {
|
|
498
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
499
|
+
const int8_t scale_l = b_ptr[l].scales[scale_idx_l * ncols_interleaved + j];
|
|
500
|
+
const int8_t scale_h = b_ptr[l].scales[scale_idx_h * ncols_interleaved + j];
|
|
501
|
+
|
|
502
|
+
int sumi_l = 0;
|
|
503
|
+
int sumi_h = 0;
|
|
504
|
+
|
|
505
|
+
for (int i = 0; i < blocklen; i++) {
|
|
506
|
+
const int ql_pos = k * ncols_interleaved * blocklen + j * blocklen + i;
|
|
507
|
+
const int l_4 = b_ptr[l].ql[ql_pos] & 0xF;
|
|
508
|
+
const int hi_4 = (b_ptr[l].ql[ql_pos] >> 4) & 0xF;
|
|
509
|
+
|
|
510
|
+
const int qh_idx_l = qh_half_l + ((base_l + i) % 32);
|
|
511
|
+
const int qh_chunk_l = qh_idx_l / blocklen;
|
|
512
|
+
const int qh_pos_l = qh_idx_l % blocklen;
|
|
513
|
+
const int qh_offset_l =
|
|
514
|
+
qh_chunk_l * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_l;
|
|
515
|
+
const int hi_2_l = (b_ptr[l].qh[qh_offset_l] >> qh_shift_l) & 0x3;
|
|
516
|
+
|
|
517
|
+
const int qh_idx_h = qh_half_h + ((base_h + i) % 32);
|
|
518
|
+
const int qh_chunk_h = qh_idx_h / blocklen;
|
|
519
|
+
const int qh_pos_h = qh_idx_h % blocklen;
|
|
520
|
+
const int qh_offset_h =
|
|
521
|
+
qh_chunk_h * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_h;
|
|
522
|
+
const int hi_2_h = (b_ptr[l].qh[qh_offset_h] >> qh_shift_h) & 0x3;
|
|
523
|
+
|
|
524
|
+
const int q_l = ((hi_2_l << 4) | l_4) - 32;
|
|
525
|
+
const int q_h = ((hi_2_h << 4) | hi_4) - 32;
|
|
526
|
+
|
|
527
|
+
const int8_t q8_l = a_ptr[l].qs[q8_base + m * blocklen + i];
|
|
528
|
+
const int8_t q8_h = a_ptr[l].qs[q8_base + m * blocklen + i + q8_low_high_step];
|
|
529
|
+
|
|
530
|
+
sumi_l += q_l * q8_l;
|
|
531
|
+
sumi_h += q_h * q8_h;
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
sumf[m][j] += (sumi_l * scale_l + sumi_h * scale_h) * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) *
|
|
535
|
+
a_ptr[l].d[m];
|
|
536
|
+
}
|
|
385
537
|
}
|
|
386
|
-
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
for (int m = 0; m < 4; m++) {
|
|
542
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
543
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
387
544
|
}
|
|
388
545
|
}
|
|
389
546
|
}
|
|
390
|
-
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
391
547
|
}
|
|
392
548
|
}
|
|
393
549
|
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
550
|
+
template <int M, int N>
|
|
551
|
+
static void ggml_gemv_q5_K_NxM_q8_K_generic_impl(int n,
|
|
552
|
+
float * GGML_RESTRICT s,
|
|
553
|
+
size_t bs,
|
|
554
|
+
const void * GGML_RESTRICT vx,
|
|
555
|
+
const void * GGML_RESTRICT vy,
|
|
556
|
+
int nr,
|
|
557
|
+
int nc) {
|
|
558
|
+
constexpr int blocklen = M;
|
|
559
|
+
constexpr int ncols_interleaved = N;
|
|
560
|
+
const int qk = QK_K;
|
|
561
|
+
const int nb = n / qk;
|
|
562
|
+
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
563
|
+
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
564
|
+
static const uint32_t kmask3 = 0x03030303;
|
|
402
565
|
|
|
403
|
-
assert
|
|
404
|
-
assert
|
|
566
|
+
assert(n % qk == 0);
|
|
567
|
+
assert(nc % ncols_interleaved == 0);
|
|
405
568
|
|
|
406
569
|
UNUSED(bs);
|
|
407
570
|
UNUSED(nr);
|
|
408
571
|
|
|
409
|
-
float
|
|
410
|
-
float
|
|
572
|
+
float sumf[ncols_interleaved];
|
|
573
|
+
float sum_minf[ncols_interleaved];
|
|
411
574
|
uint32_t utmp[32];
|
|
412
|
-
int
|
|
413
|
-
int
|
|
414
|
-
int
|
|
575
|
+
int sumi1;
|
|
576
|
+
int sumi2;
|
|
577
|
+
int sumi;
|
|
415
578
|
|
|
416
579
|
const block_q8_K * a_ptr = (const block_q8_K *) vy;
|
|
417
580
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
418
|
-
const
|
|
581
|
+
const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
|
|
419
582
|
|
|
420
583
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
421
|
-
sumf[j]
|
|
584
|
+
sumf[j] = 0.0;
|
|
422
585
|
sum_minf[j] = 0.0;
|
|
423
586
|
}
|
|
424
587
|
for (int l = 0; l < nb; l++) {
|
|
425
588
|
for (int sb = 0; sb < 8; sb++) {
|
|
426
|
-
memcpy(utmp + sb * 4, b_ptr[l].scales + sb *
|
|
427
|
-
utmp[sb * 4 + 3]
|
|
589
|
+
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * K_SCALE_SIZE, K_SCALE_SIZE);
|
|
590
|
+
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
428
591
|
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
429
|
-
utmp[sb * 4 + 1]
|
|
430
|
-
utmp[sb * 4 + 2]
|
|
592
|
+
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
593
|
+
utmp[sb * 4 + 2] = uaux_0;
|
|
431
594
|
utmp[sb * 4 + 0] &= kmask1;
|
|
432
595
|
}
|
|
433
596
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
434
|
-
|
|
435
|
-
uint8_t *
|
|
597
|
+
constexpr int scale_stride = 32;
|
|
598
|
+
uint8_t * scales_0 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride;
|
|
599
|
+
uint8_t * scales_1 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride + 16;
|
|
600
|
+
|
|
601
|
+
const int qh_shift = (k / (32 / blocklen)) * 2;
|
|
436
602
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
437
603
|
sumi1 = 0;
|
|
438
604
|
sumi2 = 0;
|
|
439
|
-
sumi
|
|
605
|
+
sumi = 0;
|
|
440
606
|
for (int i = 0; i < blocklen; ++i) {
|
|
441
|
-
const int
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
607
|
+
const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
|
|
608
|
+
|
|
609
|
+
const int qh_idx = (k * blocklen + i) % 32;
|
|
610
|
+
const int qh_chunk = qh_idx / blocklen;
|
|
611
|
+
const int qh_pos = qh_idx % blocklen;
|
|
612
|
+
const int b_qh_offset = qh_chunk * (blocklen * ncols_interleaved) + j * blocklen + qh_pos;
|
|
613
|
+
|
|
614
|
+
const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
|
|
615
|
+
const uint8_t h0 = (qh_val >> qh_shift) & 1;
|
|
616
|
+
const uint8_t h1 = (qh_val >> (qh_shift + 1)) & 1;
|
|
617
|
+
|
|
618
|
+
const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
|
|
619
|
+
const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
|
|
620
|
+
|
|
621
|
+
const int q8_offset = (k / (32 / blocklen)) * 64 + (k % (32 / blocklen)) * blocklen + i;
|
|
622
|
+
|
|
623
|
+
sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
|
|
624
|
+
sumi2 = (v1 * a_ptr[l].qs[q8_offset + 32]);
|
|
445
625
|
sumi1 = sumi1 * scales_0[j];
|
|
446
626
|
sumi2 = sumi2 * scales_1[j];
|
|
447
627
|
sumi += sumi1 + sumi2;
|
|
@@ -452,7 +632,8 @@ void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
452
632
|
for (int sb = 0; sb < 8; sb++) {
|
|
453
633
|
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
|
454
634
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
455
|
-
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) *
|
|
635
|
+
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) *
|
|
636
|
+
GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
456
637
|
}
|
|
457
638
|
}
|
|
458
639
|
}
|
|
@@ -462,17 +643,123 @@ void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
462
643
|
}
|
|
463
644
|
}
|
|
464
645
|
|
|
465
|
-
|
|
466
|
-
|
|
646
|
+
template <int M, int N>
|
|
647
|
+
static void ggml_gemm_q5_K_NxM_q8_K_generic_impl(int n,
|
|
648
|
+
float * GGML_RESTRICT s,
|
|
649
|
+
size_t bs,
|
|
650
|
+
const void * GGML_RESTRICT vx,
|
|
651
|
+
const void * GGML_RESTRICT vy,
|
|
652
|
+
int nr,
|
|
653
|
+
int nc) {
|
|
654
|
+
constexpr int blocklen = M;
|
|
655
|
+
constexpr int ncols_interleaved = N;
|
|
656
|
+
const int qk = QK_K;
|
|
657
|
+
const int nb = n / qk;
|
|
658
|
+
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
659
|
+
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
660
|
+
static const uint32_t kmask3 = 0x03030303;
|
|
661
|
+
|
|
662
|
+
assert(n % qk == 0);
|
|
663
|
+
assert(nr % 4 == 0);
|
|
664
|
+
assert(nc % ncols_interleaved == 0);
|
|
665
|
+
|
|
666
|
+
float sumf[4][ncols_interleaved];
|
|
667
|
+
float sum_minf[4][ncols_interleaved];
|
|
668
|
+
uint32_t utmp[32];
|
|
669
|
+
int sumi1;
|
|
670
|
+
int sumi2;
|
|
671
|
+
int sumi;
|
|
672
|
+
|
|
673
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
674
|
+
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
|
675
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
676
|
+
const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
|
|
677
|
+
for (int m = 0; m < 4; m++) {
|
|
678
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
679
|
+
sumf[m][j] = 0.0;
|
|
680
|
+
sum_minf[m][j] = 0.0;
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
for (int l = 0; l < nb; l++) {
|
|
684
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
685
|
+
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * K_SCALE_SIZE, K_SCALE_SIZE);
|
|
686
|
+
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
687
|
+
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
688
|
+
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
689
|
+
utmp[sb * 4 + 2] = uaux_0;
|
|
690
|
+
utmp[sb * 4 + 0] &= kmask1;
|
|
691
|
+
}
|
|
692
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
693
|
+
constexpr int scale_stride = 32;
|
|
694
|
+
uint8_t * scales_0 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride;
|
|
695
|
+
uint8_t * scales_1 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride + 16;
|
|
696
|
+
|
|
697
|
+
const int qh_shift = (k / (32 / blocklen)) * 2;
|
|
698
|
+
for (int m = 0; m < 4; m++) {
|
|
699
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
700
|
+
sumi1 = 0;
|
|
701
|
+
sumi2 = 0;
|
|
702
|
+
sumi = 0;
|
|
703
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
704
|
+
const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
|
|
705
|
+
|
|
706
|
+
const int qh_idx = (k * blocklen + i) % 32;
|
|
707
|
+
const int qh_chunk = qh_idx / blocklen;
|
|
708
|
+
const int qh_pos = qh_idx % blocklen;
|
|
709
|
+
const int b_qh_offset =
|
|
710
|
+
qh_chunk * (blocklen * ncols_interleaved) + j * blocklen + qh_pos;
|
|
711
|
+
|
|
712
|
+
const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
|
|
713
|
+
const uint8_t h0 = (qh_val >> qh_shift) & 1;
|
|
714
|
+
const uint8_t h1 = (qh_val >> (qh_shift + 1)) & 1;
|
|
715
|
+
|
|
716
|
+
const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
|
|
717
|
+
const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
|
|
718
|
+
|
|
719
|
+
const int q8_offset = (k / (32 / blocklen)) * 256 +
|
|
720
|
+
(k % (32 / blocklen)) * 4 * blocklen + m * blocklen + i;
|
|
721
|
+
|
|
722
|
+
sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
|
|
723
|
+
sumi2 = (v1 * a_ptr[l].qs[q8_offset + 128]);
|
|
724
|
+
sumi1 = sumi1 * scales_0[j];
|
|
725
|
+
sumi2 = sumi2 * scales_1[j];
|
|
726
|
+
sumi += sumi1 + sumi2;
|
|
727
|
+
}
|
|
728
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
733
|
+
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
|
734
|
+
for (int m = 0; m < 4; m++) {
|
|
735
|
+
const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
736
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
737
|
+
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) *
|
|
738
|
+
GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
739
|
+
}
|
|
740
|
+
}
|
|
741
|
+
}
|
|
742
|
+
}
|
|
743
|
+
for (int m = 0; m < 4; m++) {
|
|
744
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
745
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
|
746
|
+
}
|
|
747
|
+
}
|
|
748
|
+
}
|
|
749
|
+
}
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
extern "C" {
|
|
753
|
+
|
|
754
|
+
void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
755
|
+
const int qk = QK8_0;
|
|
467
756
|
const int nb = n / qk;
|
|
468
|
-
const int ncols_interleaved =
|
|
469
|
-
const int blocklen =
|
|
470
|
-
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
471
|
-
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
472
|
-
static const uint32_t kmask3 = 0x03030303;
|
|
757
|
+
const int ncols_interleaved = 4;
|
|
758
|
+
const int blocklen = 4;
|
|
473
759
|
|
|
474
|
-
assert
|
|
475
|
-
assert
|
|
760
|
+
assert(nr == 1);
|
|
761
|
+
assert(n % qk == 0);
|
|
762
|
+
assert(nc % ncols_interleaved == 0);
|
|
476
763
|
|
|
477
764
|
UNUSED(s);
|
|
478
765
|
UNUSED(bs);
|
|
@@ -484,66 +771,35 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
484
771
|
UNUSED(ncols_interleaved);
|
|
485
772
|
UNUSED(blocklen);
|
|
486
773
|
|
|
487
|
-
float sumf[
|
|
488
|
-
float sum_minf[8];
|
|
489
|
-
uint32_t utmp[32];
|
|
490
|
-
int sumi1;
|
|
491
|
-
int sumi2;
|
|
774
|
+
float sumf[4];
|
|
492
775
|
int sumi;
|
|
493
776
|
|
|
494
|
-
const
|
|
777
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
495
778
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
496
|
-
const
|
|
779
|
+
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
|
497
780
|
|
|
498
|
-
for (int j = 0; j < ncols_interleaved; j++)
|
|
499
|
-
sumf[j] = 0.0;
|
|
500
|
-
sum_minf[j] = 0.0;
|
|
501
|
-
}
|
|
781
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
502
782
|
for (int l = 0; l < nb; l++) {
|
|
503
|
-
for (int sb = 0; sb < 8; sb++) {
|
|
504
|
-
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
505
|
-
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
506
|
-
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
507
|
-
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
508
|
-
utmp[sb * 4 + 2] = uaux_0;
|
|
509
|
-
utmp[sb * 4 + 0] &= kmask1;
|
|
510
|
-
}
|
|
511
783
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
512
|
-
uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
|
|
513
|
-
uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
|
|
514
784
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
515
|
-
sumi1 = 0;
|
|
516
|
-
sumi2 = 0;
|
|
517
785
|
sumi = 0;
|
|
518
786
|
for (int i = 0; i < blocklen; ++i) {
|
|
519
|
-
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i]
|
|
520
|
-
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i]
|
|
521
|
-
|
|
522
|
-
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]);
|
|
523
|
-
sumi1 = sumi1 * scales_0[j];
|
|
524
|
-
sumi2 = sumi2 * scales_1[j];
|
|
525
|
-
sumi += sumi1 + sumi2;
|
|
787
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
788
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
789
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
526
790
|
}
|
|
527
|
-
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
528
|
-
}
|
|
529
|
-
}
|
|
530
|
-
for (int sb = 0; sb < 8; sb++) {
|
|
531
|
-
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
|
|
532
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
533
|
-
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
791
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
534
792
|
}
|
|
535
793
|
}
|
|
536
794
|
}
|
|
537
|
-
for (int j = 0; j < ncols_interleaved; j++)
|
|
538
|
-
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
|
539
|
-
}
|
|
795
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
540
796
|
}
|
|
541
797
|
}
|
|
542
798
|
|
|
543
|
-
void
|
|
544
|
-
const int qk =
|
|
799
|
+
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
800
|
+
const int qk = QK8_0;
|
|
545
801
|
const int nb = n / qk;
|
|
546
|
-
const int ncols_interleaved =
|
|
802
|
+
const int ncols_interleaved = 4;
|
|
547
803
|
const int blocklen = 8;
|
|
548
804
|
|
|
549
805
|
assert (n % qk == 0);
|
|
@@ -559,82 +815,56 @@ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
559
815
|
UNUSED(ncols_interleaved);
|
|
560
816
|
UNUSED(blocklen);
|
|
561
817
|
|
|
562
|
-
float sumf[
|
|
563
|
-
float sum_minf[8];
|
|
564
|
-
int sumi1,sumi2,sumi3,sumi4;
|
|
818
|
+
float sumf[4];
|
|
565
819
|
int sumi;
|
|
566
820
|
|
|
567
|
-
const
|
|
568
|
-
for(int x = 0; x < nc / ncols_interleaved; x++) {
|
|
569
|
-
const
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
sum_minf[j] = 0.0;
|
|
573
|
-
}
|
|
821
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
822
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
823
|
+
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
|
824
|
+
|
|
825
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
574
826
|
for (int l = 0; l < nb; l++) {
|
|
575
|
-
for (int k = 0; k < (qk / (
|
|
576
|
-
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
|
577
|
-
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
|
578
|
-
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
|
579
|
-
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
|
827
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
580
828
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
581
|
-
sumi1 = 0;
|
|
582
|
-
sumi2 = 0;
|
|
583
|
-
sumi3 = 0;
|
|
584
|
-
sumi4 = 0;
|
|
585
829
|
sumi = 0;
|
|
586
|
-
int
|
|
587
|
-
|
|
588
|
-
const int
|
|
589
|
-
|
|
590
|
-
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
|
591
|
-
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
|
592
|
-
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
|
|
593
|
-
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
|
|
594
|
-
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
|
|
595
|
-
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
|
|
596
|
-
|
|
597
|
-
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
|
598
|
-
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
|
599
|
-
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
|
600
|
-
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
|
601
|
-
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
|
830
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
831
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
832
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
833
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
602
834
|
}
|
|
603
|
-
sumf[j] += sumi *
|
|
604
|
-
}
|
|
605
|
-
}
|
|
606
|
-
for(int sb = 0; sb < 8; sb++) {
|
|
607
|
-
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
|
608
|
-
for(int j = 0; j < ncols_interleaved; j++){
|
|
609
|
-
sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
835
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
610
836
|
}
|
|
611
837
|
}
|
|
612
838
|
}
|
|
613
|
-
for (int j = 0; j < ncols_interleaved; j++)
|
|
614
|
-
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
|
615
|
-
}
|
|
839
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
616
840
|
}
|
|
617
841
|
}
|
|
618
842
|
|
|
619
|
-
void
|
|
843
|
+
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
620
844
|
const int qk = QK8_0;
|
|
621
845
|
const int nb = n / qk;
|
|
622
|
-
const int ncols_interleaved =
|
|
623
|
-
const int blocklen =
|
|
846
|
+
const int ncols_interleaved = 8;
|
|
847
|
+
const int blocklen = 8;
|
|
624
848
|
|
|
625
|
-
assert(
|
|
626
|
-
assert(
|
|
627
|
-
assert(nc % ncols_interleaved == 0);
|
|
849
|
+
assert (n % qk == 0);
|
|
850
|
+
assert (nc % ncols_interleaved == 0);
|
|
628
851
|
|
|
852
|
+
UNUSED(s);
|
|
629
853
|
UNUSED(bs);
|
|
854
|
+
UNUSED(vx);
|
|
855
|
+
UNUSED(vy);
|
|
630
856
|
UNUSED(nr);
|
|
857
|
+
UNUSED(nc);
|
|
858
|
+
UNUSED(nb);
|
|
859
|
+
UNUSED(ncols_interleaved);
|
|
860
|
+
UNUSED(blocklen);
|
|
631
861
|
|
|
632
|
-
float sumf[
|
|
862
|
+
float sumf[8];
|
|
633
863
|
int sumi;
|
|
634
864
|
|
|
635
865
|
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
636
866
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
637
|
-
const
|
|
867
|
+
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
|
638
868
|
|
|
639
869
|
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
640
870
|
for (int l = 0; l < nb; l++) {
|
|
@@ -642,9 +872,9 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
642
872
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
643
873
|
sumi = 0;
|
|
644
874
|
for (int i = 0; i < blocklen; ++i) {
|
|
645
|
-
const int v0 =
|
|
646
|
-
const int v1 =
|
|
647
|
-
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
875
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
876
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
877
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
648
878
|
}
|
|
649
879
|
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
650
880
|
}
|
|
@@ -654,139 +884,1212 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
654
884
|
}
|
|
655
885
|
}
|
|
656
886
|
|
|
657
|
-
void
|
|
658
|
-
const int qk =
|
|
887
|
+
void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
888
|
+
const int qk = QK_K;
|
|
659
889
|
const int nb = n / qk;
|
|
660
890
|
const int ncols_interleaved = 8;
|
|
661
|
-
const int blocklen =
|
|
891
|
+
const int blocklen = 4;
|
|
892
|
+
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
893
|
+
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
894
|
+
static const uint32_t kmask3 = 0x03030303;
|
|
662
895
|
|
|
663
|
-
assert(
|
|
664
|
-
assert(
|
|
665
|
-
assert(nc % ncols_interleaved == 0);
|
|
896
|
+
assert (n % qk == 0);
|
|
897
|
+
assert (nc % ncols_interleaved == 0);
|
|
666
898
|
|
|
667
899
|
UNUSED(bs);
|
|
668
900
|
UNUSED(nr);
|
|
669
901
|
|
|
670
902
|
float sumf[8];
|
|
903
|
+
float sum_minf[8];
|
|
904
|
+
uint32_t utmp[32];
|
|
905
|
+
int sumi1;
|
|
906
|
+
int sumi2;
|
|
671
907
|
int sumi;
|
|
672
908
|
|
|
673
|
-
const
|
|
909
|
+
const block_q8_K * a_ptr = (const block_q8_K *) vy;
|
|
674
910
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
675
|
-
const
|
|
911
|
+
const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
|
676
912
|
|
|
677
|
-
for (int j = 0; j < ncols_interleaved; j++)
|
|
913
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
914
|
+
sumf[j] = 0.0;
|
|
915
|
+
sum_minf[j] = 0.0;
|
|
916
|
+
}
|
|
678
917
|
for (int l = 0; l < nb; l++) {
|
|
918
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
919
|
+
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
920
|
+
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
921
|
+
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
922
|
+
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
923
|
+
utmp[sb * 4 + 2] = uaux_0;
|
|
924
|
+
utmp[sb * 4 + 0] &= kmask1;
|
|
925
|
+
}
|
|
679
926
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
927
|
+
uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
|
|
928
|
+
uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
|
|
680
929
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
681
|
-
|
|
930
|
+
sumi1 = 0;
|
|
931
|
+
sumi2 = 0;
|
|
932
|
+
sumi = 0;
|
|
933
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
934
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
|
|
935
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
|
936
|
+
sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i]);
|
|
937
|
+
sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i + 32]);
|
|
938
|
+
sumi1 = sumi1 * scales_0[j];
|
|
939
|
+
sumi2 = sumi2 * scales_1[j];
|
|
940
|
+
sumi += sumi1 + sumi2;
|
|
941
|
+
}
|
|
942
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
943
|
+
}
|
|
944
|
+
}
|
|
945
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
946
|
+
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
|
947
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
948
|
+
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
}
|
|
952
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
953
|
+
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
|
954
|
+
}
|
|
955
|
+
}
|
|
956
|
+
}
|
|
957
|
+
|
|
958
|
+
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
959
|
+
const int qk = QK_K;
|
|
960
|
+
const int nb = n / qk;
|
|
961
|
+
const int ncols_interleaved = 8;
|
|
962
|
+
const int blocklen = 8;
|
|
963
|
+
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
964
|
+
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
965
|
+
static const uint32_t kmask3 = 0x03030303;
|
|
966
|
+
|
|
967
|
+
assert (n % qk == 0);
|
|
968
|
+
assert (nc % ncols_interleaved == 0);
|
|
969
|
+
|
|
970
|
+
UNUSED(bs);
|
|
971
|
+
UNUSED(nr);
|
|
972
|
+
|
|
973
|
+
float sumf[8];
|
|
974
|
+
float sum_minf[8];
|
|
975
|
+
uint32_t utmp[32];
|
|
976
|
+
int sumi1;
|
|
977
|
+
int sumi2;
|
|
978
|
+
int sumi;
|
|
979
|
+
|
|
980
|
+
const block_q8_K * a_ptr = (const block_q8_K *) vy;
|
|
981
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
982
|
+
const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
|
983
|
+
|
|
984
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
985
|
+
sumf[j] = 0.0;
|
|
986
|
+
sum_minf[j] = 0.0;
|
|
987
|
+
}
|
|
988
|
+
for (int l = 0; l < nb; l++) {
|
|
989
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
990
|
+
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
991
|
+
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
992
|
+
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
993
|
+
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
994
|
+
utmp[sb * 4 + 2] = uaux_0;
|
|
995
|
+
utmp[sb * 4 + 0] &= kmask1;
|
|
996
|
+
}
|
|
997
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
998
|
+
uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
|
|
999
|
+
uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
|
|
1000
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1001
|
+
sumi1 = 0;
|
|
1002
|
+
sumi2 = 0;
|
|
1003
|
+
sumi = 0;
|
|
1004
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1005
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
|
|
1006
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
|
1007
|
+
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]);
|
|
1008
|
+
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]);
|
|
1009
|
+
sumi1 = sumi1 * scales_0[j];
|
|
1010
|
+
sumi2 = sumi2 * scales_1[j];
|
|
1011
|
+
sumi += sumi1 + sumi2;
|
|
1012
|
+
}
|
|
1013
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
1014
|
+
}
|
|
1015
|
+
}
|
|
1016
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
1017
|
+
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
|
|
1018
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1019
|
+
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
1020
|
+
}
|
|
1021
|
+
}
|
|
1022
|
+
}
|
|
1023
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1024
|
+
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
|
1025
|
+
}
|
|
1026
|
+
}
|
|
1027
|
+
}
|
|
1028
|
+
|
|
1029
|
+
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1030
|
+
const int qk = QK_K;
|
|
1031
|
+
const int nb = n / qk;
|
|
1032
|
+
const int ncols_interleaved = 8;
|
|
1033
|
+
const int blocklen = 8;
|
|
1034
|
+
|
|
1035
|
+
assert (n % qk == 0);
|
|
1036
|
+
assert (nc % ncols_interleaved == 0);
|
|
1037
|
+
|
|
1038
|
+
UNUSED(s);
|
|
1039
|
+
UNUSED(bs);
|
|
1040
|
+
UNUSED(vx);
|
|
1041
|
+
UNUSED(vy);
|
|
1042
|
+
UNUSED(nr);
|
|
1043
|
+
UNUSED(nc);
|
|
1044
|
+
UNUSED(nb);
|
|
1045
|
+
UNUSED(ncols_interleaved);
|
|
1046
|
+
UNUSED(blocklen);
|
|
1047
|
+
|
|
1048
|
+
float sumf[8];
|
|
1049
|
+
float sum_minf[8];
|
|
1050
|
+
int sumi1,sumi2,sumi3,sumi4;
|
|
1051
|
+
int sumi;
|
|
1052
|
+
|
|
1053
|
+
const block_q8_K * a_ptr = (const block_q8_K *)vy;
|
|
1054
|
+
for(int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1055
|
+
const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
|
|
1056
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1057
|
+
sumf[j] = 0.0;
|
|
1058
|
+
sum_minf[j] = 0.0;
|
|
1059
|
+
}
|
|
1060
|
+
for (int l = 0; l < nb; l++) {
|
|
1061
|
+
for (int k = 0; k < (qk / (4 * blocklen)); k++) {
|
|
1062
|
+
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
|
1063
|
+
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
|
1064
|
+
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
|
1065
|
+
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
|
1066
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1067
|
+
sumi1 = 0;
|
|
1068
|
+
sumi2 = 0;
|
|
1069
|
+
sumi3 = 0;
|
|
1070
|
+
sumi4 = 0;
|
|
1071
|
+
sumi = 0;
|
|
1072
|
+
int offset = ((k / 2) % 2) + j * 2;
|
|
1073
|
+
for (int i = 0; i < blocklen; ++i){
|
|
1074
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
|
|
1075
|
+
const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
|
|
1076
|
+
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
|
1077
|
+
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
|
1078
|
+
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
|
|
1079
|
+
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
|
|
1080
|
+
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
|
|
1081
|
+
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
|
|
1082
|
+
|
|
1083
|
+
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
|
1084
|
+
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
|
1085
|
+
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
|
1086
|
+
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
|
1087
|
+
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
|
1088
|
+
}
|
|
1089
|
+
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
1090
|
+
}
|
|
1091
|
+
}
|
|
1092
|
+
for(int sb = 0; sb < 8; sb++) {
|
|
1093
|
+
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
|
1094
|
+
for(int j = 0; j < ncols_interleaved; j++){
|
|
1095
|
+
sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
1096
|
+
}
|
|
1097
|
+
}
|
|
1098
|
+
}
|
|
1099
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1100
|
+
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
|
1101
|
+
}
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
|
|
1105
|
+
void ggml_gemv_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1106
|
+
ggml_gemv_q5_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
void ggml_gemv_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1110
|
+
ggml_gemv_q5_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
|
|
1111
|
+
}
|
|
1112
|
+
|
|
1113
|
+
|
|
1114
|
+
void ggml_gemv_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1115
|
+
ggml_gemv_q6_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
|
|
1116
|
+
}
|
|
1117
|
+
|
|
1118
|
+
void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1119
|
+
ggml_gemv_q6_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1123
|
+
const int qk = QK8_0;
|
|
1124
|
+
const int nb = n / qk;
|
|
1125
|
+
const int ncols_interleaved = 4;
|
|
1126
|
+
const int blocklen = 4;
|
|
1127
|
+
|
|
1128
|
+
assert(nr == 1);
|
|
1129
|
+
assert(n % qk == 0);
|
|
1130
|
+
assert(nc % ncols_interleaved == 0);
|
|
1131
|
+
|
|
1132
|
+
UNUSED(bs);
|
|
1133
|
+
UNUSED(nr);
|
|
1134
|
+
|
|
1135
|
+
float sumf[4];
|
|
1136
|
+
int sumi;
|
|
1137
|
+
|
|
1138
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
1139
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1140
|
+
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
|
1141
|
+
|
|
1142
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
1143
|
+
for (int l = 0; l < nb; l++) {
|
|
1144
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1145
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1146
|
+
sumi = 0;
|
|
1147
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1148
|
+
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
1149
|
+
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
1150
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
1151
|
+
}
|
|
1152
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
1153
|
+
}
|
|
1154
|
+
}
|
|
1155
|
+
}
|
|
1156
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
1157
|
+
}
|
|
1158
|
+
}
|
|
1159
|
+
|
|
1160
|
+
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1161
|
+
const int qk = QK8_0;
|
|
1162
|
+
const int nb = n / qk;
|
|
1163
|
+
const int ncols_interleaved = 8;
|
|
1164
|
+
const int blocklen = 8;
|
|
1165
|
+
|
|
1166
|
+
assert(nr == 1);
|
|
1167
|
+
assert(n % qk == 0);
|
|
1168
|
+
assert(nc % ncols_interleaved == 0);
|
|
1169
|
+
|
|
1170
|
+
UNUSED(bs);
|
|
1171
|
+
UNUSED(nr);
|
|
1172
|
+
|
|
1173
|
+
float sumf[8];
|
|
1174
|
+
int sumi;
|
|
1175
|
+
|
|
1176
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
1177
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1178
|
+
const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
|
|
1179
|
+
|
|
1180
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
1181
|
+
for (int l = 0; l < nb; l++) {
|
|
1182
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1183
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1184
|
+
sumi = 0;
|
|
682
1185
|
for (int i = 0; i < blocklen; ++i) {
|
|
683
1186
|
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
684
1187
|
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
685
1188
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
686
1189
|
}
|
|
687
|
-
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
1190
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
1191
|
+
}
|
|
1192
|
+
}
|
|
1193
|
+
}
|
|
1194
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
1195
|
+
}
|
|
1196
|
+
}
|
|
1197
|
+
|
|
1198
|
+
void ggml_gemv_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1199
|
+
const int qk = QK8_0;
|
|
1200
|
+
const int nb = n / qk;
|
|
1201
|
+
const int ncols_interleaved = 4;
|
|
1202
|
+
const int blocklen = 4;
|
|
1203
|
+
|
|
1204
|
+
assert(nr == 1);
|
|
1205
|
+
assert(n % qk == 0);
|
|
1206
|
+
assert(nc % ncols_interleaved == 0);
|
|
1207
|
+
|
|
1208
|
+
UNUSED(bs);
|
|
1209
|
+
UNUSED(nr);
|
|
1210
|
+
|
|
1211
|
+
float sumf[4];
|
|
1212
|
+
int sumi;
|
|
1213
|
+
|
|
1214
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
1215
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1216
|
+
const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb);
|
|
1217
|
+
|
|
1218
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
1219
|
+
for (int l = 0; l < nb; l++) {
|
|
1220
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1221
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1222
|
+
sumi = 0;
|
|
1223
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1224
|
+
const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
1225
|
+
const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
1226
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
1227
|
+
}
|
|
1228
|
+
sumf[j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
1229
|
+
}
|
|
1230
|
+
}
|
|
1231
|
+
}
|
|
1232
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
1233
|
+
}
|
|
1234
|
+
}
|
|
1235
|
+
|
|
1236
|
+
void ggml_gemv_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1237
|
+
const int qk = QK8_0;
|
|
1238
|
+
const int nb = n / qk;
|
|
1239
|
+
const int ncols_interleaved = 8;
|
|
1240
|
+
const int blocklen = 8;
|
|
1241
|
+
|
|
1242
|
+
assert(nr == 1);
|
|
1243
|
+
assert(n % qk == 0);
|
|
1244
|
+
assert(nc % ncols_interleaved == 0);
|
|
1245
|
+
|
|
1246
|
+
UNUSED(bs);
|
|
1247
|
+
UNUSED(nr);
|
|
1248
|
+
|
|
1249
|
+
float sumf[8];
|
|
1250
|
+
int sumi;
|
|
1251
|
+
|
|
1252
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
1253
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1254
|
+
const block_mxfp4x8 * b_ptr = (const block_mxfp4x8 *) vx + (x * nb);
|
|
1255
|
+
|
|
1256
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
1257
|
+
for (int l = 0; l < nb; l++) {
|
|
1258
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1259
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1260
|
+
sumi = 0;
|
|
1261
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1262
|
+
const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
1263
|
+
const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
1264
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
1265
|
+
}
|
|
1266
|
+
sumf[j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
1267
|
+
}
|
|
1268
|
+
}
|
|
1269
|
+
}
|
|
1270
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
1271
|
+
}
|
|
1272
|
+
}
|
|
1273
|
+
|
|
1274
|
+
void ggml_gemv_q8_0_4x4_q8_0_generic(int n,
|
|
1275
|
+
float * GGML_RESTRICT s,
|
|
1276
|
+
size_t bs,
|
|
1277
|
+
const void * GGML_RESTRICT vx,
|
|
1278
|
+
const void * GGML_RESTRICT vy,
|
|
1279
|
+
int nr,
|
|
1280
|
+
int nc) {
|
|
1281
|
+
const int qk = QK8_0;
|
|
1282
|
+
const int nb = n / qk;
|
|
1283
|
+
const int ncols_interleaved = 4;
|
|
1284
|
+
const int blocklen = 4;
|
|
1285
|
+
|
|
1286
|
+
assert(nr == 1);
|
|
1287
|
+
assert(n % qk == 0);
|
|
1288
|
+
assert(nc % ncols_interleaved == 0);
|
|
1289
|
+
|
|
1290
|
+
UNUSED(bs);
|
|
1291
|
+
UNUSED(nr);
|
|
1292
|
+
|
|
1293
|
+
float sumf[4];
|
|
1294
|
+
int sumi;
|
|
1295
|
+
|
|
1296
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
1297
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1298
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
1299
|
+
|
|
1300
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1301
|
+
sumf[j] = 0.0;
|
|
1302
|
+
}
|
|
1303
|
+
for (int l = 0; l < nb; l++) {
|
|
1304
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
1305
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1306
|
+
sumi = 0;
|
|
1307
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1308
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
1309
|
+
sumi += v0 * a_ptr[l].qs[k * blocklen + i];
|
|
1310
|
+
}
|
|
1311
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
1312
|
+
}
|
|
1313
|
+
}
|
|
1314
|
+
}
|
|
1315
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1316
|
+
s[x * ncols_interleaved + j] = sumf[j];
|
|
1317
|
+
}
|
|
1318
|
+
}
|
|
1319
|
+
}
|
|
1320
|
+
|
|
1321
|
+
void ggml_gemv_q8_0_4x8_q8_0_generic(int n,
|
|
1322
|
+
float * GGML_RESTRICT s,
|
|
1323
|
+
size_t bs,
|
|
1324
|
+
const void * GGML_RESTRICT vx,
|
|
1325
|
+
const void * GGML_RESTRICT vy,
|
|
1326
|
+
int nr,
|
|
1327
|
+
int nc) {
|
|
1328
|
+
const int qk = QK8_0;
|
|
1329
|
+
const int nb = n / qk;
|
|
1330
|
+
const int ncols_interleaved = 4;
|
|
1331
|
+
const int blocklen = 8;
|
|
1332
|
+
|
|
1333
|
+
assert(nr == 1);
|
|
1334
|
+
assert(n % qk == 0);
|
|
1335
|
+
assert(nc % ncols_interleaved == 0);
|
|
1336
|
+
|
|
1337
|
+
UNUSED(bs);
|
|
1338
|
+
UNUSED(nr);
|
|
1339
|
+
|
|
1340
|
+
float sumf[4];
|
|
1341
|
+
int sumi;
|
|
1342
|
+
|
|
1343
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
1344
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1345
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
1346
|
+
|
|
1347
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1348
|
+
sumf[j] = 0.0;
|
|
1349
|
+
}
|
|
1350
|
+
for (int l = 0; l < nb; l++) {
|
|
1351
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
1352
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1353
|
+
sumi = 0;
|
|
1354
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1355
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
1356
|
+
sumi += v0 * a_ptr[l].qs[k * blocklen + i];
|
|
1357
|
+
}
|
|
1358
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
1359
|
+
}
|
|
1360
|
+
}
|
|
1361
|
+
}
|
|
1362
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1363
|
+
s[x * ncols_interleaved + j] = sumf[j];
|
|
1364
|
+
}
|
|
1365
|
+
}
|
|
1366
|
+
}
|
|
1367
|
+
|
|
1368
|
+
// Only enable these for RISC-V.
|
|
1369
|
+
#if defined __riscv_zvfh
|
|
1370
|
+
void ggml_gemv_q4_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1371
|
+
const int qk = QK8_0;
|
|
1372
|
+
const int nb = n / qk;
|
|
1373
|
+
const int ncols_interleaved = 16;
|
|
1374
|
+
const int blocklen = 1;
|
|
1375
|
+
|
|
1376
|
+
assert (n % qk == 0);
|
|
1377
|
+
assert (nc % ncols_interleaved == 0);
|
|
1378
|
+
|
|
1379
|
+
UNUSED(s);
|
|
1380
|
+
UNUSED(bs);
|
|
1381
|
+
UNUSED(vx);
|
|
1382
|
+
UNUSED(vy);
|
|
1383
|
+
UNUSED(nr);
|
|
1384
|
+
UNUSED(nc);
|
|
1385
|
+
UNUSED(nb);
|
|
1386
|
+
UNUSED(ncols_interleaved);
|
|
1387
|
+
UNUSED(blocklen);
|
|
1388
|
+
|
|
1389
|
+
float sumf[16];
|
|
1390
|
+
int sumi;
|
|
1391
|
+
|
|
1392
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
1393
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1394
|
+
const block_q4_0x16 * b_ptr = (const block_q4_0x16 *) vx + (x * nb);
|
|
1395
|
+
|
|
1396
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
1397
|
+
for (int l = 0; l < nb; l++) {
|
|
1398
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1399
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1400
|
+
sumi = 0;
|
|
1401
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1402
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
1403
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
1404
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
1405
|
+
}
|
|
1406
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
1407
|
+
}
|
|
1408
|
+
}
|
|
1409
|
+
}
|
|
1410
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
1411
|
+
}
|
|
1412
|
+
}
|
|
1413
|
+
|
|
1414
|
+
void ggml_gemv_q4_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1415
|
+
const int qk = QK_K;
|
|
1416
|
+
const int nb = n / qk;
|
|
1417
|
+
const int ncols_interleaved = 16;
|
|
1418
|
+
const int blocklen = 1;
|
|
1419
|
+
assert (n % qk == 0);
|
|
1420
|
+
assert (nc % ncols_interleaved == 0);
|
|
1421
|
+
UNUSED(s);
|
|
1422
|
+
UNUSED(bs);
|
|
1423
|
+
UNUSED(vx);
|
|
1424
|
+
UNUSED(vy);
|
|
1425
|
+
UNUSED(nr);
|
|
1426
|
+
UNUSED(nc);
|
|
1427
|
+
UNUSED(nb);
|
|
1428
|
+
UNUSED(ncols_interleaved);
|
|
1429
|
+
UNUSED(blocklen);
|
|
1430
|
+
float sumf[16];
|
|
1431
|
+
float sum_minf[16];
|
|
1432
|
+
uint8_t scales[128];
|
|
1433
|
+
uint8_t mins[128];
|
|
1434
|
+
int sumi1;
|
|
1435
|
+
int sumi2;
|
|
1436
|
+
int sumi;
|
|
1437
|
+
const block_q8_K * a_ptr = (const block_q8_K *) vy;
|
|
1438
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1439
|
+
const block_q4_Kx16 * b_ptr = (const block_q4_Kx16 *) vx + (x * nb);
|
|
1440
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1441
|
+
sumf[j] = 0.0f;
|
|
1442
|
+
sum_minf[j] = 0.0f;
|
|
1443
|
+
}
|
|
1444
|
+
for (int l = 0; l < nb; l++) {
|
|
1445
|
+
for (int i = 0; i < 128; i++) {
|
|
1446
|
+
scales[i] = b_ptr[l].scales[i] & 0x0F;
|
|
1447
|
+
mins[i] = b_ptr[l].scales[i] >> 4;
|
|
1448
|
+
}
|
|
1449
|
+
for (int i = 0; i < 64; i++) {
|
|
1450
|
+
scales[i] |= (b_ptr[l].scales[128 + i] & 0x03) << 4;
|
|
1451
|
+
mins[i] |= (b_ptr[l].scales[128 + i] & 0x0C) << 2;
|
|
1452
|
+
scales[i + 64] |= (b_ptr[l].scales[128 + i] & 0x30);
|
|
1453
|
+
mins[i + 64] |= (b_ptr[l].scales[128 + i] & 0xC0) >> 2;
|
|
1454
|
+
}
|
|
1455
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
1456
|
+
uint8_t *min = &mins[sb * 16];
|
|
1457
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1458
|
+
sum_minf[j] += min[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
1459
|
+
}
|
|
1460
|
+
}
|
|
1461
|
+
for (int sb = 0; sb < 8; sb += 2) {
|
|
1462
|
+
uint8_t *scales_0 = &scales[sb * 16];
|
|
1463
|
+
uint8_t *scales_1 = &scales[(sb + 1) * 16];
|
|
1464
|
+
for (int i = 0; i < QK4_0; i++) {
|
|
1465
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1466
|
+
sumi1 = 0;
|
|
1467
|
+
sumi2 = 0;
|
|
1468
|
+
sumi = 0;
|
|
1469
|
+
const int v0 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] & 0xF);
|
|
1470
|
+
const int v1 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] >> 4);
|
|
1471
|
+
sumi1 = (v0 * a_ptr[l].qs[sb * 32 + i]);
|
|
1472
|
+
sumi2 = (v1 * a_ptr[l].qs[sb * 32 + 32 + i]);
|
|
1473
|
+
sumi1 = sumi1 * scales_0[j];
|
|
1474
|
+
sumi2 = sumi2 * scales_1[j];
|
|
1475
|
+
sumi += sumi1 + sumi2;
|
|
1476
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
1477
|
+
}
|
|
1478
|
+
}
|
|
1479
|
+
}
|
|
1480
|
+
}
|
|
1481
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1482
|
+
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
|
1483
|
+
}
|
|
1484
|
+
}
|
|
1485
|
+
}
|
|
1486
|
+
|
|
1487
|
+
void ggml_gemv_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1488
|
+
const int qk = QK8_0;
|
|
1489
|
+
const int nb = n / qk;
|
|
1490
|
+
const int ncols_interleaved = 16;
|
|
1491
|
+
const int blocklen = 1;
|
|
1492
|
+
|
|
1493
|
+
assert(nr == 1);
|
|
1494
|
+
assert(n % qk == 0);
|
|
1495
|
+
assert(nc % ncols_interleaved == 0);
|
|
1496
|
+
|
|
1497
|
+
UNUSED(bs);
|
|
1498
|
+
UNUSED(nr);
|
|
1499
|
+
|
|
1500
|
+
float sumf[16];
|
|
1501
|
+
int sumi;
|
|
1502
|
+
|
|
1503
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
1504
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1505
|
+
const block_iq4_nlx16 * b_ptr = (const block_iq4_nlx16 *) vx + (x * nb);
|
|
1506
|
+
|
|
1507
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
1508
|
+
for (int l = 0; l < nb; l++) {
|
|
1509
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1510
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1511
|
+
sumi = 0;
|
|
1512
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1513
|
+
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
1514
|
+
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
1515
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
1516
|
+
}
|
|
1517
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
1518
|
+
}
|
|
1519
|
+
}
|
|
1520
|
+
}
|
|
1521
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
1522
|
+
}
|
|
1523
|
+
}
|
|
1524
|
+
|
|
1525
|
+
void ggml_gemv_q8_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1526
|
+
const int qk = QK8_0;
|
|
1527
|
+
const int nb = n / qk;
|
|
1528
|
+
const int ncols_interleaved = 16;
|
|
1529
|
+
const int blocklen = 1;
|
|
1530
|
+
|
|
1531
|
+
assert(nr == 1);
|
|
1532
|
+
assert(n % qk == 0);
|
|
1533
|
+
assert(nc % ncols_interleaved == 0);
|
|
1534
|
+
|
|
1535
|
+
UNUSED(bs);
|
|
1536
|
+
UNUSED(nr);
|
|
1537
|
+
|
|
1538
|
+
float sumf[16];
|
|
1539
|
+
int sumi;
|
|
1540
|
+
|
|
1541
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
1542
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1543
|
+
const block_q8_0x16 * b_ptr = (const block_q8_0x16 *) vx + (x * nb);
|
|
1544
|
+
|
|
1545
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1546
|
+
sumf[j] = 0.0;
|
|
1547
|
+
}
|
|
1548
|
+
for (int l = 0; l < nb; l++) {
|
|
1549
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
1550
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1551
|
+
sumi = 0;
|
|
1552
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1553
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
1554
|
+
sumi += v0 * a_ptr[l].qs[k * blocklen + i];
|
|
1555
|
+
}
|
|
1556
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
1557
|
+
}
|
|
1558
|
+
}
|
|
1559
|
+
}
|
|
1560
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1561
|
+
s[x * ncols_interleaved + j] = sumf[j];
|
|
1562
|
+
}
|
|
1563
|
+
}
|
|
1564
|
+
}
|
|
1565
|
+
|
|
1566
|
+
void ggml_gemv_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1567
|
+
assert(n % QK_K == 0);
|
|
1568
|
+
assert(nr == 1);
|
|
1569
|
+
assert(nc % 16 == 0);
|
|
1570
|
+
|
|
1571
|
+
UNUSED(bs);
|
|
1572
|
+
UNUSED(nr);
|
|
1573
|
+
|
|
1574
|
+
const int nb = n / QK_K;
|
|
1575
|
+
const block_q2_Kx16 * x = (const block_q2_Kx16 *)vx;
|
|
1576
|
+
const block_q8_K * y = (const block_q8_K *)vy;
|
|
1577
|
+
|
|
1578
|
+
// Layout: Even-Low(0,2,4,6), Odd-Low(1,3,5,7), Even-High(8...), Odd-High(9...)
|
|
1579
|
+
const int sb_perm[16] = {
|
|
1580
|
+
0, 4, 1, 5, 2, 6, 3, 7, // 0-7
|
|
1581
|
+
8, 12, 9, 13, 10, 14, 11, 15 // 8-15
|
|
1582
|
+
};
|
|
1583
|
+
|
|
1584
|
+
for (int col_tile = 0; col_tile < nc; col_tile += 16) {
|
|
1585
|
+
const block_q2_Kx16 * x_ptr = x + (col_tile / 16) * nb;
|
|
1586
|
+
const block_q8_K * y_ptr = y;
|
|
1587
|
+
|
|
1588
|
+
float sumf[16] = {0};
|
|
1589
|
+
|
|
1590
|
+
// Loop over K-blocks
|
|
1591
|
+
for (int k_block = 0; k_block < nb; ++k_block) {
|
|
1592
|
+
int32_t isum[16] = {0};
|
|
1593
|
+
int32_t summs[16] = {0};
|
|
1594
|
+
|
|
1595
|
+
const uint8_t * qs_rhs = x_ptr[k_block].qs;
|
|
1596
|
+
const uint8_t * sc_rhs = x_ptr[k_block].scales;
|
|
1597
|
+
const int8_t * qs_lhs = y_ptr[k_block].qs;
|
|
1598
|
+
const int16_t * bs_lhs = y_ptr[k_block].bsums;
|
|
1599
|
+
|
|
1600
|
+
// Iterate over sub-blocks 0..15
|
|
1601
|
+
for (int sb = 0; sb < 16; ++sb) {
|
|
1602
|
+
// Correction Term
|
|
1603
|
+
int16_t bsum = bs_lhs[sb];
|
|
1604
|
+
int scale_offset = sb_perm[sb] * 16;
|
|
1605
|
+
|
|
1606
|
+
for (int col = 0; col < 16; ++col) {
|
|
1607
|
+
uint8_t sc_val = sc_rhs[scale_offset + col];
|
|
1608
|
+
summs[col] += bsum * (sc_val >> 4); // Min is high 4 bits
|
|
1609
|
+
}
|
|
1610
|
+
|
|
1611
|
+
// Main Dot Product
|
|
1612
|
+
// Calculate base offsets for Q2 unpacking based on SB
|
|
1613
|
+
int byte_base;
|
|
1614
|
+
if (sb < 8) byte_base = (sb % 2 == 0) ? 0 : 16;
|
|
1615
|
+
else byte_base = (sb % 2 == 0) ? 32 : 48;
|
|
1616
|
+
|
|
1617
|
+
int shift = ((sb / 2) % 4) * 2;
|
|
1618
|
+
|
|
1619
|
+
for (int col = 0; col < 16; ++col) {
|
|
1620
|
+
uint8_t sc_val = sc_rhs[scale_offset + col];
|
|
1621
|
+
int32_t d_sb = sc_val & 0xF; // Scale is low 4 bits
|
|
1622
|
+
|
|
1623
|
+
// Process 16 elements (l=0..15)
|
|
1624
|
+
for (int l = 0; l < 16; ++l) {
|
|
1625
|
+
// Q2: Interleaved by column. Byte `l` contains 4 k-values.
|
|
1626
|
+
int qs_idx = (byte_base + l) * 16 + col;
|
|
1627
|
+
uint8_t q2_val = (qs_rhs[qs_idx] >> shift) & 3;
|
|
1628
|
+
|
|
1629
|
+
// Q8: Linear access
|
|
1630
|
+
int k = sb * 16 + l;
|
|
1631
|
+
int8_t q8_val = qs_lhs[k];
|
|
1632
|
+
|
|
1633
|
+
isum[col] += q8_val * q2_val * d_sb;
|
|
1634
|
+
}
|
|
1635
|
+
}
|
|
1636
|
+
}
|
|
1637
|
+
|
|
1638
|
+
// Finalize K-Block
|
|
1639
|
+
for (int col = 0; col < 16; ++col) {
|
|
1640
|
+
float d_lhs = y_ptr[k_block].d;
|
|
1641
|
+
float d_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].d[col]);
|
|
1642
|
+
float dm_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].dmin[col]);
|
|
1643
|
+
|
|
1644
|
+
float d_all = d_lhs * d_rhs;
|
|
1645
|
+
float d_min = d_lhs * dm_rhs;
|
|
1646
|
+
|
|
1647
|
+
sumf[col] += (isum[col] * d_all) - (summs[col] * d_min);
|
|
1648
|
+
}
|
|
1649
|
+
}
|
|
1650
|
+
|
|
1651
|
+
for (int col = 0; col < 16; ++col) {
|
|
1652
|
+
s[col_tile + col] = sumf[col];
|
|
1653
|
+
}
|
|
1654
|
+
}
|
|
1655
|
+
}
|
|
1656
|
+
#endif
|
|
1657
|
+
|
|
1658
|
+
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1659
|
+
const int qk = QK8_0;
|
|
1660
|
+
const int nb = n / qk;
|
|
1661
|
+
const int ncols_interleaved = 4;
|
|
1662
|
+
const int blocklen = 4;
|
|
1663
|
+
|
|
1664
|
+
assert (n % qk == 0);
|
|
1665
|
+
assert (nr % 4 == 0);
|
|
1666
|
+
assert (nc % ncols_interleaved == 0);
|
|
1667
|
+
|
|
1668
|
+
UNUSED(s);
|
|
1669
|
+
UNUSED(bs);
|
|
1670
|
+
UNUSED(vx);
|
|
1671
|
+
UNUSED(vy);
|
|
1672
|
+
UNUSED(nr);
|
|
1673
|
+
UNUSED(nc);
|
|
1674
|
+
UNUSED(nb);
|
|
1675
|
+
UNUSED(ncols_interleaved);
|
|
1676
|
+
UNUSED(blocklen);
|
|
1677
|
+
|
|
1678
|
+
{
|
|
1679
|
+
float sumf[4][4];
|
|
1680
|
+
int sumi;
|
|
1681
|
+
|
|
1682
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
1683
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1684
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1685
|
+
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
|
1686
|
+
for (int m = 0; m < 4; m++) {
|
|
1687
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
1688
|
+
}
|
|
1689
|
+
for (int l = 0; l < nb; l++) {
|
|
1690
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1691
|
+
for (int m = 0; m < 4; m++) {
|
|
1692
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1693
|
+
sumi = 0;
|
|
1694
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1695
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
1696
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
1697
|
+
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
1698
|
+
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
1699
|
+
}
|
|
1700
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1701
|
+
}
|
|
1702
|
+
}
|
|
1703
|
+
}
|
|
1704
|
+
}
|
|
1705
|
+
for (int m = 0; m < 4; m++) {
|
|
1706
|
+
for (int j = 0; j < ncols_interleaved; j++)
|
|
1707
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1708
|
+
}
|
|
1709
|
+
}
|
|
1710
|
+
}
|
|
1711
|
+
}
|
|
1712
|
+
}
|
|
1713
|
+
|
|
1714
|
+
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1715
|
+
const int qk = QK8_0;
|
|
1716
|
+
const int nb = n / qk;
|
|
1717
|
+
const int ncols_interleaved = 4;
|
|
1718
|
+
const int blocklen = 8;
|
|
1719
|
+
|
|
1720
|
+
assert (n % qk == 0);
|
|
1721
|
+
assert (nr % 4 == 0);
|
|
1722
|
+
assert (nc % ncols_interleaved == 0);
|
|
1723
|
+
|
|
1724
|
+
UNUSED(s);
|
|
1725
|
+
UNUSED(bs);
|
|
1726
|
+
UNUSED(vx);
|
|
1727
|
+
UNUSED(vy);
|
|
1728
|
+
UNUSED(nr);
|
|
1729
|
+
UNUSED(nc);
|
|
1730
|
+
UNUSED(nb);
|
|
1731
|
+
UNUSED(ncols_interleaved);
|
|
1732
|
+
UNUSED(blocklen);
|
|
1733
|
+
|
|
1734
|
+
float sumf[4][4];
|
|
1735
|
+
int sumi;
|
|
1736
|
+
|
|
1737
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
1738
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1739
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1740
|
+
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
|
1741
|
+
for (int m = 0; m < 4; m++) {
|
|
1742
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
1743
|
+
}
|
|
1744
|
+
for (int l = 0; l < nb; l++) {
|
|
1745
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1746
|
+
for (int m = 0; m < 4; m++) {
|
|
1747
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1748
|
+
sumi = 0;
|
|
1749
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1750
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
1751
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
1752
|
+
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
1753
|
+
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
1754
|
+
}
|
|
1755
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1756
|
+
}
|
|
1757
|
+
}
|
|
1758
|
+
}
|
|
1759
|
+
}
|
|
1760
|
+
for (int m = 0; m < 4; m++) {
|
|
1761
|
+
for (int j = 0; j < ncols_interleaved; j++)
|
|
1762
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1763
|
+
}
|
|
1764
|
+
}
|
|
1765
|
+
}
|
|
1766
|
+
}
|
|
1767
|
+
|
|
1768
|
+
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1769
|
+
const int qk = QK8_0;
|
|
1770
|
+
const int nb = n / qk;
|
|
1771
|
+
const int ncols_interleaved = 8;
|
|
1772
|
+
const int blocklen = 8;
|
|
1773
|
+
|
|
1774
|
+
assert (n % qk == 0);
|
|
1775
|
+
assert (nr % 4 == 0);
|
|
1776
|
+
assert (nc % ncols_interleaved == 0);
|
|
1777
|
+
|
|
1778
|
+
UNUSED(s);
|
|
1779
|
+
UNUSED(bs);
|
|
1780
|
+
UNUSED(vx);
|
|
1781
|
+
UNUSED(vy);
|
|
1782
|
+
UNUSED(nr);
|
|
1783
|
+
UNUSED(nc);
|
|
1784
|
+
UNUSED(nb);
|
|
1785
|
+
UNUSED(ncols_interleaved);
|
|
1786
|
+
UNUSED(blocklen);
|
|
1787
|
+
|
|
1788
|
+
float sumf[4][8];
|
|
1789
|
+
int sumi;
|
|
1790
|
+
|
|
1791
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
1792
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1793
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1794
|
+
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
|
1795
|
+
for (int m = 0; m < 4; m++) {
|
|
1796
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
1797
|
+
}
|
|
1798
|
+
for (int l = 0; l < nb; l++) {
|
|
1799
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1800
|
+
for (int m = 0; m < 4; m++) {
|
|
1801
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1802
|
+
sumi = 0;
|
|
1803
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1804
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
1805
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
1806
|
+
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
1807
|
+
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
1808
|
+
}
|
|
1809
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1810
|
+
}
|
|
1811
|
+
}
|
|
1812
|
+
}
|
|
1813
|
+
}
|
|
1814
|
+
for (int m = 0; m < 4; m++) {
|
|
1815
|
+
for (int j = 0; j < ncols_interleaved; j++)
|
|
1816
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1817
|
+
}
|
|
1818
|
+
}
|
|
1819
|
+
}
|
|
1820
|
+
}
|
|
1821
|
+
|
|
1822
|
+
void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1823
|
+
const int qk = QK_K;
|
|
1824
|
+
const int nb = n / qk;
|
|
1825
|
+
const int ncols_interleaved = 8;
|
|
1826
|
+
const int blocklen = 4;
|
|
1827
|
+
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
1828
|
+
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
1829
|
+
static const uint32_t kmask3 = 0x03030303;
|
|
1830
|
+
|
|
1831
|
+
assert (n % qk == 0);
|
|
1832
|
+
assert (nr % 4 == 0);
|
|
1833
|
+
assert (nc % ncols_interleaved == 0);
|
|
1834
|
+
|
|
1835
|
+
UNUSED(nb);
|
|
1836
|
+
UNUSED(ncols_interleaved);
|
|
1837
|
+
UNUSED(blocklen);
|
|
1838
|
+
|
|
1839
|
+
float sumf[4][8];
|
|
1840
|
+
float sum_minf[4][8];
|
|
1841
|
+
uint32_t utmp[32];
|
|
1842
|
+
int sumi1;
|
|
1843
|
+
int sumi2;
|
|
1844
|
+
int sumi;
|
|
1845
|
+
|
|
1846
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
1847
|
+
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
|
1848
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1849
|
+
const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
|
1850
|
+
for (int m = 0; m < 4; m++) {
|
|
1851
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1852
|
+
sumf[m][j] = 0.0;
|
|
1853
|
+
sum_minf[m][j] = 0.0;
|
|
1854
|
+
}
|
|
1855
|
+
}
|
|
1856
|
+
for (int l = 0; l < nb; l++) {
|
|
1857
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
1858
|
+
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
1859
|
+
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
1860
|
+
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
1861
|
+
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
1862
|
+
utmp[sb * 4 + 2] = uaux_0;
|
|
1863
|
+
utmp[sb * 4 + 0] &= kmask1;
|
|
1864
|
+
}
|
|
1865
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1866
|
+
uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
|
|
1867
|
+
uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
|
|
1868
|
+
for (int m = 0; m < 4; m++) {
|
|
1869
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1870
|
+
sumi1 = 0;
|
|
1871
|
+
sumi2 = 0;
|
|
1872
|
+
sumi = 0;
|
|
1873
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1874
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
|
|
1875
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
|
1876
|
+
sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i]);
|
|
1877
|
+
sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i + 128]);
|
|
1878
|
+
sumi1 = sumi1 * scales_0[j];
|
|
1879
|
+
sumi2 = sumi2 * scales_1[j];
|
|
1880
|
+
sumi += sumi1 + sumi2;
|
|
1881
|
+
}
|
|
1882
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
1883
|
+
}
|
|
1884
|
+
}
|
|
1885
|
+
}
|
|
1886
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
1887
|
+
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
|
1888
|
+
for(int m = 0; m < 4; m++) {
|
|
1889
|
+
const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
1890
|
+
for(int j = 0; j < ncols_interleaved; j++) {
|
|
1891
|
+
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
1892
|
+
}
|
|
1893
|
+
}
|
|
1894
|
+
}
|
|
1895
|
+
}
|
|
1896
|
+
for (int m = 0; m < 4; m++) {
|
|
1897
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1898
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
|
1899
|
+
}
|
|
1900
|
+
}
|
|
1901
|
+
}
|
|
1902
|
+
}
|
|
1903
|
+
}
|
|
1904
|
+
|
|
1905
|
+
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1906
|
+
const int qk = QK_K;
|
|
1907
|
+
const int nb = n / qk;
|
|
1908
|
+
const int ncols_interleaved = 8;
|
|
1909
|
+
const int blocklen = 8;
|
|
1910
|
+
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
1911
|
+
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
1912
|
+
static const uint32_t kmask3 = 0x03030303;
|
|
1913
|
+
|
|
1914
|
+
assert (n % qk == 0);
|
|
1915
|
+
assert (nr % 4 == 0);
|
|
1916
|
+
assert (nc % ncols_interleaved == 0);
|
|
1917
|
+
|
|
1918
|
+
UNUSED(bs);
|
|
1919
|
+
|
|
1920
|
+
float sumf[4][8];
|
|
1921
|
+
float sum_minf[4][8];
|
|
1922
|
+
uint32_t utmp[32];
|
|
1923
|
+
int sumi1;
|
|
1924
|
+
int sumi2;
|
|
1925
|
+
int sumi;
|
|
1926
|
+
|
|
1927
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
1928
|
+
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
|
1929
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1930
|
+
const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
|
1931
|
+
for (int m = 0; m < 4; m++) {
|
|
1932
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1933
|
+
sumf[m][j] = 0.0;
|
|
1934
|
+
sum_minf[m][j] = 0.0;
|
|
1935
|
+
}
|
|
1936
|
+
}
|
|
1937
|
+
for (int l = 0; l < nb; l++) {
|
|
1938
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
1939
|
+
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
1940
|
+
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
1941
|
+
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
1942
|
+
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
1943
|
+
utmp[sb * 4 + 2] = uaux_0;
|
|
1944
|
+
utmp[sb * 4 + 0] &= kmask1;
|
|
1945
|
+
}
|
|
1946
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1947
|
+
uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
|
|
1948
|
+
uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
|
|
1949
|
+
for (int m = 0; m < 4; m++) {
|
|
1950
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1951
|
+
sumi1 = 0;
|
|
1952
|
+
sumi2 = 0;
|
|
1953
|
+
sumi = 0;
|
|
1954
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1955
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
|
|
1956
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
|
1957
|
+
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]);
|
|
1958
|
+
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
|
|
1959
|
+
sumi1 = sumi1 * scales_0[j];
|
|
1960
|
+
sumi2 = sumi2 * scales_1[j];
|
|
1961
|
+
sumi += sumi1 + sumi2;
|
|
1962
|
+
}
|
|
1963
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
1964
|
+
}
|
|
1965
|
+
}
|
|
1966
|
+
}
|
|
1967
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
1968
|
+
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
|
|
1969
|
+
for(int m = 0; m < 4; m++) {
|
|
1970
|
+
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
1971
|
+
for(int j = 0; j < ncols_interleaved; j++) {
|
|
1972
|
+
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
1973
|
+
}
|
|
1974
|
+
}
|
|
1975
|
+
}
|
|
1976
|
+
}
|
|
1977
|
+
for (int m = 0; m < 4; m++) {
|
|
1978
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1979
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
|
688
1980
|
}
|
|
689
1981
|
}
|
|
690
1982
|
}
|
|
691
|
-
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
692
1983
|
}
|
|
693
1984
|
}
|
|
694
1985
|
|
|
695
|
-
void
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
int nr,
|
|
701
|
-
int nc) {
|
|
702
|
-
const int qk = QK8_0;
|
|
703
|
-
const int nb = n / qk;
|
|
704
|
-
const int ncols_interleaved = 4;
|
|
705
|
-
const int blocklen = 4;
|
|
1986
|
+
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1987
|
+
const int qk = QK_K;
|
|
1988
|
+
const int nb = n / qk;
|
|
1989
|
+
const int ncols_interleaved = 8;
|
|
1990
|
+
const int blocklen = 8;
|
|
706
1991
|
|
|
707
|
-
assert(
|
|
708
|
-
assert(
|
|
709
|
-
assert(nc % ncols_interleaved == 0);
|
|
1992
|
+
assert (n % qk == 0);
|
|
1993
|
+
assert (nr % 4 == 0);
|
|
1994
|
+
assert (nc % ncols_interleaved == 0);
|
|
710
1995
|
|
|
1996
|
+
UNUSED(s);
|
|
711
1997
|
UNUSED(bs);
|
|
1998
|
+
UNUSED(vx);
|
|
1999
|
+
UNUSED(vy);
|
|
712
2000
|
UNUSED(nr);
|
|
2001
|
+
UNUSED(nc);
|
|
2002
|
+
UNUSED(nb);
|
|
2003
|
+
UNUSED(ncols_interleaved);
|
|
2004
|
+
UNUSED(blocklen);
|
|
713
2005
|
|
|
714
|
-
float sumf[4];
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
719
|
-
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
2006
|
+
float sumf[4][8];
|
|
2007
|
+
float sum_minf[4][8];
|
|
2008
|
+
int sumi1, sumi2, sumi3, sumi4;
|
|
2009
|
+
int sumi;
|
|
720
2010
|
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
for (int
|
|
2011
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
2012
|
+
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
|
2013
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
2014
|
+
const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
|
|
2015
|
+
for (int m = 0; m < 4; m++) {
|
|
726
2016
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
2017
|
+
sumf[m][j] = 0.0;
|
|
2018
|
+
sum_minf[m][j] = 0.0;
|
|
2019
|
+
}
|
|
2020
|
+
}
|
|
2021
|
+
for (int l = 0; l < nb; l++) {
|
|
2022
|
+
for (int k = 0; k < (qk / (4 * blocklen)); k++) {
|
|
2023
|
+
|
|
2024
|
+
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
|
2025
|
+
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
|
2026
|
+
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
|
2027
|
+
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
|
2028
|
+
for (int m = 0; m < 4; m++) {
|
|
2029
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
2030
|
+
sumi1 = 0;
|
|
2031
|
+
sumi2 = 0;
|
|
2032
|
+
sumi3 = 0;
|
|
2033
|
+
sumi4 = 0;
|
|
2034
|
+
sumi = 0;
|
|
2035
|
+
int offset = ((k / 2) % 2) + j * 2;
|
|
2036
|
+
for (int i = 0; i < blocklen; ++i){
|
|
2037
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
|
|
2038
|
+
const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
|
|
2039
|
+
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
|
2040
|
+
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
|
2041
|
+
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
|
|
2042
|
+
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
|
|
2043
|
+
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
|
|
2044
|
+
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
|
|
2045
|
+
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
|
2046
|
+
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
|
2047
|
+
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
|
2048
|
+
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
|
2049
|
+
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
|
2050
|
+
}
|
|
2051
|
+
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
2052
|
+
}
|
|
2053
|
+
}
|
|
2054
|
+
}
|
|
2055
|
+
for(int sb = 0; sb < 8; sb++) {
|
|
2056
|
+
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
|
2057
|
+
for(int m = 0; m < 4; m++) {
|
|
2058
|
+
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
2059
|
+
for(int j = 0; j < ncols_interleaved; j++) {
|
|
2060
|
+
int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
|
|
2061
|
+
sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
2062
|
+
}
|
|
731
2063
|
}
|
|
732
|
-
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
733
2064
|
}
|
|
734
2065
|
}
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
2066
|
+
|
|
2067
|
+
for (int m = 0; m < 4; m++) {
|
|
2068
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
2069
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
|
2070
|
+
}
|
|
2071
|
+
}
|
|
738
2072
|
}
|
|
739
2073
|
}
|
|
740
2074
|
}
|
|
741
2075
|
|
|
742
|
-
void
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
const void * GGML_RESTRICT vx,
|
|
746
|
-
const void * GGML_RESTRICT vy,
|
|
747
|
-
int nr,
|
|
748
|
-
int nc) {
|
|
749
|
-
const int qk = QK8_0;
|
|
750
|
-
const int nb = n / qk;
|
|
751
|
-
const int ncols_interleaved = 4;
|
|
752
|
-
const int blocklen = 8;
|
|
753
|
-
|
|
754
|
-
assert(nr == 1);
|
|
755
|
-
assert(n % qk == 0);
|
|
756
|
-
assert(nc % ncols_interleaved == 0);
|
|
757
|
-
|
|
758
|
-
UNUSED(bs);
|
|
759
|
-
UNUSED(nr);
|
|
2076
|
+
void ggml_gemm_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
2077
|
+
ggml_gemm_q5_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
|
|
2078
|
+
}
|
|
760
2079
|
|
|
761
|
-
|
|
762
|
-
|
|
2080
|
+
void ggml_gemm_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
2081
|
+
ggml_gemm_q5_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
|
|
2082
|
+
}
|
|
763
2083
|
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
2084
|
+
void ggml_gemm_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
2085
|
+
ggml_gemm_q6_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
|
|
2086
|
+
}
|
|
767
2087
|
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
}
|
|
771
|
-
for (int l = 0; l < nb; l++) {
|
|
772
|
-
for (int k = 0; k < (qk / blocklen); k++) {
|
|
773
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
774
|
-
sumi = 0;
|
|
775
|
-
for (int i = 0; i < blocklen; ++i) {
|
|
776
|
-
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
777
|
-
sumi += v0 * a_ptr[l].qs[k * blocklen + i];
|
|
778
|
-
}
|
|
779
|
-
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
780
|
-
}
|
|
781
|
-
}
|
|
782
|
-
}
|
|
783
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
784
|
-
s[x * ncols_interleaved + j] = sumf[j];
|
|
785
|
-
}
|
|
786
|
-
}
|
|
2088
|
+
void ggml_gemm_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
2089
|
+
ggml_gemm_q6_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
|
|
787
2090
|
}
|
|
788
2091
|
|
|
789
|
-
void
|
|
2092
|
+
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
790
2093
|
const int qk = QK8_0;
|
|
791
2094
|
const int nb = n / qk;
|
|
792
2095
|
const int ncols_interleaved = 4;
|
|
@@ -813,7 +2116,7 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
813
2116
|
for (int y = 0; y < nr / 4; y++) {
|
|
814
2117
|
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
815
2118
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
816
|
-
const
|
|
2119
|
+
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
|
817
2120
|
for (int m = 0; m < 4; m++) {
|
|
818
2121
|
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
819
2122
|
}
|
|
@@ -823,10 +2126,10 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
823
2126
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
824
2127
|
sumi = 0;
|
|
825
2128
|
for (int i = 0; i < blocklen; ++i) {
|
|
826
|
-
const int v0 =
|
|
827
|
-
const int v1 =
|
|
2129
|
+
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
2130
|
+
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
828
2131
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
829
|
-
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]))
|
|
2132
|
+
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
|
830
2133
|
}
|
|
831
2134
|
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
832
2135
|
}
|
|
@@ -842,33 +2145,23 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
842
2145
|
}
|
|
843
2146
|
}
|
|
844
2147
|
|
|
845
|
-
void
|
|
2148
|
+
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
846
2149
|
const int qk = QK8_0;
|
|
847
2150
|
const int nb = n / qk;
|
|
848
|
-
const int ncols_interleaved =
|
|
2151
|
+
const int ncols_interleaved = 8;
|
|
849
2152
|
const int blocklen = 8;
|
|
850
2153
|
|
|
851
|
-
assert
|
|
852
|
-
assert
|
|
853
|
-
assert
|
|
854
|
-
|
|
855
|
-
UNUSED(s);
|
|
856
|
-
UNUSED(bs);
|
|
857
|
-
UNUSED(vx);
|
|
858
|
-
UNUSED(vy);
|
|
859
|
-
UNUSED(nr);
|
|
860
|
-
UNUSED(nc);
|
|
861
|
-
UNUSED(nb);
|
|
862
|
-
UNUSED(ncols_interleaved);
|
|
863
|
-
UNUSED(blocklen);
|
|
2154
|
+
assert(n % qk == 0);
|
|
2155
|
+
assert(nr % 4 == 0);
|
|
2156
|
+
assert(nc % ncols_interleaved == 0);
|
|
864
2157
|
|
|
865
|
-
float sumf[4][
|
|
2158
|
+
float sumf[4][8];
|
|
866
2159
|
int sumi;
|
|
867
2160
|
|
|
868
2161
|
for (int y = 0; y < nr / 4; y++) {
|
|
869
2162
|
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
870
2163
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
871
|
-
const
|
|
2164
|
+
const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
|
|
872
2165
|
for (int m = 0; m < 4; m++) {
|
|
873
2166
|
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
874
2167
|
}
|
|
@@ -878,10 +2171,10 @@ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
878
2171
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
879
2172
|
sumi = 0;
|
|
880
2173
|
for (int i = 0; i < blocklen; ++i) {
|
|
881
|
-
const int v0 =
|
|
882
|
-
const int v1 =
|
|
2174
|
+
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
2175
|
+
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
883
2176
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
884
|
-
|
|
2177
|
+
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
|
885
2178
|
}
|
|
886
2179
|
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
887
2180
|
}
|
|
@@ -896,25 +2189,59 @@ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
896
2189
|
}
|
|
897
2190
|
}
|
|
898
2191
|
|
|
899
|
-
void
|
|
2192
|
+
void ggml_gemm_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
900
2193
|
const int qk = QK8_0;
|
|
901
2194
|
const int nb = n / qk;
|
|
902
|
-
const int ncols_interleaved =
|
|
903
|
-
const int blocklen =
|
|
2195
|
+
const int ncols_interleaved = 4;
|
|
2196
|
+
const int blocklen = 4;
|
|
904
2197
|
|
|
905
|
-
assert
|
|
906
|
-
assert
|
|
907
|
-
assert
|
|
2198
|
+
assert(n % qk == 0);
|
|
2199
|
+
assert(nr % 4 == 0);
|
|
2200
|
+
assert(nc % ncols_interleaved == 0);
|
|
908
2201
|
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
2202
|
+
float sumf[4][4];
|
|
2203
|
+
int sumi;
|
|
2204
|
+
|
|
2205
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
2206
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
2207
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
2208
|
+
const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb);
|
|
2209
|
+
for (int m = 0; m < 4; m++) {
|
|
2210
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
2211
|
+
}
|
|
2212
|
+
for (int l = 0; l < nb; l++) {
|
|
2213
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
2214
|
+
for (int m = 0; m < 4; m++) {
|
|
2215
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
2216
|
+
sumi = 0;
|
|
2217
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
2218
|
+
const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
2219
|
+
const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
2220
|
+
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
2221
|
+
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
|
2222
|
+
}
|
|
2223
|
+
sumf[m][j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
2224
|
+
}
|
|
2225
|
+
}
|
|
2226
|
+
}
|
|
2227
|
+
}
|
|
2228
|
+
for (int m = 0; m < 4; m++) {
|
|
2229
|
+
for (int j = 0; j < ncols_interleaved; j++)
|
|
2230
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
2231
|
+
}
|
|
2232
|
+
}
|
|
2233
|
+
}
|
|
2234
|
+
}
|
|
2235
|
+
|
|
2236
|
+
void ggml_gemm_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
2237
|
+
const int qk = QK8_0;
|
|
2238
|
+
const int nb = n / qk;
|
|
2239
|
+
const int ncols_interleaved = 8;
|
|
2240
|
+
const int blocklen = 8;
|
|
2241
|
+
|
|
2242
|
+
assert(n % qk == 0);
|
|
2243
|
+
assert(nr % 4 == 0);
|
|
2244
|
+
assert(nc % ncols_interleaved == 0);
|
|
918
2245
|
|
|
919
2246
|
float sumf[4][8];
|
|
920
2247
|
int sumi;
|
|
@@ -922,7 +2249,7 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
922
2249
|
for (int y = 0; y < nr / 4; y++) {
|
|
923
2250
|
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
924
2251
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
925
|
-
const
|
|
2252
|
+
const block_mxfp4x8 * b_ptr = (const block_mxfp4x8 *) vx + (x * nb);
|
|
926
2253
|
for (int m = 0; m < 4; m++) {
|
|
927
2254
|
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
928
2255
|
}
|
|
@@ -932,12 +2259,12 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
932
2259
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
933
2260
|
sumi = 0;
|
|
934
2261
|
for (int i = 0; i < blocklen; ++i) {
|
|
935
|
-
const int v0 =
|
|
936
|
-
const int v1 =
|
|
2262
|
+
const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
2263
|
+
const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
937
2264
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
938
|
-
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]))
|
|
2265
|
+
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
|
939
2266
|
}
|
|
940
|
-
sumf[m][j] += sumi *
|
|
2267
|
+
sumf[m][j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
941
2268
|
}
|
|
942
2269
|
}
|
|
943
2270
|
}
|
|
@@ -950,183 +2277,119 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
950
2277
|
}
|
|
951
2278
|
}
|
|
952
2279
|
|
|
953
|
-
void
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
assert (nc % ncols_interleaved == 0);
|
|
2280
|
+
void ggml_gemm_q8_0_4x4_q8_0_generic(int n,
|
|
2281
|
+
float * GGML_RESTRICT s,
|
|
2282
|
+
size_t bs,
|
|
2283
|
+
const void * GGML_RESTRICT vx,
|
|
2284
|
+
const void * GGML_RESTRICT vy,
|
|
2285
|
+
int nr,
|
|
2286
|
+
int nc) {
|
|
2287
|
+
const int qk = QK8_0;
|
|
2288
|
+
const int nb = n / qk;
|
|
2289
|
+
const int ncols_interleaved = 4;
|
|
2290
|
+
const int blocklen = 4;
|
|
965
2291
|
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
2292
|
+
assert(n % qk == 0);
|
|
2293
|
+
assert(nr % 4 == 0);
|
|
2294
|
+
assert(nc % ncols_interleaved == 0);
|
|
969
2295
|
|
|
970
|
-
float sumf[4][
|
|
971
|
-
|
|
972
|
-
uint32_t utmp[32];
|
|
973
|
-
int sumi1;
|
|
974
|
-
int sumi2;
|
|
975
|
-
int sumi;
|
|
2296
|
+
float sumf[4][4];
|
|
2297
|
+
int sumi;
|
|
976
2298
|
|
|
977
2299
|
for (int y = 0; y < nr / 4; y++) {
|
|
978
|
-
const
|
|
2300
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
979
2301
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
980
|
-
const
|
|
2302
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
981
2303
|
for (int m = 0; m < 4; m++) {
|
|
982
2304
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
983
2305
|
sumf[m][j] = 0.0;
|
|
984
|
-
sum_minf[m][j] = 0.0;
|
|
985
2306
|
}
|
|
986
2307
|
}
|
|
987
2308
|
for (int l = 0; l < nb; l++) {
|
|
988
|
-
for (int
|
|
989
|
-
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
990
|
-
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
991
|
-
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
992
|
-
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
993
|
-
utmp[sb * 4 + 2] = uaux_0;
|
|
994
|
-
utmp[sb * 4 + 0] &= kmask1;
|
|
995
|
-
}
|
|
996
|
-
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
997
|
-
uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
|
|
998
|
-
uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
|
|
2309
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
999
2310
|
for (int m = 0; m < 4; m++) {
|
|
1000
2311
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1001
|
-
sumi1 = 0;
|
|
1002
|
-
sumi2 = 0;
|
|
1003
2312
|
sumi = 0;
|
|
1004
2313
|
for (int i = 0; i < blocklen; ++i) {
|
|
1005
|
-
const int v0 =
|
|
1006
|
-
|
|
1007
|
-
sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i]);
|
|
1008
|
-
sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i + 128]);
|
|
1009
|
-
sumi1 = sumi1 * scales_0[j];
|
|
1010
|
-
sumi2 = sumi2 * scales_1[j];
|
|
1011
|
-
sumi += sumi1 + sumi2;
|
|
2314
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
2315
|
+
sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
|
|
1012
2316
|
}
|
|
1013
|
-
sumf[m][j] +=
|
|
1014
|
-
|
|
1015
|
-
}
|
|
1016
|
-
}
|
|
1017
|
-
for (int sb = 0; sb < 8; sb++) {
|
|
1018
|
-
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
|
1019
|
-
for(int m = 0; m < 4; m++) {
|
|
1020
|
-
const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
1021
|
-
for(int j = 0; j < ncols_interleaved; j++) {
|
|
1022
|
-
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
2317
|
+
sumf[m][j] +=
|
|
2318
|
+
sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1023
2319
|
}
|
|
1024
2320
|
}
|
|
1025
2321
|
}
|
|
1026
2322
|
}
|
|
1027
2323
|
for (int m = 0; m < 4; m++) {
|
|
1028
2324
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1029
|
-
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]
|
|
2325
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1030
2326
|
}
|
|
1031
2327
|
}
|
|
1032
2328
|
}
|
|
1033
2329
|
}
|
|
1034
2330
|
}
|
|
1035
2331
|
|
|
1036
|
-
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1037
|
-
const int qk = QK_K;
|
|
1038
|
-
const int nb = n / qk;
|
|
1039
|
-
const int ncols_interleaved = 8;
|
|
1040
|
-
const int blocklen = 8;
|
|
1041
|
-
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
1042
|
-
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
1043
|
-
static const uint32_t kmask3 = 0x03030303;
|
|
1044
2332
|
|
|
1045
|
-
assert (n % qk == 0);
|
|
1046
|
-
assert (nr % 4 == 0);
|
|
1047
|
-
assert (nc % ncols_interleaved == 0);
|
|
1048
2333
|
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
2334
|
+
void ggml_gemm_q8_0_4x8_q8_0_generic(int n,
|
|
2335
|
+
float * GGML_RESTRICT s,
|
|
2336
|
+
size_t bs,
|
|
2337
|
+
const void * GGML_RESTRICT vx,
|
|
2338
|
+
const void * GGML_RESTRICT vy,
|
|
2339
|
+
int nr,
|
|
2340
|
+
int nc) {
|
|
2341
|
+
const int qk = QK8_0;
|
|
2342
|
+
const int nb = n / qk;
|
|
2343
|
+
const int ncols_interleaved = 4;
|
|
2344
|
+
const int blocklen = 8;
|
|
1058
2345
|
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
int
|
|
2346
|
+
assert(n % qk == 0);
|
|
2347
|
+
assert(nr % 4 == 0);
|
|
2348
|
+
assert(nc % ncols_interleaved == 0);
|
|
2349
|
+
|
|
2350
|
+
float sumf[4][4];
|
|
2351
|
+
int sumi;
|
|
1065
2352
|
|
|
1066
2353
|
for (int y = 0; y < nr / 4; y++) {
|
|
1067
|
-
const
|
|
2354
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1068
2355
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1069
|
-
const
|
|
2356
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
1070
2357
|
for (int m = 0; m < 4; m++) {
|
|
1071
2358
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1072
2359
|
sumf[m][j] = 0.0;
|
|
1073
|
-
sum_minf[m][j] = 0.0;
|
|
1074
2360
|
}
|
|
1075
2361
|
}
|
|
1076
2362
|
for (int l = 0; l < nb; l++) {
|
|
1077
|
-
for (int
|
|
1078
|
-
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
1079
|
-
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
1080
|
-
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
1081
|
-
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
1082
|
-
utmp[sb * 4 + 2] = uaux_0;
|
|
1083
|
-
utmp[sb * 4 + 0] &= kmask1;
|
|
1084
|
-
}
|
|
1085
|
-
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1086
|
-
uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
|
|
1087
|
-
uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
|
|
2363
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
1088
2364
|
for (int m = 0; m < 4; m++) {
|
|
1089
2365
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1090
|
-
sumi1 = 0;
|
|
1091
|
-
sumi2 = 0;
|
|
1092
2366
|
sumi = 0;
|
|
1093
2367
|
for (int i = 0; i < blocklen; ++i) {
|
|
1094
|
-
const int v0 =
|
|
1095
|
-
|
|
1096
|
-
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]);
|
|
1097
|
-
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
|
|
1098
|
-
sumi1 = sumi1 * scales_0[j];
|
|
1099
|
-
sumi2 = sumi2 * scales_1[j];
|
|
1100
|
-
sumi += sumi1 + sumi2;
|
|
2368
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
2369
|
+
sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
|
|
1101
2370
|
}
|
|
1102
|
-
sumf[m][j] +=
|
|
1103
|
-
|
|
1104
|
-
}
|
|
1105
|
-
}
|
|
1106
|
-
for (int sb = 0; sb < 8; sb++) {
|
|
1107
|
-
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
|
|
1108
|
-
for(int m = 0; m < 4; m++) {
|
|
1109
|
-
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
1110
|
-
for(int j = 0; j < ncols_interleaved; j++) {
|
|
1111
|
-
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
2371
|
+
sumf[m][j] +=
|
|
2372
|
+
sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1112
2373
|
}
|
|
1113
2374
|
}
|
|
1114
2375
|
}
|
|
1115
2376
|
}
|
|
1116
2377
|
for (int m = 0; m < 4; m++) {
|
|
1117
2378
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1118
|
-
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]
|
|
2379
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1119
2380
|
}
|
|
1120
2381
|
}
|
|
1121
2382
|
}
|
|
1122
2383
|
}
|
|
1123
2384
|
}
|
|
1124
2385
|
|
|
1125
|
-
|
|
1126
|
-
|
|
2386
|
+
// Only enable these for RISC-V.
|
|
2387
|
+
#if defined __riscv_zvfh
|
|
2388
|
+
void ggml_gemm_q4_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
2389
|
+
const int qk = QK8_0;
|
|
1127
2390
|
const int nb = n / qk;
|
|
1128
|
-
const int ncols_interleaved =
|
|
1129
|
-
const int blocklen =
|
|
2391
|
+
const int ncols_interleaved = 16;
|
|
2392
|
+
const int blocklen = 1;
|
|
1130
2393
|
|
|
1131
2394
|
assert (n % qk == 0);
|
|
1132
2395
|
assert (nr % 4 == 0);
|
|
@@ -1142,82 +2405,45 @@ void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
1142
2405
|
UNUSED(ncols_interleaved);
|
|
1143
2406
|
UNUSED(blocklen);
|
|
1144
2407
|
|
|
1145
|
-
float sumf[4][
|
|
1146
|
-
float sum_minf[4][8];
|
|
1147
|
-
int sumi1, sumi2, sumi3, sumi4;
|
|
2408
|
+
float sumf[4][16];
|
|
1148
2409
|
int sumi;
|
|
1149
2410
|
|
|
1150
2411
|
for (int y = 0; y < nr / 4; y++) {
|
|
1151
|
-
const
|
|
2412
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1152
2413
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1153
|
-
const
|
|
2414
|
+
const block_q4_0x16 * b_ptr = (const block_q4_0x16 *) vx + (x * nb);
|
|
1154
2415
|
for (int m = 0; m < 4; m++) {
|
|
1155
|
-
for (int j = 0; j < ncols_interleaved; j++)
|
|
1156
|
-
sumf[m][j] = 0.0;
|
|
1157
|
-
sum_minf[m][j] = 0.0;
|
|
1158
|
-
}
|
|
2416
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
1159
2417
|
}
|
|
1160
2418
|
for (int l = 0; l < nb; l++) {
|
|
1161
|
-
for (int k = 0; k < (qk / (
|
|
1162
|
-
|
|
1163
|
-
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
|
1164
|
-
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
|
1165
|
-
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
|
1166
|
-
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
|
2419
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1167
2420
|
for (int m = 0; m < 4; m++) {
|
|
1168
2421
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1169
|
-
sumi1 = 0;
|
|
1170
|
-
sumi2 = 0;
|
|
1171
|
-
sumi3 = 0;
|
|
1172
|
-
sumi4 = 0;
|
|
1173
2422
|
sumi = 0;
|
|
1174
|
-
int
|
|
1175
|
-
|
|
1176
|
-
const int
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
|
1180
|
-
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
|
|
1181
|
-
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
|
|
1182
|
-
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
|
|
1183
|
-
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
|
|
1184
|
-
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
|
1185
|
-
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
|
1186
|
-
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
|
1187
|
-
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
|
1188
|
-
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
|
2423
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
2424
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
2425
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
2426
|
+
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
2427
|
+
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
1189
2428
|
}
|
|
1190
|
-
sumf[m][j] += sumi *
|
|
1191
|
-
}
|
|
1192
|
-
}
|
|
1193
|
-
}
|
|
1194
|
-
for(int sb = 0; sb < 8; sb++) {
|
|
1195
|
-
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
|
1196
|
-
for(int m = 0; m < 4; m++) {
|
|
1197
|
-
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
1198
|
-
for(int j = 0; j < ncols_interleaved; j++) {
|
|
1199
|
-
int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
|
|
1200
|
-
sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
2429
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1201
2430
|
}
|
|
1202
2431
|
}
|
|
1203
2432
|
}
|
|
1204
2433
|
}
|
|
1205
|
-
|
|
1206
2434
|
for (int m = 0; m < 4; m++) {
|
|
1207
|
-
for (int j = 0; j < ncols_interleaved; j++)
|
|
1208
|
-
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]
|
|
1209
|
-
}
|
|
2435
|
+
for (int j = 0; j < ncols_interleaved; j++)
|
|
2436
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1210
2437
|
}
|
|
1211
2438
|
}
|
|
1212
2439
|
}
|
|
1213
2440
|
}
|
|
1214
2441
|
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
const int qk = QK8_0;
|
|
2442
|
+
void ggml_gemm_q4_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
2443
|
+
const int qk = QK_K;
|
|
1218
2444
|
const int nb = n / qk;
|
|
1219
|
-
const int ncols_interleaved =
|
|
1220
|
-
const int blocklen =
|
|
2445
|
+
const int ncols_interleaved = 16;
|
|
2446
|
+
const int blocklen = 1;
|
|
1221
2447
|
|
|
1222
2448
|
assert (n % qk == 0);
|
|
1223
2449
|
assert (nr % 4 == 0);
|
|
@@ -1233,59 +2459,97 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
1233
2459
|
UNUSED(ncols_interleaved);
|
|
1234
2460
|
UNUSED(blocklen);
|
|
1235
2461
|
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
2462
|
+
float sumf[4][16];
|
|
2463
|
+
float sum_minf[4][16];
|
|
2464
|
+
uint8_t scales[128];
|
|
2465
|
+
uint8_t mins[128];
|
|
2466
|
+
int sumi1;
|
|
2467
|
+
int sumi2;
|
|
2468
|
+
int sumi;
|
|
2469
|
+
|
|
2470
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
2471
|
+
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
|
2472
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
2473
|
+
const block_q4_Kx16 * b_ptr = (const block_q4_Kx16 *) vx + (x * nb);
|
|
2474
|
+
for (int m = 0; m < 4; m++) {
|
|
2475
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
2476
|
+
sumf[m][j] = 0.0;
|
|
2477
|
+
sum_minf[m][j] = 0.0;
|
|
2478
|
+
}
|
|
2479
|
+
}
|
|
2480
|
+
for (int l = 0; l < nb; l++) {
|
|
2481
|
+
for (int i = 0; i < 128; i++) {
|
|
2482
|
+
scales[i] = b_ptr[l].scales[i] & 0x0F;
|
|
2483
|
+
mins[i] = b_ptr[l].scales[i] >> 4;
|
|
2484
|
+
}
|
|
2485
|
+
for (int i = 0; i < 64; i++) {
|
|
2486
|
+
scales[i] |= (b_ptr[l].scales[128 + i] & 0x03) << 4;
|
|
2487
|
+
mins[i] |= (b_ptr[l].scales[128 + i] & 0x0C) << 2;
|
|
2488
|
+
scales[i + 64] |= (b_ptr[l].scales[128 + i] & 0x30);
|
|
2489
|
+
mins[i + 64] |= (b_ptr[l].scales[128 + i] & 0xC0) >> 2;
|
|
2490
|
+
}
|
|
1239
2491
|
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
2492
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
2493
|
+
uint8_t *min = &mins[sb * 16];
|
|
2494
|
+
for(int m = 0; m < 4; m++) {
|
|
2495
|
+
const int16_t bsums = a_ptr[l].bsums[sb * 8 + m] + a_ptr[l].bsums[sb * 8 + m + 4];
|
|
2496
|
+
for(int j = 0; j < ncols_interleaved; j++) {
|
|
2497
|
+
sum_minf[m][j] += min[j] * bsums * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
2498
|
+
}
|
|
2499
|
+
}
|
|
1246
2500
|
}
|
|
1247
|
-
|
|
1248
|
-
|
|
2501
|
+
|
|
2502
|
+
for (int sb = 0; sb < 8; sb += 2) {
|
|
2503
|
+
uint8_t *scales_0 = &scales[sb * 16];
|
|
2504
|
+
uint8_t *scales_1 = &scales[(sb + 1) * 16];
|
|
2505
|
+
|
|
2506
|
+
for (int i = 0; i < QK4_0; i++) {
|
|
1249
2507
|
for (int m = 0; m < 4; m++) {
|
|
1250
2508
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
2509
|
+
sumi1 = 0;
|
|
2510
|
+
sumi2 = 0;
|
|
1251
2511
|
sumi = 0;
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
2512
|
+
|
|
2513
|
+
const int v0 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] & 0xF);
|
|
2514
|
+
const int v1 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] >> 4);
|
|
2515
|
+
sumi1 = (v0 * a_ptr[l].qs[sb * 4 * 32 + i * 4 + m]);
|
|
2516
|
+
sumi2 = (v1 * a_ptr[l].qs[sb * 4 * 32 + 32 * 4 + i * 4 + m]);
|
|
2517
|
+
sumi1 = sumi1 * scales_0[j];
|
|
2518
|
+
sumi2 = sumi2 * scales_1[j];
|
|
2519
|
+
sumi += sumi1 + sumi2;
|
|
2520
|
+
|
|
2521
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
1259
2522
|
}
|
|
1260
2523
|
}
|
|
1261
2524
|
}
|
|
1262
2525
|
}
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
2526
|
+
}
|
|
2527
|
+
for (int m = 0; m < 4; m++) {
|
|
2528
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
2529
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
|
1266
2530
|
}
|
|
1267
2531
|
}
|
|
1268
2532
|
}
|
|
1269
2533
|
}
|
|
1270
2534
|
}
|
|
1271
2535
|
|
|
1272
|
-
void
|
|
2536
|
+
void ggml_gemm_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1273
2537
|
const int qk = QK8_0;
|
|
1274
2538
|
const int nb = n / qk;
|
|
1275
|
-
const int ncols_interleaved =
|
|
1276
|
-
const int blocklen =
|
|
2539
|
+
const int ncols_interleaved = 16;
|
|
2540
|
+
const int blocklen = 1;
|
|
1277
2541
|
|
|
1278
2542
|
assert(n % qk == 0);
|
|
1279
2543
|
assert(nr % 4 == 0);
|
|
1280
2544
|
assert(nc % ncols_interleaved == 0);
|
|
1281
2545
|
|
|
1282
|
-
float sumf[4][
|
|
2546
|
+
float sumf[4][16];
|
|
1283
2547
|
int sumi;
|
|
1284
2548
|
|
|
1285
2549
|
for (int y = 0; y < nr / 4; y++) {
|
|
1286
2550
|
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1287
2551
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1288
|
-
const
|
|
2552
|
+
const block_iq4_nlx16 * b_ptr = (const block_iq4_nlx16 *) vx + (x * nb);
|
|
1289
2553
|
for (int m = 0; m < 4; m++) {
|
|
1290
2554
|
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
1291
2555
|
}
|
|
@@ -1298,7 +2562,7 @@ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
1298
2562
|
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
1299
2563
|
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
1300
2564
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
1301
|
-
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
|
2565
|
+
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + (qk / 2) * 4]));
|
|
1302
2566
|
}
|
|
1303
2567
|
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1304
2568
|
}
|
|
@@ -1313,29 +2577,23 @@ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
1313
2577
|
}
|
|
1314
2578
|
}
|
|
1315
2579
|
|
|
1316
|
-
void
|
|
1317
|
-
float * GGML_RESTRICT s,
|
|
1318
|
-
size_t bs,
|
|
1319
|
-
const void * GGML_RESTRICT vx,
|
|
1320
|
-
const void * GGML_RESTRICT vy,
|
|
1321
|
-
int nr,
|
|
1322
|
-
int nc) {
|
|
2580
|
+
void ggml_gemm_q8_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1323
2581
|
const int qk = QK8_0;
|
|
1324
2582
|
const int nb = n / qk;
|
|
1325
|
-
const int ncols_interleaved =
|
|
1326
|
-
const int blocklen =
|
|
2583
|
+
const int ncols_interleaved = 16;
|
|
2584
|
+
const int blocklen = 1;
|
|
1327
2585
|
|
|
1328
2586
|
assert(n % qk == 0);
|
|
1329
2587
|
assert(nr % 4 == 0);
|
|
1330
2588
|
assert(nc % ncols_interleaved == 0);
|
|
1331
2589
|
|
|
1332
|
-
float sumf[4][
|
|
2590
|
+
float sumf[4][16];
|
|
1333
2591
|
int sumi;
|
|
1334
2592
|
|
|
1335
2593
|
for (int y = 0; y < nr / 4; y++) {
|
|
1336
2594
|
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1337
2595
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1338
|
-
const
|
|
2596
|
+
const block_q8_0x16 * b_ptr = (const block_q8_0x16 *) vx + (x * nb);
|
|
1339
2597
|
for (int m = 0; m < 4; m++) {
|
|
1340
2598
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1341
2599
|
sumf[m][j] = 0.0;
|
|
@@ -1365,57 +2623,102 @@ void ggml_gemm_q8_0_4x4_q8_0_generic(int n,
|
|
|
1365
2623
|
}
|
|
1366
2624
|
}
|
|
1367
2625
|
|
|
1368
|
-
void ggml_gemm_q8_0_4x8_q8_0_generic(int n,
|
|
1369
|
-
float * GGML_RESTRICT s,
|
|
1370
|
-
size_t bs,
|
|
1371
|
-
const void * GGML_RESTRICT vx,
|
|
1372
|
-
const void * GGML_RESTRICT vy,
|
|
1373
|
-
int nr,
|
|
1374
|
-
int nc) {
|
|
1375
|
-
const int qk = QK8_0;
|
|
1376
|
-
const int nb = n / qk;
|
|
1377
|
-
const int ncols_interleaved = 4;
|
|
1378
|
-
const int blocklen = 8;
|
|
1379
2626
|
|
|
1380
|
-
|
|
2627
|
+
void ggml_gemm_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
2628
|
+
assert(n % QK_K == 0);
|
|
1381
2629
|
assert(nr % 4 == 0);
|
|
1382
|
-
assert(nc %
|
|
2630
|
+
assert(nc % 16 == 0);
|
|
2631
|
+
const int nb = n / QK_K;
|
|
2632
|
+
const block_q2_Kx16 * x = (const block_q2_Kx16 *)vx;
|
|
2633
|
+
const block_q8_Kx4 * y = (const block_q8_Kx4 *)vy;
|
|
2634
|
+
|
|
2635
|
+
const int sb_perm[16] = {
|
|
2636
|
+
0, 4, 1, 5, 2, 6, 3, 7,
|
|
2637
|
+
8, 12, 9, 13, 10, 14, 11, 15
|
|
2638
|
+
};
|
|
1383
2639
|
|
|
1384
|
-
|
|
1385
|
-
int
|
|
2640
|
+
// Iterate Rows in tiles of 4
|
|
2641
|
+
for (int row_tile = 0; row_tile < nr; row_tile += 4) {
|
|
2642
|
+
// Iterate Columns in tiles of 16
|
|
2643
|
+
for (int col_tile = 0; col_tile < nc; col_tile += 16) {
|
|
2644
|
+
|
|
2645
|
+
const block_q2_Kx16 * x_ptr = x + (col_tile / 16) * nb;
|
|
2646
|
+
const block_q8_Kx4 * y_ptr = y + (row_tile / 4) * nb;
|
|
2647
|
+
|
|
2648
|
+
float sumf[4][16];
|
|
2649
|
+
memset(sumf, 0, sizeof(sumf));
|
|
2650
|
+
|
|
2651
|
+
for (int k_block = 0; k_block < nb; ++k_block) {
|
|
2652
|
+
int32_t isum[4][16];
|
|
2653
|
+
int32_t summs[4][16];
|
|
2654
|
+
memset(isum, 0, sizeof(isum));
|
|
2655
|
+
memset(summs, 0, sizeof(summs));
|
|
2656
|
+
|
|
2657
|
+
const uint8_t * qs_rhs = x_ptr[k_block].qs;
|
|
2658
|
+
const uint8_t * sc_rhs = x_ptr[k_block].scales;
|
|
2659
|
+
const int8_t * qs_lhs = y_ptr[k_block].qs;
|
|
2660
|
+
const int16_t * bs_lhs = y_ptr[k_block].bsums;
|
|
2661
|
+
|
|
2662
|
+
for (int sb = 0; sb < 16; ++sb) {
|
|
2663
|
+
int scale_offset = sb_perm[sb] * 16;
|
|
2664
|
+
|
|
2665
|
+
int byte_base;
|
|
2666
|
+
if (sb < 8) byte_base = (sb % 2 == 0) ? 0 : 16;
|
|
2667
|
+
else byte_base = (sb % 2 == 0) ? 32 : 48;
|
|
2668
|
+
int shift = ((sb / 2) % 4) * 2;
|
|
2669
|
+
|
|
2670
|
+
for (int col = 0; col < 16; ++col) {
|
|
2671
|
+
uint8_t sc_val = sc_rhs[scale_offset + col];
|
|
2672
|
+
int32_t d_sb = sc_val & 0xF;
|
|
2673
|
+
int32_t m_sb = sc_val >> 4;
|
|
2674
|
+
|
|
2675
|
+
// Correction Term
|
|
2676
|
+
for (int r = 0; r < 4; ++r) {
|
|
2677
|
+
int bsum_idx = (sb / 4) * 16 + r * 4 + (sb % 4);
|
|
2678
|
+
summs[r][col] += bs_lhs[bsum_idx] * m_sb;
|
|
2679
|
+
}
|
|
1386
2680
|
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
sumi = 0;
|
|
1401
|
-
for (int i = 0; i < blocklen; ++i) {
|
|
1402
|
-
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
1403
|
-
sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
|
|
2681
|
+
// Main Dot Product
|
|
2682
|
+
for (int l = 0; l < 16; ++l) {
|
|
2683
|
+
int qs_idx = (byte_base + l) * 16 + col;
|
|
2684
|
+
uint8_t q2_val = (qs_rhs[qs_idx] >> shift) & 3;
|
|
2685
|
+
|
|
2686
|
+
// Calculate Q8 index for this specific k and row
|
|
2687
|
+
int k = sb * 16 + l;
|
|
2688
|
+
int q8_idx = (k / 4) * 16 + (k % 4);
|
|
2689
|
+
|
|
2690
|
+
for (int r = 0; r < 4; ++r) {
|
|
2691
|
+
// Add r*4 to jump to the correct row within the 4x4 chunk
|
|
2692
|
+
int8_t q8_val = qs_lhs[q8_idx + r * 4];
|
|
2693
|
+
isum[r][col] += q8_val * q2_val * d_sb;
|
|
1404
2694
|
}
|
|
1405
|
-
sumf[m][j] +=
|
|
1406
|
-
sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1407
2695
|
}
|
|
1408
2696
|
}
|
|
1409
2697
|
}
|
|
2698
|
+
|
|
2699
|
+
// Finalize K-Block
|
|
2700
|
+
for (int col = 0; col < 16; ++col) {
|
|
2701
|
+
float d_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].d[col]);
|
|
2702
|
+
float dm_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].dmin[col]);
|
|
2703
|
+
|
|
2704
|
+
for (int r = 0; r < 4; ++r) {
|
|
2705
|
+
float d_lhs = y_ptr[k_block].d[r];
|
|
2706
|
+
float d_all = d_lhs * d_rhs;
|
|
2707
|
+
float d_min = d_lhs * dm_rhs;
|
|
2708
|
+
sumf[r][col] += (isum[r][col] * d_all) - (summs[r][col] * d_min);
|
|
2709
|
+
}
|
|
2710
|
+
}
|
|
1410
2711
|
}
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
2712
|
+
|
|
2713
|
+
for (int r = 0; r < 4; ++r) {
|
|
2714
|
+
for (int col = 0; col < 16; ++col) {
|
|
2715
|
+
s[(row_tile + r) * bs + (col_tile + col)] = sumf[r][col];
|
|
1414
2716
|
}
|
|
1415
2717
|
}
|
|
1416
2718
|
}
|
|
1417
2719
|
}
|
|
1418
2720
|
}
|
|
2721
|
+
#endif
|
|
1419
2722
|
|
|
1420
2723
|
} // extern "C"
|
|
1421
2724
|
|
|
@@ -1498,16 +2801,212 @@ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_in
|
|
|
1498
2801
|
|
|
1499
2802
|
uint64_t elems;
|
|
1500
2803
|
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
1501
|
-
elems ^= xor_mask;
|
|
2804
|
+
elems ^= xor_mask;
|
|
2805
|
+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
|
2806
|
+
}
|
|
2807
|
+
|
|
2808
|
+
return out;
|
|
2809
|
+
}
|
|
2810
|
+
|
|
2811
|
+
static block_q4_0x16 make_block_q4_0x16(block_q4_0 * in, unsigned int blck_size_interleave) {
|
|
2812
|
+
block_q4_0x16 out;
|
|
2813
|
+
|
|
2814
|
+
for (int i = 0; i < 16; i++) {
|
|
2815
|
+
out.d[i] = in[i].d;
|
|
2816
|
+
}
|
|
2817
|
+
|
|
2818
|
+
const int end = QK4_0 * 8 / blck_size_interleave;
|
|
2819
|
+
|
|
2820
|
+
if (blck_size_interleave == 1) {
|
|
2821
|
+
const uint8_t xor_mask = 0x88;
|
|
2822
|
+
for (int i = 0; i < end; ++i) {
|
|
2823
|
+
int src_id = i % 16;
|
|
2824
|
+
int src_offset = i / 16;
|
|
2825
|
+
int dst_offset = i;
|
|
2826
|
+
|
|
2827
|
+
out.qs[dst_offset] = in[src_id].qs[src_offset] ^ xor_mask;
|
|
2828
|
+
}
|
|
2829
|
+
} else {
|
|
2830
|
+
GGML_ASSERT(false);
|
|
2831
|
+
}
|
|
2832
|
+
|
|
2833
|
+
return out;
|
|
2834
|
+
}
|
|
2835
|
+
|
|
2836
|
+
static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) {
|
|
2837
|
+
block_q4_Kx8 out;
|
|
2838
|
+
//Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure
|
|
2839
|
+
for (int i = 0; i < 8; i++) {
|
|
2840
|
+
out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
|
|
2841
|
+
}
|
|
2842
|
+
|
|
2843
|
+
for (int i = 0; i < 8; i++) {
|
|
2844
|
+
out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
|
|
2845
|
+
}
|
|
2846
|
+
|
|
2847
|
+
const int end = QK_K * 4 / blck_size_interleave;
|
|
2848
|
+
|
|
2849
|
+
// Interleave Q4_K quants by taking 8 bytes at a time
|
|
2850
|
+
for (int i = 0; i < end; ++i) {
|
|
2851
|
+
int src_id = i % 8;
|
|
2852
|
+
int src_offset = (i / 8) * blck_size_interleave;
|
|
2853
|
+
int dst_offset = i * blck_size_interleave;
|
|
2854
|
+
|
|
2855
|
+
// buffer large enough for the max interleave block size (8 bytes)
|
|
2856
|
+
uint64_t elems;
|
|
2857
|
+
memcpy(&elems, &in[src_id].qs[src_offset], blck_size_interleave);
|
|
2858
|
+
memcpy(&out.qs[dst_offset], &elems, blck_size_interleave);
|
|
2859
|
+
}
|
|
2860
|
+
|
|
2861
|
+
// The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K
|
|
2862
|
+
// Currently the Q4_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
|
|
2863
|
+
// The output Q4_Kx8 structure has 96 bytes
|
|
2864
|
+
// Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q4_K structure
|
|
2865
|
+
// For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q4_K structures
|
|
2866
|
+
uint8_t s[8], m[8];
|
|
2867
|
+
|
|
2868
|
+
for (int i = 0; i < 4; i++) {
|
|
2869
|
+
for (int j = 0; j < 8; j++) {
|
|
2870
|
+
s[j] = in[j].scales[i] & 63;
|
|
2871
|
+
m[j] = in[j].scales[i + 4] & 63;
|
|
2872
|
+
}
|
|
2873
|
+
|
|
2874
|
+
out.scales[i * 12] = (s[0] & 63) + ((s[4] & 48) << 2);
|
|
2875
|
+
out.scales[i * 12 + 1] = (s[1] & 63) + ((s[5] & 48) << 2);
|
|
2876
|
+
out.scales[i * 12 + 2] = (s[2] & 63) + ((s[6] & 48) << 2);
|
|
2877
|
+
out.scales[i * 12 + 3] = (s[3] & 63) + ((s[7] & 48) << 2);
|
|
2878
|
+
out.scales[i * 12 + 4] = (m[0] & 63) + ((m[4] & 48) << 2);
|
|
2879
|
+
out.scales[i * 12 + 5] = (m[1] & 63) + ((m[5] & 48) << 2);
|
|
2880
|
+
out.scales[i * 12 + 6] = (m[2] & 63) + ((m[6] & 48) << 2);
|
|
2881
|
+
out.scales[i * 12 + 7] = (m[3] & 63) + ((m[7] & 48) << 2);
|
|
2882
|
+
out.scales[i * 12 + 8] = (s[4] & 15) + ((m[4] & 15) << 4);
|
|
2883
|
+
out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4);
|
|
2884
|
+
out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
|
|
2885
|
+
out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
|
|
2886
|
+
|
|
2887
|
+
}
|
|
2888
|
+
|
|
2889
|
+
for (int i = 0; i < 4; i++) {
|
|
2890
|
+
for (int j = 0; j < 8; j++) {
|
|
2891
|
+
s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
|
|
2892
|
+
m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
|
|
2893
|
+
}
|
|
2894
|
+
|
|
2895
|
+
out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
|
|
2896
|
+
out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2);
|
|
2897
|
+
out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2);
|
|
2898
|
+
out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2);
|
|
2899
|
+
out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2);
|
|
2900
|
+
out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2);
|
|
2901
|
+
out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2);
|
|
2902
|
+
out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2);
|
|
2903
|
+
out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4);
|
|
2904
|
+
out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
|
|
2905
|
+
out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
|
|
2906
|
+
out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
|
|
2907
|
+
|
|
2908
|
+
}
|
|
2909
|
+
|
|
2910
|
+
return out;
|
|
2911
|
+
}
|
|
2912
|
+
|
|
2913
|
+
static block_q4_Kx16 make_block_q4_Kx16(block_q4_K * in, unsigned int blck_size_interleave) {
|
|
2914
|
+
block_q4_Kx16 out;
|
|
2915
|
+
//Delta(scale) and dmin values of the 16 Q4_K structures are copied onto the output interleaved structure
|
|
2916
|
+
for (int i = 0; i < 16; i++) {
|
|
2917
|
+
out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
|
|
2918
|
+
}
|
|
2919
|
+
|
|
2920
|
+
for (int i = 0; i < 16; i++) {
|
|
2921
|
+
out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
|
|
2922
|
+
}
|
|
2923
|
+
|
|
2924
|
+
const int end = QK_K * 8 / blck_size_interleave;
|
|
2925
|
+
|
|
2926
|
+
if (blck_size_interleave == 1) {
|
|
2927
|
+
for (int i = 0; i < end; ++i) {
|
|
2928
|
+
int src_id = i % 16;
|
|
2929
|
+
int src_offset = i / 16;
|
|
2930
|
+
int dst_offset = i;
|
|
2931
|
+
|
|
2932
|
+
out.qs[dst_offset] = in[src_id].qs[src_offset];
|
|
2933
|
+
}
|
|
2934
|
+
|
|
2935
|
+
// RVV repacking.
|
|
2936
|
+
//
|
|
2937
|
+
// Extract sums and mins for all 8 sub-blocks for each block of Q4_K.
|
|
2938
|
+
uint8_t s[128], m[128];
|
|
2939
|
+
for (int i = 0; i < 4; i++) {
|
|
2940
|
+
for (int j = 0; j < 16; j++) {
|
|
2941
|
+
s[i * 16 + j] = in[j].scales[i] & 63;
|
|
2942
|
+
m[i * 16 + j] = in[j].scales[i + 4] & 63;
|
|
2943
|
+
}
|
|
2944
|
+
}
|
|
2945
|
+
for (int i = 0; i < 4; i++) {
|
|
2946
|
+
for (int j = 0; j < 16; j++) {
|
|
2947
|
+
s[64 + i * 16 + j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
|
|
2948
|
+
m[64 + i * 16 + j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
|
|
2949
|
+
}
|
|
2950
|
+
}
|
|
2951
|
+
|
|
2952
|
+
for (int i = 0; i < 128; i++) {
|
|
2953
|
+
out.scales[i] = (s[i] & 15) | ((m[i] & 15) << 4);
|
|
2954
|
+
}
|
|
2955
|
+
for (int i = 0; i < 64; i++) {
|
|
2956
|
+
out.scales[128 + i] = ((s[i] & 48) >> 4) | ((m[i] & 48) >> 2) | (s[64 + i] & 48) | ((m[64 + i] & 48) << 2);
|
|
2957
|
+
}
|
|
2958
|
+
} else {
|
|
2959
|
+
GGML_ASSERT(false);
|
|
2960
|
+
}
|
|
2961
|
+
|
|
2962
|
+
return out;
|
|
2963
|
+
}
|
|
2964
|
+
|
|
2965
|
+
static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
|
|
2966
|
+
block_q2_Kx8 out;
|
|
2967
|
+
|
|
2968
|
+
// Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
|
|
2969
|
+
for (int i = 0; i < 8; i++) {
|
|
2970
|
+
out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
|
|
2971
|
+
}
|
|
2972
|
+
|
|
2973
|
+
for (int i = 0; i < 8; i++) {
|
|
2974
|
+
out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
|
|
2975
|
+
}
|
|
2976
|
+
|
|
2977
|
+
const int end = QK_K * 2 / blck_size_interleave;
|
|
2978
|
+
|
|
2979
|
+
// Interleave Q2_K quants by taking 8 bytes at a time
|
|
2980
|
+
for (int i = 0; i < end; ++i) {
|
|
2981
|
+
int src_id = i % 8;
|
|
2982
|
+
int src_offset = (i / 8) * blck_size_interleave;
|
|
2983
|
+
int dst_offset = i * blck_size_interleave;
|
|
2984
|
+
|
|
2985
|
+
uint64_t elems;
|
|
2986
|
+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
1502
2987
|
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
|
1503
2988
|
}
|
|
1504
2989
|
|
|
2990
|
+
// The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
|
|
2991
|
+
// Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
|
|
2992
|
+
// The output Q2_Kx8 structure has 128 bytes for storing scales and mins
|
|
2993
|
+
// Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
|
|
2994
|
+
// For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
|
|
2995
|
+
|
|
2996
|
+
for (int i = 0; i < 128; i++) {
|
|
2997
|
+
// Index for selecting which q2k super block
|
|
2998
|
+
int src1 = (i % 16) / 2;
|
|
2999
|
+
// Index for selecting scale
|
|
3000
|
+
int src2 = ((i / 16) * 2) + (i % 2);
|
|
3001
|
+
|
|
3002
|
+
out.scales[i] = in[src1].scales[src2];
|
|
3003
|
+
}
|
|
1505
3004
|
return out;
|
|
1506
3005
|
}
|
|
1507
3006
|
|
|
1508
|
-
static
|
|
1509
|
-
|
|
1510
|
-
//Delta(scale) and dmin values of the eight
|
|
3007
|
+
static block_q5_Kx8 make_block_q5_Kx8(block_q5_K * in, unsigned int blck_size_interleave) {
|
|
3008
|
+
block_q5_Kx8 out;
|
|
3009
|
+
//Delta(scale) and dmin values of the eight Q5_K structures are copied onto the output interleaved structure
|
|
1511
3010
|
for (int i = 0; i < 8; i++) {
|
|
1512
3011
|
out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
|
|
1513
3012
|
}
|
|
@@ -1518,22 +3017,33 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
|
|
|
1518
3017
|
|
|
1519
3018
|
const int end = QK_K * 4 / blck_size_interleave;
|
|
1520
3019
|
|
|
1521
|
-
// Interleave
|
|
3020
|
+
// Interleave Q5_K quants by taking blck_size_interleave bytes at a time
|
|
1522
3021
|
for (int i = 0; i < end; ++i) {
|
|
1523
|
-
int src_id
|
|
3022
|
+
int src_id = i % 8;
|
|
1524
3023
|
int src_offset = (i / 8) * blck_size_interleave;
|
|
1525
3024
|
int dst_offset = i * blck_size_interleave;
|
|
1526
3025
|
|
|
1527
|
-
|
|
1528
|
-
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
1529
|
-
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
|
3026
|
+
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
|
|
1530
3027
|
}
|
|
1531
3028
|
|
|
1532
|
-
//
|
|
1533
|
-
//
|
|
1534
|
-
//
|
|
1535
|
-
//
|
|
1536
|
-
|
|
3029
|
+
// Repeat for high bits with the same chunk size, since
|
|
3030
|
+
// the high bits are interleaved in Q5_K and the index is
|
|
3031
|
+
// qh_idx = (qs_idx % 32);
|
|
3032
|
+
// qh_val = qh[qh_idx] >> (qs_idx / 32);
|
|
3033
|
+
for (int i = 0; i < end / 4; ++i) {
|
|
3034
|
+
int src_id = i % 8;
|
|
3035
|
+
int src_offset = (i / 8) * blck_size_interleave;
|
|
3036
|
+
int dst_offset = i * blck_size_interleave;
|
|
3037
|
+
|
|
3038
|
+
memcpy(&out.qh[dst_offset], &in[src_id].qh[src_offset], blck_size_interleave);
|
|
3039
|
+
}
|
|
3040
|
+
|
|
3041
|
+
// The below logic is copied over from Q4_K
|
|
3042
|
+
// The point is to unpack all the scales and mins for each sub block every time we load 12 bytes.
|
|
3043
|
+
// Currently the Q5_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
|
|
3044
|
+
// The output Q5_Kx8 structure has 96 bytes
|
|
3045
|
+
// Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q5_K structure
|
|
3046
|
+
// For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q5_K structures
|
|
1537
3047
|
uint8_t s[8], m[8];
|
|
1538
3048
|
|
|
1539
3049
|
for (int i = 0; i < 4; i++) {
|
|
@@ -1554,13 +3064,12 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
|
|
|
1554
3064
|
out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4);
|
|
1555
3065
|
out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
|
|
1556
3066
|
out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
|
|
1557
|
-
|
|
1558
3067
|
}
|
|
1559
3068
|
|
|
1560
3069
|
for (int i = 0; i < 4; i++) {
|
|
1561
3070
|
for (int j = 0; j < 8; j++) {
|
|
1562
|
-
s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
|
|
1563
|
-
m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
|
|
3071
|
+
s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i + 8] & 15);
|
|
3072
|
+
m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i + 8] & 240) >> 4);
|
|
1564
3073
|
}
|
|
1565
3074
|
|
|
1566
3075
|
out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
|
|
@@ -1575,54 +3084,117 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
|
|
|
1575
3084
|
out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
|
|
1576
3085
|
out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
|
|
1577
3086
|
out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
|
|
1578
|
-
|
|
1579
3087
|
}
|
|
1580
3088
|
|
|
1581
3089
|
return out;
|
|
1582
3090
|
}
|
|
1583
3091
|
|
|
1584
|
-
static
|
|
1585
|
-
|
|
3092
|
+
static block_q6_Kx8 make_block_q6_Kx8(block_q6_K * in, unsigned int blck_size_interleave) {
|
|
3093
|
+
block_q6_Kx8 out;
|
|
3094
|
+
constexpr int n_blocks = 8; // Kx8
|
|
3095
|
+
for (int i = 0; i < n_blocks; i++) {
|
|
3096
|
+
out.d[i] = in[i].d;
|
|
3097
|
+
}
|
|
1586
3098
|
|
|
1587
|
-
|
|
1588
|
-
|
|
1589
|
-
|
|
3099
|
+
const int end_ls = QK_K * 4 / blck_size_interleave;
|
|
3100
|
+
// Interleave Q6_K quants by taking blck_size_interleave bytes at a time
|
|
3101
|
+
for (int i = 0; i < end_ls; ++i) {
|
|
3102
|
+
int src_id = i % n_blocks;
|
|
3103
|
+
int src_offset = (i / n_blocks) * blck_size_interleave;
|
|
3104
|
+
int dst_offset = i * blck_size_interleave;
|
|
3105
|
+
|
|
3106
|
+
uint64_t elem_ls;
|
|
3107
|
+
memcpy(&elem_ls, &in[src_id].ql[src_offset], blck_size_interleave);
|
|
3108
|
+
memcpy(&out.ql[dst_offset], &elem_ls, blck_size_interleave);
|
|
1590
3109
|
}
|
|
1591
3110
|
|
|
1592
|
-
|
|
3111
|
+
// Interleave high bits using same chunk size as low bits
|
|
3112
|
+
const int end_hs = end_ls / 2;
|
|
3113
|
+
for (int i = 0; i < end_hs; ++i) {
|
|
3114
|
+
int src_id = i % n_blocks;
|
|
3115
|
+
int src_offset = (i / n_blocks) * blck_size_interleave;
|
|
3116
|
+
int dst_offset = i * blck_size_interleave;
|
|
3117
|
+
|
|
3118
|
+
uint64_t elem_hs;
|
|
3119
|
+
memcpy(&elem_hs, &in[src_id].qh[src_offset], blck_size_interleave);
|
|
3120
|
+
memcpy(&out.qh[dst_offset], &elem_hs, blck_size_interleave);
|
|
3121
|
+
}
|
|
3122
|
+
|
|
3123
|
+
// The below logic is designed so as to unpack and rearrange scales in Q6_K
|
|
3124
|
+
// The output Q6_Kx8 structure interleaves the 8 bit scales in the same fashion as the quants
|
|
3125
|
+
// Q6_K structure has an 8-bit scale per 16 elements -> 16 scales
|
|
3126
|
+
// scales: [0 bl0 0 bl1 ... 0 bl7][1 bl0 ... 1 bl7] ... [15 bl0 ... 15 bl7] (bl = block)
|
|
3127
|
+
constexpr int n_scales = QK_K / 16;
|
|
3128
|
+
|
|
3129
|
+
for (int i = 0; i < n_blocks; i++) {
|
|
3130
|
+
for (int j = 0; j < n_scales; j++) {
|
|
3131
|
+
out.scales[j * n_blocks + i] = in[i].scales[j];
|
|
3132
|
+
}
|
|
3133
|
+
}
|
|
3134
|
+
|
|
3135
|
+
return out;
|
|
3136
|
+
}
|
|
3137
|
+
|
|
3138
|
+
static block_q2_Kx16 make_block_q2_Kx16(const block_q2_K * in, unsigned int blck_size_interleave) {
|
|
3139
|
+
block_q2_Kx16 out;
|
|
3140
|
+
constexpr int N_COLS = 16;
|
|
3141
|
+
|
|
3142
|
+
// 1. Copy Super-Scales (d) and Super-Mins (dmin)
|
|
3143
|
+
for (int i = 0; i < N_COLS; i++) {
|
|
3144
|
+
out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
|
|
1593
3145
|
out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
|
|
1594
3146
|
}
|
|
1595
3147
|
|
|
1596
|
-
|
|
3148
|
+
// 2. Interleave Q2_K Data
|
|
3149
|
+
const int bytes_per_col = 64;
|
|
3150
|
+
const int total_bytes = N_COLS * bytes_per_col;
|
|
3151
|
+
const int end = total_bytes / blck_size_interleave;
|
|
1597
3152
|
|
|
1598
|
-
// Interleave Q2_K quants by taking 8 bytes at a time
|
|
1599
3153
|
for (int i = 0; i < end; ++i) {
|
|
1600
|
-
int
|
|
1601
|
-
int src_offset = (i /
|
|
3154
|
+
int src_col_id = i % N_COLS;
|
|
3155
|
+
int src_offset = (i / N_COLS) * blck_size_interleave;
|
|
1602
3156
|
int dst_offset = i * blck_size_interleave;
|
|
1603
|
-
|
|
1604
|
-
uint64_t elems;
|
|
1605
|
-
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
1606
|
-
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
|
3157
|
+
memcpy(&out.qs[dst_offset], &in[src_col_id].qs[src_offset], blck_size_interleave);
|
|
1607
3158
|
}
|
|
1608
3159
|
|
|
1609
|
-
//
|
|
1610
|
-
|
|
1611
|
-
// The output Q2_Kx8 structure has 128 bytes for storing scales and mins
|
|
1612
|
-
// Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
|
|
1613
|
-
// For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
|
|
3160
|
+
// 3. Repack Scales into the Optimized "Sequential-Parallel" Layout
|
|
3161
|
+
int out_idx = 0;
|
|
1614
3162
|
|
|
1615
|
-
|
|
3163
|
+
// Arrays define the sub-block order for each group
|
|
3164
|
+
const int even_low_sbs[] = {0, 2, 4, 6};
|
|
3165
|
+
const int odd_low_sbs[] = {1, 3, 5, 7};
|
|
3166
|
+
const int even_high_sbs[] = {8, 10, 12, 14};
|
|
3167
|
+
const int odd_high_sbs[] = {9, 11, 13, 15};
|
|
1616
3168
|
|
|
1617
|
-
|
|
1618
|
-
|
|
1619
|
-
|
|
1620
|
-
|
|
3169
|
+
// Pack Group 1: Even-Low
|
|
3170
|
+
for (int sb : even_low_sbs) {
|
|
3171
|
+
for (int col = 0; col < N_COLS; col++) {
|
|
3172
|
+
out.scales[out_idx++] = in[col].scales[sb];
|
|
3173
|
+
}
|
|
3174
|
+
}
|
|
1621
3175
|
|
|
1622
|
-
|
|
3176
|
+
// Pack Group 2: Odd-Low
|
|
3177
|
+
for (int sb : odd_low_sbs) {
|
|
3178
|
+
for (int col = 0; col < N_COLS; col++) {
|
|
3179
|
+
out.scales[out_idx++] = in[col].scales[sb];
|
|
3180
|
+
}
|
|
3181
|
+
}
|
|
3182
|
+
|
|
3183
|
+
// Pack Group 3: Even-High
|
|
3184
|
+
for (int sb : even_high_sbs) {
|
|
3185
|
+
for (int col = 0; col < N_COLS; col++) {
|
|
3186
|
+
out.scales[out_idx++] = in[col].scales[sb];
|
|
3187
|
+
}
|
|
1623
3188
|
}
|
|
1624
|
-
return out;
|
|
1625
3189
|
|
|
3190
|
+
// Pack Group 4: Odd-High
|
|
3191
|
+
for (int sb : odd_high_sbs) {
|
|
3192
|
+
for (int col = 0; col < N_COLS; col++) {
|
|
3193
|
+
out.scales[out_idx++] = in[col].scales[sb];
|
|
3194
|
+
}
|
|
3195
|
+
}
|
|
3196
|
+
|
|
3197
|
+
return out;
|
|
1626
3198
|
}
|
|
1627
3199
|
|
|
1628
3200
|
static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
@@ -1687,6 +3259,36 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
|
|
|
1687
3259
|
GGML_UNUSED(data_size);
|
|
1688
3260
|
}
|
|
1689
3261
|
|
|
3262
|
+
static int repack_q4_K_to_q4_K_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
3263
|
+
GGML_ASSERT(t->type == GGML_TYPE_Q4_K);
|
|
3264
|
+
constexpr int nrows_interleaved = 16;
|
|
3265
|
+
|
|
3266
|
+
block_q4_Kx16 * dst = (block_q4_Kx16*)t->data;
|
|
3267
|
+
const block_q4_K * src = (const block_q4_K*) data;
|
|
3268
|
+
block_q4_K dst_tmp[16];
|
|
3269
|
+
int nrow = ggml_nrows(t);
|
|
3270
|
+
int nblocks = t->ne[0] / QK_K;
|
|
3271
|
+
|
|
3272
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K));
|
|
3273
|
+
|
|
3274
|
+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
|
3275
|
+
return -1;
|
|
3276
|
+
}
|
|
3277
|
+
|
|
3278
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
3279
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
3280
|
+
for (int i = 0; i < nrows_interleaved; i++ ) {
|
|
3281
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
3282
|
+
}
|
|
3283
|
+
*dst++ = make_block_q4_Kx16(dst_tmp, interleave_block);
|
|
3284
|
+
}
|
|
3285
|
+
src += nrows_interleaved * nblocks;
|
|
3286
|
+
}
|
|
3287
|
+
return 0;
|
|
3288
|
+
|
|
3289
|
+
GGML_UNUSED(data_size);
|
|
3290
|
+
}
|
|
3291
|
+
|
|
1690
3292
|
static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
1691
3293
|
GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
|
|
1692
3294
|
GGML_ASSERT(interleave_block == 8);
|
|
@@ -1706,7 +3308,7 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block
|
|
|
1706
3308
|
|
|
1707
3309
|
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
1708
3310
|
for (int64_t x = 0; x < nblocks; x++) {
|
|
1709
|
-
for (int i
|
|
3311
|
+
for (int i = 0; i < nrows_interleaved; i++) {
|
|
1710
3312
|
dst_tmp[i] = src[x + i * nblocks];
|
|
1711
3313
|
}
|
|
1712
3314
|
*dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
|
|
@@ -1718,6 +3320,132 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block
|
|
|
1718
3320
|
GGML_UNUSED(data_size);
|
|
1719
3321
|
}
|
|
1720
3322
|
|
|
3323
|
+
static int repack_q2_K_to_q2_K_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
3324
|
+
GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
|
|
3325
|
+
constexpr int nrows_interleaved = 16;
|
|
3326
|
+
|
|
3327
|
+
block_q2_Kx16 * dst = (block_q2_Kx16*)t->data;
|
|
3328
|
+
const block_q2_K * src = (const block_q2_K*) data;
|
|
3329
|
+
|
|
3330
|
+
block_q2_K dst_tmp[nrows_interleaved];
|
|
3331
|
+
|
|
3332
|
+
int nrow = ggml_nrows(t);
|
|
3333
|
+
int nblocks = t->ne[0] / QK_K;
|
|
3334
|
+
|
|
3335
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
|
|
3336
|
+
|
|
3337
|
+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
|
3338
|
+
return -1;
|
|
3339
|
+
}
|
|
3340
|
+
|
|
3341
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
3342
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
3343
|
+
// This loop gathers 16 separate blocks (one from each column)
|
|
3344
|
+
// that correspond to the same K-dimension chunk.
|
|
3345
|
+
for (int i = 0; i < nrows_interleaved; i++ ) {
|
|
3346
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
3347
|
+
}
|
|
3348
|
+
|
|
3349
|
+
*dst++ = make_block_q2_Kx16(dst_tmp, interleave_block);
|
|
3350
|
+
}
|
|
3351
|
+
src += nrows_interleaved * nblocks;
|
|
3352
|
+
}
|
|
3353
|
+
return 0;
|
|
3354
|
+
|
|
3355
|
+
GGML_UNUSED(data_size);
|
|
3356
|
+
}
|
|
3357
|
+
|
|
3358
|
+
static int repack_q4_0_to_q4_0_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
3359
|
+
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
|
3360
|
+
constexpr int nrows_interleaved = 16;
|
|
3361
|
+
|
|
3362
|
+
block_q4_0x16 * dst = (block_q4_0x16*)t->data;
|
|
3363
|
+
const block_q4_0 * src = (const block_q4_0*) data;
|
|
3364
|
+
block_q4_0 dst_tmp[16];
|
|
3365
|
+
int nrow = ggml_nrows(t);
|
|
3366
|
+
int nblocks = t->ne[0] / QK4_0;
|
|
3367
|
+
|
|
3368
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
|
|
3369
|
+
|
|
3370
|
+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
|
3371
|
+
return -1;
|
|
3372
|
+
}
|
|
3373
|
+
|
|
3374
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
3375
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
3376
|
+
for (int i = 0; i < nrows_interleaved; i++ ) {
|
|
3377
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
3378
|
+
}
|
|
3379
|
+
*dst++ = make_block_q4_0x16(dst_tmp, interleave_block);
|
|
3380
|
+
}
|
|
3381
|
+
src += nrows_interleaved * nblocks;
|
|
3382
|
+
}
|
|
3383
|
+
return 0;
|
|
3384
|
+
|
|
3385
|
+
GGML_UNUSED(data_size);
|
|
3386
|
+
}
|
|
3387
|
+
|
|
3388
|
+
static int repack_q5_K_to_q5_K_8_bl(struct ggml_tensor * t,
|
|
3389
|
+
int interleave_block,
|
|
3390
|
+
const void * GGML_RESTRICT data,
|
|
3391
|
+
size_t data_size) {
|
|
3392
|
+
GGML_ASSERT(t->type == GGML_TYPE_Q5_K);
|
|
3393
|
+
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
|
3394
|
+
constexpr int nrows_interleaved = 8;
|
|
3395
|
+
|
|
3396
|
+
block_q5_Kx8 * dst = (block_q5_Kx8 *) t->data;
|
|
3397
|
+
const block_q5_K * src = (const block_q5_K *) data;
|
|
3398
|
+
block_q5_K dst_tmp[8];
|
|
3399
|
+
int nrow = ggml_nrows(t);
|
|
3400
|
+
int nblocks = t->ne[0] / QK_K;
|
|
3401
|
+
|
|
3402
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q5_K));
|
|
3403
|
+
|
|
3404
|
+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
|
3405
|
+
return -1;
|
|
3406
|
+
}
|
|
3407
|
+
|
|
3408
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
3409
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
3410
|
+
for (int i = 0; i < nrows_interleaved; i++) {
|
|
3411
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
3412
|
+
}
|
|
3413
|
+
*dst++ = make_block_q5_Kx8(dst_tmp, interleave_block);
|
|
3414
|
+
}
|
|
3415
|
+
src += nrows_interleaved * nblocks;
|
|
3416
|
+
}
|
|
3417
|
+
return 0;
|
|
3418
|
+
}
|
|
3419
|
+
|
|
3420
|
+
static int repack_q6_K_to_q6_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
3421
|
+
GGML_ASSERT(t->type == GGML_TYPE_Q6_K);
|
|
3422
|
+
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
|
3423
|
+
constexpr int nrows_interleaved = 8;
|
|
3424
|
+
|
|
3425
|
+
block_q6_Kx8 * dst = (block_q6_Kx8 *)t->data;
|
|
3426
|
+
const block_q6_K * src = (const block_q6_K *) data;
|
|
3427
|
+
block_q6_K dst_tmp[8];
|
|
3428
|
+
int nrow = ggml_nrows(t);
|
|
3429
|
+
int nblocks = t->ne[0] / QK_K;
|
|
3430
|
+
|
|
3431
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q6_K));
|
|
3432
|
+
|
|
3433
|
+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
|
3434
|
+
return -1;
|
|
3435
|
+
}
|
|
3436
|
+
|
|
3437
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
3438
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
3439
|
+
for (int i = 0; i < nrows_interleaved; i++) {
|
|
3440
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
3441
|
+
}
|
|
3442
|
+
*dst++ = make_block_q6_Kx8(dst_tmp, interleave_block);
|
|
3443
|
+
}
|
|
3444
|
+
src += nrows_interleaved * nblocks;
|
|
3445
|
+
}
|
|
3446
|
+
return 0;
|
|
3447
|
+
}
|
|
3448
|
+
|
|
1721
3449
|
static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
1722
3450
|
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
|
1723
3451
|
GGML_ASSERT(interleave_block == 8);
|
|
@@ -1757,9 +3485,63 @@ static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor * t,
|
|
|
1757
3485
|
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
|
1758
3486
|
constexpr int nrows_interleaved = 4;
|
|
1759
3487
|
|
|
1760
|
-
block_q8_0x4 * dst = (block_q8_0x4 *) t->data;
|
|
3488
|
+
block_q8_0x4 * dst = (block_q8_0x4 *) t->data;
|
|
3489
|
+
const block_q8_0 * src = (const block_q8_0 *) data;
|
|
3490
|
+
block_q8_0 dst_tmp[4];
|
|
3491
|
+
int nrow = ggml_nrows(t);
|
|
3492
|
+
int nblocks = t->ne[0] / QK8_0;
|
|
3493
|
+
|
|
3494
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q8_0));
|
|
3495
|
+
|
|
3496
|
+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
|
3497
|
+
return -1;
|
|
3498
|
+
}
|
|
3499
|
+
|
|
3500
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
3501
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
3502
|
+
for (int i = 0; i < nrows_interleaved; i++) {
|
|
3503
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
3504
|
+
}
|
|
3505
|
+
*dst++ = make_block_q8_0x4(dst_tmp, interleave_block);
|
|
3506
|
+
}
|
|
3507
|
+
src += nrows_interleaved * nblocks;
|
|
3508
|
+
}
|
|
3509
|
+
return 0;
|
|
3510
|
+
}
|
|
3511
|
+
|
|
3512
|
+
static block_q8_0x16 make_block_q8_0x16(block_q8_0 * in, unsigned int blck_size_interleave) {
|
|
3513
|
+
block_q8_0x16 out;
|
|
3514
|
+
|
|
3515
|
+
for (int i = 0; i < 16; i++) {
|
|
3516
|
+
out.d[i] = in[i].d;
|
|
3517
|
+
}
|
|
3518
|
+
|
|
3519
|
+
const int end = QK8_0 * 16 / blck_size_interleave;
|
|
3520
|
+
|
|
3521
|
+
if (blck_size_interleave == 1) {
|
|
3522
|
+
for (int i = 0; i < end; ++i) {
|
|
3523
|
+
int src_id = i % 16;
|
|
3524
|
+
int src_offset = i / 16;
|
|
3525
|
+
int dst_offset = i;
|
|
3526
|
+
out.qs[dst_offset] = in[src_id].qs[src_offset];
|
|
3527
|
+
}
|
|
3528
|
+
} else {
|
|
3529
|
+
GGML_ASSERT(false);
|
|
3530
|
+
}
|
|
3531
|
+
|
|
3532
|
+
return out;
|
|
3533
|
+
}
|
|
3534
|
+
|
|
3535
|
+
static int repack_q8_0_to_q8_0_16_bl(struct ggml_tensor * t,
|
|
3536
|
+
int interleave_block,
|
|
3537
|
+
const void * GGML_RESTRICT data,
|
|
3538
|
+
size_t data_size) {
|
|
3539
|
+
GGML_ASSERT(t->type == GGML_TYPE_Q8_0);
|
|
3540
|
+
constexpr int nrows_interleaved = 16;
|
|
3541
|
+
|
|
3542
|
+
block_q8_0x16 * dst = (block_q8_0x16 *) t->data;
|
|
1761
3543
|
const block_q8_0 * src = (const block_q8_0 *) data;
|
|
1762
|
-
block_q8_0 dst_tmp[
|
|
3544
|
+
block_q8_0 dst_tmp[16];
|
|
1763
3545
|
int nrow = ggml_nrows(t);
|
|
1764
3546
|
int nblocks = t->ne[0] / QK8_0;
|
|
1765
3547
|
|
|
@@ -1774,7 +3556,7 @@ static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor * t,
|
|
|
1774
3556
|
for (int i = 0; i < nrows_interleaved; i++) {
|
|
1775
3557
|
dst_tmp[i] = src[x + i * nblocks];
|
|
1776
3558
|
}
|
|
1777
|
-
*dst++ =
|
|
3559
|
+
*dst++ = make_block_q8_0x16(dst_tmp, interleave_block);
|
|
1778
3560
|
}
|
|
1779
3561
|
src += nrows_interleaved * nblocks;
|
|
1780
3562
|
}
|
|
@@ -1906,6 +3688,177 @@ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_b
|
|
|
1906
3688
|
GGML_UNUSED(data_size);
|
|
1907
3689
|
}
|
|
1908
3690
|
|
|
3691
|
+
static block_iq4_nlx16 make_block_iq4_nlx16(block_iq4_nl * in, unsigned int blck_size_interleave) {
|
|
3692
|
+
block_iq4_nlx16 out;
|
|
3693
|
+
|
|
3694
|
+
for (int i = 0; i < 16; i++) {
|
|
3695
|
+
out.d[i] = in[i].d;
|
|
3696
|
+
}
|
|
3697
|
+
|
|
3698
|
+
const int end = QK4_NL * 8 / blck_size_interleave;
|
|
3699
|
+
|
|
3700
|
+
if (blck_size_interleave == 1) {
|
|
3701
|
+
for (int i = 0; i < end; ++i) {
|
|
3702
|
+
int src_id = i % 16;
|
|
3703
|
+
int src_offset = i / 16;
|
|
3704
|
+
int dst_offset = i;
|
|
3705
|
+
|
|
3706
|
+
out.qs[dst_offset] = in[src_id].qs[src_offset];
|
|
3707
|
+
}
|
|
3708
|
+
} else {
|
|
3709
|
+
GGML_ASSERT(false);
|
|
3710
|
+
}
|
|
3711
|
+
|
|
3712
|
+
return out;
|
|
3713
|
+
}
|
|
3714
|
+
|
|
3715
|
+
static int repack_iq4_nl_to_iq4_nl_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
3716
|
+
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
|
|
3717
|
+
GGML_ASSERT(interleave_block == 1);
|
|
3718
|
+
|
|
3719
|
+
const block_iq4_nl * src = (const block_iq4_nl *)data;
|
|
3720
|
+
block_iq4_nlx16 * dst = ( block_iq4_nlx16 *)t->data;
|
|
3721
|
+
|
|
3722
|
+
block_iq4_nl dst_tmp[16];
|
|
3723
|
+
|
|
3724
|
+
int nrow = ggml_nrows(t);
|
|
3725
|
+
int nrows_interleaved = 16;
|
|
3726
|
+
int nblocks = t->ne[0] / QK4_NL;
|
|
3727
|
+
|
|
3728
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
|
|
3729
|
+
|
|
3730
|
+
if (t->ne[1] % nrows_interleaved != 0) {
|
|
3731
|
+
return -1;
|
|
3732
|
+
}
|
|
3733
|
+
|
|
3734
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
3735
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
3736
|
+
for (int i = 0; i < nrows_interleaved; i++) {
|
|
3737
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
3738
|
+
}
|
|
3739
|
+
*dst++ = make_block_iq4_nlx16(dst_tmp, interleave_block);
|
|
3740
|
+
}
|
|
3741
|
+
src += nrows_interleaved * nblocks;
|
|
3742
|
+
}
|
|
3743
|
+
return 0;
|
|
3744
|
+
|
|
3745
|
+
GGML_UNUSED(data_size);
|
|
3746
|
+
}
|
|
3747
|
+
|
|
3748
|
+
static block_mxfp4x4 make_block_mxfp4x4(block_mxfp4 * in, unsigned int blck_size_interleave) {
|
|
3749
|
+
block_mxfp4x4 out;
|
|
3750
|
+
|
|
3751
|
+
for (int i = 0; i < 4; i++) {
|
|
3752
|
+
out.e[i] = in[i].e;
|
|
3753
|
+
}
|
|
3754
|
+
|
|
3755
|
+
const int end = QK_MXFP4 * 2 / blck_size_interleave;
|
|
3756
|
+
|
|
3757
|
+
if (blck_size_interleave == 4) {
|
|
3758
|
+
for (int i = 0; i < end; ++i) {
|
|
3759
|
+
int src_id = i % 4;
|
|
3760
|
+
int src_offset = (i / 4) * blck_size_interleave;
|
|
3761
|
+
int dst_offset = i * blck_size_interleave;
|
|
3762
|
+
|
|
3763
|
+
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
|
|
3764
|
+
}
|
|
3765
|
+
} else {
|
|
3766
|
+
GGML_ASSERT(false);
|
|
3767
|
+
}
|
|
3768
|
+
|
|
3769
|
+
return out;
|
|
3770
|
+
}
|
|
3771
|
+
|
|
3772
|
+
static int repack_mxfp4_to_mxfp4_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
3773
|
+
GGML_ASSERT(t->type == GGML_TYPE_MXFP4);
|
|
3774
|
+
GGML_ASSERT(interleave_block == 4);
|
|
3775
|
+
|
|
3776
|
+
const block_mxfp4 * src = (const block_mxfp4 *)data;
|
|
3777
|
+
block_mxfp4x4 * dst = ( block_mxfp4x4 *)t->data;
|
|
3778
|
+
|
|
3779
|
+
block_mxfp4 dst_tmp[4];
|
|
3780
|
+
|
|
3781
|
+
int nrow = ggml_nrows(t);
|
|
3782
|
+
int nrows_interleaved = 4;
|
|
3783
|
+
int nblocks = t->ne[0] / QK_MXFP4;
|
|
3784
|
+
|
|
3785
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_mxfp4));
|
|
3786
|
+
|
|
3787
|
+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
|
3788
|
+
return -1;
|
|
3789
|
+
}
|
|
3790
|
+
|
|
3791
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
3792
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
3793
|
+
for (int i = 0; i < nrows_interleaved; i++) {
|
|
3794
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
3795
|
+
}
|
|
3796
|
+
*dst++ = make_block_mxfp4x4(dst_tmp, interleave_block);
|
|
3797
|
+
}
|
|
3798
|
+
src += nrows_interleaved * nblocks;
|
|
3799
|
+
}
|
|
3800
|
+
return 0;
|
|
3801
|
+
|
|
3802
|
+
GGML_UNUSED(data_size);
|
|
3803
|
+
}
|
|
3804
|
+
|
|
3805
|
+
static block_mxfp4x8 make_block_mxfp4x8(block_mxfp4 * in, unsigned int blck_size_interleave) {
|
|
3806
|
+
block_mxfp4x8 out;
|
|
3807
|
+
|
|
3808
|
+
for (int i = 0; i < 8; i++) {
|
|
3809
|
+
out.e[i] = in[i].e;
|
|
3810
|
+
}
|
|
3811
|
+
|
|
3812
|
+
const int end = QK_MXFP4 * 4 / blck_size_interleave;
|
|
3813
|
+
|
|
3814
|
+
if (blck_size_interleave == 8) {
|
|
3815
|
+
for (int i = 0; i < end; ++i) {
|
|
3816
|
+
int src_id = i % 8;
|
|
3817
|
+
int src_offset = (i / 8) * blck_size_interleave;
|
|
3818
|
+
int dst_offset = i * blck_size_interleave;
|
|
3819
|
+
|
|
3820
|
+
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
3821
|
+
}
|
|
3822
|
+
} else {
|
|
3823
|
+
GGML_ASSERT(false);
|
|
3824
|
+
}
|
|
3825
|
+
|
|
3826
|
+
return out;
|
|
3827
|
+
}
|
|
3828
|
+
|
|
3829
|
+
static int repack_mxfp4_to_mxfp4_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
3830
|
+
GGML_ASSERT(t->type == GGML_TYPE_MXFP4);
|
|
3831
|
+
GGML_ASSERT(interleave_block == 8);
|
|
3832
|
+
|
|
3833
|
+
const block_mxfp4 * src = (const block_mxfp4 *)data;
|
|
3834
|
+
block_mxfp4x8 * dst = ( block_mxfp4x8 *)t->data;
|
|
3835
|
+
|
|
3836
|
+
block_mxfp4 dst_tmp[8];
|
|
3837
|
+
|
|
3838
|
+
int nrow = ggml_nrows(t);
|
|
3839
|
+
int nrows_interleaved = 8;
|
|
3840
|
+
int nblocks = t->ne[0] / QK_MXFP4;
|
|
3841
|
+
|
|
3842
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_mxfp4));
|
|
3843
|
+
|
|
3844
|
+
if (t->ne[1] % nrows_interleaved != 0) {
|
|
3845
|
+
return -1;
|
|
3846
|
+
}
|
|
3847
|
+
|
|
3848
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
3849
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
3850
|
+
for (int i = 0; i < nrows_interleaved; i++) {
|
|
3851
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
3852
|
+
}
|
|
3853
|
+
*dst++ = make_block_mxfp4x8(dst_tmp, interleave_block);
|
|
3854
|
+
}
|
|
3855
|
+
src += nrows_interleaved * nblocks;
|
|
3856
|
+
}
|
|
3857
|
+
return 0;
|
|
3858
|
+
|
|
3859
|
+
GGML_UNUSED(data_size);
|
|
3860
|
+
}
|
|
3861
|
+
|
|
1909
3862
|
namespace ggml::cpu::repack {
|
|
1910
3863
|
// repack
|
|
1911
3864
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
|
|
@@ -1936,6 +3889,22 @@ template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * da
|
|
|
1936
3889
|
return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
|
|
1937
3890
|
}
|
|
1938
3891
|
|
|
3892
|
+
template <> int repack<block_q5_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3893
|
+
return repack_q5_K_to_q5_K_8_bl(t, 4, data, data_size);
|
|
3894
|
+
}
|
|
3895
|
+
|
|
3896
|
+
template <> int repack<block_q5_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3897
|
+
return repack_q5_K_to_q5_K_8_bl(t, 8, data, data_size);
|
|
3898
|
+
}
|
|
3899
|
+
|
|
3900
|
+
template <> int repack<block_q6_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3901
|
+
return repack_q6_K_to_q6_K_8_bl(t, 4, data, data_size);
|
|
3902
|
+
}
|
|
3903
|
+
|
|
3904
|
+
template <> int repack<block_q6_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3905
|
+
return repack_q6_K_to_q6_K_8_bl(t, 8, data, data_size);
|
|
3906
|
+
}
|
|
3907
|
+
|
|
1939
3908
|
template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1940
3909
|
return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
|
|
1941
3910
|
}
|
|
@@ -1949,6 +3918,14 @@ template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void *
|
|
|
1949
3918
|
return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
|
|
1950
3919
|
}
|
|
1951
3920
|
|
|
3921
|
+
template <> int repack<block_mxfp4, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3922
|
+
return repack_mxfp4_to_mxfp4_4_bl(t, 4, data, data_size);
|
|
3923
|
+
}
|
|
3924
|
+
|
|
3925
|
+
template <> int repack<block_mxfp4, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3926
|
+
return repack_mxfp4_to_mxfp4_8_bl(t, 8, data, data_size);
|
|
3927
|
+
}
|
|
3928
|
+
|
|
1952
3929
|
template <> int repack<block_q8_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1953
3930
|
return repack_q8_0_to_q8_0_4_bl(t, 4, data, data_size);
|
|
1954
3931
|
}
|
|
@@ -1957,6 +3934,28 @@ template <> int repack<block_q8_0, 8, 4>(struct ggml_tensor * t, const void * da
|
|
|
1957
3934
|
return repack_q8_0_to_q8_0_4_bl(t, 8, data, data_size);
|
|
1958
3935
|
}
|
|
1959
3936
|
|
|
3937
|
+
#if defined __riscv_zvfh
|
|
3938
|
+
template <> int repack<block_q4_0, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3939
|
+
return repack_q4_0_to_q4_0_16_bl(t, 1, data, data_size);
|
|
3940
|
+
}
|
|
3941
|
+
|
|
3942
|
+
template <> int repack<block_q4_K, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3943
|
+
return repack_q4_K_to_q4_K_16_bl(t, 1, data, data_size);
|
|
3944
|
+
}
|
|
3945
|
+
|
|
3946
|
+
template <> int repack<block_iq4_nl, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3947
|
+
return repack_iq4_nl_to_iq4_nl_16_bl(t, 1, data, data_size);
|
|
3948
|
+
}
|
|
3949
|
+
|
|
3950
|
+
template <> int repack<block_q8_0, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3951
|
+
return repack_q8_0_to_q8_0_16_bl(t, 1, data, data_size);
|
|
3952
|
+
}
|
|
3953
|
+
|
|
3954
|
+
template <> int repack<block_q2_K, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3955
|
+
return repack_q2_K_to_q2_K_16_bl(t, 1, data, data_size);
|
|
3956
|
+
}
|
|
3957
|
+
#endif
|
|
3958
|
+
|
|
1960
3959
|
// gemv
|
|
1961
3960
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
|
1962
3961
|
void gemv(int, float *, size_t, const void *, const void *, int, int);
|
|
@@ -1973,6 +3972,17 @@ template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t
|
|
|
1973
3972
|
ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1974
3973
|
}
|
|
1975
3974
|
|
|
3975
|
+
template <>
|
|
3976
|
+
void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n,
|
|
3977
|
+
float * s,
|
|
3978
|
+
size_t bs,
|
|
3979
|
+
const void * vx,
|
|
3980
|
+
const void * vy,
|
|
3981
|
+
int nr,
|
|
3982
|
+
int nc) {
|
|
3983
|
+
ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
3984
|
+
}
|
|
3985
|
+
|
|
1976
3986
|
template <> void gemv<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1977
3987
|
ggml_gemv_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
1978
3988
|
}
|
|
@@ -1981,8 +3991,20 @@ template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
|
|
|
1981
3991
|
ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
1982
3992
|
}
|
|
1983
3993
|
|
|
1984
|
-
template <> void gemv<
|
|
1985
|
-
|
|
3994
|
+
template <> void gemv<block_q5_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
3995
|
+
ggml_gemv_q5_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
3996
|
+
}
|
|
3997
|
+
|
|
3998
|
+
template <> void gemv<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
3999
|
+
ggml_gemv_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
4000
|
+
}
|
|
4001
|
+
|
|
4002
|
+
template <> void gemv<block_q6_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4003
|
+
ggml_gemv_q6_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
4004
|
+
}
|
|
4005
|
+
|
|
4006
|
+
template <> void gemv<block_q6_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4007
|
+
ggml_gemv_q6_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
1986
4008
|
}
|
|
1987
4009
|
|
|
1988
4010
|
template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
@@ -1993,6 +4015,14 @@ template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
|
|
|
1993
4015
|
ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1994
4016
|
}
|
|
1995
4017
|
|
|
4018
|
+
template <> void gemv<block_mxfp4, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4019
|
+
ggml_gemv_mxfp4_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
4020
|
+
}
|
|
4021
|
+
|
|
4022
|
+
template <> void gemv<block_mxfp4, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4023
|
+
ggml_gemv_mxfp4_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
4024
|
+
}
|
|
4025
|
+
|
|
1996
4026
|
template <> void gemv<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1997
4027
|
ggml_gemv_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1998
4028
|
}
|
|
@@ -2001,6 +4031,28 @@ template <> void gemv<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
|
|
|
2001
4031
|
ggml_gemv_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2002
4032
|
}
|
|
2003
4033
|
|
|
4034
|
+
#if defined __riscv_zvfh
|
|
4035
|
+
template <> void gemv<block_q4_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4036
|
+
ggml_gemv_q4_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
4037
|
+
}
|
|
4038
|
+
|
|
4039
|
+
template <> void gemv<block_q4_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4040
|
+
ggml_gemv_q4_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
4041
|
+
}
|
|
4042
|
+
|
|
4043
|
+
template <> void gemv<block_iq4_nl, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4044
|
+
ggml_gemv_iq4_nl_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
4045
|
+
}
|
|
4046
|
+
|
|
4047
|
+
template <> void gemv<block_q8_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4048
|
+
ggml_gemv_q8_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
4049
|
+
}
|
|
4050
|
+
|
|
4051
|
+
template <> void gemv<block_q2_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4052
|
+
ggml_gemv_q2_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
4053
|
+
}
|
|
4054
|
+
#endif
|
|
4055
|
+
|
|
2004
4056
|
// gemm
|
|
2005
4057
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
|
2006
4058
|
void gemm(int, float *, size_t, const void *, const void *, int, int);
|
|
@@ -2013,20 +4065,43 @@ template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
|
|
|
2013
4065
|
ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2014
4066
|
}
|
|
2015
4067
|
|
|
2016
|
-
template <>
|
|
2017
|
-
|
|
4068
|
+
template <>
|
|
4069
|
+
void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n,
|
|
4070
|
+
float * s,
|
|
4071
|
+
size_t bs,
|
|
4072
|
+
const void * vx,
|
|
4073
|
+
const void * vy,
|
|
4074
|
+
int nr,
|
|
4075
|
+
int nc) {
|
|
4076
|
+
ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2018
4077
|
}
|
|
2019
4078
|
|
|
2020
|
-
template <> void gemm<
|
|
2021
|
-
|
|
4079
|
+
template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4080
|
+
ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
4081
|
+
}
|
|
4082
|
+
|
|
4083
|
+
template <> void gemm<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4084
|
+
ggml_gemm_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
2022
4085
|
}
|
|
2023
4086
|
|
|
2024
4087
|
template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
2025
4088
|
ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
2026
4089
|
}
|
|
2027
4090
|
|
|
2028
|
-
template <> void gemm<
|
|
2029
|
-
|
|
4091
|
+
template <> void gemm<block_q5_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4092
|
+
ggml_gemm_q5_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
4093
|
+
}
|
|
4094
|
+
|
|
4095
|
+
template <> void gemm<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4096
|
+
ggml_gemm_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
4097
|
+
}
|
|
4098
|
+
|
|
4099
|
+
template <> void gemm<block_q6_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4100
|
+
ggml_gemm_q6_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
4101
|
+
}
|
|
4102
|
+
|
|
4103
|
+
template <> void gemm<block_q6_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4104
|
+
ggml_gemm_q6_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
2030
4105
|
}
|
|
2031
4106
|
|
|
2032
4107
|
template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
@@ -2037,6 +4112,14 @@ template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
|
|
|
2037
4112
|
ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2038
4113
|
}
|
|
2039
4114
|
|
|
4115
|
+
template <> void gemm<block_mxfp4, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4116
|
+
ggml_gemm_mxfp4_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
4117
|
+
}
|
|
4118
|
+
|
|
4119
|
+
template <> void gemm<block_mxfp4, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4120
|
+
ggml_gemm_mxfp4_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
4121
|
+
}
|
|
4122
|
+
|
|
2040
4123
|
template <> void gemm<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
2041
4124
|
ggml_gemm_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2042
4125
|
}
|
|
@@ -2045,6 +4128,28 @@ template <> void gemm<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
|
|
|
2045
4128
|
ggml_gemm_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2046
4129
|
}
|
|
2047
4130
|
|
|
4131
|
+
#if defined __riscv_zvfh
|
|
4132
|
+
template <> void gemm<block_q4_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4133
|
+
ggml_gemm_q4_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
4134
|
+
}
|
|
4135
|
+
|
|
4136
|
+
template <> void gemm<block_q4_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4137
|
+
ggml_gemm_q4_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
4138
|
+
}
|
|
4139
|
+
|
|
4140
|
+
template <> void gemm<block_iq4_nl, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4141
|
+
ggml_gemm_iq4_nl_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
4142
|
+
}
|
|
4143
|
+
|
|
4144
|
+
template <> void gemm<block_q8_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4145
|
+
ggml_gemm_q8_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
4146
|
+
}
|
|
4147
|
+
|
|
4148
|
+
template <> void gemm<block_q2_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4149
|
+
ggml_gemm_q2_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
4150
|
+
}
|
|
4151
|
+
#endif
|
|
4152
|
+
|
|
2048
4153
|
class tensor_traits_base : public ggml::cpu::tensor_traits {
|
|
2049
4154
|
public:
|
|
2050
4155
|
virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
|
|
@@ -2063,7 +4168,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
2063
4168
|
case GGML_OP_MUL_MAT_ID:
|
|
2064
4169
|
{
|
|
2065
4170
|
size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
|
|
2066
|
-
size = GGML_PAD(size, sizeof(int64_t)); // + padding for next
|
|
4171
|
+
size = GGML_PAD(size, sizeof(int64_t)); // + padding for next block.
|
|
2067
4172
|
|
|
2068
4173
|
const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
|
|
2069
4174
|
const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
|
|
@@ -2328,7 +4433,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
2328
4433
|
auto * wdata = (char *)params->wdata;
|
|
2329
4434
|
auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
|
|
2330
4435
|
|
|
2331
|
-
// total of [n_as][ne12 + 1]
|
|
4436
|
+
// total of [n_as][ne12 + 1] elements of type mmid_row_mapping (2*int32_t = int64_t)
|
|
2332
4437
|
auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
|
2333
4438
|
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
|
|
2334
4439
|
|
|
@@ -2393,20 +4498,19 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
2393
4498
|
for (int ir1 = 0; ir1 < nr1; ir1++) {
|
|
2394
4499
|
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
|
|
2395
4500
|
|
|
2396
|
-
const int id = row_mapping.i1;
|
|
4501
|
+
const int id = row_mapping.i1; // selected expert index
|
|
2397
4502
|
|
|
2398
4503
|
const int64_t i11 = id % ne11;
|
|
2399
|
-
const int64_t i12 = row_mapping.i2;
|
|
4504
|
+
const int64_t i12 = row_mapping.i2; // row index in src1
|
|
2400
4505
|
|
|
2401
|
-
const int64_t i1 = id;
|
|
2402
|
-
const int64_t i2 = i12;
|
|
4506
|
+
const int64_t i1 = id; // selected expert index
|
|
4507
|
+
const int64_t i2 = i12; // row
|
|
2403
4508
|
|
|
2404
4509
|
const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2);
|
|
2405
4510
|
|
|
2406
|
-
gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(
|
|
2407
|
-
|
|
2408
|
-
|
|
2409
|
-
src1_col, 1, src0_cur_end - src0_cur_start);
|
|
4511
|
+
gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(
|
|
4512
|
+
ne00, (float *) ((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
|
|
4513
|
+
src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start);
|
|
2410
4514
|
}
|
|
2411
4515
|
}
|
|
2412
4516
|
#undef MMID_MATRIX_ROW
|
|
@@ -2422,7 +4526,6 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
2422
4526
|
} // namespace ggml::cpu::repack
|
|
2423
4527
|
|
|
2424
4528
|
static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) {
|
|
2425
|
-
|
|
2426
4529
|
// instance for Q4
|
|
2427
4530
|
static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
|
|
2428
4531
|
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
|
|
@@ -2432,6 +4535,14 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
2432
4535
|
static const ggml::cpu::repack::tensor_traits<block_q4_K, 4, 8, GGML_TYPE_Q8_K> q4_K_8x4_q8_K;
|
|
2433
4536
|
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
|
2434
4537
|
|
|
4538
|
+
// instance for Q5_K
|
|
4539
|
+
static const ggml::cpu::repack::tensor_traits<block_q5_K, 4, 8, GGML_TYPE_Q8_K> q5_K_8x4_q8_K;
|
|
4540
|
+
static const ggml::cpu::repack::tensor_traits<block_q5_K, 8, 8, GGML_TYPE_Q8_K> q5_K_8x8_q8_K;
|
|
4541
|
+
|
|
4542
|
+
// instance for Q6_K
|
|
4543
|
+
static const ggml::cpu::repack::tensor_traits<block_q6_K, 4, 8, GGML_TYPE_Q8_K> q6_K_8x4_q8_K;
|
|
4544
|
+
static const ggml::cpu::repack::tensor_traits<block_q6_K, 8, 8, GGML_TYPE_Q8_K> q6_K_8x8_q8_K;
|
|
4545
|
+
|
|
2435
4546
|
// instance for Q2
|
|
2436
4547
|
static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
|
|
2437
4548
|
|
|
@@ -2439,13 +4550,28 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
2439
4550
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
|
2440
4551
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
|
|
2441
4552
|
|
|
4553
|
+
// instance for MXFP4
|
|
4554
|
+
static const ggml::cpu::repack::tensor_traits<block_mxfp4, 4, 4, GGML_TYPE_Q8_0> mxfp4_4x4_q8_0;
|
|
4555
|
+
static const ggml::cpu::repack::tensor_traits<block_mxfp4, 8, 8, GGML_TYPE_Q8_0> mxfp4_8x8_q8_0;
|
|
4556
|
+
|
|
2442
4557
|
// instance for Q8_0
|
|
2443
4558
|
static const ggml::cpu::repack::tensor_traits<block_q8_0, 4, 4, GGML_TYPE_Q8_0> q8_0_4x4_q8_0;
|
|
2444
4559
|
static const ggml::cpu::repack::tensor_traits<block_q8_0, 8, 4, GGML_TYPE_Q8_0> q8_0_4x8_q8_0;
|
|
2445
4560
|
|
|
4561
|
+
// instances for RISC-V
|
|
4562
|
+
//
|
|
4563
|
+
// These implement outer-product style matrix multiplication kernels with
|
|
4564
|
+
// an interleave of 1.
|
|
4565
|
+
#if defined __riscv_zvfh
|
|
4566
|
+
static const ggml::cpu::repack::tensor_traits<block_q4_0, 1, 16, GGML_TYPE_Q8_0> q4_0_16x1_q8_0;
|
|
4567
|
+
static const ggml::cpu::repack::tensor_traits<block_q4_K, 1, 16, GGML_TYPE_Q8_K> q4_K_16x1_q8_K;
|
|
4568
|
+
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 1, 16, GGML_TYPE_Q8_0> iq4_nl_16x1_q8_0;
|
|
4569
|
+
static const ggml::cpu::repack::tensor_traits<block_q8_0, 1, 16, GGML_TYPE_Q8_0> q8_0_16x1_q8_0;
|
|
4570
|
+
static const ggml::cpu::repack::tensor_traits<block_q2_K, 1, 16, GGML_TYPE_Q8_K> q2_K_16x1_q8_K;
|
|
4571
|
+
#endif
|
|
4572
|
+
|
|
2446
4573
|
if (cur->type == GGML_TYPE_Q4_0) {
|
|
2447
|
-
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
|
|
2448
|
-
|| (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
|
|
4574
|
+
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
|
|
2449
4575
|
if (cur->ne[1] % 8 == 0) {
|
|
2450
4576
|
return &q4_0_8x8_q8_0;
|
|
2451
4577
|
}
|
|
@@ -2460,6 +4586,17 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
2460
4586
|
return &q4_0_4x4_q8_0;
|
|
2461
4587
|
}
|
|
2462
4588
|
}
|
|
4589
|
+
if (ggml_cpu_has_riscv_v()) {
|
|
4590
|
+
#if defined __riscv_zvfh
|
|
4591
|
+
switch (__riscv_vlenb() * 8) {
|
|
4592
|
+
case 128: { break; } // TODO
|
|
4593
|
+
case 256: { if (cur->ne[1] % 16 == 0) { return &q4_0_16x1_q8_0; } break; }
|
|
4594
|
+
case 512: { break; } // TODO
|
|
4595
|
+
case 1024: { break; } // TODO
|
|
4596
|
+
default: { return nullptr; }
|
|
4597
|
+
}
|
|
4598
|
+
#endif
|
|
4599
|
+
}
|
|
2463
4600
|
} else if (cur->type == GGML_TYPE_Q4_K) {
|
|
2464
4601
|
if (ggml_cpu_has_avx2()) {
|
|
2465
4602
|
if (cur->ne[1] % 8 == 0) {
|
|
@@ -2476,12 +4613,56 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
2476
4613
|
return &q4_K_8x4_q8_K;
|
|
2477
4614
|
}
|
|
2478
4615
|
}
|
|
4616
|
+
if (ggml_cpu_has_riscv_v()) {
|
|
4617
|
+
#if defined __riscv_zvfh
|
|
4618
|
+
switch (__riscv_vlenb() * 8) {
|
|
4619
|
+
case 128: { break; } // TODO
|
|
4620
|
+
case 256: { if (cur->ne[1] % 16 == 0) { return &q4_K_16x1_q8_K; } break; }
|
|
4621
|
+
case 512: { break; } // TODO
|
|
4622
|
+
case 1024: { break; } // TODO
|
|
4623
|
+
default: { return nullptr; }
|
|
4624
|
+
}
|
|
4625
|
+
#endif
|
|
4626
|
+
}
|
|
2479
4627
|
} else if (cur->type == GGML_TYPE_Q2_K) {
|
|
2480
4628
|
if (ggml_cpu_has_avx512()) {
|
|
2481
4629
|
if (cur->ne[1] % 8 == 0) {
|
|
2482
4630
|
return &q2_K_8x8_q8_K;
|
|
2483
4631
|
}
|
|
2484
4632
|
}
|
|
4633
|
+
if (ggml_cpu_has_riscv_v()) {
|
|
4634
|
+
#if defined __riscv_zvfh
|
|
4635
|
+
switch (__riscv_vlenb() * 8) {
|
|
4636
|
+
case 128: { break; } // TODO
|
|
4637
|
+
case 256: { if (cur->ne[1] % 16 == 0) { return &q2_K_16x1_q8_K; } break; }
|
|
4638
|
+
case 512: { break; } // TODO
|
|
4639
|
+
case 1024: { break; } // TODO
|
|
4640
|
+
default: { return nullptr; }
|
|
4641
|
+
}
|
|
4642
|
+
#endif
|
|
4643
|
+
}
|
|
4644
|
+
} else if (cur->type == GGML_TYPE_Q5_K) {
|
|
4645
|
+
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
|
4646
|
+
if (cur->ne[1] % 8 == 0) {
|
|
4647
|
+
return &q5_K_8x8_q8_K;
|
|
4648
|
+
}
|
|
4649
|
+
}
|
|
4650
|
+
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
|
4651
|
+
if (cur->ne[1] % 8 == 0) {
|
|
4652
|
+
return &q5_K_8x4_q8_K;
|
|
4653
|
+
}
|
|
4654
|
+
}
|
|
4655
|
+
} else if (cur->type == GGML_TYPE_Q6_K) {
|
|
4656
|
+
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
|
4657
|
+
if (cur->ne[1] % 8 == 0) {
|
|
4658
|
+
return &q6_K_8x8_q8_K;
|
|
4659
|
+
}
|
|
4660
|
+
}
|
|
4661
|
+
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
|
4662
|
+
if (cur->ne[1] % 8 == 0) {
|
|
4663
|
+
return &q6_K_8x4_q8_K;
|
|
4664
|
+
}
|
|
4665
|
+
}
|
|
2485
4666
|
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
|
2486
4667
|
if (ggml_cpu_has_avx2()) {
|
|
2487
4668
|
if (cur->ne[1] % 8 == 0) {
|
|
@@ -2493,6 +4674,28 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
2493
4674
|
return &iq4_nl_4x4_q8_0;
|
|
2494
4675
|
}
|
|
2495
4676
|
}
|
|
4677
|
+
if (ggml_cpu_has_riscv_v()) {
|
|
4678
|
+
#if defined __riscv_zvfh
|
|
4679
|
+
switch (__riscv_vlenb() * 8) {
|
|
4680
|
+
case 128: { break; } // TODO
|
|
4681
|
+
case 256: { if (cur->ne[1] % 16 == 0) { return &iq4_nl_16x1_q8_0; } break; }
|
|
4682
|
+
case 512: { break; } // TODO
|
|
4683
|
+
case 1024: { break; } // TODO
|
|
4684
|
+
default: { return nullptr; }
|
|
4685
|
+
}
|
|
4686
|
+
#endif
|
|
4687
|
+
}
|
|
4688
|
+
} else if (cur->type == GGML_TYPE_MXFP4) {
|
|
4689
|
+
if (ggml_cpu_has_avx2()) {
|
|
4690
|
+
if (cur->ne[1] % 8 == 0) {
|
|
4691
|
+
return &mxfp4_8x8_q8_0;
|
|
4692
|
+
}
|
|
4693
|
+
}
|
|
4694
|
+
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
|
4695
|
+
if (cur->ne[1] % 4 == 0) {
|
|
4696
|
+
return &mxfp4_4x4_q8_0;
|
|
4697
|
+
}
|
|
4698
|
+
}
|
|
2496
4699
|
} else if (cur->type == GGML_TYPE_Q8_0) {
|
|
2497
4700
|
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
|
2498
4701
|
if (cur->ne[1] % 4 == 0) {
|
|
@@ -2504,6 +4707,17 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
2504
4707
|
return &q8_0_4x4_q8_0;
|
|
2505
4708
|
}
|
|
2506
4709
|
}
|
|
4710
|
+
if (ggml_cpu_has_riscv_v()) {
|
|
4711
|
+
#if defined __riscv_zvfh
|
|
4712
|
+
switch (__riscv_vlenb() * 8) {
|
|
4713
|
+
case 128: { break; } // TODO
|
|
4714
|
+
case 256: { if (cur->ne[1] % 16 == 0) { return &q8_0_16x1_q8_0; } break; }
|
|
4715
|
+
case 512: { break; } // TODO
|
|
4716
|
+
case 1024: { break; } // TODO
|
|
4717
|
+
default: { return nullptr; }
|
|
4718
|
+
}
|
|
4719
|
+
#endif
|
|
4720
|
+
}
|
|
2507
4721
|
}
|
|
2508
4722
|
|
|
2509
4723
|
return nullptr;
|