whispercpp 1.3.6 → 1.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.document +3 -0
- data/.rdoc_options +2 -0
- data/README.md +38 -5
- data/Rakefile +18 -3
- data/ext/dependencies.rb +10 -4
- data/ext/dependencies_for_windows.rb +17 -0
- data/ext/extconf.rb +20 -8
- data/ext/options.rb +54 -14
- data/ext/options_for_windows.rb +51 -0
- data/ext/ruby_whisper.c +36 -42
- data/ext/ruby_whisper.h +135 -0
- data/ext/ruby_whisper_context.c +107 -28
- data/ext/ruby_whisper_log_queue.c +180 -0
- data/ext/ruby_whisper_log_settable.h +47 -0
- data/ext/ruby_whisper_parakeet.c +49 -0
- data/ext/ruby_whisper_parakeet_context.c +304 -0
- data/ext/ruby_whisper_parakeet_context_params.c +117 -0
- data/ext/ruby_whisper_parakeet_model.c +84 -0
- data/ext/ruby_whisper_parakeet_params.c +548 -0
- data/ext/ruby_whisper_parakeet_segment.c +157 -0
- data/ext/ruby_whisper_parakeet_token.c +188 -0
- data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
- data/ext/ruby_whisper_params.c +256 -65
- data/ext/ruby_whisper_segment.c +6 -6
- data/ext/ruby_whisper_transcribe.cpp +42 -15
- data/ext/sources/CMakeLists.txt +41 -3
- data/ext/sources/CMakePresets.json +95 -0
- data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
- data/ext/sources/cmake/parakeet.pc.in +10 -0
- data/ext/sources/cmake/whisper.pc.in +1 -1
- data/ext/sources/examples/CMakeLists.txt +4 -2
- data/ext/sources/examples/bench/bench.cpp +1 -1
- data/ext/sources/examples/cli/cli.cpp +43 -9
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/common-whisper.cpp +139 -67
- data/ext/sources/examples/common-whisper.h +11 -0
- data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
- data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
- data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
- data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
- data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
- data/ext/sources/examples/server/server.cpp +199 -163
- data/ext/sources/ggml/CMakeLists.txt +21 -13
- data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
- data/ext/sources/ggml/include/ggml-alloc.h +1 -0
- data/ext/sources/ggml/include/ggml-backend.h +72 -10
- data/ext/sources/ggml/include/ggml-cuda.h +3 -0
- data/ext/sources/ggml/include/ggml-rpc.h +3 -3
- data/ext/sources/ggml/include/ggml.h +101 -9
- data/ext/sources/ggml/include/gguf.h +10 -2
- data/ext/sources/ggml/src/CMakeLists.txt +22 -5
- data/ext/sources/ggml/src/ggml-alloc.c +5 -1
- data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
- data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +12 -0
- data/ext/sources/ggml/src/ggml-backend.cpp +110 -9
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +4 -0
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +672 -257
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +71 -0
- data/ext/sources/ggml/src/ggml-cann/common.h +20 -10
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +211 -30
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +58 -29
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +2 -0
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +16 -16
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +116 -7
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +65 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +4279 -1292
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +5 -35
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +0 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +177 -27
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +5 -0
- data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +95 -5
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +146 -134
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +88 -70
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +372 -73
- data/ext/sources/ggml/src/ggml-cpu/ops.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +55 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +3 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +90 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +3 -16
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
- data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +37 -53
- data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +17 -7
- data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
- data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +62 -26
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +44 -18
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +242 -28
- data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +53 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +14 -6
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +278 -44
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +331 -130
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +126 -27
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +40 -15
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +18 -9
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +152 -49
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
- data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +84 -35
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1069 -609
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +4 -2
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +242 -195
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +3 -3
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +502 -423
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +19 -12
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +485 -57
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -1
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +36 -10
- data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +133 -26
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +5 -1
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +11 -4
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
- data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
- data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +45 -13
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -4
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +26 -23
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +31 -2
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +80 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +22 -4
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +2 -1
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1428 -743
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -7
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +53 -84
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +25 -12
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +165 -184
- data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +170 -127
- data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +125 -97
- data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +148 -42
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.c +2 -2
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +252 -62
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +9 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +87 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +96 -13
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +182 -57
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +71 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +27 -10
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +63 -23
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +9 -8
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +1 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +5 -8
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +529 -815
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2522 -234
- data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +291 -95
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +59 -37
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +121 -133
- data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +244 -151
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +6 -6
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +719 -45
- data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +3 -1
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +22 -9
- data/ext/sources/ggml/src/ggml-impl.h +6 -1
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +138 -13
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +32 -1
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +164 -28
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +80 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +190 -19
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +2 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +39 -26
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +823 -322
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +54 -5
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +12248 -5907
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +67 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +59 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1819 -112
- data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mm_q8_0_f32_8x4.cl → gemm_noshuffle_q8_0_f32.cl} +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
- data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
- data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +48 -64
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +15 -5
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +18 -11
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +35 -13
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +264 -192
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +33 -7
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +1 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +1 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +27 -3
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +67 -36
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +1 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +101 -44
- data/ext/sources/ggml/src/ggml-openvino/utils.h +23 -3
- data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
- data/ext/sources/ggml/src/ggml-quants.c +289 -114
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
- data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
- data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +50 -4
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +3 -1
- data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +41 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +115 -13
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
- data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
- data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
- data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1 -90
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +0 -2
- data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +7 -5
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +4 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +76 -168
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +3 -1
- data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +69 -31
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +823 -190
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
- data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
- data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +71 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
- data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
- data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
- data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
- data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
- data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +215 -53
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +4 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +2 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +2 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +1 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +1 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +0 -2
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +11 -0
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +2060 -535
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +197 -48
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +60 -59
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +115 -113
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +122 -31
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +125 -64
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +43 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +5 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +3 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +11 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +171 -147
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +2202 -283
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +2610 -1403
- data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +8 -7
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +76 -95
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +19 -1
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -50
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +107 -184
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +183 -78
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +655 -495
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +8 -6
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +5 -1
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +80 -409
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +6 -4
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +2 -3
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +68 -48
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +18 -14
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +244 -10
- data/ext/sources/ggml/src/ggml.c +110 -28
- data/ext/sources/ggml/src/gguf.cpp +173 -28
- data/ext/sources/include/parakeet.h +342 -0
- data/ext/sources/include/whisper.h +10 -0
- data/ext/sources/media/matmul.png +0 -0
- data/ext/sources/src/CMakeLists.txt +23 -0
- data/ext/sources/src/parakeet-arch.h +188 -0
- data/ext/sources/src/parakeet.cpp +3838 -0
- data/ext/sources/src/whisper.cpp +56 -12
- data/extsources.rb +26 -10
- data/lib/whisper/log_settable.rb +36 -0
- data/lib/whisper/model/uri.rb +13 -1
- data/lib/whisper/output.rb +74 -0
- data/sig/whisper.rbs +411 -62
- data/test/helper.rb +2 -0
- data/test/jfk_reader/jfk_reader.c +50 -7
- data/test/test_callback.rb +1 -0
- data/test/test_package.rb +6 -5
- data/test/test_parakeet.rb +28 -0
- data/test/test_parakeet_callback.rb +107 -0
- data/test/test_parakeet_context.rb +116 -0
- data/test/test_parakeet_context_params.rb +24 -0
- data/test/test_parakeet_model.rb +21 -0
- data/test/test_parakeet_params.rb +78 -0
- data/test/test_parakeet_segment.rb +42 -0
- data/test/test_parakeet_token.rb +73 -0
- data/test/test_params.rb +2 -0
- data/test/test_vad_segment.rb +1 -1
- data/test/test_whisper.rb +24 -6
- data/whispercpp.gemspec +2 -2
- metadata +215 -281
- data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
- data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
- data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
- data/ext/sources/bindings/javascript/package.json +0 -26
- data/ext/sources/bindings/javascript/whisper.js +0 -19
- data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
- data/ext/sources/examples/addon.node/addon.cpp +0 -557
- data/ext/sources/examples/addon.node/index.js +0 -59
- data/ext/sources/examples/addon.node/package.json +0 -16
- data/ext/sources/examples/addon.node/vad-example.js +0 -132
- data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
- data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
- data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
- data/ext/sources/examples/coi-serviceworker.js +0 -146
- data/ext/sources/examples/command/CMakeLists.txt +0 -10
- data/ext/sources/examples/command/command.cpp +0 -802
- data/ext/sources/examples/command/commands.txt +0 -9
- data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
- data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
- data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
- data/ext/sources/examples/generate-karaoke.sh +0 -57
- data/ext/sources/examples/helpers.js +0 -191
- data/ext/sources/examples/livestream.sh +0 -112
- data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
- data/ext/sources/examples/lsp/lsp.cpp +0 -471
- data/ext/sources/examples/lsp/whisper.vim +0 -362
- data/ext/sources/examples/python/test_whisper_processor.py +0 -7
- data/ext/sources/examples/python/whisper_processor.py +0 -54
- data/ext/sources/examples/server/bench.js +0 -29
- data/ext/sources/examples/server.py +0 -120
- data/ext/sources/examples/stream/CMakeLists.txt +0 -10
- data/ext/sources/examples/stream/stream.cpp +0 -437
- data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
- data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
- data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
- data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
- data/ext/sources/examples/sycl/build.sh +0 -22
- data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
- data/ext/sources/examples/sycl/run-whisper.sh +0 -17
- data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -48
- data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -488
- data/ext/sources/examples/talk-llama/llama-adapter.h +0 -89
- data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2877
- data/ext/sources/examples/talk-llama/llama-arch.h +0 -628
- data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -919
- data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
- data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -896
- data/ext/sources/examples/talk-llama/llama-chat.h +0 -71
- data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3633
- data/ext/sources/examples/talk-llama/llama-context.h +0 -359
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
- data/ext/sources/examples/talk-llama/llama-cparams.h +0 -47
- data/ext/sources/examples/talk-llama/llama-ext.h +0 -12
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
- data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
- data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2735
- data/ext/sources/examples/talk-llama/llama-graph.h +0 -1031
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -258
- data/ext/sources/examples/talk-llama/llama-hparams.h +0 -353
- data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
- data/ext/sources/examples/talk-llama/llama-impl.h +0 -75
- data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
- data/ext/sources/examples/talk-llama/llama-io.h +0 -35
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -330
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2285
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -389
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +0 -275
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +0 -140
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1165
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
- data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
- data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -752
- data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1655
- data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -206
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -299
- data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -40
- data/ext/sources/examples/talk-llama/llama-model.cpp +0 -9056
- data/ext/sources/examples/talk-llama/llama-model.h +0 -597
- data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1304
- data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
- data/ext/sources/examples/talk-llama/llama-sampler.cpp +0 -3885
- data/ext/sources/examples/talk-llama/llama-sampler.h +0 -42
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3970
- data/ext/sources/examples/talk-llama/llama-vocab.h +0 -187
- data/ext/sources/examples/talk-llama/llama.cpp +0 -1194
- data/ext/sources/examples/talk-llama/llama.h +0 -1573
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -190
- data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
- data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -137
- data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -143
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -133
- data/ext/sources/examples/talk-llama/models/bert.cpp +0 -184
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -145
- data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
- data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -142
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -262
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +0 -445
- data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -148
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +0 -97
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +0 -145
- data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -111
- data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
- data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
- data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -157
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -195
- data/ext/sources/examples/talk-llama/models/granite.cpp +0 -210
- data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -139
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -153
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/jais2.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +0 -381
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -196
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/llama.cpp +0 -175
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/mamba-base.cpp +0 -289
- data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -54
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -129
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -200
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
- data/ext/sources/examples/talk-llama/models/models.h +0 -704
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -109
- data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -162
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
- data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
- data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
- data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -320
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/plm.cpp +0 -169
- data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +0 -381
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +0 -422
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -131
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -525
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -140
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -164
- data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
- data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -137
- data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +0 -165
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
- data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
- data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
- data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
- data/ext/sources/examples/talk-llama/speak +0 -40
- data/ext/sources/examples/talk-llama/speak.bat +0 -1
- data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
- data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
- data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
- data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
- data/ext/sources/examples/talk-llama/unicode.cpp +0 -1103
- data/ext/sources/examples/talk-llama/unicode.h +0 -111
- data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
- data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
- data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
- data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
- data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
- data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
- data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
- data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
- data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
- data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -155
- data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
- data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +0 -123
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +0 -17
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +0 -333
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -182
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -718
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
- data/ext/sources/tests/CMakeLists.txt +0 -112
- data/ext/sources/tests/earnings21/eval.mk +0 -58
- data/ext/sources/tests/earnings21/eval.py +0 -68
- data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
- data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
- data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
- data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
- data/ext/sources/tests/earnings21/requirements.txt +0 -6
- data/ext/sources/tests/en-0-ref.txt +0 -1
- data/ext/sources/tests/en-1-ref.txt +0 -1
- data/ext/sources/tests/en-2-ref.txt +0 -1
- data/ext/sources/tests/es-0-ref.txt +0 -1
- data/ext/sources/tests/librispeech/eval.mk +0 -39
- data/ext/sources/tests/librispeech/eval.py +0 -47
- data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
- data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
- data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
- data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
- data/ext/sources/tests/librispeech/requirements.txt +0 -6
- data/ext/sources/tests/run-tests.sh +0 -130
- data/ext/sources/tests/test-c.c +0 -3
- data/ext/sources/tests/test-vad-full.cpp +0 -56
- data/ext/sources/tests/test-vad.cpp +0 -83
- data/ext/sources/tests/test-whisper.js +0 -58
- data/lib/whisper/context.rb +0 -15
- data/lib/whisper/segment.rb +0 -58
- /data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general_q8_0_f32.cl → gemv_noshuffle_q8_0_f32.cl} +0 -0
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
#define GGML_COMMON_DECL_C
|
|
15
15
|
#include "ggml-common.h"
|
|
16
16
|
#include "htp-ctx.h"
|
|
17
|
-
#include "htp-
|
|
17
|
+
#include "htp-ops.h"
|
|
18
18
|
#include "htp-ops.h"
|
|
19
19
|
|
|
20
20
|
#ifndef MIN
|
|
@@ -24,31 +24,29 @@
|
|
|
24
24
|
// Context for binary operations
|
|
25
25
|
struct htp_binary_context {
|
|
26
26
|
struct htp_ops_context * octx;
|
|
27
|
-
|
|
28
|
-
struct fastdiv_values
|
|
29
|
-
struct fastdiv_values
|
|
27
|
+
|
|
28
|
+
struct fastdiv_values src0_dim1_div; // ne01
|
|
29
|
+
struct fastdiv_values src0_dim2_div; // ne02
|
|
30
|
+
struct fastdiv_values src0_dim12_div;// ne03
|
|
30
31
|
|
|
31
32
|
struct fastdiv_values src1_dim1_div; // ne11
|
|
32
33
|
struct fastdiv_values src1_dim2_div; // ne12
|
|
33
34
|
struct fastdiv_values src1_dim3_div; // ne13
|
|
34
35
|
|
|
35
|
-
uint32_t nrows_per_thread;
|
|
36
|
-
bool split_at_ne01;
|
|
37
|
-
bool split_at_ne02;
|
|
38
|
-
|
|
39
|
-
// Precomputed values
|
|
40
36
|
uint32_t block_max;
|
|
37
|
+
uint32_t nrows_per_thread;
|
|
41
38
|
size_t src0_row_size_aligned;
|
|
42
39
|
size_t src1_row_size_aligned;
|
|
43
40
|
size_t dst_row_size_aligned;
|
|
44
|
-
|
|
45
|
-
|
|
41
|
+
|
|
42
|
+
bool split_at_ne01;
|
|
43
|
+
bool split_at_ne02;
|
|
46
44
|
};
|
|
47
45
|
|
|
48
|
-
#define htp_binary_preamble
|
|
49
|
-
const struct htp_tensor * src0 =
|
|
50
|
-
const struct htp_tensor * src1 =
|
|
51
|
-
struct htp_tensor *
|
|
46
|
+
#define htp_binary_preamble \
|
|
47
|
+
const struct htp_tensor * src0 = octx->src[0]; \
|
|
48
|
+
const struct htp_tensor * src1 = octx->src[1]; \
|
|
49
|
+
const struct htp_tensor * dst = octx->dst; \
|
|
52
50
|
\
|
|
53
51
|
const uint32_t ne00 = src0->ne[0]; \
|
|
54
52
|
const uint32_t ne01 = src0->ne[1]; \
|
|
@@ -72,12 +70,11 @@ struct htp_binary_context {
|
|
|
72
70
|
const uint32_t nb2 = dst->nb[2]; \
|
|
73
71
|
const uint32_t nb3 = dst->nb[3];
|
|
74
72
|
|
|
75
|
-
static inline uint32_t calc_block_size(struct htp_binary_context * bctx, uint32_t ir, uint32_t end_row,
|
|
76
|
-
uint32_t ne01, uint32_t ne02) {
|
|
73
|
+
static inline uint32_t calc_block_size(struct htp_binary_context * bctx, uint32_t ir, uint32_t end_row, uint32_t ne01, uint32_t ne02) {
|
|
77
74
|
uint32_t i03, i02, i01, rem;
|
|
78
|
-
i03 = fastdiv(ir, &bctx->
|
|
75
|
+
i03 = fastdiv(ir, &bctx->src0_dim12_div);
|
|
79
76
|
rem = ir - i03 * (ne02 * ne01);
|
|
80
|
-
i02 = fastdiv(rem, &bctx->
|
|
77
|
+
i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
|
81
78
|
i01 = rem - i02 * ne01;
|
|
82
79
|
|
|
83
80
|
uint32_t rows_left = end_row - ir;
|
|
@@ -184,13 +181,15 @@ static void binary_job_scalar(unsigned int nth, unsigned int ith, void * data) {
|
|
|
184
181
|
struct htp_ops_context * octx = bctx->octx;
|
|
185
182
|
htp_binary_preamble;
|
|
186
183
|
|
|
187
|
-
const uint32_t src0_type = octx->
|
|
184
|
+
const uint32_t src0_type = octx->src[0]->type;
|
|
188
185
|
const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16);
|
|
189
186
|
const uint32_t total_rows = ne01 * ne02 * ne03;
|
|
190
187
|
const uint32_t start_row = bctx->nrows_per_thread * ith;
|
|
191
188
|
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
|
|
192
189
|
if (start_row >= end_row) return;
|
|
193
190
|
|
|
191
|
+
FARF(HIGH, "binary-scalar: %d/%d (%u:%u) row-size %u (%u)", ith, nth, start_row, end_row, nb01, bctx->dst_row_size_aligned);
|
|
192
|
+
|
|
194
193
|
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
|
|
195
194
|
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
|
|
196
195
|
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
|
|
@@ -204,9 +203,9 @@ static void binary_job_scalar(unsigned int nth, unsigned int ith, void * data) {
|
|
|
204
203
|
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
|
|
205
204
|
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
|
206
205
|
uint32_t i03, i02, i01, rem;
|
|
207
|
-
i03 = fastdiv(ir_prefetch, &bctx->
|
|
206
|
+
i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
|
208
207
|
rem = ir_prefetch - i03 * (ne02 * ne01);
|
|
209
|
-
i02 = fastdiv(rem, &bctx->
|
|
208
|
+
i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
|
210
209
|
i01 = rem - i02 * ne01;
|
|
211
210
|
|
|
212
211
|
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
|
|
@@ -215,7 +214,7 @@ static void binary_job_scalar(unsigned int nth, unsigned int ith, void * data) {
|
|
|
215
214
|
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
|
|
216
215
|
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
|
|
217
216
|
|
|
218
|
-
|
|
217
|
+
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, 0);
|
|
219
218
|
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
|
|
220
219
|
ir_prefetch += current_block_size;
|
|
221
220
|
spad_idx ^= 1;
|
|
@@ -229,9 +228,9 @@ static void binary_job_scalar(unsigned int nth, unsigned int ith, void * data) {
|
|
|
229
228
|
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
|
|
230
229
|
|
|
231
230
|
uint32_t i03, i02, i01, rem;
|
|
232
|
-
i03 = fastdiv(ir, &bctx->
|
|
231
|
+
i03 = fastdiv(ir, &bctx->src0_dim12_div);
|
|
233
232
|
rem = ir - i03 * (ne02 * ne01);
|
|
234
|
-
i02 = fastdiv(rem, &bctx->
|
|
233
|
+
i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
|
235
234
|
i01 = rem - i02 * ne01;
|
|
236
235
|
|
|
237
236
|
// src1 indices (broadcast/repeat)
|
|
@@ -255,9 +254,9 @@ static void binary_job_scalar(unsigned int nth, unsigned int ith, void * data) {
|
|
|
255
254
|
if (ir_prefetch < end_row) {
|
|
256
255
|
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
|
257
256
|
uint32_t p03, p02, p01, prem;
|
|
258
|
-
p03 = fastdiv(ir_prefetch, &bctx->
|
|
257
|
+
p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
|
259
258
|
prem = ir_prefetch - p03 * (ne02 * ne01);
|
|
260
|
-
p02 = fastdiv(prem, &bctx->
|
|
259
|
+
p02 = fastdiv(prem, &bctx->src0_dim1_div);
|
|
261
260
|
p01 = prem - p02 * ne01;
|
|
262
261
|
uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
|
|
263
262
|
|
|
@@ -275,13 +274,15 @@ static void binary_job_vector_same_shape(unsigned int nth, unsigned int ith, voi
|
|
|
275
274
|
struct htp_ops_context * octx = bctx->octx;
|
|
276
275
|
htp_binary_preamble;
|
|
277
276
|
|
|
278
|
-
const uint32_t src0_type = octx->
|
|
277
|
+
const uint32_t src0_type = octx->src[0]->type;
|
|
279
278
|
const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16);
|
|
280
279
|
const uint32_t total_rows = ne01 * ne02 * ne03;
|
|
281
280
|
const uint32_t start_row = bctx->nrows_per_thread * ith;
|
|
282
281
|
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
|
|
283
282
|
if (start_row >= end_row) return;
|
|
284
283
|
|
|
284
|
+
FARF(HIGH, "binary-same-shape: %d/%d (%u:%u) row-size %u (%u)", ith, nth, start_row, end_row, nb01, bctx->dst_row_size_aligned);
|
|
285
|
+
|
|
285
286
|
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
|
|
286
287
|
uint8_t * src1_spad_base = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread);
|
|
287
288
|
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
|
|
@@ -297,9 +298,9 @@ static void binary_job_vector_same_shape(unsigned int nth, unsigned int ith, voi
|
|
|
297
298
|
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
|
|
298
299
|
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
|
299
300
|
uint32_t i03, i02, i01, rem;
|
|
300
|
-
i03 = fastdiv(ir_prefetch, &bctx->
|
|
301
|
+
i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
|
301
302
|
rem = ir_prefetch - i03 * (ne02 * ne01);
|
|
302
|
-
i02 = fastdiv(rem, &bctx->
|
|
303
|
+
i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
|
303
304
|
i01 = rem - i02 * ne01;
|
|
304
305
|
|
|
305
306
|
uint32_t i13 = (ne13 == 1) ? 0 : i03;
|
|
@@ -307,23 +308,23 @@ static void binary_job_vector_same_shape(unsigned int nth, unsigned int ith, voi
|
|
|
307
308
|
uint32_t i11 = (ne11 == 1) ? 0 : i01;
|
|
308
309
|
|
|
309
310
|
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
|
|
310
|
-
uint8_t *
|
|
311
|
+
uint8_t * src1_curr = (uint8_t *)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11;
|
|
311
312
|
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
|
|
312
313
|
|
|
313
314
|
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
|
|
314
315
|
uint8_t * s1_spad = src1_spad_base + spad_idx * src1_spad_half;
|
|
315
316
|
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
|
|
316
317
|
|
|
317
|
-
|
|
318
|
+
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, 0);
|
|
318
319
|
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
|
|
319
|
-
dma_queue_push(q, dma_make_ptr(s1_spad,
|
|
320
|
+
dma_queue_push(q, dma_make_ptr(s1_spad, src1_curr), bctx->src1_row_size_aligned, nb11, row_size_bytes, current_block_size);
|
|
320
321
|
ir_prefetch += current_block_size;
|
|
321
322
|
spad_idx ^= 1;
|
|
322
323
|
}
|
|
323
324
|
|
|
324
325
|
for (uint32_t ir = start_row; ir < end_row; ) {
|
|
325
326
|
uint32_t current_block_size = calc_block_size(bctx, ir, end_row, ne01, ne02);
|
|
326
|
-
uint8_t * d_spad
|
|
327
|
+
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
|
|
327
328
|
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
|
|
328
329
|
uint8_t * s1_spad = (uint8_t *) dma_queue_pop(q).dst;
|
|
329
330
|
|
|
@@ -335,9 +336,9 @@ static void binary_job_vector_same_shape(unsigned int nth, unsigned int ith, voi
|
|
|
335
336
|
}
|
|
336
337
|
|
|
337
338
|
uint32_t i03, i02, i01, rem;
|
|
338
|
-
i03 = fastdiv(ir, &bctx->
|
|
339
|
+
i03 = fastdiv(ir, &bctx->src0_dim12_div);
|
|
339
340
|
rem = ir - i03 * (ne02 * ne01);
|
|
340
|
-
i02 = fastdiv(rem, &bctx->
|
|
341
|
+
i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
|
341
342
|
i01 = rem - i02 * ne01;
|
|
342
343
|
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
|
|
343
344
|
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, current_block_size);
|
|
@@ -345,9 +346,9 @@ static void binary_job_vector_same_shape(unsigned int nth, unsigned int ith, voi
|
|
|
345
346
|
if (ir_prefetch < end_row) {
|
|
346
347
|
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
|
347
348
|
uint32_t p03, p02, p01, prem;
|
|
348
|
-
p03 = fastdiv(ir_prefetch, &bctx->
|
|
349
|
+
p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
|
349
350
|
prem = ir_prefetch - p03 * (ne02 * ne01);
|
|
350
|
-
p02 = fastdiv(prem, &bctx->
|
|
351
|
+
p02 = fastdiv(prem, &bctx->src0_dim1_div);
|
|
351
352
|
p01 = prem - p02 * ne01;
|
|
352
353
|
|
|
353
354
|
uint32_t p13 = (ne13 == 1) ? 0 : p03;
|
|
@@ -358,7 +359,7 @@ static void binary_job_vector_same_shape(unsigned int nth, unsigned int ith, voi
|
|
|
358
359
|
uint8_t * s1_next = (uint8_t *)src1->data + p13 * nb13 + p12 * nb12 + p11 * nb11;
|
|
359
360
|
|
|
360
361
|
dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, row_size_bytes, next_block_size);
|
|
361
|
-
dma_queue_push(q, dma_make_ptr(s1_spad, s1_next), bctx->src1_row_size_aligned,
|
|
362
|
+
dma_queue_push(q, dma_make_ptr(s1_spad, s1_next), bctx->src1_row_size_aligned, nb11, row_size_bytes, next_block_size);
|
|
362
363
|
|
|
363
364
|
ir_prefetch += next_block_size;
|
|
364
365
|
}
|
|
@@ -373,15 +374,17 @@ static void binary_job_vector_row_broadcast(unsigned int nth, unsigned int ith,
|
|
|
373
374
|
struct htp_ops_context * octx = bctx->octx;
|
|
374
375
|
htp_binary_preamble;
|
|
375
376
|
|
|
376
|
-
const uint32_t src0_type
|
|
377
|
+
const uint32_t src0_type = octx->src[0]->type;
|
|
377
378
|
const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16);
|
|
378
379
|
const uint32_t total_rows = ne01 * ne02 * ne03;
|
|
379
|
-
const uint32_t start_row
|
|
380
|
-
const uint32_t end_row
|
|
380
|
+
const uint32_t start_row = bctx->nrows_per_thread * ith;
|
|
381
|
+
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
|
|
381
382
|
if (start_row >= end_row) return;
|
|
382
383
|
|
|
384
|
+
FARF(HIGH, "binary-row-bcast: %d/%d (%u:%u) row-size %u (%u)", ith, nth, start_row, end_row, nb01, bctx->dst_row_size_aligned);
|
|
385
|
+
|
|
383
386
|
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
|
|
384
|
-
uint8_t *
|
|
387
|
+
uint8_t * src1_spad_base = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread);
|
|
385
388
|
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
|
|
386
389
|
|
|
387
390
|
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
|
|
@@ -391,15 +394,14 @@ static void binary_job_vector_row_broadcast(unsigned int nth, unsigned int ith,
|
|
|
391
394
|
uint32_t ir_prefetch = start_row;
|
|
392
395
|
int spad_idx = 0;
|
|
393
396
|
|
|
394
|
-
void * s1_ptr = (void *)
|
|
397
|
+
void * s1_ptr = (void *) src1_spad_base;
|
|
395
398
|
|
|
396
399
|
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
|
|
397
400
|
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
|
398
|
-
uint32_t i03
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
i01 = rem - i02 * ne01;
|
|
401
|
+
uint32_t i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
|
402
|
+
uint32_t rem = ir_prefetch - i03 * (ne02 * ne01);
|
|
403
|
+
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
|
404
|
+
uint32_t i01 = rem - i02 * ne01;
|
|
403
405
|
|
|
404
406
|
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
|
|
405
407
|
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
|
|
@@ -407,7 +409,7 @@ static void binary_job_vector_row_broadcast(unsigned int nth, unsigned int ith,
|
|
|
407
409
|
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
|
|
408
410
|
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
|
|
409
411
|
|
|
410
|
-
|
|
412
|
+
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, 0);
|
|
411
413
|
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
|
|
412
414
|
ir_prefetch += current_block_size;
|
|
413
415
|
spad_idx ^= 1;
|
|
@@ -415,7 +417,7 @@ static void binary_job_vector_row_broadcast(unsigned int nth, unsigned int ith,
|
|
|
415
417
|
|
|
416
418
|
for (uint32_t ir = start_row; ir < end_row; ) {
|
|
417
419
|
uint32_t current_block_size = calc_block_size(bctx, ir, end_row, ne01, ne02);
|
|
418
|
-
uint8_t * d_spad
|
|
420
|
+
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
|
|
419
421
|
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
|
|
420
422
|
|
|
421
423
|
for (uint32_t r = 0; r < current_block_size; r++) {
|
|
@@ -425,21 +427,19 @@ static void binary_job_vector_row_broadcast(unsigned int nth, unsigned int ith,
|
|
|
425
427
|
COMPUTE_VECTOR_OP_AAA(r_dst, r_src0, r_src1, src0_type, ne00);
|
|
426
428
|
}
|
|
427
429
|
|
|
428
|
-
uint32_t i03
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
i01 = rem - i02 * ne01;
|
|
430
|
+
uint32_t i03 = fastdiv(ir, &bctx->src0_dim12_div);
|
|
431
|
+
uint32_t rem = ir - i03 * (ne02 * ne01);
|
|
432
|
+
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
|
433
|
+
uint32_t i01 = rem - i02 * ne01;
|
|
433
434
|
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
|
|
434
435
|
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, current_block_size);
|
|
435
436
|
|
|
436
437
|
if (ir_prefetch < end_row) {
|
|
437
438
|
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
|
438
|
-
uint32_t p03
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
p01 = prem - p02 * ne01;
|
|
439
|
+
uint32_t p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
|
440
|
+
uint32_t prem = ir_prefetch - p03 * (ne02 * ne01);
|
|
441
|
+
uint32_t p02 = fastdiv(prem, &bctx->src0_dim1_div);
|
|
442
|
+
uint32_t p01 = prem - p02 * ne01;
|
|
443
443
|
uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
|
|
444
444
|
dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, row_size_bytes, next_block_size);
|
|
445
445
|
ir_prefetch += next_block_size;
|
|
@@ -455,17 +455,19 @@ static void binary_job_vector_complex(unsigned int nth, unsigned int ith, void *
|
|
|
455
455
|
struct htp_ops_context * octx = bctx->octx;
|
|
456
456
|
htp_binary_preamble;
|
|
457
457
|
|
|
458
|
-
const uint32_t src0_type = octx->
|
|
458
|
+
const uint32_t src0_type = octx->src[0]->type;
|
|
459
459
|
const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16);
|
|
460
460
|
const uint32_t total_rows = ne01 * ne02 * ne03;
|
|
461
|
-
const uint32_t start_row
|
|
462
|
-
const uint32_t end_row
|
|
461
|
+
const uint32_t start_row = bctx->nrows_per_thread * ith;
|
|
462
|
+
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
|
|
463
463
|
if (start_row >= end_row) return;
|
|
464
464
|
|
|
465
|
+
FARF(HIGH, "binary-complex: %d/%d (%u:%u) row-size %u (%u)", ith, nth, start_row, end_row, nb01, bctx->dst_row_size_aligned);
|
|
466
|
+
|
|
465
467
|
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
|
|
466
468
|
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
|
|
467
|
-
size_t src0_spad_half
|
|
468
|
-
size_t dst_spad_half
|
|
469
|
+
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
|
|
470
|
+
size_t dst_spad_half = octx->dst_spad.size_per_thread / 2;
|
|
469
471
|
|
|
470
472
|
dma_queue * q = octx->ctx->dma[ith];
|
|
471
473
|
uint32_t ir_prefetch = start_row;
|
|
@@ -473,11 +475,10 @@ static void binary_job_vector_complex(unsigned int nth, unsigned int ith, void *
|
|
|
473
475
|
|
|
474
476
|
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
|
|
475
477
|
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
|
476
|
-
uint32_t i03
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
i01 = rem - i02 * ne01;
|
|
478
|
+
uint32_t i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
|
479
|
+
uint32_t rem = ir_prefetch - i03 * (ne02 * ne01);
|
|
480
|
+
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
|
481
|
+
uint32_t i01 = rem - i02 * ne01;
|
|
481
482
|
|
|
482
483
|
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
|
|
483
484
|
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
|
|
@@ -485,7 +486,7 @@ static void binary_job_vector_complex(unsigned int nth, unsigned int ith, void *
|
|
|
485
486
|
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
|
|
486
487
|
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
|
|
487
488
|
|
|
488
|
-
|
|
489
|
+
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, 0);
|
|
489
490
|
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
|
|
490
491
|
ir_prefetch += current_block_size;
|
|
491
492
|
spad_idx ^= 1;
|
|
@@ -496,11 +497,10 @@ static void binary_job_vector_complex(unsigned int nth, unsigned int ith, void *
|
|
|
496
497
|
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
|
|
497
498
|
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
|
|
498
499
|
|
|
499
|
-
uint32_t i03
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
i01 = rem - i02 * ne01;
|
|
500
|
+
uint32_t i03 = fastdiv(ir, &bctx->src0_dim12_div);
|
|
501
|
+
uint32_t rem = ir - i03 * (ne02 * ne01);
|
|
502
|
+
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
|
503
|
+
uint32_t i01 = rem - i02 * ne01;
|
|
504
504
|
|
|
505
505
|
for (uint32_t r = 0; r < current_block_size; r++) {
|
|
506
506
|
uint32_t r_i01 = i01 + r;
|
|
@@ -521,11 +521,10 @@ static void binary_job_vector_complex(unsigned int nth, unsigned int ith, void *
|
|
|
521
521
|
|
|
522
522
|
if (ir_prefetch < end_row) {
|
|
523
523
|
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
|
524
|
-
uint32_t p03
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
p01 = prem - p02 * ne01;
|
|
524
|
+
uint32_t p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
|
525
|
+
uint32_t prem = ir_prefetch - p03 * (ne02 * ne01);
|
|
526
|
+
uint32_t p02 = fastdiv(prem, &bctx->src0_dim1_div);
|
|
527
|
+
uint32_t p01 = prem - p02 * ne01;
|
|
529
528
|
uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
|
|
530
529
|
dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, row_size_bytes, next_block_size);
|
|
531
530
|
ir_prefetch += next_block_size;
|
|
@@ -541,18 +540,20 @@ static void binary_job_element_repeat(unsigned int nth, unsigned int ith, void *
|
|
|
541
540
|
struct htp_ops_context * octx = bctx->octx;
|
|
542
541
|
htp_binary_preamble;
|
|
543
542
|
|
|
544
|
-
const uint32_t src0_type = octx->
|
|
543
|
+
const uint32_t src0_type = octx->src[0]->type;
|
|
545
544
|
const uint32_t elem_size_bytes = (src0_type == HTP_TYPE_F32) ? sizeof(float) : sizeof(_Float16);
|
|
546
545
|
const uint32_t row_size_bytes = ne00 * elem_size_bytes;;
|
|
547
546
|
const uint32_t total_rows = ne01 * ne02 * ne03;
|
|
548
|
-
const uint32_t start_row
|
|
549
|
-
const uint32_t end_row
|
|
547
|
+
const uint32_t start_row = bctx->nrows_per_thread * ith;
|
|
548
|
+
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
|
|
550
549
|
if (start_row >= end_row) return;
|
|
551
550
|
|
|
552
551
|
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
|
|
553
552
|
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
|
|
554
|
-
size_t src0_spad_half
|
|
555
|
-
size_t dst_spad_half
|
|
553
|
+
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
|
|
554
|
+
size_t dst_spad_half = octx->dst_spad.size_per_thread / 2;
|
|
555
|
+
|
|
556
|
+
FARF(HIGH, "binary-repeat: %d/%d (%u:%u) row-size %u (%u)", ith, nth, start_row, end_row, nb01, bctx->dst_row_size_aligned);
|
|
556
557
|
|
|
557
558
|
dma_queue * q = octx->ctx->dma[ith];
|
|
558
559
|
uint32_t ir_prefetch = start_row;
|
|
@@ -560,11 +561,10 @@ static void binary_job_element_repeat(unsigned int nth, unsigned int ith, void *
|
|
|
560
561
|
|
|
561
562
|
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
|
|
562
563
|
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
|
563
|
-
uint32_t i03
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
i01 = rem - i02 * ne01;
|
|
564
|
+
uint32_t i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
|
565
|
+
uint32_t rem = ir_prefetch - i03 * (ne02 * ne01);
|
|
566
|
+
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
|
567
|
+
uint32_t i01 = rem - i02 * ne01;
|
|
568
568
|
|
|
569
569
|
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
|
|
570
570
|
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
|
|
@@ -572,7 +572,7 @@ static void binary_job_element_repeat(unsigned int nth, unsigned int ith, void *
|
|
|
572
572
|
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
|
|
573
573
|
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
|
|
574
574
|
|
|
575
|
-
|
|
575
|
+
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, 0);
|
|
576
576
|
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
|
|
577
577
|
ir_prefetch += current_block_size;
|
|
578
578
|
spad_idx ^= 1;
|
|
@@ -583,11 +583,10 @@ static void binary_job_element_repeat(unsigned int nth, unsigned int ith, void *
|
|
|
583
583
|
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
|
|
584
584
|
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
|
|
585
585
|
|
|
586
|
-
uint32_t i03
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
i01 = rem - i02 * ne01;
|
|
586
|
+
uint32_t i03 = fastdiv(ir, &bctx->src0_dim12_div);
|
|
587
|
+
uint32_t rem = ir - i03 * (ne02 * ne01);
|
|
588
|
+
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
|
589
|
+
uint32_t i01 = rem - i02 * ne01;
|
|
591
590
|
|
|
592
591
|
for (uint32_t r = 0; r < current_block_size; r++) {
|
|
593
592
|
uint32_t r_i01 = i01 + r;
|
|
@@ -612,11 +611,10 @@ static void binary_job_element_repeat(unsigned int nth, unsigned int ith, void *
|
|
|
612
611
|
|
|
613
612
|
if (ir_prefetch < end_row) {
|
|
614
613
|
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
|
615
|
-
uint32_t p03
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
p01 = prem - p02 * ne01;
|
|
614
|
+
uint32_t p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
|
615
|
+
uint32_t prem = ir_prefetch - p03 * (ne02 * ne01);
|
|
616
|
+
uint32_t p02 = fastdiv(prem, &bctx->src0_dim1_div);
|
|
617
|
+
uint32_t p01 = prem - p02 * ne01;
|
|
620
618
|
uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
|
|
621
619
|
dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, row_size_bytes, next_block_size);
|
|
622
620
|
ir_prefetch += next_block_size;
|
|
@@ -631,10 +629,10 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
|
|
|
631
629
|
struct htp_binary_context * bctx = (struct htp_binary_context *) data;
|
|
632
630
|
struct htp_ops_context * octx = bctx->octx;
|
|
633
631
|
|
|
634
|
-
const struct htp_tensor * src0 =
|
|
635
|
-
const struct htp_tensor * src1 =
|
|
636
|
-
const struct htp_tensor * src2 =
|
|
637
|
-
struct htp_tensor *
|
|
632
|
+
const struct htp_tensor * src0 = octx->src[0];
|
|
633
|
+
const struct htp_tensor * src1 = octx->src[1];
|
|
634
|
+
const struct htp_tensor * src2 = octx->src[2];
|
|
635
|
+
const struct htp_tensor * dst = octx->dst;
|
|
638
636
|
|
|
639
637
|
const uint32_t ne00 = src0->ne[0];
|
|
640
638
|
const uint32_t ne01 = src0->ne[1];
|
|
@@ -646,6 +644,7 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
|
|
|
646
644
|
const uint32_t nb02 = src0->nb[2];
|
|
647
645
|
const uint32_t nb03 = src0->nb[3];
|
|
648
646
|
const uint32_t nb11 = src1->nb[1]; // src1 row stride
|
|
647
|
+
|
|
649
648
|
const uint32_t nb1 = dst->nb[1];
|
|
650
649
|
const uint32_t nb2 = dst->nb[2];
|
|
651
650
|
const uint32_t nb3 = dst->nb[3];
|
|
@@ -657,8 +656,8 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
|
|
|
657
656
|
|
|
658
657
|
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
|
|
659
658
|
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
|
|
660
|
-
size_t src0_spad_half
|
|
661
|
-
size_t dst_spad_half
|
|
659
|
+
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
|
|
660
|
+
size_t dst_spad_half = octx->dst_spad.size_per_thread / 2;
|
|
662
661
|
|
|
663
662
|
dma_queue * q = octx->ctx->dma[ith];
|
|
664
663
|
uint32_t ir_prefetch = start_row;
|
|
@@ -666,11 +665,10 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
|
|
|
666
665
|
|
|
667
666
|
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
|
|
668
667
|
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
|
669
|
-
uint32_t i03
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
i01 = rem - i02 * ne01;
|
|
668
|
+
uint32_t i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
|
669
|
+
uint32_t rem = ir_prefetch - i03 * (ne02 * ne01);
|
|
670
|
+
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
|
671
|
+
uint32_t i01 = rem - i02 * ne01;
|
|
674
672
|
|
|
675
673
|
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
|
|
676
674
|
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
|
|
@@ -678,7 +676,7 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
|
|
|
678
676
|
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
|
|
679
677
|
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
|
|
680
678
|
|
|
681
|
-
|
|
679
|
+
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, ne00 * sizeof(float), 0);
|
|
682
680
|
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, ne00 * sizeof(float), current_block_size);
|
|
683
681
|
ir_prefetch += current_block_size;
|
|
684
682
|
spad_idx ^= 1;
|
|
@@ -689,11 +687,10 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
|
|
|
689
687
|
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
|
|
690
688
|
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
|
|
691
689
|
|
|
692
|
-
uint32_t i03
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
i01 = rem - i02 * ne01;
|
|
690
|
+
uint32_t i03 = fastdiv(ir, &bctx->src0_dim12_div);
|
|
691
|
+
uint32_t rem = ir - i03 * (ne02 * ne01);
|
|
692
|
+
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
|
|
693
|
+
uint32_t i01 = rem - i02 * ne01;
|
|
697
694
|
|
|
698
695
|
for (uint32_t r = 0; r < current_block_size; r++) {
|
|
699
696
|
uint32_t r_i01 = i01 + r; // linear within block since we split at ne01
|
|
@@ -712,11 +709,10 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
|
|
|
712
709
|
|
|
713
710
|
if (ir_prefetch < end_row) {
|
|
714
711
|
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
|
|
715
|
-
uint32_t p03
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
p01 = prem - p02 * ne01;
|
|
712
|
+
uint32_t p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
|
|
713
|
+
uint32_t prem = ir_prefetch - p03 * (ne02 * ne01);
|
|
714
|
+
uint32_t p02 = fastdiv(prem, &bctx->src0_dim1_div);
|
|
715
|
+
uint32_t p01 = prem - p02 * ne01;
|
|
720
716
|
uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
|
|
721
717
|
dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, ne00 * sizeof(float), next_block_size);
|
|
722
718
|
ir_prefetch += next_block_size;
|
|
@@ -727,52 +723,48 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
|
|
|
727
723
|
}
|
|
728
724
|
|
|
729
725
|
static int execute_op_binary(struct htp_ops_context * octx) {
|
|
730
|
-
const struct htp_tensor * src0 =
|
|
731
|
-
const struct htp_tensor * src1 =
|
|
732
|
-
struct htp_tensor *
|
|
726
|
+
const struct htp_tensor * src0 = octx->src[0];
|
|
727
|
+
const struct htp_tensor * src1 = octx->src[1];
|
|
728
|
+
const struct htp_tensor * dst = octx->dst;
|
|
733
729
|
|
|
734
730
|
const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
|
|
735
731
|
const uint32_t n_threads = MIN(octx->n_threads, src0_nrows);
|
|
736
732
|
|
|
737
733
|
// Use packed row sizes for VTCM allocation
|
|
738
|
-
const uint32_t src0_type = octx->
|
|
734
|
+
const uint32_t src0_type = octx->src[0]->type;
|
|
739
735
|
const size_t elem_size = (src0_type == HTP_TYPE_F32) ? sizeof(float) : sizeof(_Float16);
|
|
740
736
|
const size_t src0_row_size = src0->ne[0] * elem_size;
|
|
741
737
|
const size_t src1_row_size = src1->ne[0] * elem_size;
|
|
742
|
-
const size_t dst_row_size = dst->ne[0]
|
|
738
|
+
const size_t dst_row_size = dst->ne[0] * elem_size;
|
|
743
739
|
|
|
744
|
-
|
|
745
|
-
const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
|
|
746
|
-
const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
|
|
740
|
+
size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
|
|
747
741
|
size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
|
|
742
|
+
size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
|
|
748
743
|
|
|
749
744
|
bool is_add_id = (octx->op == HTP_OP_ADD_ID);
|
|
750
745
|
bool is_scalar = !is_add_id && (src1->ne[0] == 1);
|
|
751
746
|
|
|
752
|
-
|
|
753
|
-
|
|
747
|
+
bool is_transposed = (src0->nb[1] < src0_row_size || src1->nb[1] < src1_row_size || dst->nb[1] < dst_row_size);
|
|
748
|
+
|
|
749
|
+
bool is_same_shape = !is_add_id && !is_scalar && !is_transposed &&
|
|
750
|
+
(src1->ne[0] == src0->ne[0] && src0->ne[0] % VLEN == 0) &&
|
|
754
751
|
(src1->ne[1] == src0->ne[1] || src1->ne[1] == 1) &&
|
|
755
752
|
(src1->ne[2] == src0->ne[2] || src1->ne[2] == 1) &&
|
|
756
753
|
(src1->ne[3] == src0->ne[3] || src1->ne[3] == 1);
|
|
757
754
|
|
|
758
|
-
bool is_row_bcast =
|
|
759
|
-
bool
|
|
760
|
-
bool
|
|
755
|
+
bool is_row_bcast = is_same_shape && (src1->ne[1] == 1 && src1->ne[2] == 1 && src1->ne[3] == 1);
|
|
756
|
+
bool is_complex = !is_add_id && !is_scalar && !is_same_shape && (src1->ne[0] == src0->ne[0]);
|
|
757
|
+
bool is_repeat = !is_add_id && !is_scalar && !is_same_shape && (src1->ne[0] != src0->ne[0]);
|
|
761
758
|
|
|
762
759
|
size_t spad_row_total;
|
|
763
|
-
if (
|
|
764
|
-
spad_row_total = 2 * (src0_row_size_aligned + dst_row_size_aligned);
|
|
765
|
-
} else if (is_row_bcast) {
|
|
766
|
-
spad_row_total = 2 * (src0_row_size_aligned + dst_row_size_aligned);
|
|
767
|
-
} else if (use_vector_same) {
|
|
760
|
+
if (is_same_shape) {
|
|
768
761
|
spad_row_total = 2 * (src0_row_size_aligned + src1_row_size_aligned + dst_row_size_aligned);
|
|
769
|
-
} else if (is_add_id) {
|
|
770
|
-
spad_row_total = 2 * (src0_row_size_aligned + dst_row_size_aligned); // src1 read directly
|
|
771
762
|
} else {
|
|
772
763
|
spad_row_total = 2 * (src0_row_size_aligned + dst_row_size_aligned);
|
|
773
764
|
}
|
|
774
765
|
|
|
775
766
|
size_t rows_per_buffer = octx->ctx->vtcm_size / (n_threads * spad_row_total);
|
|
767
|
+
|
|
776
768
|
// Adjust for static src1 in row_bcast case
|
|
777
769
|
if (is_row_bcast) {
|
|
778
770
|
size_t needed_static = src1_row_size_aligned;
|
|
@@ -782,36 +774,34 @@ static int execute_op_binary(struct htp_ops_context * octx) {
|
|
|
782
774
|
}
|
|
783
775
|
|
|
784
776
|
if (rows_per_buffer < 1) {
|
|
785
|
-
|
|
786
|
-
|
|
777
|
+
FARF(ERROR, "binary: VTCM too small\n");
|
|
778
|
+
return HTP_STATUS_VTCM_TOO_SMALL;
|
|
787
779
|
}
|
|
788
780
|
|
|
789
781
|
octx->src0_spad.size_per_thread = rows_per_buffer * 2 * src0_row_size_aligned;
|
|
790
782
|
octx->dst_spad.size_per_thread = rows_per_buffer * 2 * dst_row_size_aligned;
|
|
791
783
|
|
|
792
|
-
if (is_scalar ||
|
|
793
|
-
octx->src1_spad.size_per_thread = 0;
|
|
794
|
-
} else if (is_row_bcast) {
|
|
784
|
+
if (is_add_id || is_scalar || is_complex || is_repeat || is_row_bcast) {
|
|
795
785
|
octx->src1_spad.size_per_thread = 0;
|
|
796
786
|
} else {
|
|
797
787
|
octx->src1_spad.size_per_thread = rows_per_buffer * 2 * src1_row_size_aligned;
|
|
798
788
|
}
|
|
799
789
|
|
|
790
|
+
octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread;
|
|
800
791
|
octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
|
|
801
792
|
if (is_row_bcast) {
|
|
802
793
|
octx->src1_spad.size = src1_row_size_aligned;
|
|
803
794
|
} else {
|
|
804
795
|
octx->src1_spad.size = n_threads * octx->src1_spad.size_per_thread;
|
|
805
796
|
}
|
|
806
|
-
octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread;
|
|
807
797
|
|
|
808
798
|
if (octx->ctx->vtcm_size < (octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size)) {
|
|
809
799
|
return HTP_STATUS_VTCM_TOO_SMALL;
|
|
810
800
|
}
|
|
811
801
|
|
|
812
|
-
octx->src0_spad.data = octx->ctx->vtcm_base;
|
|
813
|
-
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
|
|
814
|
-
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
|
|
802
|
+
octx->src0_spad.data = octx->ctx->vtcm_base; octx->src0_spad.src = NULL;
|
|
803
|
+
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->src1_spad.src = NULL;
|
|
804
|
+
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size; octx->dst_spad.src = NULL;
|
|
815
805
|
|
|
816
806
|
if ((octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
|
|
817
807
|
return HTP_STATUS_OK;
|
|
@@ -823,46 +813,37 @@ static int execute_op_binary(struct htp_ops_context * octx) {
|
|
|
823
813
|
}
|
|
824
814
|
|
|
825
815
|
struct htp_binary_context bctx;
|
|
826
|
-
bctx.octx
|
|
827
|
-
bctx.nrows_per_thread
|
|
828
|
-
bctx.block_max
|
|
816
|
+
bctx.octx = octx;
|
|
817
|
+
bctx.nrows_per_thread = (src0_nrows + n_threads - 1) / n_threads;
|
|
818
|
+
bctx.block_max = rows_per_buffer;
|
|
829
819
|
bctx.src0_row_size_aligned = src0_row_size_aligned;
|
|
830
820
|
bctx.src1_row_size_aligned = src1_row_size_aligned;
|
|
831
821
|
bctx.dst_row_size_aligned = dst_row_size_aligned;
|
|
832
822
|
|
|
833
|
-
bctx.
|
|
834
|
-
bctx.
|
|
835
|
-
bctx.
|
|
823
|
+
bctx.src0_dim1_div = init_fastdiv_values(src0->ne[1]);
|
|
824
|
+
bctx.src0_dim2_div = init_fastdiv_values(src0->ne[2]);
|
|
825
|
+
bctx.src0_dim12_div = init_fastdiv_values(src0->ne[1] * src0->ne[2]);
|
|
836
826
|
|
|
837
|
-
bctx.src1_dim1_div
|
|
838
|
-
bctx.src1_dim2_div
|
|
839
|
-
bctx.src1_dim3_div
|
|
827
|
+
bctx.src1_dim1_div = init_fastdiv_values(src1->ne[1]);
|
|
828
|
+
bctx.src1_dim2_div = init_fastdiv_values(src1->ne[2]);
|
|
829
|
+
bctx.src1_dim3_div = init_fastdiv_values(src1->ne[3]);
|
|
840
830
|
|
|
841
831
|
bool src0_contig_dim1 = (src0->nb[2] == src0->ne[1] * src0->nb[1]);
|
|
842
|
-
bool dst_contig_dim1 = (dst->nb[2]
|
|
832
|
+
bool dst_contig_dim1 = (dst->nb[2] == src0->ne[1] * dst->nb[1]);
|
|
843
833
|
|
|
844
834
|
bool src0_contig_dim2 = (src0->nb[3] == src0->ne[2] * src0->nb[2]);
|
|
845
|
-
bool dst_contig_dim2 = (dst->nb[3]
|
|
846
|
-
|
|
847
|
-
bctx.split_at_ne01 = (src0->ne[2] > 1) &&
|
|
848
|
-
((src1->ne[1] > 1) || (src1->ne[2] > 1) || !src0_contig_dim1 || !dst_contig_dim1);
|
|
835
|
+
bool dst_contig_dim2 = (dst->nb[3] == src0->ne[2] * dst->nb[2]);
|
|
849
836
|
|
|
850
|
-
bctx.
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
// Precompute specific kernel parameters
|
|
854
|
-
if (use_vector_same) {
|
|
855
|
-
bctx.src1_dma_stride = (src1->ne[1] == 1) ? 0 : src1->nb[1];
|
|
856
|
-
bctx.src1_fetch_rows = (src1->ne[1] == 1) ? 1 : rows_per_buffer;
|
|
857
|
-
}
|
|
837
|
+
bctx.split_at_ne01 = (src0->ne[2] > 1) && ((src1->ne[1] > 1) || (src1->ne[2] > 1) || !src0_contig_dim1 || !dst_contig_dim1);
|
|
838
|
+
bctx.split_at_ne02 = (src0->ne[3] > 1) && ((src1->ne[2] > 1) || (src1->ne[3] > 1) || !src0_contig_dim2 || !dst_contig_dim2);
|
|
858
839
|
|
|
859
840
|
worker_callback_t worker_func;
|
|
860
|
-
if (is_add_id)
|
|
861
|
-
else if (is_scalar)
|
|
862
|
-
else if (is_row_bcast)
|
|
863
|
-
else if (
|
|
864
|
-
else if (
|
|
865
|
-
else
|
|
841
|
+
if (is_add_id) worker_func = binary_job_add_id;
|
|
842
|
+
else if (is_scalar) worker_func = binary_job_scalar;
|
|
843
|
+
else if (is_row_bcast) worker_func = binary_job_vector_row_broadcast;
|
|
844
|
+
else if (is_same_shape) worker_func = binary_job_vector_same_shape;
|
|
845
|
+
else if (is_complex) worker_func = binary_job_vector_complex;
|
|
846
|
+
else worker_func = binary_job_element_repeat;
|
|
866
847
|
|
|
867
848
|
if (is_row_bcast) {
|
|
868
849
|
dma_queue_pop(q);
|
|
@@ -876,12 +857,12 @@ static int execute_op_binary(struct htp_ops_context * octx) {
|
|
|
876
857
|
int op_binary(struct htp_ops_context * octx) {
|
|
877
858
|
|
|
878
859
|
// Does not support permutations of src1
|
|
879
|
-
const struct htp_tensor * src1 =
|
|
860
|
+
const struct htp_tensor * src1 = octx->src[1];
|
|
880
861
|
if (src1->nb[1] < src1->nb[0]) {
|
|
881
862
|
return HTP_STATUS_NO_SUPPORT;
|
|
882
863
|
}
|
|
883
864
|
|
|
884
|
-
const uint32_t src0_type = octx->
|
|
865
|
+
const uint32_t src0_type = octx->src[0]->type;
|
|
885
866
|
if ((src0_type == HTP_TYPE_F32) || (src0_type == HTP_TYPE_F16)) {
|
|
886
867
|
return execute_op_binary(octx);
|
|
887
868
|
}
|