whispercpp 1.3.6 → 1.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.document +3 -0
- data/.rdoc_options +2 -0
- data/README.md +38 -5
- data/Rakefile +18 -3
- data/ext/dependencies.rb +10 -4
- data/ext/dependencies_for_windows.rb +17 -0
- data/ext/extconf.rb +20 -8
- data/ext/options.rb +54 -14
- data/ext/options_for_windows.rb +51 -0
- data/ext/ruby_whisper.c +36 -42
- data/ext/ruby_whisper.h +135 -0
- data/ext/ruby_whisper_context.c +107 -28
- data/ext/ruby_whisper_log_queue.c +180 -0
- data/ext/ruby_whisper_log_settable.h +47 -0
- data/ext/ruby_whisper_parakeet.c +49 -0
- data/ext/ruby_whisper_parakeet_context.c +304 -0
- data/ext/ruby_whisper_parakeet_context_params.c +117 -0
- data/ext/ruby_whisper_parakeet_model.c +84 -0
- data/ext/ruby_whisper_parakeet_params.c +548 -0
- data/ext/ruby_whisper_parakeet_segment.c +157 -0
- data/ext/ruby_whisper_parakeet_token.c +188 -0
- data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
- data/ext/ruby_whisper_params.c +256 -65
- data/ext/ruby_whisper_segment.c +6 -6
- data/ext/ruby_whisper_transcribe.cpp +42 -15
- data/ext/sources/CMakeLists.txt +41 -3
- data/ext/sources/CMakePresets.json +95 -0
- data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
- data/ext/sources/cmake/parakeet.pc.in +10 -0
- data/ext/sources/cmake/whisper.pc.in +1 -1
- data/ext/sources/examples/CMakeLists.txt +4 -2
- data/ext/sources/examples/bench/bench.cpp +1 -1
- data/ext/sources/examples/cli/cli.cpp +43 -9
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/common-whisper.cpp +139 -67
- data/ext/sources/examples/common-whisper.h +11 -0
- data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
- data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
- data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
- data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
- data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
- data/ext/sources/examples/server/server.cpp +199 -163
- data/ext/sources/ggml/CMakeLists.txt +21 -13
- data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
- data/ext/sources/ggml/include/ggml-alloc.h +1 -0
- data/ext/sources/ggml/include/ggml-backend.h +72 -10
- data/ext/sources/ggml/include/ggml-cuda.h +3 -0
- data/ext/sources/ggml/include/ggml-rpc.h +3 -3
- data/ext/sources/ggml/include/ggml.h +101 -9
- data/ext/sources/ggml/include/gguf.h +10 -2
- data/ext/sources/ggml/src/CMakeLists.txt +22 -5
- data/ext/sources/ggml/src/ggml-alloc.c +5 -1
- data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
- data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +12 -0
- data/ext/sources/ggml/src/ggml-backend.cpp +110 -9
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +4 -0
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +672 -257
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +71 -0
- data/ext/sources/ggml/src/ggml-cann/common.h +20 -10
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +211 -30
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +58 -29
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +2 -0
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +16 -16
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +116 -7
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +65 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +4279 -1292
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +5 -35
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +0 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +177 -27
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +5 -0
- data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +95 -5
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +146 -134
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +88 -70
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +372 -73
- data/ext/sources/ggml/src/ggml-cpu/ops.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +55 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +3 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +90 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +3 -16
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
- data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +37 -53
- data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +17 -7
- data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
- data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +62 -26
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +44 -18
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +242 -28
- data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +53 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +14 -6
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +278 -44
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +331 -130
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +126 -27
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +40 -15
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +18 -9
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +152 -49
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
- data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +84 -35
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1069 -609
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +4 -2
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +242 -195
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +3 -3
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +502 -423
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +19 -12
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +485 -57
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -1
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +36 -10
- data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +133 -26
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +5 -1
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +11 -4
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
- data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
- data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +45 -13
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -4
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +26 -23
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +31 -2
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +80 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +22 -4
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +2 -1
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1428 -743
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -7
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +53 -84
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +25 -12
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +165 -184
- data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +170 -127
- data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +125 -97
- data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +148 -42
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.c +2 -2
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +252 -62
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +9 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +87 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +96 -13
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +182 -57
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +71 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +27 -10
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +63 -23
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +9 -8
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +1 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +5 -8
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +529 -815
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2522 -234
- data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +291 -95
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +59 -37
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +121 -133
- data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +244 -151
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +6 -6
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +719 -45
- data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +3 -1
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +22 -9
- data/ext/sources/ggml/src/ggml-impl.h +6 -1
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +138 -13
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +32 -1
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +164 -28
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +80 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +190 -19
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +2 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +39 -26
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +823 -322
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +54 -5
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +12248 -5907
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +67 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +59 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1819 -112
- data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mm_q8_0_f32_8x4.cl → gemm_noshuffle_q8_0_f32.cl} +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
- data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
- data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +48 -64
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +15 -5
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +18 -11
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +35 -13
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +264 -192
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +33 -7
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +1 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +1 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +27 -3
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +67 -36
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +1 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +101 -44
- data/ext/sources/ggml/src/ggml-openvino/utils.h +23 -3
- data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
- data/ext/sources/ggml/src/ggml-quants.c +289 -114
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
- data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
- data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +50 -4
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +3 -1
- data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +41 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +115 -13
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
- data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
- data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
- data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1 -90
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +0 -2
- data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +7 -5
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +4 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +76 -168
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +3 -1
- data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +69 -31
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +823 -190
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
- data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
- data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +71 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
- data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
- data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
- data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
- data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
- data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +215 -53
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +4 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +2 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +2 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +1 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +1 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +0 -2
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +11 -0
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +2060 -535
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +197 -48
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +60 -59
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +115 -113
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +122 -31
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +125 -64
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +43 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +5 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +3 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +11 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +171 -147
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +2202 -283
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +2610 -1403
- data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +8 -7
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +76 -95
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +19 -1
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -50
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +107 -184
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +183 -78
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +655 -495
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +8 -6
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +5 -1
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +80 -409
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +6 -4
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +2 -3
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +68 -48
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +18 -14
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +244 -10
- data/ext/sources/ggml/src/ggml.c +110 -28
- data/ext/sources/ggml/src/gguf.cpp +173 -28
- data/ext/sources/include/parakeet.h +342 -0
- data/ext/sources/include/whisper.h +10 -0
- data/ext/sources/media/matmul.png +0 -0
- data/ext/sources/src/CMakeLists.txt +23 -0
- data/ext/sources/src/parakeet-arch.h +188 -0
- data/ext/sources/src/parakeet.cpp +3838 -0
- data/ext/sources/src/whisper.cpp +56 -12
- data/extsources.rb +26 -10
- data/lib/whisper/log_settable.rb +36 -0
- data/lib/whisper/model/uri.rb +13 -1
- data/lib/whisper/output.rb +74 -0
- data/sig/whisper.rbs +411 -62
- data/test/helper.rb +2 -0
- data/test/jfk_reader/jfk_reader.c +50 -7
- data/test/test_callback.rb +1 -0
- data/test/test_package.rb +6 -5
- data/test/test_parakeet.rb +28 -0
- data/test/test_parakeet_callback.rb +107 -0
- data/test/test_parakeet_context.rb +116 -0
- data/test/test_parakeet_context_params.rb +24 -0
- data/test/test_parakeet_model.rb +21 -0
- data/test/test_parakeet_params.rb +78 -0
- data/test/test_parakeet_segment.rb +42 -0
- data/test/test_parakeet_token.rb +73 -0
- data/test/test_params.rb +2 -0
- data/test/test_vad_segment.rb +1 -1
- data/test/test_whisper.rb +24 -6
- data/whispercpp.gemspec +2 -2
- metadata +215 -281
- data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
- data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
- data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
- data/ext/sources/bindings/javascript/package.json +0 -26
- data/ext/sources/bindings/javascript/whisper.js +0 -19
- data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
- data/ext/sources/examples/addon.node/addon.cpp +0 -557
- data/ext/sources/examples/addon.node/index.js +0 -59
- data/ext/sources/examples/addon.node/package.json +0 -16
- data/ext/sources/examples/addon.node/vad-example.js +0 -132
- data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
- data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
- data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
- data/ext/sources/examples/coi-serviceworker.js +0 -146
- data/ext/sources/examples/command/CMakeLists.txt +0 -10
- data/ext/sources/examples/command/command.cpp +0 -802
- data/ext/sources/examples/command/commands.txt +0 -9
- data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
- data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
- data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
- data/ext/sources/examples/generate-karaoke.sh +0 -57
- data/ext/sources/examples/helpers.js +0 -191
- data/ext/sources/examples/livestream.sh +0 -112
- data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
- data/ext/sources/examples/lsp/lsp.cpp +0 -471
- data/ext/sources/examples/lsp/whisper.vim +0 -362
- data/ext/sources/examples/python/test_whisper_processor.py +0 -7
- data/ext/sources/examples/python/whisper_processor.py +0 -54
- data/ext/sources/examples/server/bench.js +0 -29
- data/ext/sources/examples/server.py +0 -120
- data/ext/sources/examples/stream/CMakeLists.txt +0 -10
- data/ext/sources/examples/stream/stream.cpp +0 -437
- data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
- data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
- data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
- data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
- data/ext/sources/examples/sycl/build.sh +0 -22
- data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
- data/ext/sources/examples/sycl/run-whisper.sh +0 -17
- data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -48
- data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -488
- data/ext/sources/examples/talk-llama/llama-adapter.h +0 -89
- data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2877
- data/ext/sources/examples/talk-llama/llama-arch.h +0 -628
- data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -919
- data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
- data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -896
- data/ext/sources/examples/talk-llama/llama-chat.h +0 -71
- data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3633
- data/ext/sources/examples/talk-llama/llama-context.h +0 -359
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
- data/ext/sources/examples/talk-llama/llama-cparams.h +0 -47
- data/ext/sources/examples/talk-llama/llama-ext.h +0 -12
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
- data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
- data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2735
- data/ext/sources/examples/talk-llama/llama-graph.h +0 -1031
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -258
- data/ext/sources/examples/talk-llama/llama-hparams.h +0 -353
- data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
- data/ext/sources/examples/talk-llama/llama-impl.h +0 -75
- data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
- data/ext/sources/examples/talk-llama/llama-io.h +0 -35
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -330
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2285
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -389
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +0 -275
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +0 -140
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1165
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
- data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
- data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -752
- data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1655
- data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -206
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -299
- data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -40
- data/ext/sources/examples/talk-llama/llama-model.cpp +0 -9056
- data/ext/sources/examples/talk-llama/llama-model.h +0 -597
- data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1304
- data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
- data/ext/sources/examples/talk-llama/llama-sampler.cpp +0 -3885
- data/ext/sources/examples/talk-llama/llama-sampler.h +0 -42
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3970
- data/ext/sources/examples/talk-llama/llama-vocab.h +0 -187
- data/ext/sources/examples/talk-llama/llama.cpp +0 -1194
- data/ext/sources/examples/talk-llama/llama.h +0 -1573
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -190
- data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
- data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -137
- data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -143
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -133
- data/ext/sources/examples/talk-llama/models/bert.cpp +0 -184
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -145
- data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
- data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -142
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -262
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +0 -445
- data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -148
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +0 -97
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +0 -145
- data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -111
- data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
- data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
- data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -157
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -195
- data/ext/sources/examples/talk-llama/models/granite.cpp +0 -210
- data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -139
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -153
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/jais2.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +0 -381
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -196
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/llama.cpp +0 -175
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/mamba-base.cpp +0 -289
- data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -54
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -129
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -200
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
- data/ext/sources/examples/talk-llama/models/models.h +0 -704
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -109
- data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -162
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
- data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
- data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
- data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -320
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/plm.cpp +0 -169
- data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +0 -381
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +0 -422
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -131
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -525
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -140
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -164
- data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
- data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -137
- data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +0 -165
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
- data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
- data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
- data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
- data/ext/sources/examples/talk-llama/speak +0 -40
- data/ext/sources/examples/talk-llama/speak.bat +0 -1
- data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
- data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
- data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
- data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
- data/ext/sources/examples/talk-llama/unicode.cpp +0 -1103
- data/ext/sources/examples/talk-llama/unicode.h +0 -111
- data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
- data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
- data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
- data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
- data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
- data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
- data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
- data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
- data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
- data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -155
- data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
- data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +0 -123
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +0 -17
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +0 -333
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -182
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -718
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
- data/ext/sources/tests/CMakeLists.txt +0 -112
- data/ext/sources/tests/earnings21/eval.mk +0 -58
- data/ext/sources/tests/earnings21/eval.py +0 -68
- data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
- data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
- data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
- data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
- data/ext/sources/tests/earnings21/requirements.txt +0 -6
- data/ext/sources/tests/en-0-ref.txt +0 -1
- data/ext/sources/tests/en-1-ref.txt +0 -1
- data/ext/sources/tests/en-2-ref.txt +0 -1
- data/ext/sources/tests/es-0-ref.txt +0 -1
- data/ext/sources/tests/librispeech/eval.mk +0 -39
- data/ext/sources/tests/librispeech/eval.py +0 -47
- data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
- data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
- data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
- data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
- data/ext/sources/tests/librispeech/requirements.txt +0 -6
- data/ext/sources/tests/run-tests.sh +0 -130
- data/ext/sources/tests/test-c.c +0 -3
- data/ext/sources/tests/test-vad-full.cpp +0 -56
- data/ext/sources/tests/test-vad.cpp +0 -83
- data/ext/sources/tests/test-whisper.js +0 -58
- data/lib/whisper/context.rb +0 -15
- data/lib/whisper/segment.rb +0 -58
- /data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general_q8_0_f32.cl → gemv_noshuffle_q8_0_f32.cl} +0 -0
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
#pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
|
|
2
2
|
#pragma clang diagnostic ignored "-Wunused-function"
|
|
3
|
+
#pragma clang diagnostic ignored "-Wunused-variable"
|
|
4
|
+
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
|
3
5
|
|
|
4
6
|
#include <HAP_farf.h>
|
|
5
7
|
#include <HAP_perf.h>
|
|
@@ -10,19 +12,23 @@
|
|
|
10
12
|
#include <HAP_mem.h>
|
|
11
13
|
#include <HAP_power.h>
|
|
12
14
|
#include <HAP_ps.h>
|
|
15
|
+
#include <HAP_dcvs.h>
|
|
13
16
|
#include <qurt.h>
|
|
14
17
|
#include <qurt_thread.h>
|
|
18
|
+
#include <qurt_memory.h>
|
|
15
19
|
#include <remote.h>
|
|
16
20
|
#include <string.h>
|
|
17
21
|
|
|
18
|
-
#include "hex-dma.h"
|
|
19
22
|
#include "hex-utils.h"
|
|
23
|
+
#include "hex-dma.h"
|
|
24
|
+
#include "hmx-queue.h"
|
|
20
25
|
|
|
21
26
|
#define GGML_COMMON_DECL_C
|
|
22
27
|
#include "ggml-common.h"
|
|
23
28
|
#include "htp-ctx.h"
|
|
24
|
-
#include "htp-msg.h"
|
|
25
29
|
#include "htp-ops.h"
|
|
30
|
+
#include "htp-ops.h"
|
|
31
|
+
#include "htp_iface.h"
|
|
26
32
|
#include "worker-pool.h"
|
|
27
33
|
|
|
28
34
|
AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
|
|
@@ -34,7 +40,7 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
|
|
|
34
40
|
return AEE_ENOMEMORY;
|
|
35
41
|
}
|
|
36
42
|
|
|
37
|
-
// Use the context structure as
|
|
43
|
+
// Use the context structure as the handle
|
|
38
44
|
*handle = (remote_handle64) ctx;
|
|
39
45
|
|
|
40
46
|
// Enable FARF logs
|
|
@@ -58,8 +64,7 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
|
|
|
58
64
|
|
|
59
65
|
request.type = HAP_power_set_DCVS_v3;
|
|
60
66
|
request.dcvs_v3.set_dcvs_enable = TRUE;
|
|
61
|
-
request.dcvs_v3.dcvs_enable =
|
|
62
|
-
request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
|
|
67
|
+
request.dcvs_v3.dcvs_enable = FALSE;
|
|
63
68
|
request.dcvs_v3.set_bus_params = TRUE;
|
|
64
69
|
request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_MAX;
|
|
65
70
|
request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_MAX;
|
|
@@ -70,6 +75,10 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
|
|
|
70
75
|
request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_MAX;
|
|
71
76
|
request.dcvs_v3.set_sleep_disable = TRUE;
|
|
72
77
|
request.dcvs_v3.sleep_disable = TRUE;
|
|
78
|
+
|
|
79
|
+
#if (__HEXAGON_ARCH__ >= 79)
|
|
80
|
+
HAP_set_dcvs_v3_protected_bus_corners(&request, 1);
|
|
81
|
+
#endif
|
|
73
82
|
if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
|
|
74
83
|
return err;
|
|
75
84
|
}
|
|
@@ -82,6 +91,27 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
|
|
|
82
91
|
}
|
|
83
92
|
}
|
|
84
93
|
|
|
94
|
+
#if __HVX_ARCH__ >= 75
|
|
95
|
+
{
|
|
96
|
+
// Power on HMX and set HMX clock
|
|
97
|
+
HAP_power_request_t request;
|
|
98
|
+
memset(&request, 0, sizeof(HAP_power_request_t));
|
|
99
|
+
request.type = HAP_power_set_HMX_v2;
|
|
100
|
+
request.hmx_v2.set_power = TRUE;
|
|
101
|
+
request.hmx_v2.power_up = TRUE;
|
|
102
|
+
request.hmx_v2.set_clock = TRUE;
|
|
103
|
+
request.hmx_v2.target_corner = HAP_DCVS_EXP_VCORNER_MAX;
|
|
104
|
+
request.hmx_v2.min_corner = HAP_DCVS_EXP_VCORNER_MAX;
|
|
105
|
+
request.hmx_v2.max_corner = HAP_DCVS_EXP_VCORNER_MAX;
|
|
106
|
+
request.hmx_v2.perf_mode = HAP_CLK_PERF_HIGH;
|
|
107
|
+
FARF(ALWAYS, "Setting HMX clock\n");
|
|
108
|
+
err = HAP_power_set((void *) ctx, &request);
|
|
109
|
+
if (err != AEE_SUCCESS) {
|
|
110
|
+
FARF(ERROR, "ggml-hex: error setting HMX clock.");
|
|
111
|
+
return err;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
#else
|
|
85
115
|
{
|
|
86
116
|
// Power on HMX
|
|
87
117
|
HAP_power_request_t request;
|
|
@@ -89,12 +119,61 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
|
|
|
89
119
|
request.type = HAP_power_set_HMX;
|
|
90
120
|
request.hmx.power_up = TRUE;
|
|
91
121
|
FARF(ALWAYS, "Powering HMX on\n");
|
|
92
|
-
err = HAP_power_set((void *)
|
|
122
|
+
err = HAP_power_set((void *) ctx, &request);
|
|
93
123
|
if (err != AEE_SUCCESS) {
|
|
94
|
-
FARF(ERROR, "
|
|
124
|
+
FARF(ERROR, "ggml-hex: error powering on HMX.");
|
|
95
125
|
return err;
|
|
96
126
|
}
|
|
97
127
|
}
|
|
128
|
+
#endif
|
|
129
|
+
|
|
130
|
+
return AEE_SUCCESS;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
AEEResult htp_iface_etm(remote_handle64 handle, uint32_t enable) {
|
|
134
|
+
int err = enable ? HAP_user_etm_enable() : HAP_user_etm_disable();
|
|
135
|
+
if (err) {
|
|
136
|
+
if (err == AEE_EVERSIONNOTSUPPORT) {
|
|
137
|
+
FARF(ERROR, "API HAP_user_etm_enable/disable is not supported\n");
|
|
138
|
+
} else {
|
|
139
|
+
FARF(ERROR, "Error executing HAP_user_etm_enable/disable with error code : 0x%x\n", err);
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
return err;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
AEEResult htp_iface_profiler(remote_handle64 handle, uint32_t mode, const htp_iface_pmu_conf* pmu_conf) {
|
|
146
|
+
struct htp_context * ctx = (struct htp_context *) handle;
|
|
147
|
+
if (!ctx) {
|
|
148
|
+
return AEE_EBADPARM;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
if (mode == HTP_PROF_PMU) {
|
|
152
|
+
const uint32_t* events = pmu_conf->events;
|
|
153
|
+
|
|
154
|
+
// Pack 4 event IDs (low 8 bits) into each 32-bit config register
|
|
155
|
+
uint32_t evtcfg = 0, evtcfg1 = 0, cfg = 0, i = 0;
|
|
156
|
+
for (; i < HEX_NUM_PMU_COUNTERS/2; i++) {
|
|
157
|
+
evtcfg |= ((events[i + 0] & 0xFF) << (i * 8));
|
|
158
|
+
evtcfg1 |= ((events[i + 4] & 0xFF) << (i * 8));
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// For events >255 pack high 2 bits of all 8 event IDs into cfg register
|
|
162
|
+
// 2 bits per counter: bits [1:0] for counter 0, [3:2] for counter 1, etc.
|
|
163
|
+
for (i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
|
|
164
|
+
cfg |= (((events[i] >> 8) & 3) << (i * 2));
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
FARF(ALWAYS, "Configuring PMU registers: evtcfg = 0x%x, evtcfg1 = 0x%x, pmucfg = 0x%x", evtcfg, evtcfg1, cfg);
|
|
168
|
+
|
|
169
|
+
// Configure PMU registers
|
|
170
|
+
qurt_pmu_set(QURT_PMUCFG, cfg);
|
|
171
|
+
qurt_pmu_set(QURT_PMUEVTCFG, evtcfg);
|
|
172
|
+
qurt_pmu_set(QURT_PMUEVTCFG1, evtcfg1);
|
|
173
|
+
qurt_pmu_enable(1);
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
ctx->profiler = mode;
|
|
98
177
|
|
|
99
178
|
return AEE_SUCCESS;
|
|
100
179
|
}
|
|
@@ -111,91 +190,128 @@ AEEResult htp_iface_close(remote_handle64 handle) {
|
|
|
111
190
|
return AEE_EITEMBUSY;
|
|
112
191
|
}
|
|
113
192
|
|
|
193
|
+
// release the mmaps (if any)
|
|
194
|
+
for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
|
|
195
|
+
if (ctx->mmap[i].size) {
|
|
196
|
+
#if __HVX_ARCH__ > 73
|
|
197
|
+
HAP_munmap2((void *) ctx->mmap[i].base, ctx->mmap[i].size);
|
|
198
|
+
#else
|
|
199
|
+
HAP_munmap((void *) ctx->mmap[i].base, ctx->mmap[i].size);
|
|
200
|
+
#endif
|
|
201
|
+
ctx->mmap[i].size = 0;
|
|
202
|
+
ctx->mmap[i].base = NULL;
|
|
203
|
+
ctx->mmap[i].fd = -1;
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
if (ctx->profiler) {
|
|
208
|
+
qurt_pmu_enable(1);
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
if (ctx->etm) {
|
|
212
|
+
HAP_user_etm_disable();
|
|
213
|
+
}
|
|
214
|
+
|
|
114
215
|
free(ctx);
|
|
115
216
|
return AEE_SUCCESS;
|
|
116
217
|
}
|
|
117
218
|
|
|
118
|
-
AEEResult
|
|
119
|
-
|
|
120
|
-
if (
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
219
|
+
AEEResult htp_iface_mmap(remote_handle64 handle, uint32_t fd, uint32_t size) {
|
|
220
|
+
struct htp_context * ctx = (struct htp_context *) handle;
|
|
221
|
+
if (!ctx) {
|
|
222
|
+
return AEE_EBADPARM;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// See if we already have this mapping
|
|
226
|
+
for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
|
|
227
|
+
struct htp_mmap *m = &ctx->mmap[i];
|
|
228
|
+
if (m->fd == fd) {
|
|
229
|
+
return AEE_SUCCESS;
|
|
125
230
|
}
|
|
126
231
|
}
|
|
127
|
-
|
|
232
|
+
|
|
233
|
+
// Add new mapping
|
|
234
|
+
for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
|
|
235
|
+
struct htp_mmap *m = &ctx->mmap[i];
|
|
236
|
+
if (!m->size) {
|
|
237
|
+
FARF(HIGH, "mmap : fd %u size %u", fd, size);
|
|
238
|
+
#if __HVX_ARCH__ > 73
|
|
239
|
+
void *va = HAP_mmap2(NULL, size, HAP_PROT_READ | HAP_PROT_WRITE, 0, fd, 0);
|
|
240
|
+
#else
|
|
241
|
+
if (size > HTP_MMAP_MAX_VMEM) { // HAP_mmap has a size limit of 2GB
|
|
242
|
+
FARF(ERROR, "mmap failed : size %u exceeds 2GB limit for HAP_mmap", (uint32_t) size);
|
|
243
|
+
abort(); // can't do much else at this point
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
void *va = HAP_mmap(NULL, size, HAP_PROT_READ | HAP_PROT_WRITE, 0, fd, 0);
|
|
247
|
+
#endif
|
|
248
|
+
if (va == (void*)-1) {
|
|
249
|
+
FARF(ERROR, "mmap failed : va %p fd %u size %u", va, fd, (uint32_t) size);
|
|
250
|
+
return AEE_EFAILED;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
m->base = (uint64_t) va;
|
|
254
|
+
m->fd = fd;
|
|
255
|
+
m->size = size;
|
|
256
|
+
|
|
257
|
+
return AEE_SUCCESS;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
return AEE_ENOMEMORY;
|
|
128
262
|
}
|
|
129
263
|
|
|
130
|
-
AEEResult
|
|
131
|
-
|
|
132
|
-
if (
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
264
|
+
AEEResult htp_iface_munmap(remote_handle64 handle, uint32 fd) {
|
|
265
|
+
struct htp_context * ctx = (struct htp_context *) handle;
|
|
266
|
+
if (!ctx) {
|
|
267
|
+
return AEE_EBADPARM;
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
|
|
271
|
+
struct htp_mmap *m = &ctx->mmap[i];
|
|
272
|
+
if (fd < 0 || m->fd == fd) {
|
|
273
|
+
FARF(HIGH, "unmmap : base %p fd %u size %u", (void*) m->base, m->fd, (uint32_t) m->size);
|
|
274
|
+
#if __HVX_ARCH__ > 73
|
|
275
|
+
HAP_munmap2((void *) m->base, m->size);
|
|
276
|
+
#else
|
|
277
|
+
HAP_munmap((void *) m->base, m->size);
|
|
278
|
+
#endif
|
|
279
|
+
m->size = 0;
|
|
280
|
+
m->base = NULL;
|
|
281
|
+
m->fd = -1;
|
|
137
282
|
}
|
|
138
283
|
}
|
|
139
|
-
|
|
284
|
+
|
|
285
|
+
return AEE_SUCCESS;
|
|
140
286
|
}
|
|
141
287
|
|
|
142
|
-
static
|
|
143
|
-
int err;
|
|
288
|
+
static void vtcm_acquire(struct htp_context * ctx) {
|
|
144
289
|
if (!ctx->vtcm_valid) {
|
|
145
|
-
|
|
146
|
-
// This way the resource manager will notify the other thread to release VTCM.
|
|
147
|
-
// Note that we need to reaquire VTCM at normal priority for this to work next time.
|
|
148
|
-
qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10);
|
|
149
|
-
err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
|
|
290
|
+
int err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000u);
|
|
150
291
|
if (err != 0) {
|
|
151
|
-
FARF(ERROR, "
|
|
292
|
+
FARF(ERROR, "ggml-hex: failed to acquire VTCM: 0x%08x", (unsigned)err);
|
|
152
293
|
abort();
|
|
153
294
|
}
|
|
154
|
-
HAP_compute_res_release_cached(ctx->vtcm_rctx);
|
|
155
|
-
qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio);
|
|
156
295
|
|
|
157
|
-
|
|
158
|
-
if (err != 0) {
|
|
159
|
-
FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
|
|
160
|
-
abort();
|
|
161
|
-
}
|
|
296
|
+
ctx->vtcm_needs_release = false;
|
|
162
297
|
ctx->vtcm_valid = true;
|
|
163
|
-
}
|
|
164
298
|
|
|
165
|
-
|
|
166
|
-
|
|
299
|
+
// Drop the priority to make sure we get the release callback from other GGML-HTP and QNN-HTP sessions
|
|
300
|
+
HAP_compute_res_update_priority(ctx->vtcm_rctx, ctx->thread_prio + 10);
|
|
301
|
+
}
|
|
167
302
|
}
|
|
168
303
|
|
|
169
|
-
static
|
|
170
|
-
ctx->
|
|
171
|
-
|
|
172
|
-
if (ctx->vtcm_valid && ctx->vtcm_needs_release) {
|
|
304
|
+
static void vtcm_release(struct htp_context * ctx) {
|
|
305
|
+
if (ctx->vtcm_valid) {
|
|
173
306
|
ctx->vtcm_valid = false;
|
|
174
307
|
ctx->vtcm_needs_release = false;
|
|
175
308
|
HAP_compute_res_release_cached(ctx->vtcm_rctx);
|
|
176
309
|
}
|
|
177
|
-
|
|
178
|
-
return 0;
|
|
179
310
|
}
|
|
180
311
|
|
|
181
312
|
static int vtcm_release_callback(unsigned int rctx, void * state) {
|
|
182
313
|
struct htp_context * ctx = (struct htp_context *) state;
|
|
183
|
-
|
|
184
|
-
if (!ctx || ctx->vtcm_rctx != rctx) {
|
|
185
|
-
return AEE_EBADPARM;
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
// If VTCM is not inuse (not processing Ops) release it right here
|
|
189
|
-
// otherwise we'll release it once we're done with the current Op.
|
|
190
|
-
|
|
191
|
-
if (ctx->vtcm_inuse) {
|
|
192
|
-
ctx->vtcm_needs_release = true;
|
|
193
|
-
return 0;
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
ctx->vtcm_valid = false;
|
|
197
|
-
HAP_compute_res_release_cached(ctx->vtcm_rctx);
|
|
198
|
-
|
|
314
|
+
ctx->vtcm_needs_release = true;
|
|
199
315
|
return 0;
|
|
200
316
|
}
|
|
201
317
|
|
|
@@ -207,7 +323,7 @@ static int vtcm_alloc(struct htp_context * ctx) {
|
|
|
207
323
|
HAP_compute_res_attr_init(&attr);
|
|
208
324
|
HAP_compute_res_attr_set_serialize(&attr, 0);
|
|
209
325
|
HAP_compute_res_attr_set_cache_mode(&attr, 1);
|
|
210
|
-
HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size,
|
|
326
|
+
HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, vtcm_size, vtcm_size); // single page
|
|
211
327
|
HAP_compute_res_attr_set_release_callback(&attr, vtcm_release_callback, (void *) ctx);
|
|
212
328
|
HAP_compute_res_attr_set_hmx_param(&attr, 1);
|
|
213
329
|
|
|
@@ -229,7 +345,6 @@ static int vtcm_alloc(struct htp_context * ctx) {
|
|
|
229
345
|
ctx->vtcm_size = vtcm_size;
|
|
230
346
|
ctx->vtcm_rctx = rctx;
|
|
231
347
|
ctx->vtcm_valid = false;
|
|
232
|
-
ctx->vtcm_inuse = false;
|
|
233
348
|
ctx->vtcm_needs_release = false;
|
|
234
349
|
|
|
235
350
|
return 0;
|
|
@@ -246,7 +361,7 @@ static void vtcm_free(struct htp_context * ctx) {
|
|
|
246
361
|
static void htp_packet_callback(dspqueue_t queue, int error, void * context);
|
|
247
362
|
static void htp_error_callback(dspqueue_t queue, int error, void * context);
|
|
248
363
|
|
|
249
|
-
AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx) {
|
|
364
|
+
AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx, uint32 use_hmx, uint64_t max_vmem) {
|
|
250
365
|
struct htp_context * ctx = (struct htp_context *) handle;
|
|
251
366
|
|
|
252
367
|
if (!ctx) {
|
|
@@ -264,12 +379,12 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
|
|
|
264
379
|
htp_error_callback, // Error callback; no errors expected on the DSP
|
|
265
380
|
(void *) ctx, // Callback context
|
|
266
381
|
&ctx->queue);
|
|
267
|
-
|
|
268
382
|
if (err) {
|
|
269
383
|
FARF(ERROR, "Queue import failed with 0x%08x", (unsigned) err);
|
|
270
384
|
return err;
|
|
271
385
|
}
|
|
272
386
|
|
|
387
|
+
ctx->max_vmem = max_vmem;
|
|
273
388
|
ctx->thread_id = qurt_thread_get_id();
|
|
274
389
|
ctx->thread_prio = qurt_thread_get_priority(ctx->thread_id);
|
|
275
390
|
|
|
@@ -280,6 +395,19 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
|
|
|
280
395
|
return AEE_ENOMEMORY;
|
|
281
396
|
}
|
|
282
397
|
|
|
398
|
+
#ifdef HTP_HAS_HMX
|
|
399
|
+
ctx->hmx_enabled = use_hmx;
|
|
400
|
+
ctx->hmx_queue = NULL;
|
|
401
|
+
if (use_hmx) {
|
|
402
|
+
ctx->hmx_queue = hmx_queue_create(16, ctx->vtcm_rctx);
|
|
403
|
+
if (!ctx->hmx_queue) {
|
|
404
|
+
FARF(ERROR, "hmx-queue-create failed");
|
|
405
|
+
ctx->hmx_enabled = false;
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
FARF(HIGH, "HMX %s (use_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", use_hmx);
|
|
409
|
+
#endif
|
|
410
|
+
|
|
283
411
|
qurt_sysenv_max_hthreads_t hw_threads;
|
|
284
412
|
qurt_sysenv_get_max_hw_threads(&hw_threads);
|
|
285
413
|
uint32_t hw_nhvx = (qurt_hvx_get_units() >> 8) & 0xFF;
|
|
@@ -296,14 +424,21 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
|
|
|
296
424
|
|
|
297
425
|
ctx->n_threads = n_hvx;
|
|
298
426
|
for (int i = 0; i < ctx->n_threads; i++) {
|
|
299
|
-
//
|
|
300
|
-
ctx->dma[i] = dma_queue_create(64);
|
|
427
|
+
ctx->dma[i] = dma_queue_create(256); // queue depth
|
|
301
428
|
}
|
|
302
429
|
|
|
430
|
+
ctx->ddr_spad_size = 512 * 1024; // 512 KB
|
|
431
|
+
ctx->ddr_spad_base = memalign(128, ctx->ddr_spad_size);
|
|
432
|
+
|
|
303
433
|
// init worker pool
|
|
304
434
|
err = worker_pool_init(&ctx->worker_pool, n_hvx);
|
|
305
435
|
if (err != AEE_SUCCESS) {
|
|
306
436
|
FARF(ERROR, "Unable to create worker pool");
|
|
437
|
+
if (ctx->ddr_spad_base) {
|
|
438
|
+
free(ctx->ddr_spad_base);
|
|
439
|
+
ctx->ddr_spad_base = NULL;
|
|
440
|
+
ctx->ddr_spad_size = 0;
|
|
441
|
+
}
|
|
307
442
|
return err;
|
|
308
443
|
}
|
|
309
444
|
|
|
@@ -341,8 +476,22 @@ AEEResult htp_iface_stop(remote_handle64 handle) {
|
|
|
341
476
|
dma_queue_delete(ctx->dma[i]);
|
|
342
477
|
}
|
|
343
478
|
|
|
479
|
+
#ifdef HTP_HAS_HMX
|
|
480
|
+
if (ctx->hmx_queue) {
|
|
481
|
+
hmx_queue_delete(ctx->hmx_queue);
|
|
482
|
+
ctx->hmx_queue = NULL;
|
|
483
|
+
}
|
|
484
|
+
ctx->hmx_enabled = false;
|
|
485
|
+
#endif
|
|
486
|
+
|
|
344
487
|
vtcm_free(ctx);
|
|
345
488
|
|
|
489
|
+
if (ctx->ddr_spad_base) {
|
|
490
|
+
free(ctx->ddr_spad_base);
|
|
491
|
+
ctx->ddr_spad_base = NULL;
|
|
492
|
+
ctx->ddr_spad_size = 0;
|
|
493
|
+
}
|
|
494
|
+
|
|
346
495
|
return AEE_SUCCESS;
|
|
347
496
|
}
|
|
348
497
|
|
|
@@ -354,846 +503,411 @@ static void htp_error_callback(dspqueue_t queue, int error, void * context) {
|
|
|
354
503
|
struct profile_data {
|
|
355
504
|
uint64_t usecs;
|
|
356
505
|
uint64_t cycles;
|
|
357
|
-
|
|
506
|
+
uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
|
|
358
507
|
};
|
|
359
508
|
|
|
360
|
-
static inline void profile_start(struct profile_data * d) {
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
509
|
+
static inline void profile_start(uint32_t mode, struct profile_data * d) {
|
|
510
|
+
switch (mode) {
|
|
511
|
+
case HTP_PROF_PMU:
|
|
512
|
+
hex_get_pmu(d->pmu_counters);
|
|
513
|
+
// fallthrough
|
|
514
|
+
case HTP_PROF_BASIC:
|
|
515
|
+
d->usecs = HAP_perf_get_qtimer_count();
|
|
516
|
+
d->cycles = hex_get_cycles();
|
|
517
|
+
break;
|
|
518
|
+
default:
|
|
519
|
+
break;
|
|
520
|
+
}
|
|
364
521
|
}
|
|
365
522
|
|
|
366
|
-
static inline void profile_stop(struct profile_data * d) {
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
523
|
+
static inline void profile_stop(uint32_t mode, struct profile_data * d) {
|
|
524
|
+
uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
|
|
525
|
+
switch (mode) {
|
|
526
|
+
case HTP_PROF_PMU:
|
|
527
|
+
hex_get_pmu(pmu_counters);
|
|
528
|
+
for (int i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
|
|
529
|
+
d->pmu_counters[i] = pmu_counters[i] - d->pmu_counters[i];
|
|
530
|
+
}
|
|
531
|
+
// fallthrough
|
|
532
|
+
case HTP_PROF_BASIC:
|
|
533
|
+
d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
|
|
534
|
+
d->cycles = hex_get_cycles() - d->cycles;
|
|
535
|
+
break;
|
|
536
|
+
default:
|
|
537
|
+
break;
|
|
538
|
+
}
|
|
370
539
|
}
|
|
371
540
|
|
|
372
|
-
static int
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
size_t n_bufs,
|
|
377
|
-
struct profile_data * prof) {
|
|
378
|
-
// Prep response struct
|
|
379
|
-
struct htp_general_rsp rsp;
|
|
380
|
-
rsp.op = op;
|
|
381
|
-
rsp.status = status;
|
|
382
|
-
rsp.prof_usecs = prof->usecs;
|
|
383
|
-
rsp.prof_cycles = prof->cycles;
|
|
384
|
-
rsp.prof_pkts = prof->pkts;
|
|
385
|
-
|
|
386
|
-
int err = dspqueue_write(c->queue,
|
|
387
|
-
0, // Flags
|
|
388
|
-
n_bufs,
|
|
389
|
-
bufs, // Buffer references
|
|
390
|
-
sizeof(rsp),
|
|
391
|
-
(const uint8_t *) &rsp, // Message
|
|
392
|
-
DSPQUEUE_TIMEOUT_NONE);
|
|
541
|
+
static int execute_op(struct htp_ops_context * octx) {
|
|
542
|
+
switch (octx->op) {
|
|
543
|
+
case HTP_OP_MUL_MAT:
|
|
544
|
+
return op_matmul(octx);
|
|
393
545
|
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
}
|
|
546
|
+
case HTP_OP_MUL_MAT_ID:
|
|
547
|
+
return op_matmul_id(octx);
|
|
397
548
|
|
|
398
|
-
|
|
399
|
-
|
|
549
|
+
case HTP_OP_MUL:
|
|
550
|
+
case HTP_OP_ADD:
|
|
551
|
+
case HTP_OP_SUB:
|
|
552
|
+
case HTP_OP_DIV:
|
|
553
|
+
case HTP_OP_ADD_ID:
|
|
554
|
+
return op_binary(octx);
|
|
400
555
|
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
// Setup Op context
|
|
416
|
-
struct htp_ops_context octx = { 0 };
|
|
417
|
-
octx.ctx = ctx;
|
|
418
|
-
octx.src0 = req->src0;
|
|
419
|
-
octx.src1 = req->src1;
|
|
420
|
-
octx.dst = req->dst;
|
|
421
|
-
octx.flags = req->flags;
|
|
422
|
-
octx.op = req->op;
|
|
423
|
-
|
|
424
|
-
// Update data pointers
|
|
425
|
-
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
426
|
-
octx.src1.data = (uint32_t) bufs[1].ptr;
|
|
427
|
-
octx.dst.data = (uint32_t) bufs[2].ptr;
|
|
428
|
-
octx.n_threads = ctx->n_threads;
|
|
429
|
-
|
|
430
|
-
struct profile_data prof;
|
|
431
|
-
profile_start(&prof);
|
|
432
|
-
|
|
433
|
-
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
|
434
|
-
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
|
435
|
-
rsp_status = op_matmul(&octx);
|
|
436
|
-
vtcm_release(ctx);
|
|
437
|
-
}
|
|
556
|
+
case HTP_OP_NORM:
|
|
557
|
+
case HTP_OP_RMS_NORM:
|
|
558
|
+
case HTP_OP_RMS_NORM_MUL:
|
|
559
|
+
case HTP_OP_SCALE:
|
|
560
|
+
case HTP_OP_SQR:
|
|
561
|
+
case HTP_OP_SQRT:
|
|
562
|
+
case HTP_OP_UNARY_SOFTPLUS:
|
|
563
|
+
case HTP_OP_UNARY_SIGMOID:
|
|
564
|
+
case HTP_OP_UNARY_NEG:
|
|
565
|
+
case HTP_OP_UNARY_EXP:
|
|
566
|
+
case HTP_OP_UNARY_TANH:
|
|
567
|
+
case HTP_OP_L2_NORM:
|
|
568
|
+
return op_unary(octx);
|
|
438
569
|
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
570
|
+
case HTP_OP_UNARY_SILU:
|
|
571
|
+
case HTP_OP_UNARY_GELU:
|
|
572
|
+
case HTP_OP_GLU_SWIGLU:
|
|
573
|
+
case HTP_OP_GLU_SWIGLU_OAI:
|
|
574
|
+
case HTP_OP_GLU_GEGLU:
|
|
575
|
+
return op_activations(octx);
|
|
442
576
|
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
// We had written to the output buffer, we'd also need to flush it
|
|
447
|
-
rsp_bufs[0].fd = bufs[1].fd;
|
|
448
|
-
rsp_bufs[0].ptr = bufs[1].ptr;
|
|
449
|
-
rsp_bufs[0].offset = bufs[1].offset;
|
|
450
|
-
rsp_bufs[0].size = bufs[1].size;
|
|
451
|
-
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
|
452
|
-
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
|
453
|
-
|
|
454
|
-
// Setup Op context
|
|
455
|
-
struct htp_ops_context octx = { 0 };
|
|
456
|
-
octx.ctx = ctx;
|
|
457
|
-
octx.src0 = req->src0;
|
|
458
|
-
octx.dst = req->dst;
|
|
459
|
-
octx.flags = req->flags;
|
|
460
|
-
octx.op = req->op;
|
|
461
|
-
|
|
462
|
-
memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
|
|
463
|
-
|
|
464
|
-
// Update data pointers
|
|
465
|
-
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
466
|
-
octx.dst.data = (uint32_t) bufs[1].ptr;
|
|
467
|
-
octx.n_threads = ctx->n_threads;
|
|
468
|
-
|
|
469
|
-
struct profile_data prof;
|
|
470
|
-
profile_start(&prof);
|
|
471
|
-
|
|
472
|
-
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
|
473
|
-
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
|
474
|
-
rsp_status = op_argsort(&octx);
|
|
475
|
-
vtcm_release(ctx);
|
|
476
|
-
}
|
|
577
|
+
case HTP_OP_SOFTMAX:
|
|
578
|
+
return op_softmax(octx);
|
|
477
579
|
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
}
|
|
580
|
+
case HTP_OP_ROPE:
|
|
581
|
+
return op_rope(octx);
|
|
481
582
|
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
// We had written to the output buffer, we'd also need to flush it
|
|
486
|
-
rsp_bufs[0].fd = bufs[1].fd;
|
|
487
|
-
rsp_bufs[0].ptr = bufs[1].ptr;
|
|
488
|
-
rsp_bufs[0].offset = bufs[1].offset;
|
|
489
|
-
rsp_bufs[0].size = bufs[1].size;
|
|
490
|
-
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
|
491
|
-
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
|
492
|
-
|
|
493
|
-
// Setup Op context
|
|
494
|
-
struct htp_ops_context octx = { 0 };
|
|
495
|
-
octx.ctx = ctx;
|
|
496
|
-
octx.src0 = req->src0;
|
|
497
|
-
octx.dst = req->dst;
|
|
498
|
-
octx.flags = req->flags;
|
|
499
|
-
octx.op = req->op;
|
|
500
|
-
|
|
501
|
-
// Update data pointers
|
|
502
|
-
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
503
|
-
octx.dst.data = (uint32_t) bufs[1].ptr;
|
|
504
|
-
octx.n_threads = ctx->n_threads;
|
|
505
|
-
|
|
506
|
-
struct profile_data prof;
|
|
507
|
-
profile_start(&prof);
|
|
508
|
-
|
|
509
|
-
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
|
510
|
-
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
|
511
|
-
rsp_status = op_cpy(&octx);
|
|
512
|
-
vtcm_release(ctx);
|
|
513
|
-
}
|
|
583
|
+
case HTP_OP_FLASH_ATTN_EXT:
|
|
584
|
+
return op_flash_attn_ext(octx);
|
|
514
585
|
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
}
|
|
586
|
+
case HTP_OP_SET_ROWS:
|
|
587
|
+
return op_set_rows(octx);
|
|
518
588
|
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
// We had written to the output buffer, we'd also need to flush it
|
|
523
|
-
rsp_bufs[0].fd = bufs[2].fd;
|
|
524
|
-
rsp_bufs[0].ptr = bufs[2].ptr;
|
|
525
|
-
rsp_bufs[0].offset = bufs[2].offset;
|
|
526
|
-
rsp_bufs[0].size = bufs[2].size;
|
|
527
|
-
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
|
528
|
-
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
|
529
|
-
|
|
530
|
-
// Setup Op context
|
|
531
|
-
struct htp_ops_context octx = { 0 };
|
|
532
|
-
octx.ctx = ctx;
|
|
533
|
-
octx.src0 = req->src0;
|
|
534
|
-
octx.src1 = req->src1;
|
|
535
|
-
octx.dst = req->dst;
|
|
536
|
-
octx.flags = req->flags;
|
|
537
|
-
octx.op = req->op;
|
|
538
|
-
|
|
539
|
-
// Update data pointers
|
|
540
|
-
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
541
|
-
octx.src1.data = (uint32_t) bufs[1].ptr;
|
|
542
|
-
octx.dst.data = (uint32_t) bufs[2].ptr;
|
|
543
|
-
octx.n_threads = ctx->n_threads;
|
|
544
|
-
|
|
545
|
-
struct profile_data prof;
|
|
546
|
-
profile_start(&prof);
|
|
547
|
-
|
|
548
|
-
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
|
549
|
-
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
|
550
|
-
rsp_status = op_get_rows(&octx);
|
|
551
|
-
vtcm_release(ctx);
|
|
552
|
-
}
|
|
589
|
+
case HTP_OP_GET_ROWS:
|
|
590
|
+
return op_get_rows(octx);
|
|
553
591
|
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
}
|
|
592
|
+
case HTP_OP_SUM_ROWS:
|
|
593
|
+
return op_sum_rows(octx);
|
|
557
594
|
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
struct dspqueue_buffer * bufs,
|
|
561
|
-
size_t n_bufs) {
|
|
562
|
-
struct dspqueue_buffer rsp_bufs[1];
|
|
563
|
-
|
|
564
|
-
// We had written to the output buffer, we'd also need to flush it
|
|
565
|
-
rsp_bufs[0].fd = bufs[3].fd;
|
|
566
|
-
rsp_bufs[0].ptr = bufs[3].ptr;
|
|
567
|
-
rsp_bufs[0].size = bufs[3].size;
|
|
568
|
-
rsp_bufs[0].offset = bufs[3].offset;
|
|
569
|
-
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
|
570
|
-
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
|
571
|
-
|
|
572
|
-
// Setup Op context
|
|
573
|
-
struct htp_ops_context octx = { 0 };
|
|
574
|
-
octx.ctx = ctx;
|
|
575
|
-
octx.src0 = req->src0;
|
|
576
|
-
octx.src1 = req->src1;
|
|
577
|
-
octx.src2 = req->src2;
|
|
578
|
-
octx.dst = req->dst;
|
|
579
|
-
octx.flags = req->flags;
|
|
580
|
-
octx.op = req->op;
|
|
581
|
-
|
|
582
|
-
// Update data pointers
|
|
583
|
-
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
584
|
-
octx.src1.data = (uint32_t) bufs[1].ptr;
|
|
585
|
-
octx.src2.data = (uint32_t) bufs[2].ptr;
|
|
586
|
-
octx.dst.data = (uint32_t) bufs[3].ptr;
|
|
587
|
-
octx.n_threads = ctx->n_threads;
|
|
588
|
-
|
|
589
|
-
struct profile_data prof;
|
|
590
|
-
profile_start(&prof);
|
|
591
|
-
|
|
592
|
-
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
|
593
|
-
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
|
594
|
-
rsp_status = op_matmul_id(&octx);
|
|
595
|
-
vtcm_release(ctx);
|
|
596
|
-
}
|
|
595
|
+
case HTP_OP_CPY:
|
|
596
|
+
return op_cpy(octx);
|
|
597
597
|
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
}
|
|
598
|
+
case HTP_OP_REPEAT:
|
|
599
|
+
return op_repeat(octx);
|
|
601
600
|
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
// We had written to the output buffer, we'd also need to flush it
|
|
606
|
-
rsp_bufs[0].fd = bufs[2].fd;
|
|
607
|
-
rsp_bufs[0].ptr = bufs[2].ptr;
|
|
608
|
-
rsp_bufs[0].offset = bufs[2].offset;
|
|
609
|
-
rsp_bufs[0].size = bufs[2].size;
|
|
610
|
-
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
|
611
|
-
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
|
612
|
-
|
|
613
|
-
// Setup Op context
|
|
614
|
-
struct htp_ops_context octx = { 0 };
|
|
615
|
-
octx.ctx = ctx;
|
|
616
|
-
octx.src0 = req->src0;
|
|
617
|
-
octx.src1 = req->src1;
|
|
618
|
-
octx.dst = req->dst;
|
|
619
|
-
octx.flags = req->flags;
|
|
620
|
-
octx.op = req->op;
|
|
621
|
-
|
|
622
|
-
// Update data pointers
|
|
623
|
-
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
624
|
-
octx.src1.data = (uint32_t) bufs[1].ptr;
|
|
625
|
-
octx.dst.data = (uint32_t) bufs[2].ptr;
|
|
626
|
-
octx.n_threads = ctx->n_threads;
|
|
627
|
-
|
|
628
|
-
struct profile_data prof;
|
|
629
|
-
profile_start(&prof);
|
|
630
|
-
|
|
631
|
-
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
|
632
|
-
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
|
633
|
-
rsp_status = op_binary(&octx);
|
|
634
|
-
vtcm_release(ctx);
|
|
635
|
-
}
|
|
601
|
+
case HTP_OP_ARGSORT:
|
|
602
|
+
return op_argsort(octx);
|
|
636
603
|
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
}
|
|
604
|
+
case HTP_OP_SSM_CONV:
|
|
605
|
+
return op_ssm_conv(octx);
|
|
640
606
|
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
// We had written to the output buffer, we'd also need to flush it
|
|
645
|
-
rsp_bufs[0].fd = bufs[3].fd;
|
|
646
|
-
rsp_bufs[0].ptr = bufs[3].ptr;
|
|
647
|
-
rsp_bufs[0].offset = bufs[3].offset;
|
|
648
|
-
rsp_bufs[0].size = bufs[3].size;
|
|
649
|
-
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
|
650
|
-
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
|
651
|
-
|
|
652
|
-
// Setup Op context
|
|
653
|
-
struct htp_ops_context octx = { 0 };
|
|
654
|
-
octx.ctx = ctx;
|
|
655
|
-
octx.src0 = req->src0;
|
|
656
|
-
octx.src1 = req->src1;
|
|
657
|
-
octx.src2 = req->src2;
|
|
658
|
-
octx.dst = req->dst;
|
|
659
|
-
octx.flags = req->flags;
|
|
660
|
-
octx.op = req->op;
|
|
661
|
-
|
|
662
|
-
// Update data pointers
|
|
663
|
-
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
664
|
-
octx.src1.data = (uint32_t) bufs[1].ptr;
|
|
665
|
-
octx.src2.data = (uint32_t) bufs[2].ptr;
|
|
666
|
-
octx.dst.data = (uint32_t) bufs[3].ptr;
|
|
667
|
-
octx.n_threads = ctx->n_threads;
|
|
668
|
-
|
|
669
|
-
struct profile_data prof;
|
|
670
|
-
profile_start(&prof);
|
|
671
|
-
|
|
672
|
-
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
|
673
|
-
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
|
674
|
-
rsp_status = op_binary(&octx);
|
|
675
|
-
vtcm_release(ctx);
|
|
676
|
-
}
|
|
607
|
+
case HTP_OP_CUMSUM:
|
|
608
|
+
return op_cumsum(octx);
|
|
677
609
|
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
}
|
|
610
|
+
case HTP_OP_FILL:
|
|
611
|
+
return op_fill(octx);
|
|
681
612
|
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
// We had written to the output buffer, we'd also need to flush it
|
|
686
|
-
rsp_bufs[0].fd = bufs[1].fd;
|
|
687
|
-
rsp_bufs[0].ptr = bufs[1].ptr;
|
|
688
|
-
rsp_bufs[0].offset = bufs[1].offset;
|
|
689
|
-
rsp_bufs[0].size = bufs[1].size;
|
|
690
|
-
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
|
691
|
-
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
|
692
|
-
|
|
693
|
-
// Setup Op context
|
|
694
|
-
struct htp_ops_context octx = { 0 };
|
|
695
|
-
octx.ctx = ctx;
|
|
696
|
-
octx.src0 = req->src0;
|
|
697
|
-
octx.dst = req->dst;
|
|
698
|
-
octx.flags = req->flags;
|
|
699
|
-
octx.op = req->op;
|
|
700
|
-
|
|
701
|
-
memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
|
|
702
|
-
|
|
703
|
-
// Update data pointers
|
|
704
|
-
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
705
|
-
octx.dst.data = (uint32_t) bufs[1].ptr;
|
|
706
|
-
octx.n_threads = ctx->n_threads;
|
|
707
|
-
|
|
708
|
-
struct profile_data prof;
|
|
709
|
-
profile_start(&prof);
|
|
710
|
-
|
|
711
|
-
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
|
712
|
-
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
|
713
|
-
rsp_status = op_unary(&octx);
|
|
714
|
-
vtcm_release(ctx);
|
|
715
|
-
}
|
|
613
|
+
case HTP_OP_DIAG:
|
|
614
|
+
return op_diag(octx);
|
|
716
615
|
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
616
|
+
case HTP_OP_SOLVE_TRI:
|
|
617
|
+
return op_solve_tri(octx);
|
|
618
|
+
|
|
619
|
+
case HTP_OP_PAD:
|
|
620
|
+
return op_pad(octx);
|
|
621
|
+
|
|
622
|
+
case HTP_OP_CONCAT:
|
|
623
|
+
return op_concat(octx);
|
|
720
624
|
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
// Setup Op context
|
|
733
|
-
struct htp_ops_context octx = { 0 };
|
|
734
|
-
octx.ctx = ctx;
|
|
735
|
-
octx.src0 = req->src0;
|
|
736
|
-
octx.dst = req->dst;
|
|
737
|
-
octx.flags = req->flags;
|
|
738
|
-
octx.op = req->op;
|
|
739
|
-
|
|
740
|
-
memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
|
|
741
|
-
|
|
742
|
-
// Update data pointers
|
|
743
|
-
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
744
|
-
octx.dst.data = (uint32_t) bufs[1].ptr;
|
|
745
|
-
octx.n_threads = ctx->n_threads;
|
|
746
|
-
|
|
747
|
-
struct profile_data prof;
|
|
748
|
-
profile_start(&prof);
|
|
749
|
-
|
|
750
|
-
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
|
751
|
-
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
|
752
|
-
rsp_status = op_sum_rows(&octx);
|
|
753
|
-
vtcm_release(ctx);
|
|
625
|
+
case HTP_OP_GATED_DELTA_NET:
|
|
626
|
+
return op_gated_delta_net(octx);
|
|
627
|
+
|
|
628
|
+
case HTP_OP_TRI:
|
|
629
|
+
return op_tri(octx);
|
|
630
|
+
|
|
631
|
+
case HTP_OP_INVALID:
|
|
632
|
+
break;
|
|
633
|
+
|
|
634
|
+
// No default to catch missing cases
|
|
754
635
|
}
|
|
755
636
|
|
|
756
|
-
|
|
757
|
-
|
|
637
|
+
FARF(ERROR, "Unknown Op %u", octx->op);
|
|
638
|
+
return -1;
|
|
758
639
|
}
|
|
759
640
|
|
|
760
|
-
static
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
// Setup OP context
|
|
772
|
-
struct htp_ops_context octx = { 0 };
|
|
773
|
-
octx.ctx = ctx;
|
|
774
|
-
octx.src0 = req->src0;
|
|
775
|
-
octx.src1 = req->src1;
|
|
776
|
-
octx.dst = req->dst;
|
|
777
|
-
octx.flags = req->flags;
|
|
778
|
-
octx.op = req->op;
|
|
779
|
-
|
|
780
|
-
memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
|
|
781
|
-
|
|
782
|
-
// Update data pointers
|
|
783
|
-
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
784
|
-
octx.src1.data = (uint32_t) bufs[1].ptr;
|
|
785
|
-
octx.dst.data = (uint32_t) bufs[2].ptr;
|
|
786
|
-
octx.n_threads = ctx->n_threads;
|
|
787
|
-
|
|
788
|
-
struct profile_data prof;
|
|
789
|
-
profile_start(&prof);
|
|
790
|
-
|
|
791
|
-
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
|
792
|
-
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
|
793
|
-
rsp_status = op_ssm_conv(&octx);
|
|
794
|
-
vtcm_release(ctx);
|
|
641
|
+
static inline bool reuse_buf(struct htp_context *ctx, uint32_t *m_reuse, struct htp_buf_desc *b) {
|
|
642
|
+
b->base = NULL;
|
|
643
|
+
|
|
644
|
+
for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
|
|
645
|
+
struct htp_mmap *m = ctx->mmap + i;
|
|
646
|
+
if (m->size && m->fd == b->fd) {
|
|
647
|
+
b->base = m->base;
|
|
648
|
+
*m_reuse |= (1 << i);
|
|
649
|
+
return true;
|
|
650
|
+
}
|
|
795
651
|
}
|
|
796
652
|
|
|
797
|
-
|
|
798
|
-
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
|
653
|
+
return false;
|
|
799
654
|
}
|
|
800
655
|
|
|
801
|
-
static void
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
rsp_bufs[0].offset = bufs[write_idx].offset;
|
|
813
|
-
rsp_bufs[0].size = bufs[write_idx].size;
|
|
814
|
-
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
|
815
|
-
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
|
816
|
-
|
|
817
|
-
// Setup Op context
|
|
818
|
-
struct htp_ops_context octx = { 0 };
|
|
819
|
-
octx.ctx = ctx;
|
|
820
|
-
octx.src0 = req->src0;
|
|
821
|
-
if (3 == n_bufs) {
|
|
822
|
-
octx.src1 = req->src1;
|
|
823
|
-
}
|
|
824
|
-
octx.dst = req->dst;
|
|
825
|
-
octx.flags = req->flags;
|
|
826
|
-
octx.op = req->op;
|
|
827
|
-
|
|
828
|
-
memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
|
|
829
|
-
|
|
830
|
-
// Update data pointers
|
|
831
|
-
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
832
|
-
if (3 == n_bufs) {
|
|
833
|
-
octx.src1.data = (uint32_t) bufs[1].ptr;
|
|
834
|
-
octx.dst.data = (uint32_t) bufs[2].ptr;
|
|
835
|
-
} else {
|
|
836
|
-
octx.dst.data = (uint32_t) bufs[1].ptr;
|
|
656
|
+
static inline void drop_mmap(struct htp_context *ctx, struct htp_mmap *m) {
|
|
657
|
+
if (m->size) {
|
|
658
|
+
FARF(HIGH, "unmap : fd %u base %p size %u", m->fd, (void*) m->base, (uint32_t) m->size);
|
|
659
|
+
#if __HVX_ARCH__ > 73
|
|
660
|
+
HAP_munmap2((void *) m->base, m->size);
|
|
661
|
+
#else
|
|
662
|
+
HAP_munmap((void *) m->base, m->size);
|
|
663
|
+
#endif
|
|
664
|
+
m->size = 0;
|
|
665
|
+
m->base = 0;
|
|
666
|
+
m->fd = -1;
|
|
837
667
|
}
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
struct profile_data prof;
|
|
841
|
-
profile_start(&prof);
|
|
668
|
+
}
|
|
842
669
|
|
|
843
|
-
|
|
844
|
-
if (
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
670
|
+
static inline void mmap_buf(struct htp_context *ctx, struct htp_buf_desc *b) {
|
|
671
|
+
if (b->base) return; // already mapped
|
|
672
|
+
|
|
673
|
+
// find unused mapping
|
|
674
|
+
for (uint32_t i=0; i < HTP_MAX_MMAPS; i++) {
|
|
675
|
+
struct htp_mmap *m = &ctx->mmap[i];
|
|
676
|
+
if (!m->size) {
|
|
677
|
+
#if __HVX_ARCH__ > 73
|
|
678
|
+
void *va = HAP_mmap2(NULL, b->size, HAP_PROT_READ | HAP_PROT_WRITE, 0, b->fd, 0);
|
|
679
|
+
#else
|
|
680
|
+
if (b->size > HTP_MMAP_MAX_VMEM) { // HAP_mmap has a size limit of 2GB
|
|
681
|
+
FARF(ERROR, "mmap failed : size %u exceeds 2GB limit for HAP_mmap", (uint32_t) b->size);
|
|
682
|
+
abort(); // can't do much else at this point
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
void *va = HAP_mmap(NULL, b->size, HAP_PROT_READ | HAP_PROT_WRITE, 0, b->fd, 0);
|
|
686
|
+
#endif
|
|
687
|
+
if (va == (void*)-1) {
|
|
688
|
+
FARF(ERROR, "mmap failed : va %p fd %u size %u", va, b->fd, (uint32_t) b->size);
|
|
689
|
+
abort(); // can't do much else at this point
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
m->base = b->base = (uint64_t) va;
|
|
693
|
+
m->fd = b->fd;
|
|
694
|
+
m->size = b->size;
|
|
695
|
+
|
|
696
|
+
FARF(HIGH, "mmap : fd %u base %p size %u", m->fd, (void*) m->base, (uint32_t) m->size);
|
|
697
|
+
return;
|
|
849
698
|
}
|
|
850
|
-
vtcm_release(ctx);
|
|
851
699
|
}
|
|
852
|
-
|
|
853
|
-
profile_stop(&prof);
|
|
854
|
-
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
|
855
700
|
}
|
|
856
701
|
|
|
857
|
-
static void
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
rsp_bufs[0].size = bufs[write_idx].size;
|
|
870
|
-
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
|
871
|
-
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
|
872
|
-
|
|
873
|
-
// Setup Op context
|
|
874
|
-
struct htp_ops_context octx = { 0 };
|
|
875
|
-
octx.ctx = ctx;
|
|
876
|
-
octx.src0 = req->src0;
|
|
877
|
-
octx.src1 = req->src1;
|
|
878
|
-
if (4 == n_bufs) {
|
|
879
|
-
octx.src2 = req->src2;
|
|
880
|
-
}
|
|
881
|
-
octx.dst = req->dst;
|
|
882
|
-
octx.flags = req->flags;
|
|
883
|
-
octx.op = req->op;
|
|
884
|
-
|
|
885
|
-
memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
|
|
886
|
-
|
|
887
|
-
// Update data pointers
|
|
888
|
-
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
889
|
-
octx.src1.data = (uint32_t) bufs[1].ptr;
|
|
890
|
-
if (4 == n_bufs) {
|
|
891
|
-
octx.src2.data = (uint32_t) bufs[2].ptr;
|
|
892
|
-
octx.dst.data = (uint32_t) bufs[3].ptr;
|
|
893
|
-
} else {
|
|
894
|
-
octx.dst.data = (uint32_t) bufs[2].ptr;
|
|
702
|
+
static void prep_op_bufs(struct htp_context *ctx, struct htp_buf_desc *bufs, uint32_t n_bufs) {
|
|
703
|
+
uint32_t m_reuse = 0; // mmap reuse mask (index from ctx->mmap array)
|
|
704
|
+
uint32_t b_reuse = 0; // buf reuse count
|
|
705
|
+
|
|
706
|
+
uint64_t m_vmem = 0; // mapped vmem
|
|
707
|
+
uint64_t e_vmem = 0; // extra vmem
|
|
708
|
+
|
|
709
|
+
// See what we can reuse
|
|
710
|
+
for (uint32_t i=0; i < n_bufs; i++) {
|
|
711
|
+
struct htp_buf_desc *b = bufs + i;
|
|
712
|
+
if (reuse_buf(ctx, &m_reuse, b)) { b_reuse++; } else { e_vmem += b->size; }
|
|
713
|
+
FARF(HIGH, "prep-buf #%u : pass0 fd %u base %p size %u flags 0x%x", i, b->fd, (void*) b->base, (uint32_t) b->size, b->flags);
|
|
895
714
|
}
|
|
896
|
-
octx.n_threads = ctx->n_threads;
|
|
897
715
|
|
|
898
|
-
|
|
899
|
-
profile_start(&prof);
|
|
716
|
+
if (b_reuse == n_bufs) return; // all bufs reuse existing mappings
|
|
900
717
|
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
718
|
+
// See how much vmem we have mmaped right now
|
|
719
|
+
for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) { m_vmem += ctx->mmap[i].size; }
|
|
720
|
+
|
|
721
|
+
FARF(HIGH, "prep-bufs : pass1 mmap-vmem %zu extra-vmem %zu max-vmem %zu : n-bufs %u b-reuse %u",
|
|
722
|
+
(size_t) m_vmem, (size_t) e_vmem, (size_t) ctx->max_vmem, n_bufs, b_reuse);
|
|
723
|
+
|
|
724
|
+
if ((m_vmem + e_vmem) > ctx->max_vmem) {
|
|
725
|
+
// Drop unused mappings
|
|
726
|
+
for (uint32_t i=0; i < HTP_MAX_MMAPS; i++) {
|
|
727
|
+
bool used = m_reuse & (1<<i);
|
|
728
|
+
if (!used) { drop_mmap(ctx, ctx->mmap + i); }
|
|
729
|
+
}
|
|
905
730
|
}
|
|
906
731
|
|
|
907
|
-
|
|
908
|
-
|
|
732
|
+
// Create missing mappings
|
|
733
|
+
for (uint32_t i=0; i < n_bufs; i++) {
|
|
734
|
+
struct htp_buf_desc *b = bufs + i;
|
|
735
|
+
mmap_buf(ctx, b);
|
|
736
|
+
FARF(HIGH, "prep-buf #%u : pass1 fd %u base %p size %u flags 0x%x", i, b->fd, (void*) b->base, (uint32_t) b->size, b->flags);
|
|
737
|
+
}
|
|
909
738
|
}
|
|
910
739
|
|
|
911
|
-
static void
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
rsp_bufs[0].offset = bufs[2].offset;
|
|
918
|
-
rsp_bufs[0].size = bufs[2].size;
|
|
919
|
-
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
|
920
|
-
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
|
921
|
-
|
|
922
|
-
// Setup Op context
|
|
923
|
-
struct htp_ops_context octx = { 0 };
|
|
924
|
-
octx.ctx = ctx;
|
|
925
|
-
octx.src0 = req->src0;
|
|
926
|
-
octx.src1 = req->src1;
|
|
927
|
-
octx.dst = req->dst;
|
|
928
|
-
octx.flags = req->flags;
|
|
929
|
-
octx.op = req->op;
|
|
930
|
-
|
|
931
|
-
// Update data pointers
|
|
932
|
-
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
933
|
-
octx.src1.data = (uint32_t) bufs[1].ptr;
|
|
934
|
-
octx.dst.data = (uint32_t) bufs[2].ptr;
|
|
935
|
-
octx.n_threads = ctx->n_threads;
|
|
936
|
-
|
|
937
|
-
struct profile_data prof;
|
|
938
|
-
profile_start(&prof);
|
|
939
|
-
|
|
940
|
-
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
|
941
|
-
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
|
942
|
-
rsp_status = op_set_rows(&octx);
|
|
943
|
-
vtcm_release(ctx);
|
|
944
|
-
}
|
|
740
|
+
static void prep_tensor(struct htp_context *ctx, struct htp_buf_desc *bufs, uint32_t idx, struct htp_tensor *t) {
|
|
741
|
+
uint32_t offset = t->data;
|
|
742
|
+
uint32_t size = t->size;
|
|
743
|
+
uint32_t bi = t->bi;
|
|
744
|
+
|
|
745
|
+
t->data = bufs[bi].base + offset; // update data to the actual pointer
|
|
945
746
|
|
|
946
|
-
|
|
947
|
-
|
|
747
|
+
FARF(HIGH, "prep-tensor #%u: bi %u offset %u size %u data %p : %u:%u:%u:%u", idx, t->bi, offset, t->size, (void*) t->data,
|
|
748
|
+
t->ne[0], t->ne[1], t->ne[3], t->ne[3]);
|
|
948
749
|
}
|
|
949
750
|
|
|
950
|
-
static void
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
uint32_t n_bufs) {
|
|
954
|
-
// Setup Op context
|
|
955
|
-
struct htp_ops_context octx;
|
|
956
|
-
memset(&octx, 0, sizeof(octx));
|
|
957
|
-
|
|
958
|
-
octx.ctx = ctx;
|
|
959
|
-
octx.n_threads = ctx->n_threads;
|
|
960
|
-
|
|
961
|
-
octx.src0 = req->src0;
|
|
962
|
-
octx.src1 = req->src1;
|
|
963
|
-
octx.src2 = req->src2;
|
|
964
|
-
octx.src3 = req->src3;
|
|
965
|
-
octx.src4 = req->src4;
|
|
966
|
-
octx.dst = req->dst;
|
|
967
|
-
octx.flags = req->flags;
|
|
968
|
-
octx.op = req->op;
|
|
969
|
-
|
|
970
|
-
memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
|
|
971
|
-
|
|
972
|
-
// Update data pointers
|
|
973
|
-
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
974
|
-
octx.src1.data = (uint32_t) bufs[1].ptr;
|
|
975
|
-
octx.src2.data = (uint32_t) bufs[2].ptr;
|
|
976
|
-
|
|
977
|
-
int last_buf = 3;
|
|
978
|
-
|
|
979
|
-
if (octx.src3.ne[0]) {
|
|
980
|
-
octx.src3.data = (uint32_t) bufs[last_buf++].ptr; // mask is valid
|
|
751
|
+
static void prep_tensors(struct htp_context *ctx, struct htp_buf_desc *bufs, struct htp_tensor *tens, uint32_t n_tens) {
|
|
752
|
+
for (uint32_t i=0; i < n_tens; i++) {
|
|
753
|
+
prep_tensor(ctx, bufs, i, tens + i);
|
|
981
754
|
}
|
|
755
|
+
}
|
|
982
756
|
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
757
|
+
static void proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens, uint32_t idx, struct htp_op_desc * op) {
|
|
758
|
+
memcpy(octx->op_params, op->params, sizeof(octx->op_params));
|
|
759
|
+
octx->flags = op->flags;
|
|
760
|
+
octx->op = op->opcode;
|
|
986
761
|
|
|
987
|
-
octx
|
|
762
|
+
FARF(HIGH, "proc-op #%u: opcode %u flags 0x%x", idx, octx->op, octx->flags);
|
|
988
763
|
|
|
989
|
-
|
|
990
|
-
|
|
764
|
+
// Prep input tensors
|
|
765
|
+
for (uint32_t i=0; i<HTP_OP_MAX_INPUTS; i++) {
|
|
766
|
+
struct htp_tensor *src = op->src[i] == 0xffff ? NULL : tens + op->src[i];
|
|
991
767
|
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
768
|
+
octx->src[i] = src;
|
|
769
|
+
if (!src) continue;
|
|
770
|
+
|
|
771
|
+
if (!(src->flags & HTP_TENSOR_FLUSHED) && (src->flags & HTP_TENSOR_COMPUTE)) {
|
|
772
|
+
// flush compute buffers on input
|
|
773
|
+
hex_l2flush((void *) src->data, src->size);
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
FARF(HIGH, "prep-src #%u: data %p size %u : %u:%u:%u:%u", op->src[i], (void*) src->data, src->size,
|
|
777
|
+
src->ne[0], src->ne[1], src->ne[3], src->ne[3]);
|
|
996
778
|
}
|
|
997
779
|
|
|
998
|
-
|
|
780
|
+
// Prep output tensor
|
|
781
|
+
struct htp_tensor *dst = tens + op->dst;
|
|
782
|
+
|
|
783
|
+
octx->dst = dst;
|
|
784
|
+
|
|
785
|
+
FARF(HIGH, "prep-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size,
|
|
786
|
+
dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]);
|
|
787
|
+
|
|
788
|
+
(void) execute_op(octx);
|
|
999
789
|
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
790
|
+
// flush buffers on output
|
|
791
|
+
hex_l2flush((void *) dst->data, dst->size);
|
|
792
|
+
dst->flags |= HTP_TENSOR_FLUSHED;
|
|
1003
793
|
|
|
1004
|
-
|
|
794
|
+
FARF(HIGH, "post-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size,
|
|
795
|
+
dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]);
|
|
1005
796
|
}
|
|
1006
797
|
|
|
798
|
+
#define DSPQUEUE_POLL_TIMEOUT_USEC 100
|
|
799
|
+
#define DSPQUEUE_POLL_COUNT 100
|
|
800
|
+
|
|
1007
801
|
static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
|
1008
802
|
struct htp_context * ctx = (struct htp_context *) context;
|
|
1009
803
|
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
// keep the DSP busy as much as possible and avoid waiting for the CPU.
|
|
804
|
+
int err;
|
|
805
|
+
|
|
806
|
+
uint32_t poll_count = DSPQUEUE_POLL_COUNT;
|
|
1014
807
|
|
|
1015
|
-
|
|
1016
|
-
struct htp_general_req req;
|
|
1017
|
-
uint32_t req_size;
|
|
808
|
+
vtcm_acquire(ctx);
|
|
1018
809
|
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
uint32_t
|
|
810
|
+
while (!ctx->vtcm_needs_release) {
|
|
811
|
+
struct htp_opbatch_req req;
|
|
812
|
+
uint32_t r_size = sizeof(req);
|
|
1022
813
|
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
&n_bufs, // Number of buffer references
|
|
1027
|
-
bufs, // Buffer references
|
|
1028
|
-
sizeof(req), // Max message length
|
|
1029
|
-
&req_size, // Message length
|
|
1030
|
-
(uint8_t *) &req); // Message
|
|
814
|
+
struct dspqueue_buffer dbuf;
|
|
815
|
+
uint32_t n_dbufs = 1;
|
|
816
|
+
uint32_t flags = 0;
|
|
1031
817
|
|
|
818
|
+
err = dspqueue_read_noblock(queue, &flags, n_dbufs, &n_dbufs, &dbuf, r_size, &r_size, (uint8_t *) &req);
|
|
1032
819
|
if (err == AEE_EWOULDBLOCK) {
|
|
1033
|
-
|
|
1034
|
-
|
|
820
|
+
if (--poll_count) {
|
|
821
|
+
qurt_sleep(DSPQUEUE_POLL_TIMEOUT_USEC);
|
|
822
|
+
continue;
|
|
823
|
+
}
|
|
824
|
+
break;
|
|
1035
825
|
}
|
|
1036
826
|
|
|
1037
827
|
if (err != 0) {
|
|
1038
828
|
FARF(ERROR, "dspqueue_read_noblock failed: 0x%08x", (unsigned) err);
|
|
1039
|
-
|
|
829
|
+
break;
|
|
1040
830
|
}
|
|
1041
831
|
|
|
1042
|
-
if (
|
|
1043
|
-
FARF(ERROR, "
|
|
832
|
+
if (r_size < sizeof(req) || n_dbufs != 1) {
|
|
833
|
+
FARF(ERROR, "invalid request : size %u n-dbufs %u", r_size, n_dbufs);
|
|
1044
834
|
continue;
|
|
1045
835
|
}
|
|
1046
836
|
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);
|
|
1050
|
-
}
|
|
837
|
+
// Reset poll count for valid requests
|
|
838
|
+
poll_count = DSPQUEUE_POLL_COUNT;
|
|
1051
839
|
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
if (n_bufs != 3) {
|
|
1056
|
-
FARF(ERROR, "Bad matmul-req buffer list");
|
|
1057
|
-
continue;
|
|
1058
|
-
}
|
|
1059
|
-
proc_matmul_req(ctx, &req, bufs, n_bufs);
|
|
1060
|
-
break;
|
|
840
|
+
const uint32_t n_bufs = req.n_bufs;
|
|
841
|
+
const uint32_t n_tens = req.n_tensors;
|
|
842
|
+
const uint32_t n_ops = req.n_ops;
|
|
1061
843
|
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
}
|
|
1067
|
-
proc_matmul_id_req(ctx, &req, bufs, n_bufs);
|
|
1068
|
-
break;
|
|
1069
|
-
|
|
1070
|
-
case HTP_OP_MUL:
|
|
1071
|
-
case HTP_OP_ADD:
|
|
1072
|
-
case HTP_OP_SUB:
|
|
1073
|
-
case HTP_OP_DIV:
|
|
1074
|
-
if (n_bufs != 3) {
|
|
1075
|
-
FARF(ERROR, "Bad binary-req buffer list");
|
|
1076
|
-
continue;
|
|
1077
|
-
}
|
|
1078
|
-
proc_binary_req(ctx, &req, bufs);
|
|
1079
|
-
break;
|
|
1080
|
-
|
|
1081
|
-
case HTP_OP_RMS_NORM:
|
|
1082
|
-
case HTP_OP_SCALE:
|
|
1083
|
-
if (n_bufs != 2) {
|
|
1084
|
-
FARF(ERROR, "Bad unary-req buffer list");
|
|
1085
|
-
continue;
|
|
1086
|
-
}
|
|
844
|
+
const uint32_t b_size = sizeof(struct htp_buf_desc) * n_bufs;
|
|
845
|
+
const uint32_t t_size = sizeof(struct htp_tensor) * n_tens;
|
|
846
|
+
const uint32_t o_size = sizeof(struct htp_op_desc) * n_ops;
|
|
847
|
+
const uint32_t p_size = sizeof(struct htp_prof_desc) * n_ops;
|
|
1087
848
|
|
|
1088
|
-
|
|
1089
|
-
|
|
849
|
+
if (dbuf.size < b_size + t_size + o_size + p_size) {
|
|
850
|
+
FARF(ERROR, "invalid opbatch memory block size %u", dbuf.size);
|
|
851
|
+
break;
|
|
852
|
+
}
|
|
1090
853
|
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
if (n_bufs != 2) {
|
|
1094
|
-
FARF(ERROR, "Bad unary-req buffer list");
|
|
1095
|
-
continue;
|
|
1096
|
-
}
|
|
854
|
+
FARF(HIGH, "processing opbatch #%u: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u", req.id,
|
|
855
|
+
n_bufs, n_tens, n_ops, dbuf.size, b_size, t_size, o_size);
|
|
1097
856
|
|
|
1098
|
-
|
|
1099
|
-
|
|
857
|
+
// Setup descriptor pointers
|
|
858
|
+
uint8_t * m_ptr = dbuf.ptr;
|
|
859
|
+
struct htp_buf_desc* bufs = (struct htp_buf_desc*) m_ptr; m_ptr += b_size;
|
|
860
|
+
struct htp_tensor* tens = (struct htp_tensor*) m_ptr; m_ptr += t_size;
|
|
861
|
+
struct htp_op_desc* ops = (struct htp_op_desc*) m_ptr; m_ptr += o_size;
|
|
862
|
+
struct htp_prof_desc* pds = (struct htp_prof_desc*) m_ptr;
|
|
1100
863
|
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
FARF(ERROR, "Bad unary-req buffer list");
|
|
1104
|
-
continue;
|
|
1105
|
-
}
|
|
864
|
+
prep_op_bufs(ctx, bufs, n_bufs);
|
|
865
|
+
prep_tensors(ctx, bufs, tens, n_tens);
|
|
1106
866
|
|
|
1107
|
-
|
|
1108
|
-
|
|
867
|
+
struct htp_ops_context *octx = &ctx->octx;
|
|
868
|
+
memset(octx, 0, sizeof(*octx));
|
|
869
|
+
octx->n_threads = ctx->n_threads;
|
|
870
|
+
octx->ctx = ctx;
|
|
1109
871
|
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
if (n_bufs != 2) {
|
|
1113
|
-
FARF(ERROR, "Bad act-req buffer list");
|
|
1114
|
-
continue;
|
|
1115
|
-
}
|
|
1116
|
-
proc_activations_req(ctx, &req, bufs, n_bufs);
|
|
1117
|
-
break;
|
|
1118
|
-
|
|
1119
|
-
case HTP_OP_GLU_SWIGLU:
|
|
1120
|
-
case HTP_OP_GLU_SWIGLU_OAI:
|
|
1121
|
-
case HTP_OP_SOFTMAX:
|
|
1122
|
-
case HTP_OP_GLU_GEGLU:
|
|
1123
|
-
if ((n_bufs != 2) && (n_bufs != 3)) {
|
|
1124
|
-
FARF(ERROR, "Bad act-req buffer list");
|
|
1125
|
-
continue;
|
|
1126
|
-
}
|
|
1127
|
-
proc_activations_req(ctx, &req, bufs, n_bufs);
|
|
1128
|
-
break;
|
|
872
|
+
for (uint32_t i=0; i < n_ops; i++) {
|
|
873
|
+
struct profile_data prof;
|
|
1129
874
|
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
}
|
|
1135
|
-
proc_add_id_req(ctx, &req, bufs);
|
|
1136
|
-
break;
|
|
875
|
+
if (i == (n_ops-1)) {
|
|
876
|
+
// wake up the host before starting the last op
|
|
877
|
+
dspqueue_write_early_wakeup_noblock(queue, 0, 0);
|
|
878
|
+
}
|
|
1137
879
|
|
|
1138
|
-
|
|
1139
|
-
if ((n_bufs != 3) && (n_bufs != 4)) {
|
|
1140
|
-
FARF(ERROR, "Bad rope-req buffer list");
|
|
1141
|
-
continue;
|
|
1142
|
-
}
|
|
1143
|
-
proc_rope_req(ctx, &req, bufs, n_bufs);
|
|
1144
|
-
break;
|
|
880
|
+
profile_start(ctx->profiler, &prof);
|
|
1145
881
|
|
|
1146
|
-
|
|
1147
|
-
if (!(n_bufs >= 4 && n_bufs <= 6)) {
|
|
1148
|
-
FARF(ERROR, "Bad flash-attn-ext-req buffer list");
|
|
1149
|
-
continue;
|
|
1150
|
-
}
|
|
1151
|
-
proc_flash_attn_ext_req(ctx, &req, bufs, n_bufs);
|
|
1152
|
-
break;
|
|
882
|
+
proc_op_req(octx, tens, i, &ops[i]);
|
|
1153
883
|
|
|
1154
|
-
|
|
1155
|
-
if (n_bufs != 3) {
|
|
1156
|
-
FARF(ERROR, "Bad set-rows-req buffer list");
|
|
1157
|
-
continue;
|
|
1158
|
-
}
|
|
1159
|
-
proc_set_rows_req(ctx, &req, bufs);
|
|
1160
|
-
break;
|
|
1161
|
-
|
|
1162
|
-
case HTP_OP_GET_ROWS:
|
|
1163
|
-
if (n_bufs != 3) {
|
|
1164
|
-
FARF(ERROR, "Bad get-rows-req buffer list");
|
|
1165
|
-
continue;
|
|
1166
|
-
}
|
|
1167
|
-
proc_get_rows_req(ctx, &req, bufs);
|
|
1168
|
-
break;
|
|
884
|
+
profile_stop(ctx->profiler, &prof);
|
|
1169
885
|
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
886
|
+
if (ctx->profiler) {
|
|
887
|
+
pds[i].opcode = ops[i].opcode;
|
|
888
|
+
pds[i].usecs = prof.usecs;
|
|
889
|
+
pds[i].cycles = prof.cycles;
|
|
890
|
+
for (int j = 0; j < HEX_NUM_PMU_COUNTERS; j++) {
|
|
891
|
+
pds[i].pmu[j] = prof.pmu_counters[j];
|
|
1174
892
|
}
|
|
1175
|
-
|
|
1176
|
-
|
|
893
|
+
}
|
|
894
|
+
}
|
|
1177
895
|
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
break;
|
|
896
|
+
struct htp_opbatch_rsp rsp;
|
|
897
|
+
rsp.id = req.id;
|
|
898
|
+
rsp.status = HTP_STATUS_OK;
|
|
899
|
+
rsp.n_bufs = n_bufs;
|
|
900
|
+
rsp.n_tensors = n_tens;
|
|
901
|
+
rsp.n_ops = n_ops;
|
|
1185
902
|
|
|
1186
|
-
|
|
1187
|
-
if (n_bufs != 3) {
|
|
1188
|
-
FARF(ERROR, "Bad ssm-conv-req buffer list");
|
|
1189
|
-
continue;
|
|
1190
|
-
}
|
|
1191
|
-
proc_ssm_conv_req(ctx, &req, bufs);
|
|
1192
|
-
break;
|
|
903
|
+
dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
|
|
1193
904
|
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
905
|
+
err = dspqueue_write(queue, 0, 1, &dbuf, sizeof(rsp), (const uint8_t *) &rsp, DSPQUEUE_TIMEOUT_NONE);
|
|
906
|
+
if (err != 0) {
|
|
907
|
+
FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err);
|
|
908
|
+
break;
|
|
1197
909
|
}
|
|
1198
910
|
}
|
|
911
|
+
|
|
912
|
+
vtcm_release(ctx);
|
|
1199
913
|
}
|