whispercpp 1.3.6 → 1.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.document +3 -0
- data/.rdoc_options +2 -0
- data/README.md +38 -5
- data/Rakefile +18 -3
- data/ext/dependencies.rb +10 -4
- data/ext/dependencies_for_windows.rb +17 -0
- data/ext/extconf.rb +20 -8
- data/ext/options.rb +54 -14
- data/ext/options_for_windows.rb +51 -0
- data/ext/ruby_whisper.c +36 -42
- data/ext/ruby_whisper.h +135 -0
- data/ext/ruby_whisper_context.c +107 -28
- data/ext/ruby_whisper_log_queue.c +180 -0
- data/ext/ruby_whisper_log_settable.h +47 -0
- data/ext/ruby_whisper_parakeet.c +49 -0
- data/ext/ruby_whisper_parakeet_context.c +304 -0
- data/ext/ruby_whisper_parakeet_context_params.c +117 -0
- data/ext/ruby_whisper_parakeet_model.c +84 -0
- data/ext/ruby_whisper_parakeet_params.c +548 -0
- data/ext/ruby_whisper_parakeet_segment.c +157 -0
- data/ext/ruby_whisper_parakeet_token.c +188 -0
- data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
- data/ext/ruby_whisper_params.c +256 -65
- data/ext/ruby_whisper_segment.c +6 -6
- data/ext/ruby_whisper_transcribe.cpp +42 -15
- data/ext/sources/CMakeLists.txt +41 -3
- data/ext/sources/CMakePresets.json +95 -0
- data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
- data/ext/sources/cmake/parakeet.pc.in +10 -0
- data/ext/sources/cmake/whisper.pc.in +1 -1
- data/ext/sources/examples/CMakeLists.txt +4 -2
- data/ext/sources/examples/bench/bench.cpp +1 -1
- data/ext/sources/examples/cli/cli.cpp +43 -9
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/common-whisper.cpp +139 -67
- data/ext/sources/examples/common-whisper.h +11 -0
- data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
- data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
- data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
- data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
- data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
- data/ext/sources/examples/server/server.cpp +199 -163
- data/ext/sources/ggml/CMakeLists.txt +21 -13
- data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
- data/ext/sources/ggml/include/ggml-alloc.h +1 -0
- data/ext/sources/ggml/include/ggml-backend.h +72 -10
- data/ext/sources/ggml/include/ggml-cuda.h +3 -0
- data/ext/sources/ggml/include/ggml-rpc.h +3 -3
- data/ext/sources/ggml/include/ggml.h +101 -9
- data/ext/sources/ggml/include/gguf.h +10 -2
- data/ext/sources/ggml/src/CMakeLists.txt +22 -5
- data/ext/sources/ggml/src/ggml-alloc.c +5 -1
- data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
- data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +12 -0
- data/ext/sources/ggml/src/ggml-backend.cpp +110 -9
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +4 -0
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +672 -257
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +71 -0
- data/ext/sources/ggml/src/ggml-cann/common.h +20 -10
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +211 -30
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +58 -29
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +2 -0
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +16 -16
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +116 -7
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +65 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +4279 -1292
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +5 -35
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +0 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +177 -27
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +5 -0
- data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +95 -5
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +146 -134
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +88 -70
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +372 -73
- data/ext/sources/ggml/src/ggml-cpu/ops.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +55 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +3 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +90 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +3 -16
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
- data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +37 -53
- data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +17 -7
- data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
- data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +62 -26
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +44 -18
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +242 -28
- data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +53 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +14 -6
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +278 -44
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +331 -130
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +126 -27
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +40 -15
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +18 -9
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +152 -49
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
- data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +84 -35
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1069 -609
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +4 -2
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +242 -195
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +3 -3
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +502 -423
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +19 -12
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +485 -57
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -1
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +36 -10
- data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +133 -26
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +5 -1
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +11 -4
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
- data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
- data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +45 -13
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -4
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +26 -23
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +31 -2
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +80 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +22 -4
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +2 -1
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1428 -743
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -7
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +53 -84
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +25 -12
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +165 -184
- data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +170 -127
- data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +125 -97
- data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +148 -42
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.c +2 -2
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +252 -62
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +9 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +87 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +96 -13
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +182 -57
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +71 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +27 -10
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +63 -23
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +9 -8
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +1 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +5 -8
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +529 -815
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2522 -234
- data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +291 -95
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +59 -37
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +121 -133
- data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +244 -151
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +6 -6
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +719 -45
- data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +3 -1
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +22 -9
- data/ext/sources/ggml/src/ggml-impl.h +6 -1
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +138 -13
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +32 -1
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +164 -28
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +80 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +190 -19
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +2 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +39 -26
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +823 -322
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +54 -5
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +12248 -5907
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +67 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +59 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1819 -112
- data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mm_q8_0_f32_8x4.cl → gemm_noshuffle_q8_0_f32.cl} +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
- data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
- data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +48 -64
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +15 -5
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +18 -11
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +35 -13
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +264 -192
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +33 -7
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +1 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +1 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +27 -3
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +67 -36
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +1 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +101 -44
- data/ext/sources/ggml/src/ggml-openvino/utils.h +23 -3
- data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
- data/ext/sources/ggml/src/ggml-quants.c +289 -114
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
- data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
- data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +50 -4
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +3 -1
- data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +41 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +115 -13
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
- data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
- data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
- data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1 -90
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +0 -2
- data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +7 -5
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +4 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +76 -168
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +3 -1
- data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +69 -31
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +823 -190
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
- data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
- data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +71 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
- data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
- data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
- data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
- data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
- data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +215 -53
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +4 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +2 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +2 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +1 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +1 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +0 -2
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +11 -0
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +2060 -535
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +197 -48
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +60 -59
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +115 -113
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +122 -31
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +125 -64
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +43 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +5 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +3 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +11 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +171 -147
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +2202 -283
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +2610 -1403
- data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +8 -7
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +76 -95
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +19 -1
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -50
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +107 -184
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +183 -78
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +655 -495
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +8 -6
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +5 -1
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +80 -409
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +6 -4
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +2 -3
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +68 -48
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +18 -14
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +244 -10
- data/ext/sources/ggml/src/ggml.c +110 -28
- data/ext/sources/ggml/src/gguf.cpp +173 -28
- data/ext/sources/include/parakeet.h +342 -0
- data/ext/sources/include/whisper.h +10 -0
- data/ext/sources/media/matmul.png +0 -0
- data/ext/sources/src/CMakeLists.txt +23 -0
- data/ext/sources/src/parakeet-arch.h +188 -0
- data/ext/sources/src/parakeet.cpp +3838 -0
- data/ext/sources/src/whisper.cpp +56 -12
- data/extsources.rb +26 -10
- data/lib/whisper/log_settable.rb +36 -0
- data/lib/whisper/model/uri.rb +13 -1
- data/lib/whisper/output.rb +74 -0
- data/sig/whisper.rbs +411 -62
- data/test/helper.rb +2 -0
- data/test/jfk_reader/jfk_reader.c +50 -7
- data/test/test_callback.rb +1 -0
- data/test/test_package.rb +6 -5
- data/test/test_parakeet.rb +28 -0
- data/test/test_parakeet_callback.rb +107 -0
- data/test/test_parakeet_context.rb +116 -0
- data/test/test_parakeet_context_params.rb +24 -0
- data/test/test_parakeet_model.rb +21 -0
- data/test/test_parakeet_params.rb +78 -0
- data/test/test_parakeet_segment.rb +42 -0
- data/test/test_parakeet_token.rb +73 -0
- data/test/test_params.rb +2 -0
- data/test/test_vad_segment.rb +1 -1
- data/test/test_whisper.rb +24 -6
- data/whispercpp.gemspec +2 -2
- metadata +215 -281
- data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
- data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
- data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
- data/ext/sources/bindings/javascript/package.json +0 -26
- data/ext/sources/bindings/javascript/whisper.js +0 -19
- data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
- data/ext/sources/examples/addon.node/addon.cpp +0 -557
- data/ext/sources/examples/addon.node/index.js +0 -59
- data/ext/sources/examples/addon.node/package.json +0 -16
- data/ext/sources/examples/addon.node/vad-example.js +0 -132
- data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
- data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
- data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
- data/ext/sources/examples/coi-serviceworker.js +0 -146
- data/ext/sources/examples/command/CMakeLists.txt +0 -10
- data/ext/sources/examples/command/command.cpp +0 -802
- data/ext/sources/examples/command/commands.txt +0 -9
- data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
- data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
- data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
- data/ext/sources/examples/generate-karaoke.sh +0 -57
- data/ext/sources/examples/helpers.js +0 -191
- data/ext/sources/examples/livestream.sh +0 -112
- data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
- data/ext/sources/examples/lsp/lsp.cpp +0 -471
- data/ext/sources/examples/lsp/whisper.vim +0 -362
- data/ext/sources/examples/python/test_whisper_processor.py +0 -7
- data/ext/sources/examples/python/whisper_processor.py +0 -54
- data/ext/sources/examples/server/bench.js +0 -29
- data/ext/sources/examples/server.py +0 -120
- data/ext/sources/examples/stream/CMakeLists.txt +0 -10
- data/ext/sources/examples/stream/stream.cpp +0 -437
- data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
- data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
- data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
- data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
- data/ext/sources/examples/sycl/build.sh +0 -22
- data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
- data/ext/sources/examples/sycl/run-whisper.sh +0 -17
- data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -48
- data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -488
- data/ext/sources/examples/talk-llama/llama-adapter.h +0 -89
- data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2877
- data/ext/sources/examples/talk-llama/llama-arch.h +0 -628
- data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -919
- data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
- data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -896
- data/ext/sources/examples/talk-llama/llama-chat.h +0 -71
- data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3633
- data/ext/sources/examples/talk-llama/llama-context.h +0 -359
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
- data/ext/sources/examples/talk-llama/llama-cparams.h +0 -47
- data/ext/sources/examples/talk-llama/llama-ext.h +0 -12
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
- data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
- data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2735
- data/ext/sources/examples/talk-llama/llama-graph.h +0 -1031
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -258
- data/ext/sources/examples/talk-llama/llama-hparams.h +0 -353
- data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
- data/ext/sources/examples/talk-llama/llama-impl.h +0 -75
- data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
- data/ext/sources/examples/talk-llama/llama-io.h +0 -35
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -330
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2285
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -389
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +0 -275
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +0 -140
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1165
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
- data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
- data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -752
- data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1655
- data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -206
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -299
- data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -40
- data/ext/sources/examples/talk-llama/llama-model.cpp +0 -9056
- data/ext/sources/examples/talk-llama/llama-model.h +0 -597
- data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1304
- data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
- data/ext/sources/examples/talk-llama/llama-sampler.cpp +0 -3885
- data/ext/sources/examples/talk-llama/llama-sampler.h +0 -42
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3970
- data/ext/sources/examples/talk-llama/llama-vocab.h +0 -187
- data/ext/sources/examples/talk-llama/llama.cpp +0 -1194
- data/ext/sources/examples/talk-llama/llama.h +0 -1573
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -190
- data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
- data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -137
- data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -143
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -133
- data/ext/sources/examples/talk-llama/models/bert.cpp +0 -184
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -145
- data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
- data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -142
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -262
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +0 -445
- data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -148
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +0 -97
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +0 -145
- data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -111
- data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
- data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
- data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -157
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -195
- data/ext/sources/examples/talk-llama/models/granite.cpp +0 -210
- data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -139
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -153
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/jais2.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +0 -381
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -196
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/llama.cpp +0 -175
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/mamba-base.cpp +0 -289
- data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -54
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -129
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -200
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
- data/ext/sources/examples/talk-llama/models/models.h +0 -704
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -109
- data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -162
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
- data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
- data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
- data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -320
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/plm.cpp +0 -169
- data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +0 -381
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +0 -422
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -131
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -525
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -140
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -164
- data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
- data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -137
- data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +0 -165
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
- data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
- data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
- data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
- data/ext/sources/examples/talk-llama/speak +0 -40
- data/ext/sources/examples/talk-llama/speak.bat +0 -1
- data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
- data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
- data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
- data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
- data/ext/sources/examples/talk-llama/unicode.cpp +0 -1103
- data/ext/sources/examples/talk-llama/unicode.h +0 -111
- data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
- data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
- data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
- data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
- data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
- data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
- data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
- data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
- data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
- data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -155
- data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
- data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +0 -123
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +0 -17
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +0 -333
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -182
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -718
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
- data/ext/sources/tests/CMakeLists.txt +0 -112
- data/ext/sources/tests/earnings21/eval.mk +0 -58
- data/ext/sources/tests/earnings21/eval.py +0 -68
- data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
- data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
- data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
- data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
- data/ext/sources/tests/earnings21/requirements.txt +0 -6
- data/ext/sources/tests/en-0-ref.txt +0 -1
- data/ext/sources/tests/en-1-ref.txt +0 -1
- data/ext/sources/tests/en-2-ref.txt +0 -1
- data/ext/sources/tests/es-0-ref.txt +0 -1
- data/ext/sources/tests/librispeech/eval.mk +0 -39
- data/ext/sources/tests/librispeech/eval.py +0 -47
- data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
- data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
- data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
- data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
- data/ext/sources/tests/librispeech/requirements.txt +0 -6
- data/ext/sources/tests/run-tests.sh +0 -130
- data/ext/sources/tests/test-c.c +0 -3
- data/ext/sources/tests/test-vad-full.cpp +0 -56
- data/ext/sources/tests/test-vad.cpp +0 -83
- data/ext/sources/tests/test-whisper.js +0 -58
- data/lib/whisper/context.rb +0 -15
- data/lib/whisper/segment.rb +0 -58
- /data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general_q8_0_f32.cl → gemv_noshuffle_q8_0_f32.cl} +0 -0
|
@@ -7,10 +7,17 @@
|
|
|
7
7
|
|
|
8
8
|
#include <atomic>
|
|
9
9
|
#include <chrono>
|
|
10
|
-
#include <cstddef>
|
|
11
10
|
#include <mutex>
|
|
11
|
+
#include <thread>
|
|
12
|
+
#include <cstddef>
|
|
12
13
|
#include <stdexcept>
|
|
13
14
|
#include <string>
|
|
15
|
+
#include <sstream>
|
|
16
|
+
#include <iomanip>
|
|
17
|
+
#include <unordered_set>
|
|
18
|
+
#include <unordered_map>
|
|
19
|
+
#include <regex>
|
|
20
|
+
#include <queue>
|
|
14
21
|
|
|
15
22
|
#ifdef _WIN32
|
|
16
23
|
# include <sal.h>
|
|
@@ -32,23 +39,38 @@
|
|
|
32
39
|
#include "ggml-hexagon.h"
|
|
33
40
|
#include "ggml-impl.h"
|
|
34
41
|
#include "ggml-quants.h"
|
|
35
|
-
#include "
|
|
36
|
-
#include "htp-
|
|
42
|
+
#include "htp-opnode.h"
|
|
43
|
+
#include "htp-ops.h"
|
|
37
44
|
#include "htp_iface.h"
|
|
38
45
|
#include "htp-drv.h"
|
|
39
46
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
static int
|
|
45
|
-
static
|
|
46
|
-
static
|
|
47
|
-
static int
|
|
47
|
+
using intvec = std::vector<int>;
|
|
48
|
+
using uintvec = std::vector<unsigned int>;
|
|
49
|
+
using u32vec = std::vector<uint32_t>;
|
|
50
|
+
|
|
51
|
+
static int opt_arch = 0; // autodetect
|
|
52
|
+
static size_t opt_ndev = 1;
|
|
53
|
+
static size_t opt_nhvx = 0; // use all
|
|
54
|
+
static int opt_use_hmx = 1; // when set, enable HMX; when 0, use HVX only
|
|
55
|
+
static size_t opt_vmem = HTP_OP_MAX_VMEM_DEFAULT; // max available va space for buffer mappings
|
|
56
|
+
static size_t opt_mbuf = 1ul * 1024 * 1024 * 1024; // max buffer size
|
|
57
|
+
static int opt_etm = 0;
|
|
58
|
+
static int opt_verbose = 0;
|
|
59
|
+
static int opt_profile = 0; // profiling mode (0-disabled, 1-basic, 2-pmu)
|
|
60
|
+
static int opt_hostbuf = 1; // hostbuf ON by default
|
|
61
|
+
|
|
62
|
+
// Default PMU events, if profiling with PMU (mode=2) is enabled
|
|
63
|
+
// See https://docs.qualcomm.com/doc/80-N2040-60/topic/pmu-events.html
|
|
64
|
+
// https://docs.qualcomm.com/doc/80-N2040-61/topic/hvx-pmu-events.html
|
|
65
|
+
static u32vec opt_pmu_evt { 0x3, 0x111, 0x100, 0x105, 0x240, 0x256, 0x7D, 0x8C };
|
|
48
66
|
|
|
49
67
|
// Enable all stages by default
|
|
50
|
-
static int
|
|
51
|
-
static int
|
|
68
|
+
static int opt_opstage = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE;
|
|
69
|
+
static int opt_opbatch = 1024; // max number of ops in a batch
|
|
70
|
+
static int opt_opqueue = 16; // max number of pending batches
|
|
71
|
+
static int opt_oppoll = 0; // polling for batch completions
|
|
72
|
+
|
|
73
|
+
static std::regex* opt_opfilter = NULL; // regex of ops to not claim
|
|
52
74
|
|
|
53
75
|
#define HEX_VERBOSE(...) \
|
|
54
76
|
if (opt_verbose) GGML_LOG_DEBUG(__VA_ARGS__)
|
|
@@ -80,47 +102,45 @@ static const char * status_to_str(uint32_t status) {
|
|
|
80
102
|
|
|
81
103
|
// ** debug helpers
|
|
82
104
|
|
|
83
|
-
static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const
|
|
105
|
+
static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const htp_opnode & node, const uint32_t req_flags) {
|
|
84
106
|
if (!opt_verbose) return;
|
|
85
107
|
|
|
86
|
-
|
|
108
|
+
htp_opformat fmt(node);
|
|
87
109
|
GGML_LOG_DEBUG("ggml-hex: %s execute-op %s: %s : %s : %s : %s : %s : flags 0x%x\n", sess_name.c_str(),
|
|
88
|
-
|
|
110
|
+
node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, req_flags);
|
|
89
111
|
}
|
|
90
112
|
|
|
91
113
|
static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct ggml_tensor * op, bool supp) {
|
|
92
114
|
if (!opt_verbose) return;
|
|
93
115
|
|
|
94
|
-
|
|
95
|
-
GGML_LOG_DEBUG("ggml-hex: %s supports-op %s
|
|
96
|
-
|
|
116
|
+
htp_opformat fmt(htp_opformat(htp_opnode{const_cast<ggml_tensor*>(op), {}, HTP_OP_INVALID}));
|
|
117
|
+
GGML_LOG_DEBUG("ggml-hex: %s supports-op %s: %s : %s : %s : %s : %s : %s\n", sess_name.c_str(),
|
|
118
|
+
ggml_op_desc(op), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, supp ? "yes" : "no");
|
|
97
119
|
}
|
|
98
120
|
|
|
99
|
-
static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const
|
|
100
|
-
uint32_t op_usec, uint32_t op_cycles, uint32_t
|
|
121
|
+
static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const htp_opnode & node,
|
|
122
|
+
uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) {
|
|
101
123
|
if (!opt_profile) return;
|
|
102
124
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
125
|
+
char pmu_str[256] = "";
|
|
126
|
+
if (opt_profile > 1) {
|
|
127
|
+
static_assert(HTP_PROF_PMU_NCNT == 8, "current implementation assumes 8 PMU counters");
|
|
128
|
+
sprintf(pmu_str, " pmu [%u,%u,%u,%u,%u,%u,%u,%u]",
|
|
129
|
+
pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
htp_opformat fmt(node);
|
|
133
|
+
GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(),
|
|
134
|
+
node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, op_usec, op_cycles, pmu_str);
|
|
107
135
|
}
|
|
108
136
|
|
|
109
137
|
// ** backend sessions
|
|
110
138
|
|
|
111
|
-
struct
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
void allocate(int dev_id) noexcept(false);
|
|
116
|
-
void release() noexcept(true);
|
|
117
|
-
|
|
118
|
-
void enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync = false);
|
|
119
|
-
void flush();
|
|
120
|
-
|
|
121
|
-
ggml_backend_buffer_type buffer_type = {};
|
|
122
|
-
ggml_backend_buffer_type repack_buffer_type = {};
|
|
139
|
+
struct ggml_hexagon_opbatch;
|
|
140
|
+
struct ggml_hexagon_opqueue;
|
|
141
|
+
struct htp_opnode;
|
|
123
142
|
|
|
143
|
+
struct ggml_hexagon_session {
|
|
124
144
|
std::string name;
|
|
125
145
|
remote_handle64 handle;
|
|
126
146
|
dspqueue_t queue;
|
|
@@ -132,87 +152,28 @@ struct ggml_hexagon_session {
|
|
|
132
152
|
bool valid_handle;
|
|
133
153
|
bool valid_queue;
|
|
134
154
|
bool valid_iface;
|
|
135
|
-
std::atomic<int> op_pending;
|
|
136
|
-
uint32_t prof_usecs;
|
|
137
|
-
uint32_t prof_cycles;
|
|
138
|
-
uint32_t prof_pkts;
|
|
139
|
-
};
|
|
140
|
-
|
|
141
|
-
void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
|
|
142
|
-
// Bump pending flag (cleared in the session::flush once we get the response)
|
|
143
|
-
this->op_pending++; // atomic inc
|
|
144
|
-
|
|
145
|
-
int err = dspqueue_write(this->queue,
|
|
146
|
-
0, // flags - the framework will autoset this
|
|
147
|
-
n_bufs, // number of buffers
|
|
148
|
-
bufs, // buffer references
|
|
149
|
-
sizeof(req), // Message length
|
|
150
|
-
(const uint8_t *) &req, // Message
|
|
151
|
-
DSPQUEUE_TIMEOUT // Timeout
|
|
152
|
-
);
|
|
153
|
-
|
|
154
|
-
if (err != 0) {
|
|
155
|
-
GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->name.c_str(), (unsigned) err);
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
if (sync) {
|
|
159
|
-
flush();
|
|
160
|
-
}
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
// Flush HTP response queue i.e wait for all outstanding requests to complete
|
|
164
|
-
void ggml_hexagon_session::flush() {
|
|
165
|
-
dspqueue_t q = this->queue;
|
|
166
|
-
|
|
167
|
-
// Repeatedly read packets from the queue until it's empty. We don't
|
|
168
|
-
// necessarily get a separate callback for each packet, and new packets
|
|
169
|
-
// may arrive while we're processing the previous one.
|
|
170
|
-
|
|
171
|
-
while (this->op_pending) {
|
|
172
|
-
struct htp_general_rsp rsp;
|
|
173
|
-
uint32_t rsp_size;
|
|
174
|
-
uint32_t flags;
|
|
175
155
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
// Read response packet from queue
|
|
180
|
-
int err = dspqueue_read(q, &flags,
|
|
181
|
-
HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
|
|
182
|
-
&n_bufs, // Number of buffer references
|
|
183
|
-
bufs, // Buffer references
|
|
184
|
-
sizeof(rsp), // Max message length
|
|
185
|
-
&rsp_size, // Message length
|
|
186
|
-
(uint8_t *) &rsp, // Message
|
|
187
|
-
DSPQUEUE_TIMEOUT); // Timeout
|
|
156
|
+
std::atomic<int> op_pending;
|
|
157
|
+
ggml_hexagon_opbatch* op_batch;
|
|
158
|
+
ggml_hexagon_opqueue* op_queue;
|
|
188
159
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
continue;
|
|
192
|
-
}
|
|
160
|
+
ggml_backend_buffer_type buffer_type = {};
|
|
161
|
+
ggml_backend_buffer_type repack_buffer_type = {};
|
|
193
162
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
}
|
|
163
|
+
ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false);
|
|
164
|
+
~ggml_hexagon_session() noexcept(true);
|
|
197
165
|
|
|
198
|
-
|
|
199
|
-
if (rsp_size != sizeof(rsp)) {
|
|
200
|
-
GGML_ABORT("ggml-hex: dspcall : bad response (size)\n");
|
|
201
|
-
}
|
|
166
|
+
const char* c_name() const { return name.c_str(); }
|
|
202
167
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
// TODO: handle errors
|
|
206
|
-
}
|
|
168
|
+
void allocate(int dev_id) noexcept(false);
|
|
169
|
+
void release() noexcept(true);
|
|
207
170
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
this->prof_cycles = rsp.prof_cycles;
|
|
211
|
-
this->prof_pkts = rsp.prof_pkts;
|
|
171
|
+
void enqueue_op(const htp_opnode & node);
|
|
172
|
+
void flush(bool all = true);
|
|
212
173
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
}
|
|
174
|
+
void flush_pending(bool all = false);
|
|
175
|
+
void flush_batch();
|
|
176
|
+
};
|
|
216
177
|
|
|
217
178
|
// ** backend buffers
|
|
218
179
|
|
|
@@ -226,82 +187,94 @@ struct ggml_backend_hexagon_buffer_type_context {
|
|
|
226
187
|
std::string name;
|
|
227
188
|
};
|
|
228
189
|
|
|
229
|
-
struct
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
190
|
+
struct ggml_hexagon_shared_buffer {
|
|
191
|
+
ggml_hexagon_session * sess;
|
|
192
|
+
uint8_t * base;
|
|
193
|
+
size_t size;
|
|
194
|
+
int fd;
|
|
195
|
+
bool mapped;
|
|
196
|
+
bool pinned;
|
|
197
|
+
|
|
198
|
+
void mmap() {
|
|
199
|
+
fastrpc_map_flags flags = this->pinned ? FASTRPC_MAP_FD : FASTRPC_MAP_FD_DELAYED;
|
|
234
200
|
|
|
235
|
-
int err = fastrpc_mmap(
|
|
201
|
+
int err = fastrpc_mmap(sess->domain_id, this->fd, (void *) this->base, 0, this->size, flags);
|
|
236
202
|
if (err != 0) {
|
|
237
|
-
GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n",
|
|
238
|
-
|
|
239
|
-
|
|
203
|
+
GGML_LOG_ERROR("ggml-hex: %s buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n", sess->c_name(),
|
|
204
|
+
sess->domain_id, this->size, this->fd, (unsigned) err);
|
|
205
|
+
throw std::runtime_error("ggml-hex: fastrpc_mmap failed (see log for details)");
|
|
240
206
|
}
|
|
241
207
|
|
|
242
|
-
|
|
243
|
-
|
|
208
|
+
HEX_VERBOSE("ggml-hex: %s mapped buffer: base %p size %zu fd %d pinned %u\n",
|
|
209
|
+
sess->c_name(), (void *) this->base, this->size, this->fd, pinned);
|
|
244
210
|
|
|
245
|
-
bool mmap() {
|
|
246
|
-
if (this->mapped) {
|
|
247
|
-
return true;
|
|
248
|
-
}
|
|
249
|
-
if (!mmap_to(this->sess)) {
|
|
250
|
-
return false;
|
|
251
|
-
}
|
|
252
211
|
this->mapped = true;
|
|
253
|
-
return true;
|
|
254
212
|
}
|
|
255
213
|
|
|
256
|
-
void
|
|
257
|
-
if (!this->mapped)
|
|
258
|
-
|
|
214
|
+
void unmap() {
|
|
215
|
+
if (!this->mapped) return;
|
|
216
|
+
|
|
217
|
+
if (!this->pinned) {
|
|
218
|
+
// HTP might still hold a reference, tell it drop it
|
|
219
|
+
htp_iface_munmap(sess->handle, this->fd);
|
|
259
220
|
}
|
|
260
221
|
|
|
261
|
-
fastrpc_munmap(
|
|
222
|
+
fastrpc_munmap(sess->domain_id, this->fd, (void *) this->base, this->size);
|
|
223
|
+
|
|
224
|
+
HEX_VERBOSE("ggml-hex: %s unmapped buffer: base %p size %zu fd %d\n", sess->c_name(),
|
|
225
|
+
(void *) this->base, size, this->fd);
|
|
226
|
+
|
|
262
227
|
this->mapped = false;
|
|
228
|
+
this->fd = -1;
|
|
263
229
|
}
|
|
264
230
|
|
|
265
|
-
|
|
266
|
-
|
|
231
|
+
void alloc(size_t size) {
|
|
232
|
+
if (this->base) return;
|
|
267
233
|
|
|
268
|
-
this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS
|
|
234
|
+
this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, size);
|
|
269
235
|
if (!this->base) {
|
|
270
|
-
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->
|
|
236
|
+
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->c_name(), size);
|
|
271
237
|
throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)");
|
|
272
238
|
}
|
|
273
239
|
|
|
274
240
|
this->fd = rpcmem_to_fd(this->base);
|
|
275
241
|
if (this->fd < 0) {
|
|
276
|
-
GGML_LOG_ERROR("ggml-hex: %s failed to get FD for buffer %p\n", sess->
|
|
277
|
-
rpcmem_free(this->base);
|
|
278
|
-
this->base = NULL;
|
|
242
|
+
GGML_LOG_ERROR("ggml-hex: %s failed to get FD for buffer %p\n", sess->c_name(), (void *) this->base);
|
|
279
243
|
throw std::runtime_error("ggml-hex: rpcmem_to_fd failed (see log for details)");
|
|
280
244
|
}
|
|
245
|
+
this->size = size;
|
|
281
246
|
|
|
282
|
-
HEX_VERBOSE("ggml-hex: %s allocated buffer: base %p size %zu fd %d
|
|
283
|
-
(void *) this->base, size, this->fd, (int)
|
|
247
|
+
HEX_VERBOSE("ggml-hex: %s allocated buffer: base %p size %zu fd %d pinned %d\n", sess->c_name(),
|
|
248
|
+
(void *) this->base, this->size, this->fd, (int) pinned);
|
|
249
|
+
mmap();
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
void free() {
|
|
253
|
+
if (!this->base) return;
|
|
254
|
+
|
|
255
|
+
unmap();
|
|
256
|
+
rpcmem_free(this->base);
|
|
257
|
+
|
|
258
|
+
HEX_VERBOSE("ggml-hex: %s freed buffer: base %p size %zu fd %d\n", sess->c_name(),
|
|
259
|
+
(void *) this->base, size, this->fd);
|
|
284
260
|
|
|
261
|
+
this->base = NULL;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
ggml_hexagon_shared_buffer(ggml_hexagon_session * sess, size_t size, bool pinned = false) {
|
|
285
265
|
this->sess = sess;
|
|
286
|
-
this->size =
|
|
266
|
+
this->size = 0;
|
|
267
|
+
this->base = nullptr;
|
|
268
|
+
this->fd = -1;
|
|
287
269
|
this->mapped = false;
|
|
288
|
-
this->
|
|
289
|
-
}
|
|
270
|
+
this->pinned = pinned;
|
|
290
271
|
|
|
291
|
-
|
|
292
|
-
munmap();
|
|
293
|
-
if (this->base) {
|
|
294
|
-
rpcmem_free(this->base);
|
|
295
|
-
this->base = NULL;
|
|
296
|
-
}
|
|
272
|
+
alloc(size);
|
|
297
273
|
}
|
|
298
274
|
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
int fd;
|
|
303
|
-
bool mapped; // mmap is done
|
|
304
|
-
bool repack; // repacked buffer
|
|
275
|
+
~ggml_hexagon_shared_buffer() {
|
|
276
|
+
free();
|
|
277
|
+
}
|
|
305
278
|
};
|
|
306
279
|
|
|
307
280
|
static ggml_hexagon_session * ggml_backend_hexagon_buffer_get_sess(ggml_backend_buffer_t buffer) {
|
|
@@ -309,30 +282,26 @@ static ggml_hexagon_session * ggml_backend_hexagon_buffer_get_sess(ggml_backend_
|
|
|
309
282
|
}
|
|
310
283
|
|
|
311
284
|
static void ggml_backend_hexagon_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
312
|
-
auto
|
|
313
|
-
delete
|
|
285
|
+
auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(buffer->context);
|
|
286
|
+
delete sbuf;
|
|
314
287
|
}
|
|
315
288
|
|
|
316
289
|
static void * ggml_backend_hexagon_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
317
|
-
auto
|
|
318
|
-
return
|
|
290
|
+
auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(buffer->context);
|
|
291
|
+
return sbuf->base;
|
|
319
292
|
}
|
|
320
293
|
|
|
321
294
|
static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
|
322
|
-
auto
|
|
323
|
-
auto sess =
|
|
295
|
+
auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(buffer->context);
|
|
296
|
+
auto sess = sbuf->sess;
|
|
324
297
|
|
|
325
|
-
HEX_VERBOSE("ggml-hex: %s init-tensor %s : base %p data %p nbytes %zu usage %d
|
|
326
|
-
tensor->name, (void *)
|
|
327
|
-
(int) ctx->repack);
|
|
298
|
+
HEX_VERBOSE("ggml-hex: %s init-tensor %s : base %p data %p nbytes %zu usage %d\n", sess->c_name(),
|
|
299
|
+
tensor->name, (void *) sbuf->base, tensor->data, ggml_nbytes(tensor), (int) buffer->usage);
|
|
328
300
|
|
|
329
301
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
|
330
|
-
; // nothing to do for the view
|
|
331
|
-
} else {
|
|
332
|
-
if (!ctx->mapped) {
|
|
333
|
-
ctx->mmap();
|
|
334
|
-
}
|
|
302
|
+
return GGML_STATUS_SUCCESS; // nothing to do for the view
|
|
335
303
|
}
|
|
304
|
+
|
|
336
305
|
return GGML_STATUS_SUCCESS;
|
|
337
306
|
}
|
|
338
307
|
|
|
@@ -460,7 +429,7 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
|
|
|
460
429
|
d[7] = x[i * 8 + 7].d;
|
|
461
430
|
}
|
|
462
431
|
|
|
463
|
-
if (opt_verbose >
|
|
432
|
+
if (opt_verbose > 2) {
|
|
464
433
|
for (int i = 0; i < nb; i++) {
|
|
465
434
|
dump_packed_block_q4x4x2(y, i, k);
|
|
466
435
|
}
|
|
@@ -479,7 +448,7 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
|
|
|
479
448
|
const uint8_t * y_q = y + 0; // quants first
|
|
480
449
|
const uint8_t * y_d = y + qrow_size; // then scales
|
|
481
450
|
|
|
482
|
-
if (opt_verbose >
|
|
451
|
+
if (opt_verbose > 2) {
|
|
483
452
|
for (int i = 0; i < nb; i++) {
|
|
484
453
|
dump_packed_block_q4x4x2(y, i, k);
|
|
485
454
|
}
|
|
@@ -583,7 +552,7 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
|
|
|
583
552
|
|
|
584
553
|
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
|
585
554
|
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
|
|
586
|
-
size_t row_size_rp =
|
|
555
|
+
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
|
|
587
556
|
|
|
588
557
|
// Ensure we don't try to read more data than is available in the source buffer 'data'
|
|
589
558
|
// or write more than the tensor can hold.
|
|
@@ -644,7 +613,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
|
|
|
644
613
|
|
|
645
614
|
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
|
646
615
|
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
|
|
647
|
-
size_t row_size_rp =
|
|
616
|
+
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
|
|
648
617
|
|
|
649
618
|
// Ensure we don't try to copy more data than the tensor actually contains.
|
|
650
619
|
const size_t total_tensor_size = (size_t)nrows * row_size;
|
|
@@ -693,6 +662,239 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
|
|
|
693
662
|
ggml_aligned_free(buf_rp, row_size_rp);
|
|
694
663
|
}
|
|
695
664
|
|
|
665
|
+
static void unpack_q4_1_quants(uint8_t * qs, const block_q4_1 * x, unsigned int bi) {
|
|
666
|
+
static const int qk = QK4_1;
|
|
667
|
+
|
|
668
|
+
for (unsigned int i = 0; i < qk / 2; ++i) {
|
|
669
|
+
const int x0 = (x->qs[i] & 0x0F);
|
|
670
|
+
const int x1 = (x->qs[i] >> 4);
|
|
671
|
+
qs[bi * qk + i + 0] = x0;
|
|
672
|
+
qs[bi * qk + i + qk / 2] = x1;
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
static void pack_q4_1_quants(block_q4_1 * x, const uint8_t * qs, unsigned int bi) {
|
|
677
|
+
static const int qk = QK4_1;
|
|
678
|
+
|
|
679
|
+
for (unsigned int i = 0; i < qk / 2; ++i) {
|
|
680
|
+
const uint8_t x0 = qs[bi * qk + i + 0];
|
|
681
|
+
const uint8_t x1 = qs[bi * qk + i + qk / 2];
|
|
682
|
+
x->qs[i] = x0 | (x1 << 4);
|
|
683
|
+
}
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
static void repack_row_q4_1x4x2(uint8_t * y, const block_q4_1 * x, int64_t k) {
|
|
687
|
+
static const int qk = QK_Q4_0x4x2;
|
|
688
|
+
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
|
689
|
+
const int nloe = k % qk; // leftovers
|
|
690
|
+
|
|
691
|
+
const int dblk_size = 8 * 4; // 8x (d, m) __fp16 = 32 bytes
|
|
692
|
+
const int qblk_size = qk / 2; // int4 = 128 bytes
|
|
693
|
+
const int qrow_size = k / 2; // int4 (not padded to blocks)
|
|
694
|
+
|
|
695
|
+
uint8_t * y_q = y + 0; // quants first
|
|
696
|
+
uint8_t * y_d = y + qrow_size; // then scales/offsets
|
|
697
|
+
|
|
698
|
+
// Repack the quants
|
|
699
|
+
for (int i = 0; i < nb; i++) {
|
|
700
|
+
uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
|
|
701
|
+
unpack_q4_1_quants(qs, &x[i * 8 + 0], 0);
|
|
702
|
+
unpack_q4_1_quants(qs, &x[i * 8 + 1], 1);
|
|
703
|
+
unpack_q4_1_quants(qs, &x[i * 8 + 2], 2);
|
|
704
|
+
unpack_q4_1_quants(qs, &x[i * 8 + 3], 3);
|
|
705
|
+
unpack_q4_1_quants(qs, &x[i * 8 + 4], 4);
|
|
706
|
+
unpack_q4_1_quants(qs, &x[i * 8 + 5], 5);
|
|
707
|
+
unpack_q4_1_quants(qs, &x[i * 8 + 6], 6);
|
|
708
|
+
unpack_q4_1_quants(qs, &x[i * 8 + 7], 7);
|
|
709
|
+
|
|
710
|
+
bool partial = (nloe && i == nb-1);
|
|
711
|
+
|
|
712
|
+
uint8_t * q = y_q + (i * qblk_size);
|
|
713
|
+
for (int j = 0; j < qk / 2; j++) {
|
|
714
|
+
q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
|
|
715
|
+
}
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
// Repack the scales and offsets
|
|
719
|
+
for (int i = 0; i < nb; i++) {
|
|
720
|
+
ggml_half * d_m = (ggml_half *) (y_d + i * dblk_size);
|
|
721
|
+
for (int j = 0; j < 8; j++) {
|
|
722
|
+
d_m[j * 2 + 0] = x[i * 8 + j].d;
|
|
723
|
+
d_m[j * 2 + 1] = x[i * 8 + j].m;
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
static void unpack_row_q4_1x4x2(block_q4_1 * x, const uint8_t * y, int64_t k) {
|
|
729
|
+
static const int qk = QK_Q4_0x4x2;
|
|
730
|
+
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
|
731
|
+
const int nloe = k % qk; // leftovers
|
|
732
|
+
|
|
733
|
+
const int dblk_size = 8 * 4; // 8x (d, m) __fp16 = 32 bytes
|
|
734
|
+
const int qblk_size = qk / 2; // int4 = 128 bytes
|
|
735
|
+
const int qrow_size = k / 2; // int4 (not padded to blocks)
|
|
736
|
+
|
|
737
|
+
const uint8_t * y_q = y + 0; // quants first
|
|
738
|
+
const uint8_t * y_d = y + qrow_size; // then scales/offsets
|
|
739
|
+
|
|
740
|
+
// Unpack the quants
|
|
741
|
+
for (int i = 0; i < nb; i++) {
|
|
742
|
+
uint8_t qs[QK_Q4_0x4x2];
|
|
743
|
+
bool partial = (nloe && i == nb-1);
|
|
744
|
+
|
|
745
|
+
const uint8_t * q = y_q + (i * qblk_size);
|
|
746
|
+
for (int j = 0; j < qk / 2; j++) {
|
|
747
|
+
if (partial) {
|
|
748
|
+
qs[j*2+0] = q[j] & 0x0F;
|
|
749
|
+
qs[j*2+1] = q[j] >> 4;
|
|
750
|
+
} else {
|
|
751
|
+
qs[j+000] = q[j] & 0x0F;
|
|
752
|
+
qs[j+128] = q[j] >> 4;
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
pack_q4_1_quants(&x[i * 8 + 0], qs, 0);
|
|
757
|
+
pack_q4_1_quants(&x[i * 8 + 1], qs, 1);
|
|
758
|
+
pack_q4_1_quants(&x[i * 8 + 2], qs, 2);
|
|
759
|
+
pack_q4_1_quants(&x[i * 8 + 3], qs, 3);
|
|
760
|
+
pack_q4_1_quants(&x[i * 8 + 4], qs, 4);
|
|
761
|
+
pack_q4_1_quants(&x[i * 8 + 5], qs, 5);
|
|
762
|
+
pack_q4_1_quants(&x[i * 8 + 6], qs, 6);
|
|
763
|
+
pack_q4_1_quants(&x[i * 8 + 7], qs, 7);
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
// Unpack the scales and offsets
|
|
767
|
+
for (int i = 0; i < nb; i++) {
|
|
768
|
+
const ggml_half * d_m = (const ggml_half *) (y_d + i * dblk_size);
|
|
769
|
+
for (int j = 0; j < 8; j++) {
|
|
770
|
+
x[i * 8 + j].d = d_m[j * 2 + 0];
|
|
771
|
+
x[i * 8 + j].m = d_m[j * 2 + 1];
|
|
772
|
+
}
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
static void init_row_q4_1x4x2(block_q4_1 * x, int64_t k) {
|
|
777
|
+
static const int qk = QK_Q4_0x4x2;
|
|
778
|
+
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
|
779
|
+
|
|
780
|
+
uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
|
|
781
|
+
memset(qs, 0, sizeof(qs));
|
|
782
|
+
|
|
783
|
+
for (int i = 0; i < nb; i++) {
|
|
784
|
+
pack_q4_1_quants(&x[i * 8 + 0], qs, 0);
|
|
785
|
+
pack_q4_1_quants(&x[i * 8 + 1], qs, 1);
|
|
786
|
+
pack_q4_1_quants(&x[i * 8 + 2], qs, 2);
|
|
787
|
+
pack_q4_1_quants(&x[i * 8 + 3], qs, 3);
|
|
788
|
+
pack_q4_1_quants(&x[i * 8 + 4], qs, 4);
|
|
789
|
+
pack_q4_1_quants(&x[i * 8 + 5], qs, 5);
|
|
790
|
+
pack_q4_1_quants(&x[i * 8 + 6], qs, 6);
|
|
791
|
+
pack_q4_1_quants(&x[i * 8 + 7], qs, 7);
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
for (int i = 0; i < nb; i++) {
|
|
795
|
+
for (int j = 0; j < 8; j++) {
|
|
796
|
+
x[i * 8 + j].d = 0;
|
|
797
|
+
x[i * 8 + j].m = 0;
|
|
798
|
+
}
|
|
799
|
+
}
|
|
800
|
+
}
|
|
801
|
+
|
|
802
|
+
static void repack_q4_1_q4x4x2(ggml_tensor * t, const void * data, size_t size) {
|
|
803
|
+
int64_t nrows = ggml_nrows(t);
|
|
804
|
+
|
|
805
|
+
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
|
806
|
+
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));
|
|
807
|
+
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
|
|
808
|
+
|
|
809
|
+
const size_t total_tensor_size = (size_t)nrows * row_size;
|
|
810
|
+
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
|
|
811
|
+
|
|
812
|
+
const int64_t n_full_rows = n_bytes_to_copy / row_size;
|
|
813
|
+
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
|
|
814
|
+
|
|
815
|
+
void * buf_pd = ggml_aligned_malloc(row_size_pd);
|
|
816
|
+
GGML_ASSERT(buf_pd != NULL);
|
|
817
|
+
|
|
818
|
+
void * buf_rp = ggml_aligned_malloc(row_size_rp);
|
|
819
|
+
GGML_ASSERT(buf_rp != NULL);
|
|
820
|
+
|
|
821
|
+
HEX_VERBOSE("ggml-hex: repack-q4_1-q4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
|
|
822
|
+
t->ne[0], nrows, row_size);
|
|
823
|
+
|
|
824
|
+
init_row_q4_1x4x2((block_q4_1 *) buf_pd, t->ne[0]);
|
|
825
|
+
|
|
826
|
+
for (int64_t i = 0; i < n_full_rows; i++) {
|
|
827
|
+
const uint8_t * src = (const uint8_t *) data + (i * row_size);
|
|
828
|
+
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
|
|
829
|
+
|
|
830
|
+
memcpy(buf_pd, src, row_size);
|
|
831
|
+
repack_row_q4_1x4x2((uint8_t *) buf_rp, (const block_q4_1 *) buf_pd, t->ne[0]);
|
|
832
|
+
memcpy(dst, buf_rp, row_size);
|
|
833
|
+
}
|
|
834
|
+
|
|
835
|
+
if (n_rem_bytes > 0) {
|
|
836
|
+
const int64_t i = n_full_rows;
|
|
837
|
+
const uint8_t * src = (const uint8_t *) data + (i * row_size);
|
|
838
|
+
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
|
|
839
|
+
|
|
840
|
+
init_row_q4_1x4x2((block_q4_1 *) buf_pd, t->ne[0]);
|
|
841
|
+
memcpy(buf_pd, src, n_rem_bytes);
|
|
842
|
+
repack_row_q4_1x4x2((uint8_t *) buf_rp, (const block_q4_1 *) buf_pd, t->ne[0]);
|
|
843
|
+
memcpy(dst, buf_rp, n_rem_bytes);
|
|
844
|
+
}
|
|
845
|
+
|
|
846
|
+
ggml_aligned_free(buf_pd, row_size_pd);
|
|
847
|
+
ggml_aligned_free(buf_rp, row_size_rp);
|
|
848
|
+
}
|
|
849
|
+
|
|
850
|
+
static void repack_q4x4x2_q4_1(void * data, const ggml_tensor * t, size_t size) {
|
|
851
|
+
int64_t nrows = ggml_nrows(t);
|
|
852
|
+
|
|
853
|
+
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
|
854
|
+
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));
|
|
855
|
+
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
|
|
856
|
+
|
|
857
|
+
const size_t total_tensor_size = (size_t)nrows * row_size;
|
|
858
|
+
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
|
|
859
|
+
|
|
860
|
+
const int64_t n_full_rows = n_bytes_to_copy / row_size;
|
|
861
|
+
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
|
|
862
|
+
|
|
863
|
+
void * buf_pd = ggml_aligned_malloc(row_size_pd);
|
|
864
|
+
GGML_ASSERT(buf_pd != NULL);
|
|
865
|
+
|
|
866
|
+
void * buf_rp = ggml_aligned_malloc(row_size_rp);
|
|
867
|
+
GGML_ASSERT(buf_rp != NULL);
|
|
868
|
+
|
|
869
|
+
HEX_VERBOSE("ggml-hex: repack-q4x4x2-q4_1 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
|
|
870
|
+
t->ne[0], nrows, row_size);
|
|
871
|
+
|
|
872
|
+
memset(buf_rp, 0, row_size_rp); // clear-out padded buffer to make sure the tail is all zeros
|
|
873
|
+
|
|
874
|
+
for (int64_t i = 0; i < n_full_rows; i++) {
|
|
875
|
+
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
|
|
876
|
+
uint8_t * dst = (uint8_t *) data + (i * row_size);
|
|
877
|
+
|
|
878
|
+
memcpy(buf_rp, src, row_size);
|
|
879
|
+
unpack_row_q4_1x4x2((block_q4_1 *) buf_pd, (const uint8_t *) buf_rp, t->ne[0]);
|
|
880
|
+
memcpy(dst, buf_pd, row_size);
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
if (n_rem_bytes > 0) {
|
|
884
|
+
const int64_t i = n_full_rows;
|
|
885
|
+
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
|
|
886
|
+
uint8_t * dst = (uint8_t *) data + (i * row_size);
|
|
887
|
+
|
|
888
|
+
// We still need to read and unpack the entire source row because quantization is block-based.
|
|
889
|
+
memcpy(buf_rp, src, row_size);
|
|
890
|
+
unpack_row_q4_1x4x2((block_q4_1 *) buf_pd, (const uint8_t *) buf_rp, t->ne[0]);
|
|
891
|
+
memcpy(dst, buf_pd, n_rem_bytes);
|
|
892
|
+
}
|
|
893
|
+
|
|
894
|
+
ggml_aligned_free(buf_pd, row_size_pd);
|
|
895
|
+
ggml_aligned_free(buf_rp, row_size_rp);
|
|
896
|
+
}
|
|
897
|
+
|
|
696
898
|
// ======== Q8x4x2 ====================
|
|
697
899
|
static void dump_block_q8_0(const block_q8_0 * b, int i) {
|
|
698
900
|
HEX_VERBOSE("ggml-hex: repack q8_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, b->qs[0], b->qs[1], b->qs[2],
|
|
@@ -795,7 +997,7 @@ static void repack_row_q8x4x2(uint8_t * y, const block_q8_0 * x, int64_t k) {
|
|
|
795
997
|
d[7] = x[i * 8 + 7].d;
|
|
796
998
|
}
|
|
797
999
|
|
|
798
|
-
if (opt_verbose >
|
|
1000
|
+
if (opt_verbose > 2) {
|
|
799
1001
|
for (int i = 0; i < nb; i++) {
|
|
800
1002
|
dump_packed_block_q8x4x2(y, i, k);
|
|
801
1003
|
}
|
|
@@ -813,7 +1015,7 @@ static void unpack_row_q8x4x2(block_q8_0 * x, const uint8_t * y, int64_t k) {
|
|
|
813
1015
|
const uint8_t * y_q = y + 0; // quants first
|
|
814
1016
|
const uint8_t * y_d = y + qrow_size; // then scales
|
|
815
1017
|
|
|
816
|
-
if (opt_verbose >
|
|
1018
|
+
if (opt_verbose > 2) {
|
|
817
1019
|
for (int i = 0; i < nb; i++) {
|
|
818
1020
|
dump_packed_block_q8x4x2(y, i, k);
|
|
819
1021
|
}
|
|
@@ -909,7 +1111,7 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
|
|
|
909
1111
|
|
|
910
1112
|
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
|
911
1113
|
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
|
|
912
|
-
size_t row_size_rp =
|
|
1114
|
+
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size quants + scales)
|
|
913
1115
|
|
|
914
1116
|
// Ensure we don't try to read more data than is available in the source buffer 'data'
|
|
915
1117
|
// or write more than the tensor can hold.
|
|
@@ -970,7 +1172,7 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
|
|
|
970
1172
|
|
|
971
1173
|
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
|
972
1174
|
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
|
|
973
|
-
size_t row_size_rp =
|
|
1175
|
+
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size quants + scales)
|
|
974
1176
|
|
|
975
1177
|
// Ensure we don't try to copy more data than the tensor actually contains.
|
|
976
1178
|
const size_t total_tensor_size = (size_t)nrows * row_size;
|
|
@@ -1148,7 +1350,7 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
|
|
|
1148
1350
|
e[7] = x[i * 8 + 7].e;
|
|
1149
1351
|
}
|
|
1150
1352
|
|
|
1151
|
-
if (opt_verbose >
|
|
1353
|
+
if (opt_verbose > 2) {
|
|
1152
1354
|
for (int i = 0; i < nb; i++) {
|
|
1153
1355
|
dump_packed_block_mxfp4x4x2(y, i, k);
|
|
1154
1356
|
}
|
|
@@ -1167,7 +1369,7 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
|
|
|
1167
1369
|
const uint8_t * y_q = y + 0; // quants first
|
|
1168
1370
|
const uint8_t * y_e = y + qrow_size; // then scales
|
|
1169
1371
|
|
|
1170
|
-
if (opt_verbose >
|
|
1372
|
+
if (opt_verbose > 2) {
|
|
1171
1373
|
for (int i = 0; i < nb; i++) {
|
|
1172
1374
|
dump_packed_block_mxfp4x4x2(y, i, k);
|
|
1173
1375
|
}
|
|
@@ -1271,7 +1473,7 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
|
|
|
1271
1473
|
|
|
1272
1474
|
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
|
1273
1475
|
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
|
|
1274
|
-
size_t row_size_rp =
|
|
1476
|
+
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
|
|
1275
1477
|
|
|
1276
1478
|
// Ensure we don't try to read more data than is available in the source buffer 'data'
|
|
1277
1479
|
// or write more than the tensor can hold.
|
|
@@ -1332,7 +1534,7 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
|
|
|
1332
1534
|
|
|
1333
1535
|
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
|
1334
1536
|
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
|
|
1335
|
-
size_t row_size_rp =
|
|
1537
|
+
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
|
|
1336
1538
|
|
|
1337
1539
|
// Ensure we don't try to copy more data than the tensor actually contains.
|
|
1338
1540
|
const size_t total_tensor_size = (size_t)nrows * row_size;
|
|
@@ -1386,11 +1588,10 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
1386
1588
|
const void * data,
|
|
1387
1589
|
size_t offset,
|
|
1388
1590
|
size_t size) {
|
|
1389
|
-
auto
|
|
1390
|
-
auto sess =
|
|
1591
|
+
auto sbuf = (ggml_hexagon_shared_buffer *) buffer->context;
|
|
1592
|
+
auto sess = sbuf->sess;
|
|
1391
1593
|
|
|
1392
|
-
HEX_VERBOSE("ggml-hex: %s set-tensor %s : data %p offset %zu size %zu\n", sess->
|
|
1393
|
-
offset, size);
|
|
1594
|
+
HEX_VERBOSE("ggml-hex: %s set-tensor %s : data %p offset %zu size %zu\n", sess->c_name(), tensor->name, data, offset, size);
|
|
1394
1595
|
|
|
1395
1596
|
switch (tensor->type) {
|
|
1396
1597
|
case GGML_TYPE_Q4_0:
|
|
@@ -1399,10 +1600,23 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
1399
1600
|
repack_q4_0_q4x4x2(tensor, data, size);
|
|
1400
1601
|
break;
|
|
1401
1602
|
|
|
1402
|
-
case
|
|
1603
|
+
case GGML_TYPE_Q4_1:
|
|
1403
1604
|
GGML_ASSERT(offset == 0);
|
|
1404
1605
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
|
1405
|
-
|
|
1606
|
+
repack_q4_1_q4x4x2(tensor, data, size);
|
|
1607
|
+
break;
|
|
1608
|
+
|
|
1609
|
+
case GGML_TYPE_Q8_0:
|
|
1610
|
+
GGML_ASSERT(offset == 0);
|
|
1611
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
|
1612
|
+
repack_q8_0_q8x4x2(tensor, data, size);
|
|
1613
|
+
break;
|
|
1614
|
+
|
|
1615
|
+
case GGML_TYPE_IQ4_NL:
|
|
1616
|
+
GGML_ASSERT(offset == 0);
|
|
1617
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
|
1618
|
+
// IQ4_NL has identical block layout to Q4_0 (ggml_half d + uint8_t qs[16])
|
|
1619
|
+
repack_q4_0_q4x4x2(tensor, data, size);
|
|
1406
1620
|
break;
|
|
1407
1621
|
|
|
1408
1622
|
case GGML_TYPE_MXFP4:
|
|
@@ -1422,11 +1636,10 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
|
1422
1636
|
void * data,
|
|
1423
1637
|
size_t offset,
|
|
1424
1638
|
size_t size) {
|
|
1425
|
-
auto
|
|
1426
|
-
auto sess =
|
|
1639
|
+
auto sbuf = (ggml_hexagon_shared_buffer *) buffer->context;
|
|
1640
|
+
auto sess = sbuf->sess;
|
|
1427
1641
|
|
|
1428
|
-
HEX_VERBOSE("ggml-hex: %s get-tensor %s : data %p offset %zu size %zu\n", sess->
|
|
1429
|
-
offset, size);
|
|
1642
|
+
HEX_VERBOSE("ggml-hex: %s get-tensor %s : data %p offset %zu size %zu\n", sess->c_name(), tensor->name, data, offset, size);
|
|
1430
1643
|
|
|
1431
1644
|
switch (tensor->type) {
|
|
1432
1645
|
case GGML_TYPE_Q4_0:
|
|
@@ -1435,12 +1648,24 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
|
1435
1648
|
repack_q4x4x2_q4_0(data, tensor, size);
|
|
1436
1649
|
break;
|
|
1437
1650
|
|
|
1651
|
+
case GGML_TYPE_Q4_1:
|
|
1652
|
+
GGML_ASSERT(offset == 0);
|
|
1653
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
|
1654
|
+
repack_q4x4x2_q4_1(data, tensor, size);
|
|
1655
|
+
break;
|
|
1656
|
+
|
|
1438
1657
|
case GGML_TYPE_Q8_0:
|
|
1439
1658
|
GGML_ASSERT(offset == 0);
|
|
1440
1659
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
|
1441
1660
|
repack_q8x4x2_q8_0(data, tensor, size);
|
|
1442
1661
|
break;
|
|
1443
1662
|
|
|
1663
|
+
case GGML_TYPE_IQ4_NL:
|
|
1664
|
+
GGML_ASSERT(offset == 0);
|
|
1665
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
|
1666
|
+
repack_q4x4x2_q4_0(data, tensor, size);
|
|
1667
|
+
break;
|
|
1668
|
+
|
|
1444
1669
|
case GGML_TYPE_MXFP4:
|
|
1445
1670
|
GGML_ASSERT(offset == 0);
|
|
1446
1671
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
|
@@ -1464,10 +1689,10 @@ static bool ggml_backend_hexagon_buffer_cpy_tensor(ggml_backend_buffer_t bu
|
|
|
1464
1689
|
}
|
|
1465
1690
|
|
|
1466
1691
|
static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
1467
|
-
auto
|
|
1468
|
-
auto sess =
|
|
1469
|
-
HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->
|
|
1470
|
-
memset(
|
|
1692
|
+
auto sbuf = (ggml_hexagon_shared_buffer *) buffer->context;
|
|
1693
|
+
auto sess = sbuf->sess;
|
|
1694
|
+
HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->c_name(), (void *) sbuf->base, sbuf->size);
|
|
1695
|
+
memset(sbuf->base, value, sbuf->size);
|
|
1471
1696
|
}
|
|
1472
1697
|
|
|
1473
1698
|
static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
|
|
@@ -1477,6 +1702,8 @@ static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
|
|
|
1477
1702
|
/* .memset_tensor = */ NULL,
|
|
1478
1703
|
/* .set_tensor = */ ggml_backend_hexagon_buffer_set_tensor,
|
|
1479
1704
|
/* .get_tensor = */ ggml_backend_hexagon_buffer_get_tensor,
|
|
1705
|
+
/* .set_tensor_2d = */ NULL,
|
|
1706
|
+
/* .get_tensor_2d = */ NULL,
|
|
1480
1707
|
/* .cpy_tensor = */ ggml_backend_hexagon_buffer_cpy_tensor,
|
|
1481
1708
|
/* .clear = */ ggml_backend_hexagon_buffer_clear,
|
|
1482
1709
|
/* .reset = */ NULL,
|
|
@@ -1492,10 +1719,11 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
|
|
|
1492
1719
|
ggml_backend_buffer_type_t buffer_type, size_t size) {
|
|
1493
1720
|
auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
|
|
1494
1721
|
try {
|
|
1495
|
-
|
|
1496
|
-
|
|
1722
|
+
size += 4 * 1024; // guard page
|
|
1723
|
+
ggml_hexagon_shared_buffer * sbuf = new ggml_hexagon_shared_buffer(sess, size);
|
|
1724
|
+
return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, sbuf, size);
|
|
1497
1725
|
} catch (const std::exception & exc) {
|
|
1498
|
-
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->
|
|
1726
|
+
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context (host): %s\n", sess->c_name(), exc.what());
|
|
1499
1727
|
return nullptr;
|
|
1500
1728
|
}
|
|
1501
1729
|
}
|
|
@@ -1504,10 +1732,11 @@ static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffe
|
|
|
1504
1732
|
ggml_backend_buffer_type_t buffer_type, size_t size) {
|
|
1505
1733
|
auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
|
|
1506
1734
|
try {
|
|
1507
|
-
|
|
1508
|
-
|
|
1735
|
+
size += 4 * 1024; // guard page
|
|
1736
|
+
ggml_hexagon_shared_buffer * sbuf = new ggml_hexagon_shared_buffer(sess, size);
|
|
1737
|
+
return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, sbuf, size);
|
|
1509
1738
|
} catch (const std::exception & exc) {
|
|
1510
|
-
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->
|
|
1739
|
+
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context (repack): %s\n", sess->c_name(), exc.what());
|
|
1511
1740
|
return nullptr;
|
|
1512
1741
|
}
|
|
1513
1742
|
}
|
|
@@ -1522,7 +1751,7 @@ static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffe
|
|
|
1522
1751
|
}
|
|
1523
1752
|
|
|
1524
1753
|
static size_t ggml_backend_hexagon_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
|
|
1525
|
-
return
|
|
1754
|
+
return opt_mbuf; // typically 1GB per buffer
|
|
1526
1755
|
GGML_UNUSED(buffer_type);
|
|
1527
1756
|
}
|
|
1528
1757
|
|
|
@@ -1554,6 +1783,448 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf
|
|
|
1554
1783
|
/* .is_host = */ ggml_backend_hexagon_repack_buffer_type_is_host,
|
|
1555
1784
|
};
|
|
1556
1785
|
|
|
1786
|
+
struct ggml_hexagon_opbatch {
|
|
1787
|
+
ggml_hexagon_session* sess;
|
|
1788
|
+
|
|
1789
|
+
std::vector<htp_opnode> ops; // htp_opnode of ops
|
|
1790
|
+
|
|
1791
|
+
std::vector<htp_buf_desc> h_bufs; // htp buffer descriptors
|
|
1792
|
+
std::vector<htp_tensor> h_tens; // htp tensor descriptors
|
|
1793
|
+
std::vector<htp_op_desc> h_ops; // htp op descriptors
|
|
1794
|
+
|
|
1795
|
+
std::unordered_map<int, int> b_map; // buffer fd to index
|
|
1796
|
+
std::unordered_map<const ggml_tensor*, int> t_map; // tensor ptr to index
|
|
1797
|
+
std::unordered_multimap<void*, int> d_map; // tensor data to index
|
|
1798
|
+
|
|
1799
|
+
unsigned int n_bufs; // num buffers in the batch
|
|
1800
|
+
unsigned int n_tens; // num tensors ...
|
|
1801
|
+
unsigned int n_ops; // num ops ...
|
|
1802
|
+
size_t b_vmem; // sum of all buffer sizes
|
|
1803
|
+
|
|
1804
|
+
unsigned int n_bufs_max;
|
|
1805
|
+
unsigned int n_tens_max;
|
|
1806
|
+
unsigned int n_ops_max;
|
|
1807
|
+
size_t b_vmem_max;
|
|
1808
|
+
|
|
1809
|
+
void reset() {
|
|
1810
|
+
n_bufs = 0;
|
|
1811
|
+
n_tens = 0;
|
|
1812
|
+
n_ops = 0;
|
|
1813
|
+
b_vmem = 0;
|
|
1814
|
+
|
|
1815
|
+
b_map.clear();
|
|
1816
|
+
t_map.clear();
|
|
1817
|
+
d_map.clear();
|
|
1818
|
+
}
|
|
1819
|
+
|
|
1820
|
+
ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t batch_size, size_t max_vmem) {
|
|
1821
|
+
this->sess = sess;
|
|
1822
|
+
|
|
1823
|
+
n_bufs_max = HTP_OP_MAX_BUFS;
|
|
1824
|
+
n_ops_max = batch_size;
|
|
1825
|
+
n_tens_max = n_ops_max + n_ops_max * HTP_OP_MAX_INPUTS;
|
|
1826
|
+
|
|
1827
|
+
b_vmem_max = max_vmem;
|
|
1828
|
+
|
|
1829
|
+
ops.resize(n_ops_max);
|
|
1830
|
+
|
|
1831
|
+
h_bufs.resize(n_bufs_max);
|
|
1832
|
+
h_tens.resize(n_tens_max);
|
|
1833
|
+
h_ops.resize(n_ops_max);
|
|
1834
|
+
|
|
1835
|
+
b_map.reserve(n_bufs_max);
|
|
1836
|
+
t_map.reserve(n_tens_max);
|
|
1837
|
+
d_map.reserve(n_tens_max);
|
|
1838
|
+
|
|
1839
|
+
GGML_LOG_INFO("ggml-hex: %s op batching: n-bufs %u n-tensors %u n-ops %u vmem %zu\n",
|
|
1840
|
+
sess->c_name(), n_bufs_max, n_tens_max, n_ops_max, b_vmem_max);
|
|
1841
|
+
|
|
1842
|
+
reset();
|
|
1843
|
+
}
|
|
1844
|
+
|
|
1845
|
+
bool empty() const { return n_ops == 0; }
|
|
1846
|
+
|
|
1847
|
+
// add buffer and return its index
|
|
1848
|
+
int add_buffer(ggml_hexagon_shared_buffer * sbuf) {
|
|
1849
|
+
// Lookup by fd
|
|
1850
|
+
auto it = b_map.find(sbuf->fd);
|
|
1851
|
+
if (it != b_map.end()) { return it->second; }
|
|
1852
|
+
|
|
1853
|
+
// Add new buffer to the batch
|
|
1854
|
+
int bi = n_bufs++;
|
|
1855
|
+
GGML_ASSERT(n_bufs < HTP_OP_MAX_BUFS);
|
|
1856
|
+
|
|
1857
|
+
b_map.insert({sbuf->fd, bi});
|
|
1858
|
+
|
|
1859
|
+
htp_buf_desc &b = h_bufs[bi];
|
|
1860
|
+
b.base = (uint64_t) sbuf->base;
|
|
1861
|
+
b.fd = sbuf->fd;
|
|
1862
|
+
b.size = sbuf->size;
|
|
1863
|
+
|
|
1864
|
+
b_vmem += b.size;
|
|
1865
|
+
|
|
1866
|
+
HEX_VERBOSE("ggml-hex: add-buffer #%u : fd %d base %p size %zu : vmem %zu\n", bi, b.fd, (void*) sbuf->base, (size_t) b.size, b_vmem);
|
|
1867
|
+
|
|
1868
|
+
return bi;
|
|
1869
|
+
}
|
|
1870
|
+
|
|
1871
|
+
bool same_shape(const htp_tensor * h, const ggml_tensor * t) const {
|
|
1872
|
+
return (h->ne[0] == t->ne[0]) && (h->ne[1] == t->ne[1]) && (h->ne[2] == t->ne[2]) && (h->ne[3] == t->ne[3]) &&
|
|
1873
|
+
(h->nb[0] == t->nb[0]) && (h->nb[1] == t->nb[1]) && (h->nb[2] == t->nb[2]) && (h->nb[3] == t->nb[3]);
|
|
1874
|
+
}
|
|
1875
|
+
|
|
1876
|
+
// add tensor and return its index
|
|
1877
|
+
int add_tensor(const ggml_tensor * t) {
|
|
1878
|
+
auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(t->buffer->context);
|
|
1879
|
+
|
|
1880
|
+
// First lookup by tensor data
|
|
1881
|
+
auto range = d_map.equal_range(t->data);
|
|
1882
|
+
for (auto it = range.first; it != range.second; ++it) {
|
|
1883
|
+
htp_tensor * h = &h_tens[it->second];
|
|
1884
|
+
if (same_shape(h, t)) { return it->second; }
|
|
1885
|
+
}
|
|
1886
|
+
|
|
1887
|
+
// Lookup by tensor ptr
|
|
1888
|
+
auto it = t_map.find(t);
|
|
1889
|
+
if (it != t_map.end()) { return it->second; }
|
|
1890
|
+
|
|
1891
|
+
// Add new tensor to the batch
|
|
1892
|
+
int ti = n_tens++;
|
|
1893
|
+
GGML_ASSERT(n_tens <= n_tens_max);
|
|
1894
|
+
|
|
1895
|
+
t_map.insert({t, ti});
|
|
1896
|
+
d_map.insert({t->data, ti});
|
|
1897
|
+
|
|
1898
|
+
uint64_t t_offset = (uint8_t *) t->data - sbuf->base;
|
|
1899
|
+
size_t t_size = ggml_nbytes(t);
|
|
1900
|
+
|
|
1901
|
+
htp_tensor &h = h_tens[ti];
|
|
1902
|
+
h.bi = add_buffer(sbuf);
|
|
1903
|
+
h.data = t_offset;
|
|
1904
|
+
h.size = t_size;
|
|
1905
|
+
h.type = t->type;
|
|
1906
|
+
h.ne[0] = t->ne[0]; h.ne[1] = t->ne[1]; h.ne[2] = t->ne[2]; h.ne[3] = t->ne[3];
|
|
1907
|
+
h.nb[0] = t->nb[0]; h.nb[1] = t->nb[1]; h.nb[2] = t->nb[2]; h.nb[3] = t->nb[3];
|
|
1908
|
+
|
|
1909
|
+
h.flags = 0;
|
|
1910
|
+
if (ggml_backend_buffer_get_usage(t->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
|
1911
|
+
h.flags |= HTP_TENSOR_COMPUTE;
|
|
1912
|
+
}
|
|
1913
|
+
|
|
1914
|
+
HEX_VERBOSE("ggml-hex: add-tensor #%u %s : bi %d data %p offset %zu size %zu flags 0x%x : %zu:%zu:%zu:%zu\n",
|
|
1915
|
+
ti, t->name, h.bi, (void*) t->data, (size_t) t_offset, t_size, h.flags,
|
|
1916
|
+
(size_t) t->ne[0], (size_t) t->ne[1], (size_t) t->ne[2], (size_t) t->ne[3]);
|
|
1917
|
+
|
|
1918
|
+
return ti;
|
|
1919
|
+
}
|
|
1920
|
+
|
|
1921
|
+
bool fit_op(const htp_opnode & node) const {
|
|
1922
|
+
if (n_ops >= n_ops_max ) return false;
|
|
1923
|
+
|
|
1924
|
+
// check how much extras we will need
|
|
1925
|
+
size_t extra_bufs = 0;
|
|
1926
|
+
size_t extra_vmem = 0;
|
|
1927
|
+
size_t extra_tens = 0;
|
|
1928
|
+
|
|
1929
|
+
auto fit_tensor = [&](const ggml_tensor *t) {
|
|
1930
|
+
if (!t) return;
|
|
1931
|
+
if (!t_map.count(t)) {
|
|
1932
|
+
extra_tens++;
|
|
1933
|
+
|
|
1934
|
+
auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(t->buffer->context);
|
|
1935
|
+
if (!b_map.count(sbuf->fd)) {
|
|
1936
|
+
extra_vmem += sbuf->size;
|
|
1937
|
+
extra_bufs += 1;
|
|
1938
|
+
}
|
|
1939
|
+
}
|
|
1940
|
+
};
|
|
1941
|
+
|
|
1942
|
+
for (const auto * src : node.get_inputs()) {
|
|
1943
|
+
fit_tensor(src);
|
|
1944
|
+
}
|
|
1945
|
+
fit_tensor(node.dst());
|
|
1946
|
+
|
|
1947
|
+
if ((extra_bufs + n_bufs) > n_bufs_max) return false;
|
|
1948
|
+
if ((extra_tens + n_tens) > n_tens_max) return false;
|
|
1949
|
+
if ((extra_vmem + b_vmem) > b_vmem_max) return false;
|
|
1950
|
+
|
|
1951
|
+
return true;
|
|
1952
|
+
}
|
|
1953
|
+
|
|
1954
|
+
// assumes that fit_op() was called first and returned true
|
|
1955
|
+
void add_op(const htp_opnode & node) {
|
|
1956
|
+
// Add new op
|
|
1957
|
+
|
|
1958
|
+
unsigned int n = n_ops++;
|
|
1959
|
+
GGML_ASSERT(n_ops <= n_ops_max);
|
|
1960
|
+
|
|
1961
|
+
ops[n] = node;
|
|
1962
|
+
|
|
1963
|
+
htp_op_desc &o = h_ops[n];
|
|
1964
|
+
memcpy(&o.params, &node.node->op_params, sizeof(node.node->op_params));
|
|
1965
|
+
o.opcode = node.opcode;
|
|
1966
|
+
o.flags = 0;
|
|
1967
|
+
|
|
1968
|
+
if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) {
|
|
1969
|
+
o.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
|
|
1970
|
+
}
|
|
1971
|
+
|
|
1972
|
+
ggml_hexagon_dump_op_exec(sess->c_name(), node, o.flags);
|
|
1973
|
+
|
|
1974
|
+
auto inputs = node.get_inputs();
|
|
1975
|
+
for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) {
|
|
1976
|
+
o.src[i] = (i < inputs.size() && inputs[i]) ? add_tensor(inputs[i]) : 0xffff;
|
|
1977
|
+
}
|
|
1978
|
+
o.dst = add_tensor(node.dst());
|
|
1979
|
+
}
|
|
1980
|
+
};
|
|
1981
|
+
|
|
1982
|
+
struct ggml_hexagon_opqueue {
|
|
1983
|
+
// Shared buffer for storing batches
|
|
1984
|
+
ggml_hexagon_shared_buffer *shm_buf;
|
|
1985
|
+
size_t shm_blk_size;
|
|
1986
|
+
|
|
1987
|
+
using opvec = std::vector<htp_opnode>;
|
|
1988
|
+
|
|
1989
|
+
std::queue<unsigned int> done; // completed batch ids
|
|
1990
|
+
std::vector<opvec> op_cache; // per batch op cache
|
|
1991
|
+
std::vector<uint64_t> start_usec; // per batch start time
|
|
1992
|
+
|
|
1993
|
+
ggml_hexagon_opqueue(ggml_hexagon_session *sess, size_t batch_size, size_t depth) {
|
|
1994
|
+
size_t n_bufs = HTP_OP_MAX_BUFS;
|
|
1995
|
+
size_t n_ops = batch_size;
|
|
1996
|
+
size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
|
|
1997
|
+
|
|
1998
|
+
shm_blk_size = sizeof(htp_buf_desc) * n_bufs +
|
|
1999
|
+
sizeof(htp_tensor) * n_tensors +
|
|
2000
|
+
sizeof(htp_op_desc) * n_ops +
|
|
2001
|
+
sizeof(htp_prof_desc) * n_ops;
|
|
2002
|
+
|
|
2003
|
+
shm_buf = new ggml_hexagon_shared_buffer(sess, shm_blk_size * depth, true /* pinned */);
|
|
2004
|
+
|
|
2005
|
+
op_cache.resize(depth);
|
|
2006
|
+
start_usec.resize(depth, 0);
|
|
2007
|
+
|
|
2008
|
+
// init done queue
|
|
2009
|
+
for (unsigned int i = 0; i < depth; i++) { done.push(i); }
|
|
2010
|
+
|
|
2011
|
+
if (opt_verbose) {
|
|
2012
|
+
GGML_LOG_INFO("ggml-hex: %s allocated op-queue : batch-size %zu depth %zu shm-size %zu shm-block-size %zu\n",
|
|
2013
|
+
sess->c_name(), batch_size, depth, shm_buf->size, shm_blk_size);
|
|
2014
|
+
}
|
|
2015
|
+
}
|
|
2016
|
+
|
|
2017
|
+
~ggml_hexagon_opqueue() {
|
|
2018
|
+
delete shm_buf;
|
|
2019
|
+
}
|
|
2020
|
+
|
|
2021
|
+
// push new batch
|
|
2022
|
+
bool push(htp_opbatch_req& req, dspqueue_buffer& dbuf, ggml_hexagon_opbatch* op_batch) {
|
|
2023
|
+
static_assert(sizeof(htp_opbatch_req) % 8 == 0, "sizeof(htp_opbatch_req) must be multiple of 8");
|
|
2024
|
+
static_assert(sizeof(htp_opbatch_rsp) % 8 == 0, "sizeof(htp_opbatch_rsp) must be multiple of 8");
|
|
2025
|
+
static_assert(sizeof(htp_buf_desc) % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
|
|
2026
|
+
static_assert(sizeof(htp_tensor) % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
|
|
2027
|
+
static_assert(sizeof(htp_op_desc) % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
|
|
2028
|
+
static_assert(sizeof(htp_prof_desc) % 8 == 0, "sizeof(htp_prof_desc) must be multiple of 8");
|
|
2029
|
+
|
|
2030
|
+
if (done.empty()) { return false; }
|
|
2031
|
+
|
|
2032
|
+
req.id = done.front(); done.pop(); // batch id
|
|
2033
|
+
req.n_bufs = op_batch->n_bufs;
|
|
2034
|
+
req.n_tensors = op_batch->n_tens;
|
|
2035
|
+
req.n_ops = op_batch->n_ops;
|
|
2036
|
+
|
|
2037
|
+
op_cache[req.id] = op_batch->ops;
|
|
2038
|
+
start_usec[req.id] = ggml_time_us();
|
|
2039
|
+
|
|
2040
|
+
const size_t b_size = sizeof(htp_buf_desc) * req.n_bufs;
|
|
2041
|
+
const size_t t_size = sizeof(htp_tensor) * req.n_tensors;
|
|
2042
|
+
const size_t o_size = sizeof(htp_op_desc) * req.n_ops;
|
|
2043
|
+
const size_t p_size = sizeof(htp_prof_desc) * req.n_ops;
|
|
2044
|
+
|
|
2045
|
+
dbuf.ptr = shm_buf->base + (req.id * shm_blk_size);
|
|
2046
|
+
dbuf.fd = shm_buf->fd;
|
|
2047
|
+
dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
|
|
2048
|
+
dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) shm_buf->base;
|
|
2049
|
+
dbuf.size = b_size + t_size + o_size + p_size;
|
|
2050
|
+
|
|
2051
|
+
GGML_ASSERT(dbuf.size <= shm_blk_size);
|
|
2052
|
+
|
|
2053
|
+
uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
|
|
2054
|
+
uint8_t * b_ptr = m_ptr; m_ptr += b_size;
|
|
2055
|
+
uint8_t * t_ptr = m_ptr; m_ptr += t_size;
|
|
2056
|
+
uint8_t * o_ptr = m_ptr;
|
|
2057
|
+
|
|
2058
|
+
memcpy(b_ptr, (void *) op_batch->h_bufs.data(), b_size);
|
|
2059
|
+
memcpy(t_ptr, (void *) op_batch->h_tens.data(), t_size);
|
|
2060
|
+
memcpy(o_ptr, (void *) op_batch->h_ops.data(), o_size);
|
|
2061
|
+
|
|
2062
|
+
HEX_VERBOSE("ggml-hex: %s op-queue push batch #%u : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu m-size %zu\n",
|
|
2063
|
+
shm_buf->sess->c_name(), req.id, req.n_bufs, req.n_tensors, req.n_ops, op_batch->b_vmem,
|
|
2064
|
+
b_size, t_size, o_size, (size_t) dbuf.size);
|
|
2065
|
+
|
|
2066
|
+
op_batch->reset();
|
|
2067
|
+
|
|
2068
|
+
if (opt_verbose > 1) {
|
|
2069
|
+
htp_buf_desc *b = (htp_buf_desc*) b_ptr;
|
|
2070
|
+
for (unsigned int i=0; i < req.n_bufs; i++) {
|
|
2071
|
+
GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", shm_buf->sess->c_name(), i,
|
|
2072
|
+
b[i].fd, (void *) b[i].base, (size_t) b[i].size);
|
|
2073
|
+
}
|
|
2074
|
+
htp_tensor *t = (htp_tensor*) t_ptr;
|
|
2075
|
+
for (unsigned int i=0; i < req.n_tensors; i++) {
|
|
2076
|
+
GGML_LOG_DEBUG("ggml-hex: %s htp-tensor #%u : bi %u offset %u size %u : %zu:%zu:%zu:%zu\n",
|
|
2077
|
+
shm_buf->sess->c_name(), i, t[i].bi, t[i].data, t[i].size,
|
|
2078
|
+
(size_t) t[i].ne[0], (size_t) t[i].ne[1], (size_t) t[i].ne[2], (size_t) t[i].ne[3]);
|
|
2079
|
+
}
|
|
2080
|
+
}
|
|
2081
|
+
|
|
2082
|
+
return true;
|
|
2083
|
+
}
|
|
2084
|
+
|
|
2085
|
+
void pop(htp_opbatch_rsp rsp, dspqueue_buffer dbuf) {
|
|
2086
|
+
GGML_ASSERT(rsp.id < op_cache.size());
|
|
2087
|
+
|
|
2088
|
+
done.push(rsp.id);
|
|
2089
|
+
|
|
2090
|
+
const size_t b_size = sizeof(htp_buf_desc) * rsp.n_bufs;
|
|
2091
|
+
const size_t t_size = sizeof(htp_tensor) * rsp.n_tensors;
|
|
2092
|
+
const size_t o_size = sizeof(htp_op_desc) * rsp.n_ops;
|
|
2093
|
+
const size_t p_size = sizeof(htp_prof_desc) * rsp.n_ops;
|
|
2094
|
+
|
|
2095
|
+
const size_t m_size = b_size + t_size + o_size + p_size;
|
|
2096
|
+
GGML_ASSERT(m_size <= shm_blk_size);
|
|
2097
|
+
|
|
2098
|
+
HEX_VERBOSE("ggml-hex: %s op-queue pop batch #%u : n-bufs %u n-tensors %u n-ops %u : m-size %zu b-size %zu t-size %zu o-size %zu\n",
|
|
2099
|
+
shm_buf->sess->c_name(), rsp.id, rsp.n_bufs, rsp.n_tensors, rsp.n_ops,
|
|
2100
|
+
(size_t) dbuf.size, b_size, t_size, o_size);
|
|
2101
|
+
|
|
2102
|
+
uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
|
|
2103
|
+
uint8_t * p_ptr = m_ptr + (b_size + t_size + o_size);
|
|
2104
|
+
|
|
2105
|
+
if (opt_profile && rsp.n_ops > 0) {
|
|
2106
|
+
auto & ops = op_cache[rsp.id];
|
|
2107
|
+
|
|
2108
|
+
uint64_t batch_usec = ggml_time_us() - start_usec[rsp.id];
|
|
2109
|
+
uint32_t htp_usec = 0;
|
|
2110
|
+
|
|
2111
|
+
GGML_ASSERT(rsp.n_ops <= ops.size());
|
|
2112
|
+
|
|
2113
|
+
const htp_prof_desc * pd = (const htp_prof_desc *) p_ptr;
|
|
2114
|
+
for (uint32_t i = 0; i < rsp.n_ops; i++) {
|
|
2115
|
+
htp_usec += pd[i].usecs;
|
|
2116
|
+
ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i].usecs, pd[i].cycles, pd[i].pmu);
|
|
2117
|
+
}
|
|
2118
|
+
|
|
2119
|
+
GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u\n",
|
|
2120
|
+
shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec);
|
|
2121
|
+
}
|
|
2122
|
+
}
|
|
2123
|
+
};
|
|
2124
|
+
|
|
2125
|
+
// Flush HTP response queue i.e wait for all outstanding requests to complete
|
|
2126
|
+
void ggml_hexagon_session::flush_pending(bool all) {
|
|
2127
|
+
while (this->op_pending) {
|
|
2128
|
+
struct htp_opbatch_rsp rsp;
|
|
2129
|
+
uint32_t rsp_size;
|
|
2130
|
+
uint32_t flags;
|
|
2131
|
+
|
|
2132
|
+
struct dspqueue_buffer dbuf;
|
|
2133
|
+
uint32_t n_dbufs;
|
|
2134
|
+
|
|
2135
|
+
// Read response packet from queue
|
|
2136
|
+
const uint32_t timeo = opt_oppoll ? 0 : DSPQUEUE_TIMEOUT;
|
|
2137
|
+
int err = dspqueue_read(this->queue, &flags, 1, &n_dbufs, &dbuf, sizeof(rsp), &rsp_size, (uint8_t *) &rsp, timeo);
|
|
2138
|
+
if (err == AEE_EEXPIRED) {
|
|
2139
|
+
continue;
|
|
2140
|
+
}
|
|
2141
|
+
|
|
2142
|
+
if (err != 0) {
|
|
2143
|
+
GGML_ABORT("ggml-hex: dspqueue_read failed: 0x%08x\n", (unsigned) err);
|
|
2144
|
+
}
|
|
2145
|
+
|
|
2146
|
+
// Basic sanity checks
|
|
2147
|
+
if (rsp_size != sizeof(rsp) || n_dbufs != 1) {
|
|
2148
|
+
GGML_ABORT("ggml-hex: %s dspcall : bad response : size %u dspbufs %u\n", this->c_name(), rsp_size, n_dbufs);
|
|
2149
|
+
}
|
|
2150
|
+
|
|
2151
|
+
if (rsp.status != HTP_STATUS_OK) {
|
|
2152
|
+
GGML_LOG_ERROR("ggml-hex: %s dspcall : dsp-rsp: %s\n", this->c_name(), status_to_str(rsp.status));
|
|
2153
|
+
// TODO: handle errors
|
|
2154
|
+
}
|
|
2155
|
+
|
|
2156
|
+
op_queue->pop(rsp, dbuf);
|
|
2157
|
+
|
|
2158
|
+
this->op_pending--; // atomic dec
|
|
2159
|
+
|
|
2160
|
+
if (!all) break;
|
|
2161
|
+
}
|
|
2162
|
+
}
|
|
2163
|
+
|
|
2164
|
+
void ggml_hexagon_session::flush_batch() {
|
|
2165
|
+
if (op_batch->empty()) { return; }
|
|
2166
|
+
|
|
2167
|
+
htp_opbatch_req req {};
|
|
2168
|
+
dspqueue_buffer dbuf{};
|
|
2169
|
+
|
|
2170
|
+
if (!op_queue->push(req, dbuf, op_batch)) {
|
|
2171
|
+
flush_pending(false);
|
|
2172
|
+
op_queue->push(req, dbuf, op_batch);
|
|
2173
|
+
}
|
|
2174
|
+
|
|
2175
|
+
// Bump pending flag (cleared in the session::flush once we get the response)
|
|
2176
|
+
this->op_pending++; // atomic inc
|
|
2177
|
+
|
|
2178
|
+
HEX_VERBOSE("ggml-hex: %s queue-opbatch: %p size %u\n", this->c_name(), dbuf.ptr, dbuf.size);
|
|
2179
|
+
|
|
2180
|
+
int err = dspqueue_write(this->queue, 0, 1, &dbuf, sizeof(req), (const uint8_t*) &req, DSPQUEUE_TIMEOUT);
|
|
2181
|
+
if (err != 0) {
|
|
2182
|
+
GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->c_name(), (unsigned) err);
|
|
2183
|
+
}
|
|
2184
|
+
}
|
|
2185
|
+
|
|
2186
|
+
void ggml_hexagon_session::enqueue_op(const htp_opnode & node) {
|
|
2187
|
+
if (!op_batch->fit_op(node)) {
|
|
2188
|
+
flush_batch();
|
|
2189
|
+
}
|
|
2190
|
+
op_batch->add_op(node);
|
|
2191
|
+
}
|
|
2192
|
+
|
|
2193
|
+
// Flush HTP response queue i.e wait for all outstanding requests to complete
|
|
2194
|
+
void ggml_hexagon_session::flush(bool all) {
|
|
2195
|
+
flush_batch();
|
|
2196
|
+
flush_pending(all);
|
|
2197
|
+
}
|
|
2198
|
+
|
|
2199
|
+
static size_t ggml_hexagon_measure_max_vmem(ggml_hexagon_session *sess) {
|
|
2200
|
+
// Allocate a bunch pinned buffers till failure.
|
|
2201
|
+
// This is kind of expensive but handy for figuring out exactly how much we can mmap on a specific device.
|
|
2202
|
+
// Typically we're going to allocate all/most of these buffers anyway for the model weights.
|
|
2203
|
+
|
|
2204
|
+
std::vector<ggml_hexagon_shared_buffer *> sbufs;
|
|
2205
|
+
|
|
2206
|
+
const size_t MiB = 1024 * 1024;
|
|
2207
|
+
const size_t GiB = MiB * 1024;
|
|
2208
|
+
|
|
2209
|
+
size_t vmem = 0;
|
|
2210
|
+
size_t step = 256u * MiB;
|
|
2211
|
+
|
|
2212
|
+
try {
|
|
2213
|
+
sbufs.push_back(new ggml_hexagon_shared_buffer(sess, GiB, true)); vmem += GiB;
|
|
2214
|
+
sbufs.push_back(new ggml_hexagon_shared_buffer(sess, GiB, true)); vmem += GiB;
|
|
2215
|
+
sbufs.push_back(new ggml_hexagon_shared_buffer(sess, GiB, true)); vmem += GiB;
|
|
2216
|
+
|
|
2217
|
+
while (1) {
|
|
2218
|
+
sbufs.push_back(new ggml_hexagon_shared_buffer(sess, step, true));
|
|
2219
|
+
vmem += step;
|
|
2220
|
+
}
|
|
2221
|
+
} catch (...) { }
|
|
2222
|
+
|
|
2223
|
+
for (auto b : sbufs) { delete b; }
|
|
2224
|
+
|
|
2225
|
+
return vmem - step; // backoff to account for overhead from internal mappings
|
|
2226
|
+
}
|
|
2227
|
+
|
|
1557
2228
|
void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
|
|
1558
2229
|
this->valid_session = false;
|
|
1559
2230
|
this->valid_handle = false;
|
|
@@ -1566,11 +2237,8 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
|
|
|
1566
2237
|
this->name = std::string("HTP") + std::to_string(dev_id);
|
|
1567
2238
|
|
|
1568
2239
|
this->op_pending = 0;
|
|
1569
|
-
this->prof_usecs = 0;
|
|
1570
|
-
this->prof_cycles = 0;
|
|
1571
|
-
this->prof_pkts = 0;
|
|
1572
2240
|
|
|
1573
|
-
|
|
2241
|
+
GGML_LOG_DEBUG("ggml-hex: %s allocating new session\n", this->name.c_str());
|
|
1574
2242
|
|
|
1575
2243
|
domain * my_domain = get_domain(this->domain_id);
|
|
1576
2244
|
if (my_domain == NULL) {
|
|
@@ -1646,9 +2314,6 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
|
|
|
1646
2314
|
|
|
1647
2315
|
this->valid_handle = true;
|
|
1648
2316
|
|
|
1649
|
-
GGML_LOG_INFO("ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(),
|
|
1650
|
-
this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
|
|
1651
|
-
|
|
1652
2317
|
// Enable FastRPC QoS mode
|
|
1653
2318
|
{
|
|
1654
2319
|
struct remote_rpc_control_latency l;
|
|
@@ -1660,11 +2325,17 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
|
|
|
1660
2325
|
}
|
|
1661
2326
|
}
|
|
1662
2327
|
|
|
2328
|
+
GGML_LOG_INFO("ggml-hex: %s new session : session-id %d domain-id %d uri %s handle 0x%lx\n", this->c_name(),
|
|
2329
|
+
this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
|
|
2330
|
+
|
|
2331
|
+
const size_t req_q_size = (sizeof(htp_opbatch_req) * opt_opqueue * 2) + 1024;
|
|
2332
|
+
const size_t rsp_q_size = (sizeof(htp_opbatch_rsp) * opt_opqueue * 2) + 1024;
|
|
2333
|
+
|
|
1663
2334
|
// Now let's setup the DSP queue
|
|
1664
2335
|
err = dspqueue_create(this->domain_id,
|
|
1665
2336
|
0, // Flags
|
|
1666
|
-
|
|
1667
|
-
|
|
2337
|
+
req_q_size, // Request queue size (in bytes)
|
|
2338
|
+
rsp_q_size, // Response queue size (in bytes)
|
|
1668
2339
|
nullptr, // Read packet callback (we handle reads explicitly)
|
|
1669
2340
|
nullptr, // Error callback (we handle errors during reads)
|
|
1670
2341
|
(void *) this, // Callback context
|
|
@@ -1684,18 +2355,36 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
|
|
|
1684
2355
|
}
|
|
1685
2356
|
|
|
1686
2357
|
if (opt_etm) {
|
|
1687
|
-
err =
|
|
2358
|
+
err = htp_iface_etm(this->handle, 1);
|
|
1688
2359
|
if (err != 0) {
|
|
1689
2360
|
GGML_LOG_ERROR("ggml-hex: failed to enable ETM tracing: 0x%08x\n", (unsigned) err);
|
|
1690
2361
|
}
|
|
1691
2362
|
}
|
|
1692
2363
|
|
|
1693
|
-
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
2364
|
+
if (opt_profile) {
|
|
2365
|
+
htp_iface_pmu_conf pmu_conf{};
|
|
2366
|
+
std::copy(opt_pmu_evt.begin(), opt_pmu_evt.end(), pmu_conf.events);
|
|
2367
|
+
|
|
2368
|
+
err = htp_iface_profiler(this->handle, opt_profile, &pmu_conf);
|
|
2369
|
+
if (err != 0) {
|
|
2370
|
+
GGML_LOG_ERROR("ggml-hex: failed to enable profiling: 0x%08x\n", (unsigned) err);
|
|
2371
|
+
}
|
|
2372
|
+
}
|
|
2373
|
+
|
|
2374
|
+
// Allocate buffers and state for op batching
|
|
2375
|
+
this->op_queue = new ggml_hexagon_opqueue(this, opt_opbatch, opt_opqueue);
|
|
2376
|
+
|
|
2377
|
+
if (!opt_vmem) {
|
|
2378
|
+
opt_vmem = ggml_hexagon_measure_max_vmem(this);
|
|
2379
|
+
GGML_LOG_INFO("ggml-hex: %s measured max vmem %zu\n", this->c_name(), opt_vmem);
|
|
2380
|
+
}
|
|
2381
|
+
|
|
2382
|
+
this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch, opt_vmem);
|
|
2383
|
+
|
|
2384
|
+
// Start dspqueue/opbatch processing
|
|
2385
|
+
err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx, opt_use_hmx, opt_vmem);
|
|
1697
2386
|
if (err != 0) {
|
|
1698
|
-
GGML_LOG_ERROR("ggml-hex: failed to start session: 0x%08x\n", (unsigned) err);
|
|
2387
|
+
GGML_LOG_ERROR("ggml-hex: %s failed to start session: 0x%08x\n", this->c_name(), (unsigned) err);
|
|
1699
2388
|
throw std::runtime_error("ggml-hex: iface start failed (see log for details)");
|
|
1700
2389
|
}
|
|
1701
2390
|
this->valid_iface = true;
|
|
@@ -1706,21 +2395,32 @@ void ggml_hexagon_session::release() noexcept(true) {
|
|
|
1706
2395
|
|
|
1707
2396
|
int err;
|
|
1708
2397
|
|
|
1709
|
-
// Stop the DSP-side service and close the queue
|
|
1710
2398
|
if (this->valid_iface) {
|
|
2399
|
+
// Stop dspqueue/opbatch processing
|
|
1711
2400
|
err = htp_iface_stop(this->handle);
|
|
1712
2401
|
if (err != 0) {
|
|
1713
2402
|
GGML_ABORT("ggml-hex: htp_iface_stop failed: 0x%08x\n", (unsigned) err);
|
|
1714
2403
|
}
|
|
1715
2404
|
}
|
|
1716
2405
|
|
|
2406
|
+
delete this->op_batch;
|
|
2407
|
+
delete this->op_queue;
|
|
2408
|
+
|
|
1717
2409
|
if (opt_etm) {
|
|
1718
|
-
err =
|
|
2410
|
+
err = htp_iface_etm(this->handle, 0);
|
|
1719
2411
|
if (err != 0) {
|
|
1720
2412
|
GGML_LOG_ERROR("ggml-hex: warn : failed to disable ETM tracing: 0x%08x\n", (unsigned) err);
|
|
1721
2413
|
}
|
|
1722
2414
|
}
|
|
1723
2415
|
|
|
2416
|
+
if (opt_profile) {
|
|
2417
|
+
htp_iface_pmu_conf pmu_conf{};
|
|
2418
|
+
err = htp_iface_profiler(this->handle, 0, &pmu_conf);
|
|
2419
|
+
if (err != 0) {
|
|
2420
|
+
GGML_LOG_ERROR("ggml-hex: warn : failed to disable profiling: 0x%08x\n", (unsigned) err);
|
|
2421
|
+
}
|
|
2422
|
+
}
|
|
2423
|
+
|
|
1724
2424
|
if (this->valid_queue) {
|
|
1725
2425
|
err = dspqueue_close(queue);
|
|
1726
2426
|
if (err != 0) {
|
|
@@ -1737,6 +2437,9 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
|
|
|
1737
2437
|
buffer_type.device = dev;
|
|
1738
2438
|
repack_buffer_type.device = dev;
|
|
1739
2439
|
|
|
2440
|
+
op_batch = nullptr;
|
|
2441
|
+
op_queue = nullptr;
|
|
2442
|
+
|
|
1740
2443
|
try {
|
|
1741
2444
|
allocate(dev_id);
|
|
1742
2445
|
|
|
@@ -1799,9 +2502,66 @@ static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_sess
|
|
|
1799
2502
|
return false;
|
|
1800
2503
|
}
|
|
1801
2504
|
|
|
1802
|
-
|
|
2505
|
+
if (dst->ne[3] != 1) {
|
|
2506
|
+
return false;
|
|
2507
|
+
}
|
|
2508
|
+
|
|
2509
|
+
return true;
|
|
1803
2510
|
}
|
|
1804
2511
|
|
|
2512
|
+
static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
2513
|
+
const struct ggml_tensor * q = op->src[0];
|
|
2514
|
+
const struct ggml_tensor * k = op->src[1];
|
|
2515
|
+
const struct ggml_tensor * v = op->src[2];
|
|
2516
|
+
const struct ggml_tensor * g = op->src[3];
|
|
2517
|
+
const struct ggml_tensor * beta = op->src[4];
|
|
2518
|
+
const struct ggml_tensor * state = op->src[5];
|
|
2519
|
+
const struct ggml_tensor * dst = op;
|
|
2520
|
+
|
|
2521
|
+
if (!q || !k || !v || !g || !beta || !state) {
|
|
2522
|
+
return false;
|
|
2523
|
+
}
|
|
2524
|
+
|
|
2525
|
+
if (q->type != GGML_TYPE_F32 || k->type != GGML_TYPE_F32 || v->type != GGML_TYPE_F32 ||
|
|
2526
|
+
g->type != GGML_TYPE_F32 || beta->type != GGML_TYPE_F32 || state->type != GGML_TYPE_F32 ||
|
|
2527
|
+
dst->type != GGML_TYPE_F32) {
|
|
2528
|
+
return false;
|
|
2529
|
+
}
|
|
2530
|
+
|
|
2531
|
+
if (!ggml_is_contiguous_rows(q) || !ggml_is_contiguous_rows(k) || !ggml_is_contiguous_rows(v) ||
|
|
2532
|
+
!ggml_is_contiguous(g) || !ggml_is_contiguous(beta) || !ggml_is_contiguous(state) ||
|
|
2533
|
+
!ggml_is_contiguous(dst)) {
|
|
2534
|
+
return false;
|
|
2535
|
+
}
|
|
2536
|
+
|
|
2537
|
+
const int64_t S_v = v->ne[0];
|
|
2538
|
+
const int64_t H = v->ne[1];
|
|
2539
|
+
const int64_t n_tokens = v->ne[2];
|
|
2540
|
+
const int64_t n_seqs = v->ne[3];
|
|
2541
|
+
const int64_t K = ggml_get_op_params_i32(op, 0);
|
|
2542
|
+
|
|
2543
|
+
if (S_v <= 0 || S_v > 128 || H <= 0 || n_tokens <= 0 || n_seqs <= 0) {
|
|
2544
|
+
return false;
|
|
2545
|
+
}
|
|
2546
|
+
if (q->ne[0] != S_v || k->ne[0] != S_v || q->ne[1] <= 0 || k->ne[1] <= 0 ||
|
|
2547
|
+
q->ne[2] != n_tokens || k->ne[2] != n_tokens || q->ne[3] <= 0 || k->ne[3] <= 0 ||
|
|
2548
|
+
(n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) {
|
|
2549
|
+
return false;
|
|
2550
|
+
}
|
|
2551
|
+
if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) {
|
|
2552
|
+
return false;
|
|
2553
|
+
}
|
|
2554
|
+
// state holds s0 only [S_v, S_v, H, n_seqs]; K is op param 0.
|
|
2555
|
+
if (ggml_nelements(state) != S_v * S_v * H * n_seqs) {
|
|
2556
|
+
return false;
|
|
2557
|
+
}
|
|
2558
|
+
if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs * K) {
|
|
2559
|
+
return false;
|
|
2560
|
+
}
|
|
2561
|
+
|
|
2562
|
+
GGML_UNUSED(sess);
|
|
2563
|
+
return true;
|
|
2564
|
+
}
|
|
1805
2565
|
|
|
1806
2566
|
static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
|
|
1807
2567
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
@@ -1817,7 +2577,9 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
|
|
1817
2577
|
|
|
1818
2578
|
switch (src0->type) {
|
|
1819
2579
|
case GGML_TYPE_Q4_0:
|
|
2580
|
+
case GGML_TYPE_Q4_1:
|
|
1820
2581
|
case GGML_TYPE_Q8_0:
|
|
2582
|
+
case GGML_TYPE_IQ4_NL:
|
|
1821
2583
|
case GGML_TYPE_MXFP4:
|
|
1822
2584
|
if (src0->ne[0] % 32) {
|
|
1823
2585
|
return false;
|
|
@@ -1842,6 +2604,27 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
|
|
1842
2604
|
GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F16 src0 not supported\n");
|
|
1843
2605
|
return false;
|
|
1844
2606
|
}
|
|
2607
|
+
if (src1->ne[2] < src0->ne[2] || src1->ne[3] < src0->ne[3]) {
|
|
2608
|
+
GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: src1 broadcasting not supported\n");
|
|
2609
|
+
return false;
|
|
2610
|
+
}
|
|
2611
|
+
if (ggml_nrows(src1) > 1024) {
|
|
2612
|
+
return false; // no huge batches (for now)
|
|
2613
|
+
}
|
|
2614
|
+
break;
|
|
2615
|
+
|
|
2616
|
+
case GGML_TYPE_F32:
|
|
2617
|
+
if (src1->type != GGML_TYPE_F32) {
|
|
2618
|
+
return false;
|
|
2619
|
+
}
|
|
2620
|
+
if (src0->nb[1] < src0->nb[0]) {
|
|
2621
|
+
GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F32 src0 not supported\n");
|
|
2622
|
+
return false;
|
|
2623
|
+
}
|
|
2624
|
+
if (src1->ne[2] < src0->ne[2] || src1->ne[3] < src0->ne[3]) {
|
|
2625
|
+
GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: src1 broadcasting not supported\n");
|
|
2626
|
+
return false;
|
|
2627
|
+
}
|
|
1845
2628
|
if (ggml_nrows(src1) > 1024) {
|
|
1846
2629
|
return false; // no huge batches (for now)
|
|
1847
2630
|
}
|
|
@@ -1866,7 +2649,9 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
|
|
|
1866
2649
|
|
|
1867
2650
|
switch (src0->type) {
|
|
1868
2651
|
case GGML_TYPE_Q4_0:
|
|
2652
|
+
case GGML_TYPE_Q4_1:
|
|
1869
2653
|
case GGML_TYPE_Q8_0:
|
|
2654
|
+
case GGML_TYPE_IQ4_NL:
|
|
1870
2655
|
case GGML_TYPE_MXFP4:
|
|
1871
2656
|
if ((src0->ne[0] % 32)) {
|
|
1872
2657
|
return false;
|
|
@@ -1960,8 +2745,8 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
|
|
|
1960
2745
|
return false;
|
|
1961
2746
|
}
|
|
1962
2747
|
|
|
1963
|
-
//
|
|
1964
|
-
if (!ggml_is_contiguous(
|
|
2748
|
+
// dst must be contiguous; src0 may be non-contiguous
|
|
2749
|
+
if (!ggml_is_contiguous(dst)) {
|
|
1965
2750
|
return false;
|
|
1966
2751
|
}
|
|
1967
2752
|
|
|
@@ -2064,8 +2849,25 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
|
|
|
2064
2849
|
}
|
|
2065
2850
|
}
|
|
2066
2851
|
|
|
2067
|
-
|
|
2068
|
-
|
|
2852
|
+
// Reject non-HVX-aligned sizes when ne[0] > HVX_F32_LANES
|
|
2853
|
+
// The HVX softmax implementation has issues with tail handling for larger non-aligned sizes
|
|
2854
|
+
// Small sizes (ne[0] <= 32) work correctly with tail-only processing
|
|
2855
|
+
const int64_t ne0 = src0->ne[0];
|
|
2856
|
+
if (ne0 > 32 && (ne0 & (32 - 1)) != 0) {
|
|
2857
|
+
return false;
|
|
2858
|
+
}
|
|
2859
|
+
|
|
2860
|
+
// HVX vector size constraints for softmax
|
|
2861
|
+
#define SOFTMAX_MAX_ROW_SIZE 131072 // 128K elements max for numerical precision
|
|
2862
|
+
|
|
2863
|
+
// Reject very large row sizes to avoid numerical precision issues
|
|
2864
|
+
// Softmax accumulation over many elements can lead to precision loss
|
|
2865
|
+
if (ne0 > SOFTMAX_MAX_ROW_SIZE) {
|
|
2866
|
+
return false;
|
|
2867
|
+
}
|
|
2868
|
+
|
|
2869
|
+
return true;
|
|
2870
|
+
}
|
|
2069
2871
|
|
|
2070
2872
|
static bool ggml_hexagon_supported_set_rows(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
2071
2873
|
const struct ggml_tensor * src0 = op->src[0]; // values
|
|
@@ -2132,7 +2934,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
|
|
|
2132
2934
|
|
|
2133
2935
|
int mode = op_params[2];
|
|
2134
2936
|
|
|
2135
|
-
if (
|
|
2937
|
+
if (mode == GGML_ROPE_TYPE_VISION) {
|
|
2136
2938
|
return false;
|
|
2137
2939
|
}
|
|
2138
2940
|
if (mode & 1) {
|
|
@@ -2206,486 +3008,238 @@ static bool ggml_hexagon_supported_ssm_conv(const struct ggml_hexagon_session *
|
|
|
2206
3008
|
if (dst->ne[0] != d_inner || dst->ne[1] != n_t || dst->ne[2] != n_s) {
|
|
2207
3009
|
return false;
|
|
2208
3010
|
}
|
|
2209
|
-
|
|
2210
|
-
|
|
2211
|
-
|
|
3011
|
+
if (src0->nb[0] != sizeof(float) || src1->nb[0] != sizeof(float) || dst->nb[0] != sizeof(float)) {
|
|
3012
|
+
return false;
|
|
3013
|
+
}
|
|
3014
|
+
if (src0->nb[1] != src0->ne[0] * sizeof(float) || src1->nb[1] != src1->ne[0] * sizeof(float)) {
|
|
2212
3015
|
return false;
|
|
2213
3016
|
}
|
|
2214
3017
|
|
|
2215
3018
|
return true;
|
|
2216
3019
|
}
|
|
2217
3020
|
|
|
2218
|
-
|
|
2219
|
-
|
|
2220
|
-
|
|
2221
|
-
DSPQBUF_TYPE_CONSTANT,
|
|
2222
|
-
};
|
|
2223
|
-
|
|
2224
|
-
static void dspqbuf_dump(dspqueue_buffer * d, const struct ggml_tensor * t, dspqbuf_type type) {
|
|
2225
|
-
if (opt_verbose < 2) return;
|
|
2226
|
-
|
|
2227
|
-
auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
|
|
2228
|
-
auto sess = buf->sess;
|
|
2229
|
-
|
|
2230
|
-
GGML_LOG_DEBUG("ggml-hex: %s dspqbuf : %s base-addr %p base-size %zu data %p offset %u size %u\n", sess->name.c_str(),
|
|
2231
|
-
t->name, (void *) buf->base, buf->size, (void *) d->ptr, (unsigned int) d->offset,
|
|
2232
|
-
(unsigned int) d->size);
|
|
2233
|
-
}
|
|
2234
|
-
|
|
2235
|
-
// Init hexagon tensor from GGML tensor and Hexagon buffer
|
|
2236
|
-
static void htp_req_tensor_init(htp_tensor * h, const ggml_tensor * t) {
|
|
2237
|
-
h->data = 0; // updated by the receiver
|
|
2238
|
-
h->type = t->type;
|
|
2239
|
-
h->ne[0] = t->ne[0];
|
|
2240
|
-
h->ne[1] = t->ne[1];
|
|
2241
|
-
h->ne[2] = t->ne[2];
|
|
2242
|
-
h->ne[3] = t->ne[3];
|
|
2243
|
-
h->nb[0] = t->nb[0];
|
|
2244
|
-
h->nb[1] = t->nb[1];
|
|
2245
|
-
h->nb[2] = t->nb[2];
|
|
2246
|
-
h->nb[3] = t->nb[3];
|
|
2247
|
-
}
|
|
3021
|
+
static bool ggml_hexagon_supported_pad(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
3022
|
+
const struct ggml_tensor * src0 = op->src[0];
|
|
3023
|
+
const struct ggml_tensor * dst = op;
|
|
2248
3024
|
|
|
2249
|
-
|
|
2250
|
-
|
|
2251
|
-
return 0;
|
|
3025
|
+
if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
|
3026
|
+
return false;
|
|
2252
3027
|
}
|
|
2253
3028
|
|
|
2254
|
-
|
|
3029
|
+
GGML_UNUSED(sess);
|
|
3030
|
+
return true;
|
|
3031
|
+
}
|
|
2255
3032
|
|
|
2256
|
-
|
|
2257
|
-
|
|
2258
|
-
|
|
2259
|
-
d->offset = (uint8_t *) t->data - buf->base;
|
|
2260
|
-
d->size = ggml_nbytes(t);
|
|
3033
|
+
static bool ggml_hexagon_supported_cumsum(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
3034
|
+
const struct ggml_tensor * src0 = op->src[0];
|
|
3035
|
+
const struct ggml_tensor * dst = op;
|
|
2261
3036
|
|
|
2262
|
-
if (
|
|
2263
|
-
|
|
2264
|
-
d->size = 64;
|
|
3037
|
+
if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
|
3038
|
+
return false;
|
|
2265
3039
|
}
|
|
2266
3040
|
|
|
2267
|
-
|
|
2268
|
-
|
|
2269
|
-
// Flush CPU
|
|
2270
|
-
d->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER;
|
|
2271
|
-
break;
|
|
2272
|
-
case DSPQBUF_TYPE_CPU_WRITE_DSP_READ:
|
|
2273
|
-
// Flush CPU, Invalidate DSP
|
|
2274
|
-
d->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
|
|
2275
|
-
break;
|
|
2276
|
-
default:
|
|
2277
|
-
// Constant buffer, no cache maintenance
|
|
2278
|
-
d->flags = 0;
|
|
2279
|
-
break;
|
|
3041
|
+
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
|
|
3042
|
+
return false;
|
|
2280
3043
|
}
|
|
2281
3044
|
|
|
2282
|
-
|
|
2283
|
-
|
|
2284
|
-
dspqbuf_dump(d, t, type);
|
|
2285
|
-
|
|
2286
|
-
return 1;
|
|
3045
|
+
GGML_UNUSED(sess);
|
|
3046
|
+
return true;
|
|
2287
3047
|
}
|
|
2288
3048
|
|
|
2289
|
-
|
|
2290
|
-
|
|
2291
|
-
|
|
2292
|
-
static inline void ggml_hexagon_dispatch_op(ggml_hexagon_session *sess, const struct ggml_tensor * op, uint32_t flags) {
|
|
2293
|
-
uint64_t t = ggml_time_us();
|
|
2294
|
-
|
|
2295
|
-
// Construct HTP request
|
|
2296
|
-
htp_general_req req;
|
|
2297
|
-
memset(&req, 0, sizeof(req));
|
|
3049
|
+
static bool ggml_hexagon_supported_diag(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
3050
|
+
const struct ggml_tensor * src0 = op->src[0];
|
|
3051
|
+
const struct ggml_tensor * dst = op;
|
|
2298
3052
|
|
|
2299
|
-
|
|
2300
|
-
if (
|
|
2301
|
-
|
|
2302
|
-
}
|
|
2303
|
-
if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
|
|
2304
|
-
req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
|
|
3053
|
+
// diag only supports F32 currently
|
|
3054
|
+
if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
|
3055
|
+
return false;
|
|
2305
3056
|
}
|
|
2306
3057
|
|
|
2307
|
-
|
|
2308
|
-
|
|
2309
|
-
|
|
2310
|
-
dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
|
|
2311
|
-
size_t n_bufs = _init_req_func(&req, bufs, op);
|
|
2312
|
-
sess->enqueue(req, bufs, n_bufs, opt_opsync);
|
|
3058
|
+
// Input must have ne[1] == 1 (vector input)
|
|
3059
|
+
if (src0->ne[1] != 1) {
|
|
3060
|
+
return false;
|
|
2313
3061
|
}
|
|
2314
3062
|
|
|
2315
|
-
|
|
3063
|
+
// Output must be square in first two dimensions
|
|
3064
|
+
if (dst->ne[0] != dst->ne[1] || dst->ne[0] != src0->ne[0]) {
|
|
3065
|
+
return false;
|
|
3066
|
+
}
|
|
2316
3067
|
|
|
2317
|
-
|
|
3068
|
+
GGML_UNUSED(sess);
|
|
3069
|
+
return true;
|
|
2318
3070
|
}
|
|
2319
3071
|
|
|
2320
|
-
|
|
2321
|
-
|
|
2322
|
-
|
|
2323
|
-
|
|
2324
|
-
|
|
2325
|
-
|
|
2326
|
-
|
|
2327
|
-
req->op = HTP_OP_MUL;
|
|
2328
|
-
break;
|
|
2329
|
-
case GGML_OP_ADD:
|
|
2330
|
-
req->op = HTP_OP_ADD;
|
|
2331
|
-
break;
|
|
2332
|
-
case GGML_OP_SUB:
|
|
2333
|
-
req->op = HTP_OP_SUB;
|
|
2334
|
-
break;
|
|
2335
|
-
case GGML_OP_DIV:
|
|
2336
|
-
req->op = HTP_OP_DIV;
|
|
2337
|
-
break;
|
|
2338
|
-
default:
|
|
2339
|
-
GGML_ABORT("ggml-hex: binary : unsupported op: %d\n", t->op);
|
|
2340
|
-
break;
|
|
3072
|
+
static bool ggml_hexagon_supported_solve_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
3073
|
+
const struct ggml_tensor * src0 = op->src[0]; // A
|
|
3074
|
+
const struct ggml_tensor * src1 = op->src[1]; // B
|
|
3075
|
+
const struct ggml_tensor * dst = op; // X
|
|
3076
|
+
|
|
3077
|
+
if (!src0 || !src1) {
|
|
3078
|
+
return false;
|
|
2341
3079
|
}
|
|
2342
3080
|
|
|
2343
|
-
|
|
2344
|
-
|
|
2345
|
-
|
|
3081
|
+
if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
|
3082
|
+
return false;
|
|
3083
|
+
}
|
|
2346
3084
|
|
|
2347
|
-
|
|
2348
|
-
|
|
2349
|
-
|
|
2350
|
-
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
3085
|
+
if (src0->ne[0] != src0->ne[1]) {
|
|
3086
|
+
return false;
|
|
3087
|
+
}
|
|
2351
3088
|
|
|
2352
|
-
|
|
2353
|
-
|
|
3089
|
+
if (src0->ne[1] != src1->ne[1]) {
|
|
3090
|
+
return false;
|
|
3091
|
+
}
|
|
2354
3092
|
|
|
2355
|
-
|
|
2356
|
-
|
|
3093
|
+
if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
|
|
3094
|
+
return false;
|
|
3095
|
+
}
|
|
2357
3096
|
|
|
2358
|
-
|
|
2359
|
-
|
|
2360
|
-
|
|
3097
|
+
if (dst->ne[0] != src1->ne[0] || dst->ne[1] != src1->ne[1] || dst->ne[2] != src1->ne[2] || dst->ne[3] != src1->ne[3]) {
|
|
3098
|
+
return false;
|
|
3099
|
+
}
|
|
2361
3100
|
|
|
2362
|
-
|
|
3101
|
+
GGML_UNUSED(sess);
|
|
3102
|
+
return true;
|
|
2363
3103
|
}
|
|
2364
3104
|
|
|
2365
|
-
static
|
|
2366
|
-
req->op = HTP_OP_GET_ROWS;
|
|
2367
|
-
|
|
2368
|
-
size_t n_bufs = 0;
|
|
2369
|
-
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2370
|
-
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2371
|
-
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
3105
|
+
static bool ggml_hexagon_supported_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
2372
3106
|
|
|
2373
|
-
|
|
2374
|
-
|
|
3107
|
+
const struct ggml_tensor * src0 = op->src[0];
|
|
3108
|
+
const struct ggml_tensor * dst = op;
|
|
2375
3109
|
|
|
2376
|
-
|
|
2377
|
-
|
|
2378
|
-
|
|
3110
|
+
if (src0->type != GGML_TYPE_F32) { return false; }
|
|
3111
|
+
if (dst->type != GGML_TYPE_F32) { return false; }
|
|
3112
|
+
if (!ggml_are_same_shape(src0, dst)) { return false; }
|
|
3113
|
+
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) { return false; }
|
|
2379
3114
|
|
|
2380
|
-
|
|
2381
|
-
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2382
|
-
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
3115
|
+
return true;
|
|
2383
3116
|
|
|
2384
|
-
|
|
3117
|
+
GGML_UNUSED(sess);
|
|
2385
3118
|
}
|
|
2386
3119
|
|
|
2387
|
-
|
|
2388
|
-
|
|
2389
|
-
|
|
2390
|
-
case GGML_OP_MUL_MAT_ID:
|
|
2391
|
-
req->op = HTP_OP_MUL_MAT_ID;
|
|
2392
|
-
break;
|
|
2393
|
-
case GGML_OP_ADD_ID:
|
|
2394
|
-
req->op = HTP_OP_ADD_ID;
|
|
2395
|
-
break;
|
|
2396
|
-
default:
|
|
2397
|
-
GGML_ABORT("ggml-hex: unsupported op: %d\n", t->op);
|
|
2398
|
-
}
|
|
2399
|
-
|
|
2400
|
-
// src0: Weights (mulmat) or Input Activations (other op).
|
|
2401
|
-
// If constant, no cache management is needed.
|
|
2402
|
-
// src1: Input Activations (mulmat) or Second Operand (binary op).
|
|
2403
|
-
// src2: Expert IDs (mulmat) or Activated Experts (other op).
|
|
2404
|
-
|
|
2405
|
-
size_t n_bufs = 0;
|
|
2406
|
-
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], _is_src0_constant ? DSPQBUF_TYPE_CONSTANT : DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2407
|
-
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2408
|
-
n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2409
|
-
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
2410
|
-
|
|
2411
|
-
return n_bufs;
|
|
3120
|
+
static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
|
|
3121
|
+
auto sess = static_cast<ggml_hexagon_session *>(backend->context);
|
|
3122
|
+
return sess->c_name();
|
|
2412
3123
|
}
|
|
2413
3124
|
|
|
2414
|
-
static
|
|
2415
|
-
|
|
2416
|
-
|
|
2417
|
-
|
|
2418
|
-
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2419
|
-
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2420
|
-
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
2421
|
-
|
|
2422
|
-
return n_bufs;
|
|
3125
|
+
static void ggml_backend_hexagon_free(ggml_backend_t backend) {
|
|
3126
|
+
// we just need to delete the backend here
|
|
3127
|
+
// the sessions are allocated & freed as part of the registry
|
|
3128
|
+
delete backend;
|
|
2423
3129
|
}
|
|
2424
3130
|
|
|
2425
|
-
static
|
|
2426
|
-
memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
|
|
2427
|
-
|
|
2428
|
-
bool supported = false;
|
|
2429
|
-
|
|
3131
|
+
static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
|
|
2430
3132
|
switch (t->op) {
|
|
2431
|
-
case
|
|
2432
|
-
|
|
2433
|
-
|
|
2434
|
-
|
|
2435
|
-
|
|
2436
|
-
case
|
|
2437
|
-
|
|
2438
|
-
|
|
2439
|
-
|
|
2440
|
-
|
|
2441
|
-
case
|
|
2442
|
-
|
|
2443
|
-
|
|
2444
|
-
|
|
2445
|
-
|
|
2446
|
-
case
|
|
2447
|
-
|
|
2448
|
-
|
|
2449
|
-
|
|
3133
|
+
case GGML_OP_FLASH_ATTN_EXT: return HTP_OP_FLASH_ATTN_EXT;
|
|
3134
|
+
case GGML_OP_MUL_MAT: return HTP_OP_MUL_MAT;
|
|
3135
|
+
case GGML_OP_MUL_MAT_ID: return HTP_OP_MUL_MAT_ID;
|
|
3136
|
+
case GGML_OP_MUL: return HTP_OP_MUL;
|
|
3137
|
+
case GGML_OP_ADD: return HTP_OP_ADD;
|
|
3138
|
+
case GGML_OP_ADD_ID: return HTP_OP_ADD_ID;
|
|
3139
|
+
case GGML_OP_SUB: return HTP_OP_SUB;
|
|
3140
|
+
case GGML_OP_DIV: return HTP_OP_DIV;
|
|
3141
|
+
case GGML_OP_CPY: return HTP_OP_CPY;
|
|
3142
|
+
case GGML_OP_CONT: return HTP_OP_CPY;
|
|
3143
|
+
case GGML_OP_GET_ROWS: return HTP_OP_GET_ROWS;
|
|
3144
|
+
case GGML_OP_SET_ROWS: return HTP_OP_SET_ROWS;
|
|
3145
|
+
case GGML_OP_SUM_ROWS: return HTP_OP_SUM_ROWS;
|
|
3146
|
+
case GGML_OP_ARGSORT: return HTP_OP_ARGSORT;
|
|
3147
|
+
case GGML_OP_NORM: return HTP_OP_NORM;
|
|
3148
|
+
case GGML_OP_L2_NORM: return HTP_OP_L2_NORM;
|
|
3149
|
+
case GGML_OP_RMS_NORM: return HTP_OP_RMS_NORM;
|
|
3150
|
+
case GGML_OP_CONCAT: return HTP_OP_CONCAT;
|
|
3151
|
+
case GGML_OP_SCALE: return HTP_OP_SCALE;
|
|
3152
|
+
case GGML_OP_SQR: return HTP_OP_SQR;
|
|
3153
|
+
case GGML_OP_SQRT: return HTP_OP_SQRT;
|
|
3154
|
+
case GGML_OP_SOFT_MAX: return HTP_OP_SOFTMAX;
|
|
3155
|
+
case GGML_OP_SSM_CONV: return HTP_OP_SSM_CONV;
|
|
3156
|
+
case GGML_OP_GATED_DELTA_NET: return HTP_OP_GATED_DELTA_NET;
|
|
3157
|
+
case GGML_OP_ROPE: return HTP_OP_ROPE;
|
|
3158
|
+
case GGML_OP_REPEAT: return HTP_OP_REPEAT;
|
|
3159
|
+
case GGML_OP_CUMSUM: return HTP_OP_CUMSUM;
|
|
3160
|
+
case GGML_OP_FILL: return HTP_OP_FILL;
|
|
3161
|
+
case GGML_OP_DIAG: return HTP_OP_DIAG;
|
|
3162
|
+
case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI;
|
|
3163
|
+
case GGML_OP_TRI: return HTP_OP_TRI;
|
|
3164
|
+
case GGML_OP_PAD: return HTP_OP_PAD;
|
|
2450
3165
|
|
|
2451
3166
|
case GGML_OP_UNARY:
|
|
2452
|
-
|
|
2453
|
-
|
|
2454
|
-
|
|
2455
|
-
|
|
2456
|
-
|
|
2457
|
-
|
|
3167
|
+
switch (ggml_get_unary_op(t)) {
|
|
3168
|
+
case GGML_UNARY_OP_SILU: return HTP_OP_UNARY_SILU;
|
|
3169
|
+
case GGML_UNARY_OP_GELU: return HTP_OP_UNARY_GELU;
|
|
3170
|
+
case GGML_UNARY_OP_GELU_QUICK: return HTP_OP_UNARY_GELU;
|
|
3171
|
+
case GGML_UNARY_OP_SIGMOID: return HTP_OP_UNARY_SIGMOID;
|
|
3172
|
+
case GGML_UNARY_OP_NEG: return HTP_OP_UNARY_NEG;
|
|
3173
|
+
case GGML_UNARY_OP_EXP: return HTP_OP_UNARY_EXP;
|
|
3174
|
+
case GGML_UNARY_OP_SOFTPLUS: return HTP_OP_UNARY_SOFTPLUS;
|
|
3175
|
+
case GGML_UNARY_OP_TANH: return HTP_OP_UNARY_TANH;
|
|
3176
|
+
default:
|
|
3177
|
+
break;
|
|
2458
3178
|
}
|
|
2459
3179
|
break;
|
|
2460
3180
|
|
|
2461
3181
|
case GGML_OP_GLU:
|
|
2462
|
-
|
|
2463
|
-
|
|
2464
|
-
|
|
2465
|
-
|
|
2466
|
-
|
|
2467
|
-
supported = true;
|
|
2468
|
-
} else if (ggml_get_glu_op(t) == GGML_GLU_OP_GEGLU) {
|
|
2469
|
-
req->op = HTP_OP_GLU_GEGLU;
|
|
2470
|
-
supported = true;
|
|
3182
|
+
switch (ggml_get_glu_op(t)) {
|
|
3183
|
+
case GGML_GLU_OP_SWIGLU: return HTP_OP_GLU_SWIGLU;
|
|
3184
|
+
case GGML_GLU_OP_SWIGLU_OAI: return HTP_OP_GLU_SWIGLU_OAI;
|
|
3185
|
+
case GGML_GLU_OP_GEGLU: return HTP_OP_GLU_GEGLU;
|
|
3186
|
+
default: break;
|
|
2471
3187
|
}
|
|
2472
3188
|
break;
|
|
2473
3189
|
|
|
2474
|
-
case GGML_OP_SOFT_MAX:
|
|
2475
|
-
req->op = HTP_OP_SOFTMAX;
|
|
2476
|
-
supported = true;
|
|
2477
|
-
break;
|
|
2478
|
-
|
|
2479
3190
|
default:
|
|
2480
|
-
|
|
3191
|
+
GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(t));
|
|
2481
3192
|
}
|
|
2482
|
-
|
|
2483
|
-
if (!supported) {
|
|
2484
|
-
GGML_ABORT("ggml-hex: unary : unsupported op: %d\n", t->op);
|
|
2485
|
-
}
|
|
2486
|
-
|
|
2487
|
-
size_t n_bufs = 0;
|
|
2488
|
-
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2489
|
-
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2490
|
-
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
2491
|
-
|
|
2492
|
-
return n_bufs;
|
|
2493
|
-
}
|
|
2494
|
-
|
|
2495
|
-
static inline size_t init_sum_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
|
2496
|
-
memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
|
|
2497
|
-
req->op = HTP_OP_SUM_ROWS;
|
|
2498
|
-
|
|
2499
|
-
size_t n_bufs = 0;
|
|
2500
|
-
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2501
|
-
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
2502
|
-
|
|
2503
|
-
return n_bufs;
|
|
3193
|
+
return HTP_OP_INVALID;
|
|
2504
3194
|
}
|
|
2505
3195
|
|
|
2506
|
-
static inline
|
|
2507
|
-
memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
|
|
2508
|
-
req->op = HTP_OP_ROPE;
|
|
2509
|
-
|
|
2510
|
-
size_t n_bufs = 0;
|
|
2511
|
-
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2512
|
-
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2513
|
-
n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2514
|
-
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
2515
|
-
|
|
2516
|
-
return n_bufs;
|
|
2517
|
-
}
|
|
2518
|
-
|
|
2519
|
-
static inline size_t init_flash_attn_ext_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
|
2520
|
-
memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
|
|
2521
|
-
req->op = HTP_OP_FLASH_ATTN_EXT;
|
|
2522
|
-
|
|
2523
|
-
size_t n_bufs = 0;
|
|
2524
|
-
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2525
|
-
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2526
|
-
n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2527
|
-
n_bufs += htp_req_buff_init(&req->src3, &bufs[n_bufs], t->src[3], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2528
|
-
n_bufs += htp_req_buff_init(&req->src4, &bufs[n_bufs], t->src[4], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2529
|
-
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
2530
|
-
|
|
2531
|
-
return n_bufs;
|
|
2532
|
-
}
|
|
2533
|
-
|
|
2534
|
-
static inline size_t init_ssm_conv_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
|
2535
|
-
req->op = HTP_OP_SSM_CONV;
|
|
2536
|
-
|
|
2537
|
-
size_t n_bufs = 0;
|
|
2538
|
-
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2539
|
-
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CONSTANT);
|
|
2540
|
-
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
2541
|
-
|
|
2542
|
-
return n_bufs;
|
|
2543
|
-
}
|
|
2544
|
-
|
|
2545
|
-
static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
|
|
2546
|
-
auto sess = static_cast<ggml_hexagon_session *>(backend->context);
|
|
2547
|
-
return sess->name.c_str();
|
|
2548
|
-
}
|
|
2549
|
-
|
|
2550
|
-
static void ggml_backend_hexagon_free(ggml_backend_t backend) {
|
|
2551
|
-
// we just need to delete the backend here
|
|
2552
|
-
// the sessions are allocated & freed as part of the registry
|
|
2553
|
-
delete backend;
|
|
2554
|
-
}
|
|
2555
|
-
|
|
2556
|
-
static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) {
|
|
2557
|
-
return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type));
|
|
2558
|
-
}
|
|
2559
|
-
|
|
2560
|
-
static inline bool is_compute_op(ggml_tensor *node)
|
|
3196
|
+
static inline bool op_is_compute(ggml_tensor *node)
|
|
2561
3197
|
{
|
|
2562
3198
|
return !ggml_op_is_empty(node->op) && !ggml_is_empty(node) && (node->flags & GGML_TENSOR_FLAG_COMPUTE);
|
|
2563
3199
|
}
|
|
2564
3200
|
|
|
2565
|
-
// scan the graph and figure out last compute op index
|
|
2566
|
-
static inline int last_compute_op(ggml_cgraph * graph) {
|
|
2567
|
-
int last = 0;
|
|
2568
|
-
for (int i = 0; i < graph->n_nodes; ++i) {
|
|
2569
|
-
if (is_compute_op(graph->nodes[i])) {
|
|
2570
|
-
last = i;
|
|
2571
|
-
}
|
|
2572
|
-
}
|
|
2573
|
-
|
|
2574
|
-
return last;
|
|
2575
|
-
}
|
|
2576
|
-
|
|
2577
3201
|
static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
|
|
2578
3202
|
auto sess = static_cast<ggml_hexagon_session *>(backend->context);
|
|
2579
3203
|
|
|
2580
|
-
HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->
|
|
2581
|
-
|
|
2582
|
-
const int last = last_compute_op(graph);
|
|
3204
|
+
HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->c_name(), graph->n_nodes);
|
|
2583
3205
|
|
|
2584
|
-
|
|
3206
|
+
std::vector<htp_opnode> nodes;
|
|
3207
|
+
nodes.reserve(graph->n_nodes);
|
|
2585
3208
|
|
|
3209
|
+
// Fusion
|
|
2586
3210
|
for (int i = 0; i < graph->n_nodes; ++i) {
|
|
2587
|
-
ggml_tensor *
|
|
2588
|
-
|
|
2589
|
-
if (!is_compute_op(node)) {
|
|
3211
|
+
ggml_tensor * n = graph->nodes[i];
|
|
3212
|
+
if (!op_is_compute(n)) {
|
|
2590
3213
|
continue;
|
|
2591
3214
|
}
|
|
2592
3215
|
|
|
2593
|
-
|
|
2594
|
-
|
|
2595
|
-
// skip quantizer if src1 is reused
|
|
2596
|
-
if (op_reuse_src1(node, prev_op)) {
|
|
2597
|
-
flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
|
|
2598
|
-
}
|
|
3216
|
+
ggml_tensor * next_node = (i + 1 < graph->n_nodes) ? graph->nodes[i + 1] : nullptr;
|
|
2599
3217
|
|
|
2600
|
-
|
|
3218
|
+
htp_opnode node = {
|
|
3219
|
+
/*.node =*/ n,
|
|
3220
|
+
/*.fused =*/ {},
|
|
3221
|
+
/*.opcode =*/ HTP_OP_INVALID
|
|
3222
|
+
};
|
|
2601
3223
|
|
|
2602
|
-
|
|
2603
|
-
|
|
2604
|
-
|
|
3224
|
+
if (n->op == GGML_OP_RMS_NORM && next_node) {
|
|
3225
|
+
if (next_node->op == GGML_OP_MUL && op_is_compute(next_node) && ggml_can_fuse(graph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
|
|
3226
|
+
node.add_fused(next_node);
|
|
3227
|
+
node.opcode = HTP_OP_RMS_NORM_MUL;
|
|
3228
|
+
i++; // skip the fused MUL node
|
|
3229
|
+
}
|
|
2605
3230
|
}
|
|
2606
3231
|
|
|
2607
|
-
|
|
2608
|
-
|
|
2609
|
-
|
|
2610
|
-
ggml_hexagon_dispatch_op<init_binary_req<true>>(sess, node, flags);
|
|
2611
|
-
} else {
|
|
2612
|
-
ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
|
|
2613
|
-
}
|
|
2614
|
-
break;
|
|
2615
|
-
case GGML_OP_MUL_MAT_ID:
|
|
2616
|
-
if (ggml_is_quantized(node->src[0]->type)) {
|
|
2617
|
-
ggml_hexagon_dispatch_op<init_binary_id_req<true>>(sess, node, flags);
|
|
2618
|
-
} else {
|
|
2619
|
-
ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
|
|
2620
|
-
}
|
|
2621
|
-
break;
|
|
2622
|
-
case GGML_OP_MUL:
|
|
2623
|
-
case GGML_OP_ADD:
|
|
2624
|
-
case GGML_OP_SUB:
|
|
2625
|
-
case GGML_OP_DIV:
|
|
2626
|
-
ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
|
|
2627
|
-
break;
|
|
2628
|
-
case GGML_OP_ADD_ID:
|
|
2629
|
-
ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
|
|
2630
|
-
break;
|
|
2631
|
-
case GGML_OP_RMS_NORM:
|
|
2632
|
-
case GGML_OP_SCALE:
|
|
2633
|
-
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
|
2634
|
-
break;
|
|
2635
|
-
case GGML_OP_SQR:
|
|
2636
|
-
case GGML_OP_SQRT:
|
|
2637
|
-
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
|
2638
|
-
break;
|
|
2639
|
-
case GGML_OP_SUM_ROWS:
|
|
2640
|
-
ggml_hexagon_dispatch_op<init_sum_rows_req>(sess, node, flags);
|
|
2641
|
-
break;
|
|
2642
|
-
case GGML_OP_UNARY:
|
|
2643
|
-
if ((ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) ||
|
|
2644
|
-
(ggml_get_unary_op(node) == GGML_UNARY_OP_GELU)) {
|
|
2645
|
-
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
|
2646
|
-
}
|
|
2647
|
-
break;
|
|
2648
|
-
case GGML_OP_GLU:
|
|
2649
|
-
if ((ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU) ||
|
|
2650
|
-
(ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI) ||
|
|
2651
|
-
(ggml_get_glu_op(node) == GGML_GLU_OP_GEGLU)) {
|
|
2652
|
-
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
|
2653
|
-
}
|
|
2654
|
-
break;
|
|
2655
|
-
case GGML_OP_SOFT_MAX:
|
|
2656
|
-
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
|
2657
|
-
break;
|
|
2658
|
-
|
|
2659
|
-
case GGML_OP_ROPE:
|
|
2660
|
-
ggml_hexagon_dispatch_op<init_rope_req>(sess, node, flags);
|
|
2661
|
-
break;
|
|
2662
|
-
|
|
2663
|
-
case GGML_OP_FLASH_ATTN_EXT:
|
|
2664
|
-
ggml_hexagon_dispatch_op<init_flash_attn_ext_req>(sess, node, flags);
|
|
2665
|
-
break;
|
|
2666
|
-
|
|
2667
|
-
case GGML_OP_SET_ROWS:
|
|
2668
|
-
ggml_hexagon_dispatch_op<init_set_rows_req>(sess, node, flags);
|
|
2669
|
-
break;
|
|
2670
|
-
|
|
2671
|
-
case GGML_OP_GET_ROWS:
|
|
2672
|
-
ggml_hexagon_dispatch_op<init_get_rows_req>(sess, node, flags);
|
|
2673
|
-
break;
|
|
2674
|
-
|
|
2675
|
-
case GGML_OP_CPY:
|
|
2676
|
-
ggml_hexagon_dispatch_op<init_cpy_req>(sess, node, flags);
|
|
2677
|
-
break;
|
|
2678
|
-
|
|
2679
|
-
case GGML_OP_ARGSORT:
|
|
2680
|
-
ggml_hexagon_dispatch_op<init_argsort_req>(sess, node, flags);
|
|
2681
|
-
break;
|
|
3232
|
+
if (node.opcode == HTP_OP_INVALID) {
|
|
3233
|
+
node.opcode = op_remap_to_htp(n);
|
|
3234
|
+
}
|
|
2682
3235
|
|
|
2683
|
-
|
|
2684
|
-
|
|
2685
|
-
break;
|
|
3236
|
+
nodes.push_back(std::move(node));
|
|
3237
|
+
}
|
|
2686
3238
|
|
|
2687
|
-
|
|
2688
|
-
|
|
3239
|
+
// Queue and execute
|
|
3240
|
+
if (opt_opstage & HTP_OPSTAGE_QUEUE) {
|
|
3241
|
+
for (const auto & node : nodes) {
|
|
3242
|
+
sess->enqueue_op(node);
|
|
2689
3243
|
}
|
|
2690
3244
|
}
|
|
2691
3245
|
|
|
@@ -2698,57 +3252,13 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
|
|
2698
3252
|
static void ggml_backend_hexagon_synchronize(ggml_backend_t backend) {
|
|
2699
3253
|
auto sess = static_cast<ggml_hexagon_session *>(backend->context);
|
|
2700
3254
|
|
|
2701
|
-
HEX_VERBOSE("ggml-hex: %s synchronize\n", sess->
|
|
3255
|
+
HEX_VERBOSE("ggml-hex: %s synchronize\n", sess->c_name());
|
|
2702
3256
|
|
|
2703
3257
|
// Wait until all pending ops complete
|
|
2704
3258
|
sess->flush();
|
|
2705
3259
|
}
|
|
2706
3260
|
|
|
2707
|
-
|
|
2708
|
-
ggml_tensor * node;
|
|
2709
|
-
|
|
2710
|
-
std::vector<ggml_tensor *> fused;
|
|
2711
|
-
|
|
2712
|
-
ggml_op op() const {
|
|
2713
|
-
return node->op;
|
|
2714
|
-
}
|
|
2715
|
-
|
|
2716
|
-
const ggml_tensor * dst() const {
|
|
2717
|
-
return fused.empty() ? node : fused.back();
|
|
2718
|
-
}
|
|
2719
|
-
|
|
2720
|
-
const ggml_tensor * src0() const {
|
|
2721
|
-
return node->src[0];
|
|
2722
|
-
}
|
|
2723
|
-
|
|
2724
|
-
const ggml_tensor * src1() const {
|
|
2725
|
-
return node->src[1];
|
|
2726
|
-
}
|
|
2727
|
-
|
|
2728
|
-
bool is_empty() const {
|
|
2729
|
-
return ggml_op_is_empty(node->op);
|
|
2730
|
-
}
|
|
2731
|
-
|
|
2732
|
-
void add_fused(ggml_tensor * t) {
|
|
2733
|
-
fused.push_back(t);
|
|
2734
|
-
}
|
|
2735
|
-
|
|
2736
|
-
bool stackable() const {
|
|
2737
|
-
switch (this->op()) {
|
|
2738
|
-
case GGML_OP_MUL_MAT:
|
|
2739
|
-
case GGML_OP_MUL_MAT_ID:
|
|
2740
|
-
return ggml_is_quantized(this->src0()->type);
|
|
2741
|
-
default:
|
|
2742
|
-
return false;
|
|
2743
|
-
}
|
|
2744
|
-
}
|
|
2745
|
-
|
|
2746
|
-
bool same_input(const node_info& n) const {
|
|
2747
|
-
return n.src1() == this->src1();
|
|
2748
|
-
}
|
|
2749
|
-
};
|
|
2750
|
-
|
|
2751
|
-
static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<node_info> & nodes) {
|
|
3261
|
+
static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<htp_opnode> & nodes) {
|
|
2752
3262
|
const int n = nodes.size();
|
|
2753
3263
|
|
|
2754
3264
|
std::vector<int> res;
|
|
@@ -2802,14 +3312,14 @@ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgr
|
|
|
2802
3312
|
|
|
2803
3313
|
enum ggml_op ops[MAX_FUSE];
|
|
2804
3314
|
|
|
2805
|
-
std::vector<
|
|
3315
|
+
std::vector<htp_opnode> nodes;
|
|
2806
3316
|
nodes.reserve(gf->n_nodes);
|
|
2807
3317
|
|
|
2808
3318
|
// fuse nodes:
|
|
2809
3319
|
// we don't want to make reorders that break fusing, so we first pack all fusable tensors
|
|
2810
3320
|
// and perform the reorder over the fused nodes. after the reorder is done, we unfuse
|
|
2811
3321
|
for (int i = 0; i < n; i++) {
|
|
2812
|
-
|
|
3322
|
+
htp_opnode node = {
|
|
2813
3323
|
/*.node =*/gf->nodes[i],
|
|
2814
3324
|
/*.fused =*/{},
|
|
2815
3325
|
};
|
|
@@ -2876,6 +3386,8 @@ static struct ggml_backend_i hexagon_backend_i = {
|
|
|
2876
3386
|
/* .free = */ ggml_backend_hexagon_free,
|
|
2877
3387
|
/* .set_tensor_async = */ NULL,
|
|
2878
3388
|
/* .get_tensor_async = */ NULL,
|
|
3389
|
+
/* .set_tensor_2d_async = */ NULL,
|
|
3390
|
+
/* .get_tensor_2d_async = */ NULL,
|
|
2879
3391
|
/* .cpy_tensor_async = */ NULL,
|
|
2880
3392
|
/* .synchronize = */ ggml_backend_hexagon_synchronize,
|
|
2881
3393
|
/* .graph_plan_create = */ NULL,
|
|
@@ -2915,7 +3427,7 @@ static ggml_backend_t ggml_backend_hexagon_device_init(ggml_backend_dev_t dev, c
|
|
|
2915
3427
|
|
|
2916
3428
|
static const char * ggml_backend_hexagon_device_get_name(ggml_backend_dev_t dev) {
|
|
2917
3429
|
auto sess = static_cast<ggml_hexagon_session *>(dev->context);
|
|
2918
|
-
return sess->
|
|
3430
|
+
return sess->c_name();
|
|
2919
3431
|
|
|
2920
3432
|
GGML_UNUSED(dev);
|
|
2921
3433
|
}
|
|
@@ -2926,8 +3438,7 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
|
|
|
2926
3438
|
}
|
|
2927
3439
|
|
|
2928
3440
|
static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
|
2929
|
-
|
|
2930
|
-
*free = 2ULL * 1024 * 1024 * 1024;
|
|
3441
|
+
*free = 0;
|
|
2931
3442
|
*total = *free;
|
|
2932
3443
|
|
|
2933
3444
|
GGML_UNUSED(dev);
|
|
@@ -3006,9 +3517,77 @@ static bool ggml_hexagon_supported_cpy(const struct ggml_hexagon_session * sess,
|
|
|
3006
3517
|
return true;
|
|
3007
3518
|
}
|
|
3008
3519
|
|
|
3520
|
+
static bool ggml_hexagon_supported_cont(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
3521
|
+
GGML_UNUSED(sess);
|
|
3522
|
+
const struct ggml_tensor * src0 = op->src[0];
|
|
3523
|
+
|
|
3524
|
+
// CONT is same-type only, supports f32 and f16
|
|
3525
|
+
if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
|
|
3526
|
+
|
|
3527
|
+
return true;
|
|
3528
|
+
}
|
|
3529
|
+
|
|
3530
|
+
static bool ggml_hexagon_supported_repeat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
3531
|
+
GGML_UNUSED(sess);
|
|
3532
|
+
const struct ggml_tensor * src0 = op->src[0];
|
|
3533
|
+
const struct ggml_tensor * dst = op;
|
|
3534
|
+
|
|
3535
|
+
// Support f32 and f16
|
|
3536
|
+
if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
|
|
3537
|
+
|
|
3538
|
+
// src and dst must be the same type
|
|
3539
|
+
if (src0->type != dst->type) return false;
|
|
3540
|
+
|
|
3541
|
+
// dst dims must be multiples of src dims
|
|
3542
|
+
if (dst->ne[0] % src0->ne[0] != 0) return false;
|
|
3543
|
+
if (dst->ne[1] % src0->ne[1] != 0) return false;
|
|
3544
|
+
if (dst->ne[2] % src0->ne[2] != 0) return false;
|
|
3545
|
+
if (dst->ne[3] % src0->ne[3] != 0) return false;
|
|
3546
|
+
|
|
3547
|
+
// require contiguous tensors (no transposition)
|
|
3548
|
+
if (ggml_is_transposed(src0) || ggml_is_transposed(dst)) return false;
|
|
3549
|
+
|
|
3550
|
+
return true;
|
|
3551
|
+
}
|
|
3552
|
+
|
|
3553
|
+
static bool ggml_hexagon_supported_concat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
3554
|
+
int dim = ((const int32_t *) op->op_params)[0];
|
|
3555
|
+
if (dim < 0 || dim >= GGML_MAX_DIMS) {
|
|
3556
|
+
return false;
|
|
3557
|
+
}
|
|
3558
|
+
|
|
3559
|
+
for (int i = 0; i < GGML_MAX_SRC; ++i) {
|
|
3560
|
+
const struct ggml_tensor * src = op->src[i];
|
|
3561
|
+
if (!src) {
|
|
3562
|
+
continue;
|
|
3563
|
+
}
|
|
3564
|
+
if (src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_I32 && src->type != GGML_TYPE_F16) {
|
|
3565
|
+
return false;
|
|
3566
|
+
}
|
|
3567
|
+
}
|
|
3568
|
+
|
|
3569
|
+
return true;
|
|
3570
|
+
}
|
|
3571
|
+
|
|
3572
|
+
static bool ggml_hexagon_supported_fill(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
3573
|
+
const struct ggml_tensor * dst = op;
|
|
3574
|
+
|
|
3575
|
+
if (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) {
|
|
3576
|
+
return false;
|
|
3577
|
+
}
|
|
3578
|
+
|
|
3579
|
+
GGML_UNUSED(sess);
|
|
3580
|
+
return true;
|
|
3581
|
+
}
|
|
3582
|
+
|
|
3009
3583
|
static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
|
3010
3584
|
auto sess = static_cast<ggml_hexagon_session *>(dev->context);
|
|
3011
3585
|
|
|
3586
|
+
// reject ops that match the filter
|
|
3587
|
+
if (opt_opfilter && std::regex_match(ggml_op_desc(op), *opt_opfilter)) {
|
|
3588
|
+
return false;
|
|
3589
|
+
}
|
|
3590
|
+
|
|
3012
3591
|
// all srcs & dsts must be mapped to the same session
|
|
3013
3592
|
if (!ggml_hexagon_supported_buffers(sess, op)) {
|
|
3014
3593
|
ggml_hexagon_dump_op_supp(sess->name, op, false);
|
|
@@ -3025,6 +3604,13 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
|
|
3025
3604
|
supp = true;
|
|
3026
3605
|
break;
|
|
3027
3606
|
|
|
3607
|
+
case GGML_OP_MUL:
|
|
3608
|
+
case GGML_OP_ADD:
|
|
3609
|
+
case GGML_OP_SUB:
|
|
3610
|
+
case GGML_OP_DIV:
|
|
3611
|
+
supp = ggml_hexagon_supported_binary(sess, op);
|
|
3612
|
+
break;
|
|
3613
|
+
|
|
3028
3614
|
case GGML_OP_MUL_MAT:
|
|
3029
3615
|
supp = ggml_hexagon_supported_mul_mat(sess, op);
|
|
3030
3616
|
break;
|
|
@@ -3033,17 +3619,12 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
|
|
3033
3619
|
supp = ggml_hexagon_supported_mul_mat_id(sess, op);
|
|
3034
3620
|
break;
|
|
3035
3621
|
|
|
3036
|
-
case GGML_OP_MUL:
|
|
3037
|
-
case GGML_OP_ADD:
|
|
3038
|
-
case GGML_OP_SUB:
|
|
3039
|
-
case GGML_OP_DIV:
|
|
3040
|
-
supp = ggml_hexagon_supported_binary(sess, op);
|
|
3041
|
-
break;
|
|
3042
|
-
|
|
3043
3622
|
case GGML_OP_ADD_ID:
|
|
3044
3623
|
supp = ggml_hexagon_supported_add_id(sess, op);
|
|
3045
3624
|
break;
|
|
3046
3625
|
|
|
3626
|
+
case GGML_OP_NORM:
|
|
3627
|
+
case GGML_OP_L2_NORM:
|
|
3047
3628
|
case GGML_OP_RMS_NORM:
|
|
3048
3629
|
case GGML_OP_SCALE:
|
|
3049
3630
|
supp = ggml_hexagon_supported_unary(sess, op);
|
|
@@ -3063,21 +3644,36 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
|
|
3063
3644
|
break;
|
|
3064
3645
|
|
|
3065
3646
|
case GGML_OP_UNARY:
|
|
3066
|
-
{
|
|
3067
|
-
|
|
3068
|
-
|
|
3647
|
+
switch (ggml_get_unary_op(op)) {
|
|
3648
|
+
case GGML_UNARY_OP_NEG:
|
|
3649
|
+
case GGML_UNARY_OP_EXP:
|
|
3650
|
+
case GGML_UNARY_OP_SIGMOID:
|
|
3651
|
+
case GGML_UNARY_OP_SOFTPLUS:
|
|
3652
|
+
case GGML_UNARY_OP_TANH:
|
|
3653
|
+
supp = ggml_hexagon_supported_unary(sess, op);
|
|
3654
|
+
break;
|
|
3655
|
+
case GGML_UNARY_OP_SILU:
|
|
3656
|
+
case GGML_UNARY_OP_GELU:
|
|
3657
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
|
3069
3658
|
supp = ggml_hexagon_supported_activations(sess, op);
|
|
3070
|
-
|
|
3071
|
-
|
|
3659
|
+
break;
|
|
3660
|
+
default:
|
|
3661
|
+
break;
|
|
3072
3662
|
}
|
|
3663
|
+
break;
|
|
3664
|
+
|
|
3073
3665
|
case GGML_OP_GLU:
|
|
3074
|
-
{
|
|
3075
|
-
|
|
3076
|
-
|
|
3666
|
+
switch (ggml_get_glu_op(op)) {
|
|
3667
|
+
case GGML_GLU_OP_SWIGLU:
|
|
3668
|
+
case GGML_GLU_OP_SWIGLU_OAI:
|
|
3669
|
+
case GGML_GLU_OP_GEGLU:
|
|
3077
3670
|
supp = ggml_hexagon_supported_activations(sess, op);
|
|
3078
|
-
|
|
3079
|
-
|
|
3671
|
+
break;
|
|
3672
|
+
default:
|
|
3673
|
+
break;
|
|
3080
3674
|
}
|
|
3675
|
+
break;
|
|
3676
|
+
|
|
3081
3677
|
case GGML_OP_ROPE:
|
|
3082
3678
|
supp = ggml_hexagon_supported_rope(sess, op);
|
|
3083
3679
|
break;
|
|
@@ -3098,6 +3694,14 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
|
|
3098
3694
|
supp = ggml_hexagon_supported_cpy(sess, op);
|
|
3099
3695
|
break;
|
|
3100
3696
|
|
|
3697
|
+
case GGML_OP_CONT:
|
|
3698
|
+
supp = ggml_hexagon_supported_cont(sess, op);
|
|
3699
|
+
break;
|
|
3700
|
+
|
|
3701
|
+
case GGML_OP_REPEAT:
|
|
3702
|
+
supp = ggml_hexagon_supported_repeat(sess, op);
|
|
3703
|
+
break;
|
|
3704
|
+
|
|
3101
3705
|
case GGML_OP_ARGSORT:
|
|
3102
3706
|
supp = ggml_hexagon_supported_argsort(sess, op);
|
|
3103
3707
|
break;
|
|
@@ -3106,6 +3710,38 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
|
|
3106
3710
|
supp = ggml_hexagon_supported_ssm_conv(sess, op);
|
|
3107
3711
|
break;
|
|
3108
3712
|
|
|
3713
|
+
case GGML_OP_GATED_DELTA_NET:
|
|
3714
|
+
supp = ggml_hexagon_supported_gated_delta_net(sess, op);
|
|
3715
|
+
break;
|
|
3716
|
+
|
|
3717
|
+
case GGML_OP_CUMSUM:
|
|
3718
|
+
supp = ggml_hexagon_supported_cumsum(sess, op);
|
|
3719
|
+
break;
|
|
3720
|
+
|
|
3721
|
+
case GGML_OP_CONCAT:
|
|
3722
|
+
supp = ggml_hexagon_supported_concat(sess, op);
|
|
3723
|
+
break;
|
|
3724
|
+
|
|
3725
|
+
case GGML_OP_FILL:
|
|
3726
|
+
supp = ggml_hexagon_supported_fill(sess, op);
|
|
3727
|
+
break;
|
|
3728
|
+
|
|
3729
|
+
case GGML_OP_DIAG:
|
|
3730
|
+
supp = ggml_hexagon_supported_diag(sess, op);
|
|
3731
|
+
break;
|
|
3732
|
+
|
|
3733
|
+
case GGML_OP_SOLVE_TRI:
|
|
3734
|
+
supp = ggml_hexagon_supported_solve_tri(sess, op);
|
|
3735
|
+
break;
|
|
3736
|
+
|
|
3737
|
+
case GGML_OP_TRI:
|
|
3738
|
+
supp = ggml_hexagon_supported_tri(sess, op);
|
|
3739
|
+
break;
|
|
3740
|
+
|
|
3741
|
+
case GGML_OP_PAD:
|
|
3742
|
+
supp = ggml_hexagon_supported_pad(sess, op);
|
|
3743
|
+
break;
|
|
3744
|
+
|
|
3109
3745
|
default:
|
|
3110
3746
|
break;
|
|
3111
3747
|
}
|
|
@@ -3172,21 +3808,6 @@ struct ggml_hexagon_registry {
|
|
|
3172
3808
|
ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
|
3173
3809
|
GGML_LOG_INFO("ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev %zu\n", opt_ndev);
|
|
3174
3810
|
|
|
3175
|
-
if (!opt_arch) {
|
|
3176
|
-
int err = get_hex_arch_ver(CDSP_DOMAIN_ID, &opt_arch);
|
|
3177
|
-
if (err != 0) {
|
|
3178
|
-
GGML_LOG_ERROR("ggml-hex: failed to query HTP version (err %d) defaulting to v73\n", err);
|
|
3179
|
-
opt_arch = 73;
|
|
3180
|
-
}
|
|
3181
|
-
}
|
|
3182
|
-
|
|
3183
|
-
#if defined(__ANDROID__)
|
|
3184
|
-
if (opt_arch < 75) {
|
|
3185
|
-
opt_ndev = 1;
|
|
3186
|
-
GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
|
|
3187
|
-
}
|
|
3188
|
-
#endif
|
|
3189
|
-
|
|
3190
3811
|
GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
|
|
3191
3812
|
|
|
3192
3813
|
// Create devices / sessions
|
|
@@ -3241,53 +3862,117 @@ static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, cons
|
|
|
3241
3862
|
return NULL;
|
|
3242
3863
|
}
|
|
3243
3864
|
|
|
3865
|
+
template<typename T> std::vector<T> str_to_vec(const char* str) {
|
|
3866
|
+
std::stringstream ss(str);
|
|
3867
|
+
std::vector<T> v;
|
|
3868
|
+
std::string t;
|
|
3869
|
+
|
|
3870
|
+
while (std::getline(ss, t, ',')) {
|
|
3871
|
+
v.push_back(std::stoul(t, nullptr, 0));
|
|
3872
|
+
}
|
|
3873
|
+
|
|
3874
|
+
return v;
|
|
3875
|
+
}
|
|
3876
|
+
|
|
3877
|
+
template<typename T, int BASE=10> std::string vec_to_str(std::vector<T> v) {
|
|
3878
|
+
std::stringstream ss;
|
|
3879
|
+
ss << std::setbase(BASE) << std::showbase;
|
|
3880
|
+
for (auto i : v) { ss << i << ','; }
|
|
3881
|
+
auto str = ss.str(); str.pop_back(); // drop last comma
|
|
3882
|
+
return str;
|
|
3883
|
+
}
|
|
3884
|
+
|
|
3244
3885
|
static void ggml_hexagon_init(ggml_backend_reg * reg) {
|
|
3245
3886
|
// Basic sanity checks to make sure definitions match
|
|
3246
3887
|
static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0,
|
|
3247
3888
|
"please update hexagon_type to match ggml_type");
|
|
3889
|
+
static_assert((unsigned int) HTP_TYPE_Q4_1 == (unsigned int) GGML_TYPE_Q4_1,
|
|
3890
|
+
"please update hexagon_type to match ggml_type");
|
|
3248
3891
|
static_assert((unsigned int) HTP_TYPE_Q8_0 == (unsigned int) GGML_TYPE_Q8_0,
|
|
3249
3892
|
"please update hexagon_type to match ggml_type");
|
|
3250
3893
|
static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4,
|
|
3251
3894
|
"please update hexagon_type to match ggml_type");
|
|
3895
|
+
static_assert((unsigned int) HTP_TYPE_IQ4_NL == (unsigned int) GGML_TYPE_IQ4_NL,
|
|
3896
|
+
"please update hexagon_type to match ggml_type");
|
|
3252
3897
|
|
|
3253
|
-
const char *
|
|
3254
|
-
const char *
|
|
3255
|
-
const char *
|
|
3256
|
-
const char *
|
|
3257
|
-
const char *
|
|
3258
|
-
const char *
|
|
3259
|
-
const char *
|
|
3260
|
-
const char *
|
|
3261
|
-
const char *
|
|
3262
|
-
const char *
|
|
3263
|
-
|
|
3264
|
-
|
|
3265
|
-
|
|
3266
|
-
|
|
3267
|
-
|
|
3268
|
-
|
|
3269
|
-
|
|
3270
|
-
|
|
3271
|
-
|
|
3272
|
-
|
|
3898
|
+
const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
|
|
3899
|
+
const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
|
|
3900
|
+
const char * str_opstage = getenv("GGML_HEXAGON_OPSTAGE");
|
|
3901
|
+
const char * str_opbatch = getenv("GGML_HEXAGON_OPBATCH");
|
|
3902
|
+
const char * str_opqueue = getenv("GGML_HEXAGON_OPQUEUE");
|
|
3903
|
+
const char * str_oppoll = getenv("GGML_HEXAGON_OPPOLL");
|
|
3904
|
+
const char * str_opfilter = getenv("GGML_HEXAGON_OPFILTER");
|
|
3905
|
+
const char * str_profile = getenv("GGML_HEXAGON_PROFILE");
|
|
3906
|
+
const char * str_etm = getenv("GGML_HEXAGON_ETM");
|
|
3907
|
+
const char * str_nhvx = getenv("GGML_HEXAGON_NHVX");
|
|
3908
|
+
const char * str_use_hmx = getenv("GGML_HEXAGON_USE_HMX");
|
|
3909
|
+
const char * str_ndev = getenv("GGML_HEXAGON_NDEV");
|
|
3910
|
+
const char * str_arch = getenv("GGML_HEXAGON_ARCH");
|
|
3911
|
+
const char * str_vmem = getenv("GGML_HEXAGON_VMEM");
|
|
3912
|
+
const char * str_mbuf = getenv("GGML_HEXAGON_MBUF");
|
|
3913
|
+
|
|
3914
|
+
// Init Arch first since it affects other defaults
|
|
3915
|
+
if (!str_arch) {
|
|
3916
|
+
int err = get_hex_arch_ver(CDSP_DOMAIN_ID, &opt_arch);
|
|
3917
|
+
if (err != 0) {
|
|
3918
|
+
GGML_LOG_ERROR("ggml-hex: failed to query HTP version (err %d) defaulting to v73\n", err);
|
|
3919
|
+
opt_arch = 73;
|
|
3920
|
+
}
|
|
3921
|
+
} else {
|
|
3922
|
+
if (str_arch[0] == 'v' || str_arch[0] == 'V') {
|
|
3923
|
+
str_arch++;
|
|
3924
|
+
}
|
|
3925
|
+
opt_arch = strtoul(str_arch, NULL, 0);
|
|
3926
|
+
}
|
|
3927
|
+
|
|
3928
|
+
size_t MiB = 1024 * 1024;
|
|
3929
|
+
|
|
3930
|
+
// Update vmem default
|
|
3931
|
+
opt_vmem = opt_arch >= 75 ? HTP_OP_MAX_VMEM_DEFAULT : 3000 * MiB;
|
|
3932
|
+
|
|
3933
|
+
auto RE_ICASE = std::regex_constants::icase;
|
|
3934
|
+
|
|
3935
|
+
opt_opfilter = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL;
|
|
3936
|
+
opt_verbose = str_verbose ? atoi(str_verbose) : 0;
|
|
3937
|
+
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
|
|
3938
|
+
opt_opstage = str_opstage ? strtoul(str_opstage, NULL, 0) : opt_opstage;
|
|
3939
|
+
opt_opbatch = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
|
|
3940
|
+
opt_opqueue = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
|
|
3941
|
+
opt_oppoll = str_oppoll ? strtoul(str_oppoll, NULL, 0) : opt_oppoll;
|
|
3942
|
+
opt_profile = str_profile ? atoi(str_profile) : 0;
|
|
3943
|
+
opt_etm = str_etm ? atoi(str_etm) : 0;
|
|
3944
|
+
opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
|
|
3945
|
+
opt_use_hmx = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx;
|
|
3946
|
+
opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev;
|
|
3947
|
+
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
|
|
3948
|
+
opt_mbuf = str_mbuf ? strtoul(str_mbuf, NULL, 0) * MiB : opt_mbuf;
|
|
3949
|
+
opt_vmem = str_vmem ? strtoul(str_vmem, NULL, 0) * MiB : opt_vmem;
|
|
3273
3950
|
|
|
3274
3951
|
if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
|
|
3275
3952
|
opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
|
|
3276
3953
|
}
|
|
3277
3954
|
|
|
3278
|
-
|
|
3279
|
-
|
|
3280
|
-
|
|
3281
|
-
|
|
3282
|
-
opt_arch = strtoul(str_arch, NULL, 0);
|
|
3955
|
+
#if defined(__ANDROID__)
|
|
3956
|
+
if (opt_arch < 75) {
|
|
3957
|
+
opt_ndev = 1;
|
|
3958
|
+
GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
|
|
3283
3959
|
}
|
|
3960
|
+
#endif
|
|
3284
3961
|
|
|
3285
|
-
|
|
3962
|
+
if (str_profile) {
|
|
3963
|
+
opt_pmu_evt = [&]() -> std::vector<uint32_t> {
|
|
3964
|
+
auto v = str_to_vec<uint32_t>(str_profile);
|
|
3965
|
+
switch (v.size()) {
|
|
3966
|
+
case 1: opt_profile = v[0]; return opt_pmu_evt; // mode with default pmu events
|
|
3967
|
+
case 8: opt_profile = 2; return v; // mode with custom pmu events
|
|
3968
|
+
default: opt_profile = 0; return {}; // garbage input
|
|
3969
|
+
}}();
|
|
3970
|
+
if (opt_profile == 1) opt_pmu_evt = {};
|
|
3971
|
+
GGML_LOG_INFO("ggml-hex: Profiling mode %u : pmu-evt [ %s ]\n", opt_profile,
|
|
3972
|
+
vec_to_str<uint32_t, 16>(opt_pmu_evt).c_str());
|
|
3973
|
+
}
|
|
3286
3974
|
|
|
3287
3975
|
reg->context = new ggml_hexagon_registry(reg);
|
|
3288
|
-
|
|
3289
|
-
HEX_VERBOSE("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req),
|
|
3290
|
-
sizeof(struct htp_general_rsp));
|
|
3291
3976
|
}
|
|
3292
3977
|
|
|
3293
3978
|
static const struct ggml_backend_reg_i ggml_backend_hexagon_reg_i = {
|