whispercpp 1.3.5 → 1.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.document +3 -0
- data/.rdoc_options +2 -0
- data/LICENSE +1 -1
- data/README.md +133 -3
- data/Rakefile +18 -3
- data/ext/dependencies.rb +10 -4
- data/ext/dependencies_for_windows.rb +17 -0
- data/ext/extconf.rb +20 -7
- data/ext/options.rb +54 -14
- data/ext/options_for_windows.rb +51 -0
- data/ext/ruby_whisper.c +56 -46
- data/ext/ruby_whisper.h +165 -2
- data/ext/ruby_whisper_context.c +297 -126
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_log_queue.c +180 -0
- data/ext/ruby_whisper_log_settable.h +47 -0
- data/ext/ruby_whisper_model.c +0 -1
- data/ext/ruby_whisper_parakeet.c +49 -0
- data/ext/ruby_whisper_parakeet_context.c +304 -0
- data/ext/ruby_whisper_parakeet_context_params.c +117 -0
- data/ext/ruby_whisper_parakeet_model.c +84 -0
- data/ext/ruby_whisper_parakeet_params.c +548 -0
- data/ext/ruby_whisper_parakeet_segment.c +157 -0
- data/ext/ruby_whisper_parakeet_token.c +188 -0
- data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
- data/ext/ruby_whisper_params.c +256 -66
- data/ext/ruby_whisper_segment.c +6 -7
- data/ext/ruby_whisper_token.c +29 -9
- data/ext/ruby_whisper_transcribe.cpp +46 -16
- data/ext/ruby_whisper_vad_context.c +48 -1
- data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +0 -1
- data/ext/ruby_whisper_vad_segments.c +0 -1
- data/ext/sources/CMakeLists.txt +41 -3
- data/ext/sources/CMakePresets.json +95 -0
- data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
- data/ext/sources/cmake/parakeet.pc.in +10 -0
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/cmake/whisper.pc.in +1 -1
- data/ext/sources/examples/CMakeLists.txt +4 -2
- data/ext/sources/examples/bench/bench.cpp +24 -19
- data/ext/sources/examples/cli/cli.cpp +51 -9
- data/ext/sources/examples/common-ggml.cpp +4 -0
- data/ext/sources/examples/common-whisper.cpp +139 -67
- data/ext/sources/examples/common-whisper.h +11 -0
- data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
- data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
- data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
- data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
- data/ext/sources/examples/server/server.cpp +213 -163
- data/ext/sources/ggml/CMakeLists.txt +29 -15
- data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
- data/ext/sources/ggml/include/ggml-alloc.h +1 -0
- data/ext/sources/ggml/include/ggml-backend.h +73 -11
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +5 -0
- data/ext/sources/ggml/include/ggml-cuda.h +3 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +8 -3
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml.h +155 -16
- data/ext/sources/ggml/include/gguf.h +10 -2
- data/ext/sources/ggml/src/CMakeLists.txt +25 -5
- data/ext/sources/ggml/src/ggml-alloc.c +9 -10
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
- data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +40 -86
- data/ext/sources/ggml/src/ggml-backend.cpp +114 -10
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +10 -2
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +1016 -442
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +111 -85
- data/ext/sources/ggml/src/ggml-cann/common.h +23 -14
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +255 -92
- data/ext/sources/ggml/src/ggml-common.h +22 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +68 -34
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +44 -19
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +101 -101
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +194 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2874 -613
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +5480 -840
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1361 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -11
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +186 -36
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +119 -19
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +112 -26
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +153 -16
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +17 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +976 -251
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +671 -266
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1277 -263
- data/ext/sources/ggml/src/ggml-cpu/ops.h +4 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +95 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +6 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2893 -679
- data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +226 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +114 -19
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
- data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +54 -53
- data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +18 -8
- data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
- data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +73 -28
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +69 -41
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +359 -29
- data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +94 -27
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +20 -9
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +333 -85
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +632 -190
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +162 -49
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +43 -18
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +44 -14
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +241 -23
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
- data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +312 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1454 -599
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +397 -183
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +161 -88
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +522 -431
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +139 -72
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +608 -88
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +47 -79
- data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +134 -27
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +7 -17
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +244 -137
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
- data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
- data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +96 -40
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +6 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +6 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -5
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +202 -135
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +86 -2
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +111 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +30 -2
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +84 -46
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1612 -753
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +51 -11
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +361 -261
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +294 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +753 -241
- data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +295 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +471 -296
- data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +159 -53
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +3 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +372 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +86 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +137 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +97 -14
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +163 -67
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +308 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +262 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +291 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +216 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +142 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -1348
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +547 -635
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +3556 -1101
- data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +475 -269
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +94 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +222 -217
- data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +432 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +886 -117
- data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +40 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +28 -9
- data/ext/sources/ggml/src/ggml-impl.h +68 -1
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +409 -83
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +54 -5
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +254 -52
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +254 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +756 -285
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +7 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +359 -133
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1867 -1123
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +71 -4
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +14127 -5314
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +97 -88
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +104 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1978 -67
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
- data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
- data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +178 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +985 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +380 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1132 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +956 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +149 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +47 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +40 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +317 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +257 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +86 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +880 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +143 -0
- data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
- data/ext/sources/ggml/src/ggml-quants.c +385 -119
- data/ext/sources/ggml/src/ggml-quants.h +6 -0
- data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
- data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
- data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +64 -91
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +4 -1
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
- data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +356 -11
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +184 -14
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +31 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
- data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
- data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
- data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +77 -156
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1181 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +59 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1246 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +674 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +227 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +347 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +1134 -236
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
- data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
- data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +72 -1
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
- data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
- data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
- data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
- data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
- data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +228 -53
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +123 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +160 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +99 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +545 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +115 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3250 -940
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +533 -180
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +113 -68
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +412 -222
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +222 -83
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +189 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +22 -20
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +51 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +39 -63
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +13 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +27 -11
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +193 -149
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +3221 -97
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3493 -1997
- data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +142 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +115 -141
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +93 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -44
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +198 -230
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +234 -335
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +871 -42
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +149 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +36 -138
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +151 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +15 -40
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +39 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +213 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +24 -15
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +253 -16
- data/ext/sources/ggml/src/ggml.c +268 -52
- data/ext/sources/ggml/src/gguf.cpp +377 -47
- data/ext/sources/include/parakeet.h +342 -0
- data/ext/sources/include/whisper.h +10 -0
- data/ext/sources/media/matmul.png +0 -0
- data/ext/sources/src/CMakeLists.txt +23 -0
- data/ext/sources/src/parakeet-arch.h +188 -0
- data/ext/sources/src/parakeet.cpp +3838 -0
- data/ext/sources/src/whisper.cpp +62 -40
- data/extsources.rb +26 -10
- data/lib/whisper/log_settable.rb +36 -0
- data/lib/whisper/model/uri.rb +13 -1
- data/lib/whisper/output.rb +74 -0
- data/sig/whisper.rbs +445 -55
- data/test/helper.rb +2 -0
- data/test/jfk_reader/jfk_reader.c +50 -7
- data/test/test_callback.rb +1 -0
- data/test/test_context_params.rb +82 -0
- data/test/test_package.rb +6 -5
- data/test/test_parakeet.rb +28 -0
- data/test/test_parakeet_callback.rb +107 -0
- data/test/test_parakeet_context.rb +116 -0
- data/test/test_parakeet_context_params.rb +24 -0
- data/test/test_parakeet_model.rb +21 -0
- data/test/test_parakeet_params.rb +78 -0
- data/test/test_parakeet_segment.rb +42 -0
- data/test/test_parakeet_token.rb +73 -0
- data/test/test_params.rb +2 -0
- data/test/test_token.rb +11 -0
- data/test/test_vad_context.rb +58 -8
- data/test/test_vad_segment.rb +1 -1
- data/test/test_whisper.rb +44 -6
- data/whispercpp.gemspec +2 -2
- metadata +426 -280
- data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
- data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
- data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
- data/ext/sources/bindings/javascript/package.json +0 -26
- data/ext/sources/bindings/javascript/whisper.js +0 -19
- data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
- data/ext/sources/examples/addon.node/addon.cpp +0 -557
- data/ext/sources/examples/addon.node/index.js +0 -59
- data/ext/sources/examples/addon.node/package.json +0 -16
- data/ext/sources/examples/addon.node/vad-example.js +0 -132
- data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
- data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
- data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
- data/ext/sources/examples/coi-serviceworker.js +0 -146
- data/ext/sources/examples/command/CMakeLists.txt +0 -10
- data/ext/sources/examples/command/command.cpp +0 -802
- data/ext/sources/examples/command/commands.txt +0 -9
- data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
- data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
- data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
- data/ext/sources/examples/generate-karaoke.sh +0 -57
- data/ext/sources/examples/helpers.js +0 -191
- data/ext/sources/examples/livestream.sh +0 -112
- data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
- data/ext/sources/examples/lsp/lsp.cpp +0 -471
- data/ext/sources/examples/lsp/whisper.vim +0 -362
- data/ext/sources/examples/python/test_whisper_processor.py +0 -7
- data/ext/sources/examples/python/whisper_processor.py +0 -54
- data/ext/sources/examples/server/bench.js +0 -29
- data/ext/sources/examples/server.py +0 -120
- data/ext/sources/examples/stream/CMakeLists.txt +0 -10
- data/ext/sources/examples/stream/stream.cpp +0 -437
- data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
- data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
- data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
- data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
- data/ext/sources/examples/sycl/build.sh +0 -22
- data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
- data/ext/sources/examples/sycl/run-whisper.sh +0 -17
- data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -47
- data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -494
- data/ext/sources/examples/talk-llama/llama-adapter.h +0 -88
- data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2559
- data/ext/sources/examples/talk-llama/llama-arch.h +0 -586
- data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -917
- data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
- data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -876
- data/ext/sources/examples/talk-llama/llama-chat.h +0 -70
- data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3645
- data/ext/sources/examples/talk-llama/llama-context.h +0 -360
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
- data/ext/sources/examples/talk-llama/llama-cparams.h +0 -42
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
- data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
- data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2282
- data/ext/sources/examples/talk-llama/llama-graph.h +0 -910
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -241
- data/ext/sources/examples/talk-llama/llama-hparams.h +0 -284
- data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
- data/ext/sources/examples/talk-llama/llama-impl.h +0 -63
- data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
- data/ext/sources/examples/talk-llama/llama-io.h +0 -35
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -328
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2100
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -390
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1167
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
- data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
- data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -735
- data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1247
- data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -176
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -285
- data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -37
- data/ext/sources/examples/talk-llama/llama-model.cpp +0 -8338
- data/ext/sources/examples/talk-llama/llama-model.h +0 -544
- data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1072
- data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +0 -3771
- data/ext/sources/examples/talk-llama/llama-sampling.h +0 -44
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3900
- data/ext/sources/examples/talk-llama/llama-vocab.h +0 -182
- data/ext/sources/examples/talk-llama/llama.cpp +0 -1140
- data/ext/sources/examples/talk-llama/llama.h +0 -1540
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -191
- data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
- data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -138
- data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -144
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/bert.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -160
- data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
- data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -144
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -259
- data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -134
- data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
- data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -113
- data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
- data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
- data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -196
- data/ext/sources/examples/talk-llama/models/granite.cpp +0 -211
- data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +0 -283
- data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -141
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -154
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
- data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -175
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
- data/ext/sources/examples/talk-llama/models/llama.cpp +0 -168
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -55
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -199
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
- data/ext/sources/examples/talk-llama/models/models.h +0 -569
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -116
- data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
- data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
- data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
- data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -316
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/plm.cpp +0 -168
- data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -117
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -873
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -149
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -141
- data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -162
- data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
- data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -135
- data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
- data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
- data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
- data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
- data/ext/sources/examples/talk-llama/speak +0 -40
- data/ext/sources/examples/talk-llama/speak.bat +0 -1
- data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
- data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
- data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
- data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
- data/ext/sources/examples/talk-llama/unicode.cpp +0 -1147
- data/ext/sources/examples/talk-llama/unicode.h +0 -111
- data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
- data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
- data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
- data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
- data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
- data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
- data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
- data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
- data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
- data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +0 -157
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -165
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
- data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
- data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -147
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +0 -907
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +0 -247
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
- data/ext/sources/tests/CMakeLists.txt +0 -112
- data/ext/sources/tests/earnings21/eval.mk +0 -58
- data/ext/sources/tests/earnings21/eval.py +0 -68
- data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
- data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
- data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
- data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
- data/ext/sources/tests/earnings21/requirements.txt +0 -6
- data/ext/sources/tests/en-0-ref.txt +0 -1
- data/ext/sources/tests/en-1-ref.txt +0 -1
- data/ext/sources/tests/en-2-ref.txt +0 -1
- data/ext/sources/tests/es-0-ref.txt +0 -1
- data/ext/sources/tests/librispeech/eval.mk +0 -39
- data/ext/sources/tests/librispeech/eval.py +0 -47
- data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
- data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
- data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
- data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
- data/ext/sources/tests/librispeech/requirements.txt +0 -6
- data/ext/sources/tests/run-tests.sh +0 -130
- data/ext/sources/tests/test-c.c +0 -3
- data/ext/sources/tests/test-vad-full.cpp +0 -56
- data/ext/sources/tests/test-vad.cpp +0 -83
- data/ext/sources/tests/test-whisper.js +0 -58
- data/lib/whisper/context.rb +0 -15
- data/lib/whisper/segment.rb +0 -58
|
@@ -7,16 +7,20 @@
|
|
|
7
7
|
|
|
8
8
|
#include <atomic>
|
|
9
9
|
#include <chrono>
|
|
10
|
-
#include <cstddef>
|
|
11
10
|
#include <mutex>
|
|
11
|
+
#include <thread>
|
|
12
|
+
#include <cstddef>
|
|
12
13
|
#include <stdexcept>
|
|
13
14
|
#include <string>
|
|
15
|
+
#include <sstream>
|
|
16
|
+
#include <iomanip>
|
|
17
|
+
#include <unordered_set>
|
|
18
|
+
#include <unordered_map>
|
|
19
|
+
#include <regex>
|
|
20
|
+
#include <queue>
|
|
14
21
|
|
|
15
22
|
#ifdef _WIN32
|
|
16
23
|
# include <sal.h>
|
|
17
|
-
# ifndef _WINDOWS
|
|
18
|
-
# define _WINDOWS
|
|
19
|
-
# endif
|
|
20
24
|
#else
|
|
21
25
|
# include <semaphore.h>
|
|
22
26
|
# include <unistd.h>
|
|
@@ -25,8 +29,6 @@
|
|
|
25
29
|
#pragma clang diagnostic ignored "-Wnested-anon-types"
|
|
26
30
|
#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
|
|
27
31
|
|
|
28
|
-
#include "htp-utils.h"
|
|
29
|
-
|
|
30
32
|
#include <AEEStdErr.h>
|
|
31
33
|
#include <dspqueue.h>
|
|
32
34
|
#include <rpcmem.h>
|
|
@@ -37,22 +39,38 @@
|
|
|
37
39
|
#include "ggml-hexagon.h"
|
|
38
40
|
#include "ggml-impl.h"
|
|
39
41
|
#include "ggml-quants.h"
|
|
40
|
-
#include "
|
|
41
|
-
#include "htp-
|
|
42
|
+
#include "htp-opnode.h"
|
|
43
|
+
#include "htp-ops.h"
|
|
42
44
|
#include "htp_iface.h"
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
static int
|
|
50
|
-
static
|
|
51
|
-
static
|
|
45
|
+
#include "htp-drv.h"
|
|
46
|
+
|
|
47
|
+
using intvec = std::vector<int>;
|
|
48
|
+
using uintvec = std::vector<unsigned int>;
|
|
49
|
+
using u32vec = std::vector<uint32_t>;
|
|
50
|
+
|
|
51
|
+
static int opt_arch = 0; // autodetect
|
|
52
|
+
static size_t opt_ndev = 1;
|
|
53
|
+
static size_t opt_nhvx = 0; // use all
|
|
54
|
+
static int opt_use_hmx = 1; // when set, enable HMX; when 0, use HVX only
|
|
55
|
+
static size_t opt_vmem = HTP_OP_MAX_VMEM_DEFAULT; // max available va space for buffer mappings
|
|
56
|
+
static size_t opt_mbuf = 1ul * 1024 * 1024 * 1024; // max buffer size
|
|
57
|
+
static int opt_etm = 0;
|
|
58
|
+
static int opt_verbose = 0;
|
|
59
|
+
static int opt_profile = 0; // profiling mode (0-disabled, 1-basic, 2-pmu)
|
|
60
|
+
static int opt_hostbuf = 1; // hostbuf ON by default
|
|
61
|
+
|
|
62
|
+
// Default PMU events, if profiling with PMU (mode=2) is enabled
|
|
63
|
+
// See https://docs.qualcomm.com/doc/80-N2040-60/topic/pmu-events.html
|
|
64
|
+
// https://docs.qualcomm.com/doc/80-N2040-61/topic/hvx-pmu-events.html
|
|
65
|
+
static u32vec opt_pmu_evt { 0x3, 0x111, 0x100, 0x105, 0x240, 0x256, 0x7D, 0x8C };
|
|
52
66
|
|
|
53
67
|
// Enable all stages by default
|
|
54
|
-
static int
|
|
55
|
-
static int
|
|
68
|
+
static int opt_opstage = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE;
|
|
69
|
+
static int opt_opbatch = 1024; // max number of ops in a batch
|
|
70
|
+
static int opt_opqueue = 16; // max number of pending batches
|
|
71
|
+
static int opt_oppoll = 0; // polling for batch completions
|
|
72
|
+
|
|
73
|
+
static std::regex* opt_opfilter = NULL; // regex of ops to not claim
|
|
56
74
|
|
|
57
75
|
#define HEX_VERBOSE(...) \
|
|
58
76
|
if (opt_verbose) GGML_LOG_DEBUG(__VA_ARGS__)
|
|
@@ -84,47 +102,45 @@ static const char * status_to_str(uint32_t status) {
|
|
|
84
102
|
|
|
85
103
|
// ** debug helpers
|
|
86
104
|
|
|
87
|
-
static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const
|
|
105
|
+
static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const htp_opnode & node, const uint32_t req_flags) {
|
|
88
106
|
if (!opt_verbose) return;
|
|
89
107
|
|
|
90
|
-
|
|
108
|
+
htp_opformat fmt(node);
|
|
91
109
|
GGML_LOG_DEBUG("ggml-hex: %s execute-op %s: %s : %s : %s : %s : %s : flags 0x%x\n", sess_name.c_str(),
|
|
92
|
-
|
|
110
|
+
node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, req_flags);
|
|
93
111
|
}
|
|
94
112
|
|
|
95
113
|
static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct ggml_tensor * op, bool supp) {
|
|
96
114
|
if (!opt_verbose) return;
|
|
97
115
|
|
|
98
|
-
|
|
99
|
-
GGML_LOG_DEBUG("ggml-hex: %s supports-op %s
|
|
100
|
-
|
|
116
|
+
htp_opformat fmt(htp_opformat(htp_opnode{const_cast<ggml_tensor*>(op), {}, HTP_OP_INVALID}));
|
|
117
|
+
GGML_LOG_DEBUG("ggml-hex: %s supports-op %s: %s : %s : %s : %s : %s : %s\n", sess_name.c_str(),
|
|
118
|
+
ggml_op_desc(op), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, supp ? "yes" : "no");
|
|
101
119
|
}
|
|
102
120
|
|
|
103
|
-
static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const
|
|
104
|
-
uint32_t op_usec, uint32_t op_cycles, uint32_t
|
|
121
|
+
static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const htp_opnode & node,
|
|
122
|
+
uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) {
|
|
105
123
|
if (!opt_profile) return;
|
|
106
124
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
125
|
+
char pmu_str[256] = "";
|
|
126
|
+
if (opt_profile > 1) {
|
|
127
|
+
static_assert(HTP_PROF_PMU_NCNT == 8, "current implementation assumes 8 PMU counters");
|
|
128
|
+
sprintf(pmu_str, " pmu [%u,%u,%u,%u,%u,%u,%u,%u]",
|
|
129
|
+
pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
htp_opformat fmt(node);
|
|
133
|
+
GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(),
|
|
134
|
+
node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, op_usec, op_cycles, pmu_str);
|
|
111
135
|
}
|
|
112
136
|
|
|
113
137
|
// ** backend sessions
|
|
114
138
|
|
|
115
|
-
struct
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
void allocate(int dev_id) noexcept(false);
|
|
120
|
-
void release() noexcept(true);
|
|
121
|
-
|
|
122
|
-
void enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync = false);
|
|
123
|
-
void flush();
|
|
124
|
-
|
|
125
|
-
ggml_backend_buffer_type buffer_type = {};
|
|
126
|
-
ggml_backend_buffer_type repack_buffer_type = {};
|
|
139
|
+
struct ggml_hexagon_opbatch;
|
|
140
|
+
struct ggml_hexagon_opqueue;
|
|
141
|
+
struct htp_opnode;
|
|
127
142
|
|
|
143
|
+
struct ggml_hexagon_session {
|
|
128
144
|
std::string name;
|
|
129
145
|
remote_handle64 handle;
|
|
130
146
|
dspqueue_t queue;
|
|
@@ -136,87 +152,28 @@ struct ggml_hexagon_session {
|
|
|
136
152
|
bool valid_handle;
|
|
137
153
|
bool valid_queue;
|
|
138
154
|
bool valid_iface;
|
|
139
|
-
std::atomic<int> op_pending;
|
|
140
|
-
uint32_t prof_usecs;
|
|
141
|
-
uint32_t prof_cycles;
|
|
142
|
-
uint32_t prof_pkts;
|
|
143
|
-
};
|
|
144
|
-
|
|
145
|
-
void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
|
|
146
|
-
// Bump pending flag (cleared in the session::flush once we get the responce)
|
|
147
|
-
this->op_pending++; // atomic inc
|
|
148
|
-
|
|
149
|
-
int err = dspqueue_write(this->queue,
|
|
150
|
-
0, // flags - the framework will autoset this
|
|
151
|
-
n_bufs, // number of buffers
|
|
152
|
-
bufs, // buffer references
|
|
153
|
-
sizeof(req),
|
|
154
|
-
(const uint8_t *) &req, // Message
|
|
155
|
-
1000000 // Timeout
|
|
156
|
-
);
|
|
157
|
-
|
|
158
|
-
if (err != 0) {
|
|
159
|
-
GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->name.c_str(), (unsigned) err);
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
if (sync) {
|
|
163
|
-
flush();
|
|
164
|
-
}
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
// Flush HTP response queue i.e wait for all outstanding requests to complete
|
|
168
|
-
void ggml_hexagon_session::flush() {
|
|
169
|
-
dspqueue_t q = this->queue;
|
|
170
|
-
|
|
171
|
-
// Repeatedly read packets from the queue until it's empty. We don't
|
|
172
|
-
// necessarily get a separate callback for each packet, and new packets
|
|
173
|
-
// may arrive while we're processing the previous one.
|
|
174
|
-
|
|
175
|
-
while (this->op_pending) {
|
|
176
|
-
struct htp_general_rsp rsp;
|
|
177
|
-
uint32_t rsp_size;
|
|
178
|
-
uint32_t flags;
|
|
179
|
-
|
|
180
|
-
struct dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
|
|
181
|
-
uint32_t n_bufs;
|
|
182
155
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
&n_bufs, // Number of buffer references
|
|
187
|
-
bufs, // Buffer references
|
|
188
|
-
sizeof(rsp), // Max message length
|
|
189
|
-
&rsp_size, // Message length
|
|
190
|
-
(uint8_t *) &rsp,
|
|
191
|
-
1000000); // Timeout
|
|
156
|
+
std::atomic<int> op_pending;
|
|
157
|
+
ggml_hexagon_opbatch* op_batch;
|
|
158
|
+
ggml_hexagon_opqueue* op_queue;
|
|
192
159
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
continue;
|
|
196
|
-
}
|
|
160
|
+
ggml_backend_buffer_type buffer_type = {};
|
|
161
|
+
ggml_backend_buffer_type repack_buffer_type = {};
|
|
197
162
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
}
|
|
163
|
+
ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false);
|
|
164
|
+
~ggml_hexagon_session() noexcept(true);
|
|
201
165
|
|
|
202
|
-
|
|
203
|
-
if (rsp_size != sizeof(rsp)) {
|
|
204
|
-
GGML_ABORT("ggml-hex: dspcall : bad response (size)\n");
|
|
205
|
-
}
|
|
166
|
+
const char* c_name() const { return name.c_str(); }
|
|
206
167
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
// TODO: handle errors
|
|
210
|
-
}
|
|
168
|
+
void allocate(int dev_id) noexcept(false);
|
|
169
|
+
void release() noexcept(true);
|
|
211
170
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
this->prof_cycles = rsp.prof_cycles;
|
|
215
|
-
this->prof_pkts = rsp.prof_pkts;
|
|
171
|
+
void enqueue_op(const htp_opnode & node);
|
|
172
|
+
void flush(bool all = true);
|
|
216
173
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
}
|
|
174
|
+
void flush_pending(bool all = false);
|
|
175
|
+
void flush_batch();
|
|
176
|
+
};
|
|
220
177
|
|
|
221
178
|
// ** backend buffers
|
|
222
179
|
|
|
@@ -230,88 +187,94 @@ struct ggml_backend_hexagon_buffer_type_context {
|
|
|
230
187
|
std::string name;
|
|
231
188
|
};
|
|
232
189
|
|
|
233
|
-
struct
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
190
|
+
struct ggml_hexagon_shared_buffer {
|
|
191
|
+
ggml_hexagon_session * sess;
|
|
192
|
+
uint8_t * base;
|
|
193
|
+
size_t size;
|
|
194
|
+
int fd;
|
|
195
|
+
bool mapped;
|
|
196
|
+
bool pinned;
|
|
197
|
+
|
|
198
|
+
void mmap() {
|
|
199
|
+
fastrpc_map_flags flags = this->pinned ? FASTRPC_MAP_FD : FASTRPC_MAP_FD_DELAYED;
|
|
238
200
|
|
|
239
|
-
int err = fastrpc_mmap(
|
|
201
|
+
int err = fastrpc_mmap(sess->domain_id, this->fd, (void *) this->base, 0, this->size, flags);
|
|
240
202
|
if (err != 0) {
|
|
241
|
-
GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n",
|
|
242
|
-
|
|
243
|
-
|
|
203
|
+
GGML_LOG_ERROR("ggml-hex: %s buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n", sess->c_name(),
|
|
204
|
+
sess->domain_id, this->size, this->fd, (unsigned) err);
|
|
205
|
+
throw std::runtime_error("ggml-hex: fastrpc_mmap failed (see log for details)");
|
|
244
206
|
}
|
|
245
207
|
|
|
246
|
-
|
|
247
|
-
|
|
208
|
+
HEX_VERBOSE("ggml-hex: %s mapped buffer: base %p size %zu fd %d pinned %u\n",
|
|
209
|
+
sess->c_name(), (void *) this->base, this->size, this->fd, pinned);
|
|
248
210
|
|
|
249
|
-
bool mmap() {
|
|
250
|
-
if (this->mapped) {
|
|
251
|
-
return true;
|
|
252
|
-
}
|
|
253
|
-
if (!mmap_to(this->sess)) {
|
|
254
|
-
return false;
|
|
255
|
-
}
|
|
256
211
|
this->mapped = true;
|
|
257
|
-
return true;
|
|
258
212
|
}
|
|
259
213
|
|
|
260
|
-
void
|
|
261
|
-
if (!this->mapped)
|
|
262
|
-
|
|
214
|
+
void unmap() {
|
|
215
|
+
if (!this->mapped) return;
|
|
216
|
+
|
|
217
|
+
if (!this->pinned) {
|
|
218
|
+
// HTP might still hold a reference, tell it drop it
|
|
219
|
+
htp_iface_munmap(sess->handle, this->fd);
|
|
263
220
|
}
|
|
264
221
|
|
|
265
|
-
fastrpc_munmap(
|
|
222
|
+
fastrpc_munmap(sess->domain_id, this->fd, (void *) this->base, this->size);
|
|
223
|
+
|
|
224
|
+
HEX_VERBOSE("ggml-hex: %s unmapped buffer: base %p size %zu fd %d\n", sess->c_name(),
|
|
225
|
+
(void *) this->base, size, this->fd);
|
|
226
|
+
|
|
266
227
|
this->mapped = false;
|
|
228
|
+
this->fd = -1;
|
|
267
229
|
}
|
|
268
230
|
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
if (rpcmem_alloc2) {
|
|
273
|
-
this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
|
|
274
|
-
} else {
|
|
275
|
-
GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str());
|
|
276
|
-
this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
|
|
277
|
-
}
|
|
231
|
+
void alloc(size_t size) {
|
|
232
|
+
if (this->base) return;
|
|
278
233
|
|
|
234
|
+
this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, size);
|
|
279
235
|
if (!this->base) {
|
|
280
|
-
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->
|
|
236
|
+
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->c_name(), size);
|
|
281
237
|
throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)");
|
|
282
238
|
}
|
|
283
239
|
|
|
284
240
|
this->fd = rpcmem_to_fd(this->base);
|
|
285
241
|
if (this->fd < 0) {
|
|
286
|
-
GGML_LOG_ERROR("ggml-hex: %s failed to get FD for buffer %p\n", sess->
|
|
287
|
-
rpcmem_free(this->base);
|
|
288
|
-
this->base = NULL;
|
|
242
|
+
GGML_LOG_ERROR("ggml-hex: %s failed to get FD for buffer %p\n", sess->c_name(), (void *) this->base);
|
|
289
243
|
throw std::runtime_error("ggml-hex: rpcmem_to_fd failed (see log for details)");
|
|
290
244
|
}
|
|
245
|
+
this->size = size;
|
|
246
|
+
|
|
247
|
+
HEX_VERBOSE("ggml-hex: %s allocated buffer: base %p size %zu fd %d pinned %d\n", sess->c_name(),
|
|
248
|
+
(void *) this->base, this->size, this->fd, (int) pinned);
|
|
249
|
+
mmap();
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
void free() {
|
|
253
|
+
if (!this->base) return;
|
|
254
|
+
|
|
255
|
+
unmap();
|
|
256
|
+
rpcmem_free(this->base);
|
|
257
|
+
|
|
258
|
+
HEX_VERBOSE("ggml-hex: %s freed buffer: base %p size %zu fd %d\n", sess->c_name(),
|
|
259
|
+
(void *) this->base, size, this->fd);
|
|
291
260
|
|
|
292
|
-
|
|
293
|
-
|
|
261
|
+
this->base = NULL;
|
|
262
|
+
}
|
|
294
263
|
|
|
264
|
+
ggml_hexagon_shared_buffer(ggml_hexagon_session * sess, size_t size, bool pinned = false) {
|
|
295
265
|
this->sess = sess;
|
|
296
|
-
this->size =
|
|
266
|
+
this->size = 0;
|
|
267
|
+
this->base = nullptr;
|
|
268
|
+
this->fd = -1;
|
|
297
269
|
this->mapped = false;
|
|
298
|
-
this->
|
|
299
|
-
}
|
|
270
|
+
this->pinned = pinned;
|
|
300
271
|
|
|
301
|
-
|
|
302
|
-
munmap();
|
|
303
|
-
if (this->base) {
|
|
304
|
-
rpcmem_free(this->base);
|
|
305
|
-
this->base = NULL;
|
|
306
|
-
}
|
|
272
|
+
alloc(size);
|
|
307
273
|
}
|
|
308
274
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
int fd;
|
|
313
|
-
bool mapped; // mmap is done
|
|
314
|
-
bool repack; // repacked buffer
|
|
275
|
+
~ggml_hexagon_shared_buffer() {
|
|
276
|
+
free();
|
|
277
|
+
}
|
|
315
278
|
};
|
|
316
279
|
|
|
317
280
|
static ggml_hexagon_session * ggml_backend_hexagon_buffer_get_sess(ggml_backend_buffer_t buffer) {
|
|
@@ -319,30 +282,26 @@ static ggml_hexagon_session * ggml_backend_hexagon_buffer_get_sess(ggml_backend_
|
|
|
319
282
|
}
|
|
320
283
|
|
|
321
284
|
static void ggml_backend_hexagon_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
322
|
-
auto
|
|
323
|
-
delete
|
|
285
|
+
auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(buffer->context);
|
|
286
|
+
delete sbuf;
|
|
324
287
|
}
|
|
325
288
|
|
|
326
289
|
static void * ggml_backend_hexagon_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
327
|
-
auto
|
|
328
|
-
return
|
|
290
|
+
auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(buffer->context);
|
|
291
|
+
return sbuf->base;
|
|
329
292
|
}
|
|
330
293
|
|
|
331
294
|
static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
|
332
|
-
auto
|
|
333
|
-
auto sess =
|
|
295
|
+
auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(buffer->context);
|
|
296
|
+
auto sess = sbuf->sess;
|
|
334
297
|
|
|
335
|
-
HEX_VERBOSE("ggml-hex: %s init-tensor %s : base %p data %p nbytes %zu usage %d
|
|
336
|
-
tensor->name, (void *)
|
|
337
|
-
(int) ctx->repack);
|
|
298
|
+
HEX_VERBOSE("ggml-hex: %s init-tensor %s : base %p data %p nbytes %zu usage %d\n", sess->c_name(),
|
|
299
|
+
tensor->name, (void *) sbuf->base, tensor->data, ggml_nbytes(tensor), (int) buffer->usage);
|
|
338
300
|
|
|
339
301
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
|
340
|
-
; // nothing to do for the view
|
|
341
|
-
} else {
|
|
342
|
-
if (!ctx->mapped) {
|
|
343
|
-
ctx->mmap();
|
|
344
|
-
}
|
|
302
|
+
return GGML_STATUS_SUCCESS; // nothing to do for the view
|
|
345
303
|
}
|
|
304
|
+
|
|
346
305
|
return GGML_STATUS_SUCCESS;
|
|
347
306
|
}
|
|
348
307
|
|
|
@@ -412,6 +371,7 @@ static void pack_q4_0_quants(block_q4_0 * x, const uint8_t * qs, unsigned int bi
|
|
|
412
371
|
static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
|
|
413
372
|
static const int qk = QK_Q4_0x4x2;
|
|
414
373
|
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
|
374
|
+
const int nloe = k % qk; // leftovers
|
|
415
375
|
|
|
416
376
|
const int dblk_size = 8 * 2; // 8x __fp16
|
|
417
377
|
const int qblk_size = qk / 2; // int4
|
|
@@ -445,15 +405,17 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
|
|
|
445
405
|
unpack_q4_0_quants(qs, &x[i * 8 + 6], 6);
|
|
446
406
|
unpack_q4_0_quants(qs, &x[i * 8 + 7], 7);
|
|
447
407
|
|
|
408
|
+
bool partial = (nloe && i == nb-1);
|
|
409
|
+
|
|
448
410
|
uint8_t * q = y_q + (i * qblk_size);
|
|
449
411
|
for (int j = 0; j < qk / 2; j++) {
|
|
450
|
-
q[j] = (qs[j + 128] << 4) | qs[j];
|
|
412
|
+
q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
|
|
451
413
|
}
|
|
452
414
|
}
|
|
453
415
|
|
|
454
416
|
// Repack the scales
|
|
455
417
|
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
|
|
456
|
-
// the last block is truncated and
|
|
418
|
+
// the last block is truncated and overridden by the scales.
|
|
457
419
|
for (int i = 0; i < nb; i++) {
|
|
458
420
|
// Repack the scales
|
|
459
421
|
ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
|
|
@@ -467,7 +429,7 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
|
|
|
467
429
|
d[7] = x[i * 8 + 7].d;
|
|
468
430
|
}
|
|
469
431
|
|
|
470
|
-
if (opt_verbose >
|
|
432
|
+
if (opt_verbose > 2) {
|
|
471
433
|
for (int i = 0; i < nb; i++) {
|
|
472
434
|
dump_packed_block_q4x4x2(y, i, k);
|
|
473
435
|
}
|
|
@@ -477,6 +439,7 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
|
|
|
477
439
|
static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
|
|
478
440
|
static const int qk = QK_Q4_0x4x2;
|
|
479
441
|
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
|
442
|
+
const int nloe = k % qk; // leftovers
|
|
480
443
|
|
|
481
444
|
const int dblk_size = 8 * 2; // 8x __fp16
|
|
482
445
|
const int qblk_size = qk / 2; // int4
|
|
@@ -485,7 +448,7 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
|
|
|
485
448
|
const uint8_t * y_q = y + 0; // quants first
|
|
486
449
|
const uint8_t * y_d = y + qrow_size; // then scales
|
|
487
450
|
|
|
488
|
-
if (opt_verbose >
|
|
451
|
+
if (opt_verbose > 2) {
|
|
489
452
|
for (int i = 0; i < nb; i++) {
|
|
490
453
|
dump_packed_block_q4x4x2(y, i, k);
|
|
491
454
|
}
|
|
@@ -495,10 +458,17 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
|
|
|
495
458
|
for (int i = 0; i < nb; i++) {
|
|
496
459
|
uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
|
|
497
460
|
|
|
461
|
+
bool partial = (nloe && i == nb-1);
|
|
462
|
+
|
|
498
463
|
const uint8_t * q = y_q + (i * qblk_size);
|
|
499
464
|
for (int j = 0; j < qk / 2; j++) {
|
|
500
|
-
|
|
501
|
-
|
|
465
|
+
if (partial) {
|
|
466
|
+
qs[j*2+0] = q[j] & 0xf;
|
|
467
|
+
qs[j*2+1] = q[j] >> 4;
|
|
468
|
+
} else {
|
|
469
|
+
qs[j+000] = q[j] & 0xf;
|
|
470
|
+
qs[j+128] = q[j] >> 4;
|
|
471
|
+
}
|
|
502
472
|
}
|
|
503
473
|
|
|
504
474
|
pack_q4_0_quants(&x[i * 8 + 0], qs, 0);
|
|
@@ -513,7 +483,7 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
|
|
|
513
483
|
|
|
514
484
|
// Repack the scales
|
|
515
485
|
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
|
|
516
|
-
// the last block is truncated and
|
|
486
|
+
// the last block is truncated and overridden by the scales.
|
|
517
487
|
for (int i = 0; i < nb; i++) {
|
|
518
488
|
// Unpack the scales
|
|
519
489
|
const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
|
|
@@ -562,7 +532,7 @@ static void init_row_q4x4x2(block_q4_0 * x, int64_t k) {
|
|
|
562
532
|
|
|
563
533
|
// Init the scales
|
|
564
534
|
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
|
|
565
|
-
// the last block is truncated and
|
|
535
|
+
// the last block is truncated and overridden by the scales.
|
|
566
536
|
for (int i = 0; i < nb; i++) {
|
|
567
537
|
// Unpack the scales
|
|
568
538
|
x[i * 8 + 0].d = 0;
|
|
@@ -582,7 +552,7 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
|
|
|
582
552
|
|
|
583
553
|
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
|
584
554
|
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
|
|
585
|
-
size_t row_size_rp =
|
|
555
|
+
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
|
|
586
556
|
|
|
587
557
|
// Ensure we don't try to read more data than is available in the source buffer 'data'
|
|
588
558
|
// or write more than the tensor can hold.
|
|
@@ -643,7 +613,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
|
|
|
643
613
|
|
|
644
614
|
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
|
645
615
|
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
|
|
646
|
-
size_t row_size_rp =
|
|
616
|
+
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
|
|
647
617
|
|
|
648
618
|
// Ensure we don't try to copy more data than the tensor actually contains.
|
|
649
619
|
const size_t total_tensor_size = (size_t)nrows * row_size;
|
|
@@ -692,6 +662,239 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
|
|
|
692
662
|
ggml_aligned_free(buf_rp, row_size_rp);
|
|
693
663
|
}
|
|
694
664
|
|
|
665
|
+
static void unpack_q4_1_quants(uint8_t * qs, const block_q4_1 * x, unsigned int bi) {
|
|
666
|
+
static const int qk = QK4_1;
|
|
667
|
+
|
|
668
|
+
for (unsigned int i = 0; i < qk / 2; ++i) {
|
|
669
|
+
const int x0 = (x->qs[i] & 0x0F);
|
|
670
|
+
const int x1 = (x->qs[i] >> 4);
|
|
671
|
+
qs[bi * qk + i + 0] = x0;
|
|
672
|
+
qs[bi * qk + i + qk / 2] = x1;
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
static void pack_q4_1_quants(block_q4_1 * x, const uint8_t * qs, unsigned int bi) {
|
|
677
|
+
static const int qk = QK4_1;
|
|
678
|
+
|
|
679
|
+
for (unsigned int i = 0; i < qk / 2; ++i) {
|
|
680
|
+
const uint8_t x0 = qs[bi * qk + i + 0];
|
|
681
|
+
const uint8_t x1 = qs[bi * qk + i + qk / 2];
|
|
682
|
+
x->qs[i] = x0 | (x1 << 4);
|
|
683
|
+
}
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
static void repack_row_q4_1x4x2(uint8_t * y, const block_q4_1 * x, int64_t k) {
|
|
687
|
+
static const int qk = QK_Q4_0x4x2;
|
|
688
|
+
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
|
689
|
+
const int nloe = k % qk; // leftovers
|
|
690
|
+
|
|
691
|
+
const int dblk_size = 8 * 4; // 8x (d, m) __fp16 = 32 bytes
|
|
692
|
+
const int qblk_size = qk / 2; // int4 = 128 bytes
|
|
693
|
+
const int qrow_size = k / 2; // int4 (not padded to blocks)
|
|
694
|
+
|
|
695
|
+
uint8_t * y_q = y + 0; // quants first
|
|
696
|
+
uint8_t * y_d = y + qrow_size; // then scales/offsets
|
|
697
|
+
|
|
698
|
+
// Repack the quants
|
|
699
|
+
for (int i = 0; i < nb; i++) {
|
|
700
|
+
uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
|
|
701
|
+
unpack_q4_1_quants(qs, &x[i * 8 + 0], 0);
|
|
702
|
+
unpack_q4_1_quants(qs, &x[i * 8 + 1], 1);
|
|
703
|
+
unpack_q4_1_quants(qs, &x[i * 8 + 2], 2);
|
|
704
|
+
unpack_q4_1_quants(qs, &x[i * 8 + 3], 3);
|
|
705
|
+
unpack_q4_1_quants(qs, &x[i * 8 + 4], 4);
|
|
706
|
+
unpack_q4_1_quants(qs, &x[i * 8 + 5], 5);
|
|
707
|
+
unpack_q4_1_quants(qs, &x[i * 8 + 6], 6);
|
|
708
|
+
unpack_q4_1_quants(qs, &x[i * 8 + 7], 7);
|
|
709
|
+
|
|
710
|
+
bool partial = (nloe && i == nb-1);
|
|
711
|
+
|
|
712
|
+
uint8_t * q = y_q + (i * qblk_size);
|
|
713
|
+
for (int j = 0; j < qk / 2; j++) {
|
|
714
|
+
q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
|
|
715
|
+
}
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
// Repack the scales and offsets
|
|
719
|
+
for (int i = 0; i < nb; i++) {
|
|
720
|
+
ggml_half * d_m = (ggml_half *) (y_d + i * dblk_size);
|
|
721
|
+
for (int j = 0; j < 8; j++) {
|
|
722
|
+
d_m[j * 2 + 0] = x[i * 8 + j].d;
|
|
723
|
+
d_m[j * 2 + 1] = x[i * 8 + j].m;
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
static void unpack_row_q4_1x4x2(block_q4_1 * x, const uint8_t * y, int64_t k) {
|
|
729
|
+
static const int qk = QK_Q4_0x4x2;
|
|
730
|
+
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
|
731
|
+
const int nloe = k % qk; // leftovers
|
|
732
|
+
|
|
733
|
+
const int dblk_size = 8 * 4; // 8x (d, m) __fp16 = 32 bytes
|
|
734
|
+
const int qblk_size = qk / 2; // int4 = 128 bytes
|
|
735
|
+
const int qrow_size = k / 2; // int4 (not padded to blocks)
|
|
736
|
+
|
|
737
|
+
const uint8_t * y_q = y + 0; // quants first
|
|
738
|
+
const uint8_t * y_d = y + qrow_size; // then scales/offsets
|
|
739
|
+
|
|
740
|
+
// Unpack the quants
|
|
741
|
+
for (int i = 0; i < nb; i++) {
|
|
742
|
+
uint8_t qs[QK_Q4_0x4x2];
|
|
743
|
+
bool partial = (nloe && i == nb-1);
|
|
744
|
+
|
|
745
|
+
const uint8_t * q = y_q + (i * qblk_size);
|
|
746
|
+
for (int j = 0; j < qk / 2; j++) {
|
|
747
|
+
if (partial) {
|
|
748
|
+
qs[j*2+0] = q[j] & 0x0F;
|
|
749
|
+
qs[j*2+1] = q[j] >> 4;
|
|
750
|
+
} else {
|
|
751
|
+
qs[j+000] = q[j] & 0x0F;
|
|
752
|
+
qs[j+128] = q[j] >> 4;
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
pack_q4_1_quants(&x[i * 8 + 0], qs, 0);
|
|
757
|
+
pack_q4_1_quants(&x[i * 8 + 1], qs, 1);
|
|
758
|
+
pack_q4_1_quants(&x[i * 8 + 2], qs, 2);
|
|
759
|
+
pack_q4_1_quants(&x[i * 8 + 3], qs, 3);
|
|
760
|
+
pack_q4_1_quants(&x[i * 8 + 4], qs, 4);
|
|
761
|
+
pack_q4_1_quants(&x[i * 8 + 5], qs, 5);
|
|
762
|
+
pack_q4_1_quants(&x[i * 8 + 6], qs, 6);
|
|
763
|
+
pack_q4_1_quants(&x[i * 8 + 7], qs, 7);
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
// Unpack the scales and offsets
|
|
767
|
+
for (int i = 0; i < nb; i++) {
|
|
768
|
+
const ggml_half * d_m = (const ggml_half *) (y_d + i * dblk_size);
|
|
769
|
+
for (int j = 0; j < 8; j++) {
|
|
770
|
+
x[i * 8 + j].d = d_m[j * 2 + 0];
|
|
771
|
+
x[i * 8 + j].m = d_m[j * 2 + 1];
|
|
772
|
+
}
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
static void init_row_q4_1x4x2(block_q4_1 * x, int64_t k) {
|
|
777
|
+
static const int qk = QK_Q4_0x4x2;
|
|
778
|
+
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
|
779
|
+
|
|
780
|
+
uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
|
|
781
|
+
memset(qs, 0, sizeof(qs));
|
|
782
|
+
|
|
783
|
+
for (int i = 0; i < nb; i++) {
|
|
784
|
+
pack_q4_1_quants(&x[i * 8 + 0], qs, 0);
|
|
785
|
+
pack_q4_1_quants(&x[i * 8 + 1], qs, 1);
|
|
786
|
+
pack_q4_1_quants(&x[i * 8 + 2], qs, 2);
|
|
787
|
+
pack_q4_1_quants(&x[i * 8 + 3], qs, 3);
|
|
788
|
+
pack_q4_1_quants(&x[i * 8 + 4], qs, 4);
|
|
789
|
+
pack_q4_1_quants(&x[i * 8 + 5], qs, 5);
|
|
790
|
+
pack_q4_1_quants(&x[i * 8 + 6], qs, 6);
|
|
791
|
+
pack_q4_1_quants(&x[i * 8 + 7], qs, 7);
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
for (int i = 0; i < nb; i++) {
|
|
795
|
+
for (int j = 0; j < 8; j++) {
|
|
796
|
+
x[i * 8 + j].d = 0;
|
|
797
|
+
x[i * 8 + j].m = 0;
|
|
798
|
+
}
|
|
799
|
+
}
|
|
800
|
+
}
|
|
801
|
+
|
|
802
|
+
static void repack_q4_1_q4x4x2(ggml_tensor * t, const void * data, size_t size) {
|
|
803
|
+
int64_t nrows = ggml_nrows(t);
|
|
804
|
+
|
|
805
|
+
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
|
806
|
+
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));
|
|
807
|
+
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
|
|
808
|
+
|
|
809
|
+
const size_t total_tensor_size = (size_t)nrows * row_size;
|
|
810
|
+
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
|
|
811
|
+
|
|
812
|
+
const int64_t n_full_rows = n_bytes_to_copy / row_size;
|
|
813
|
+
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
|
|
814
|
+
|
|
815
|
+
void * buf_pd = ggml_aligned_malloc(row_size_pd);
|
|
816
|
+
GGML_ASSERT(buf_pd != NULL);
|
|
817
|
+
|
|
818
|
+
void * buf_rp = ggml_aligned_malloc(row_size_rp);
|
|
819
|
+
GGML_ASSERT(buf_rp != NULL);
|
|
820
|
+
|
|
821
|
+
HEX_VERBOSE("ggml-hex: repack-q4_1-q4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
|
|
822
|
+
t->ne[0], nrows, row_size);
|
|
823
|
+
|
|
824
|
+
init_row_q4_1x4x2((block_q4_1 *) buf_pd, t->ne[0]);
|
|
825
|
+
|
|
826
|
+
for (int64_t i = 0; i < n_full_rows; i++) {
|
|
827
|
+
const uint8_t * src = (const uint8_t *) data + (i * row_size);
|
|
828
|
+
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
|
|
829
|
+
|
|
830
|
+
memcpy(buf_pd, src, row_size);
|
|
831
|
+
repack_row_q4_1x4x2((uint8_t *) buf_rp, (const block_q4_1 *) buf_pd, t->ne[0]);
|
|
832
|
+
memcpy(dst, buf_rp, row_size);
|
|
833
|
+
}
|
|
834
|
+
|
|
835
|
+
if (n_rem_bytes > 0) {
|
|
836
|
+
const int64_t i = n_full_rows;
|
|
837
|
+
const uint8_t * src = (const uint8_t *) data + (i * row_size);
|
|
838
|
+
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
|
|
839
|
+
|
|
840
|
+
init_row_q4_1x4x2((block_q4_1 *) buf_pd, t->ne[0]);
|
|
841
|
+
memcpy(buf_pd, src, n_rem_bytes);
|
|
842
|
+
repack_row_q4_1x4x2((uint8_t *) buf_rp, (const block_q4_1 *) buf_pd, t->ne[0]);
|
|
843
|
+
memcpy(dst, buf_rp, n_rem_bytes);
|
|
844
|
+
}
|
|
845
|
+
|
|
846
|
+
ggml_aligned_free(buf_pd, row_size_pd);
|
|
847
|
+
ggml_aligned_free(buf_rp, row_size_rp);
|
|
848
|
+
}
|
|
849
|
+
|
|
850
|
+
static void repack_q4x4x2_q4_1(void * data, const ggml_tensor * t, size_t size) {
|
|
851
|
+
int64_t nrows = ggml_nrows(t);
|
|
852
|
+
|
|
853
|
+
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
|
854
|
+
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));
|
|
855
|
+
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
|
|
856
|
+
|
|
857
|
+
const size_t total_tensor_size = (size_t)nrows * row_size;
|
|
858
|
+
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
|
|
859
|
+
|
|
860
|
+
const int64_t n_full_rows = n_bytes_to_copy / row_size;
|
|
861
|
+
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
|
|
862
|
+
|
|
863
|
+
void * buf_pd = ggml_aligned_malloc(row_size_pd);
|
|
864
|
+
GGML_ASSERT(buf_pd != NULL);
|
|
865
|
+
|
|
866
|
+
void * buf_rp = ggml_aligned_malloc(row_size_rp);
|
|
867
|
+
GGML_ASSERT(buf_rp != NULL);
|
|
868
|
+
|
|
869
|
+
HEX_VERBOSE("ggml-hex: repack-q4x4x2-q4_1 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
|
|
870
|
+
t->ne[0], nrows, row_size);
|
|
871
|
+
|
|
872
|
+
memset(buf_rp, 0, row_size_rp); // clear-out padded buffer to make sure the tail is all zeros
|
|
873
|
+
|
|
874
|
+
for (int64_t i = 0; i < n_full_rows; i++) {
|
|
875
|
+
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
|
|
876
|
+
uint8_t * dst = (uint8_t *) data + (i * row_size);
|
|
877
|
+
|
|
878
|
+
memcpy(buf_rp, src, row_size);
|
|
879
|
+
unpack_row_q4_1x4x2((block_q4_1 *) buf_pd, (const uint8_t *) buf_rp, t->ne[0]);
|
|
880
|
+
memcpy(dst, buf_pd, row_size);
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
if (n_rem_bytes > 0) {
|
|
884
|
+
const int64_t i = n_full_rows;
|
|
885
|
+
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
|
|
886
|
+
uint8_t * dst = (uint8_t *) data + (i * row_size);
|
|
887
|
+
|
|
888
|
+
// We still need to read and unpack the entire source row because quantization is block-based.
|
|
889
|
+
memcpy(buf_rp, src, row_size);
|
|
890
|
+
unpack_row_q4_1x4x2((block_q4_1 *) buf_pd, (const uint8_t *) buf_rp, t->ne[0]);
|
|
891
|
+
memcpy(dst, buf_pd, n_rem_bytes);
|
|
892
|
+
}
|
|
893
|
+
|
|
894
|
+
ggml_aligned_free(buf_pd, row_size_pd);
|
|
895
|
+
ggml_aligned_free(buf_rp, row_size_rp);
|
|
896
|
+
}
|
|
897
|
+
|
|
695
898
|
// ======== Q8x4x2 ====================
|
|
696
899
|
static void dump_block_q8_0(const block_q8_0 * b, int i) {
|
|
697
900
|
HEX_VERBOSE("ggml-hex: repack q8_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, b->qs[0], b->qs[1], b->qs[2],
|
|
@@ -780,7 +983,7 @@ static void repack_row_q8x4x2(uint8_t * y, const block_q8_0 * x, int64_t k) {
|
|
|
780
983
|
|
|
781
984
|
// Repack the scales
|
|
782
985
|
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
|
|
783
|
-
// the last block is truncated and
|
|
986
|
+
// the last block is truncated and overridden by the scales.
|
|
784
987
|
for (int i = 0; i < nb; i++) {
|
|
785
988
|
// Repack the scales
|
|
786
989
|
ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
|
|
@@ -794,7 +997,7 @@ static void repack_row_q8x4x2(uint8_t * y, const block_q8_0 * x, int64_t k) {
|
|
|
794
997
|
d[7] = x[i * 8 + 7].d;
|
|
795
998
|
}
|
|
796
999
|
|
|
797
|
-
if (opt_verbose >
|
|
1000
|
+
if (opt_verbose > 2) {
|
|
798
1001
|
for (int i = 0; i < nb; i++) {
|
|
799
1002
|
dump_packed_block_q8x4x2(y, i, k);
|
|
800
1003
|
}
|
|
@@ -812,7 +1015,7 @@ static void unpack_row_q8x4x2(block_q8_0 * x, const uint8_t * y, int64_t k) {
|
|
|
812
1015
|
const uint8_t * y_q = y + 0; // quants first
|
|
813
1016
|
const uint8_t * y_d = y + qrow_size; // then scales
|
|
814
1017
|
|
|
815
|
-
if (opt_verbose >
|
|
1018
|
+
if (opt_verbose > 2) {
|
|
816
1019
|
for (int i = 0; i < nb; i++) {
|
|
817
1020
|
dump_packed_block_q8x4x2(y, i, k);
|
|
818
1021
|
}
|
|
@@ -839,7 +1042,7 @@ static void unpack_row_q8x4x2(block_q8_0 * x, const uint8_t * y, int64_t k) {
|
|
|
839
1042
|
|
|
840
1043
|
// Repack the scales
|
|
841
1044
|
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
|
|
842
|
-
// the last block is truncated and
|
|
1045
|
+
// the last block is truncated and overridden by the scales.
|
|
843
1046
|
for (int i = 0; i < nb; i++) {
|
|
844
1047
|
// Unpack the scales
|
|
845
1048
|
const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
|
|
@@ -888,7 +1091,7 @@ static void init_row_q8x4x2(block_q8_0 * x, int64_t k) {
|
|
|
888
1091
|
|
|
889
1092
|
// Init the scales
|
|
890
1093
|
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q8_0x4x2)
|
|
891
|
-
// the last block is truncated and
|
|
1094
|
+
// the last block is truncated and overridden by the scales.
|
|
892
1095
|
for (int i = 0; i < nb; i++) {
|
|
893
1096
|
// Unpack the scales
|
|
894
1097
|
x[i * 8 + 0].d = 0;
|
|
@@ -908,7 +1111,7 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
|
|
|
908
1111
|
|
|
909
1112
|
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
|
910
1113
|
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
|
|
911
|
-
size_t row_size_rp =
|
|
1114
|
+
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size quants + scales)
|
|
912
1115
|
|
|
913
1116
|
// Ensure we don't try to read more data than is available in the source buffer 'data'
|
|
914
1117
|
// or write more than the tensor can hold.
|
|
@@ -969,7 +1172,7 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
|
|
|
969
1172
|
|
|
970
1173
|
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
|
971
1174
|
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
|
|
972
|
-
size_t row_size_rp =
|
|
1175
|
+
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size quants + scales)
|
|
973
1176
|
|
|
974
1177
|
// Ensure we don't try to copy more data than the tensor actually contains.
|
|
975
1178
|
const size_t total_tensor_size = (size_t)nrows * row_size;
|
|
@@ -1088,6 +1291,7 @@ static void pack_mxfp4_quants(block_mxfp4 * x, const uint8_t * qs, unsigned int
|
|
|
1088
1291
|
static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k) {
|
|
1089
1292
|
static const int qk = QK_MXFP4x4x2;
|
|
1090
1293
|
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
|
1294
|
+
const int nloe = k % qk; // leftovers
|
|
1091
1295
|
|
|
1092
1296
|
const int eblk_size = 8 * 1; // 8x E8M0
|
|
1093
1297
|
const int qblk_size = qk / 2; // int4
|
|
@@ -1122,15 +1326,17 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
|
|
|
1122
1326
|
unpack_mxfp4_quants(qs, &x[i * 8 + 6], 6);
|
|
1123
1327
|
unpack_mxfp4_quants(qs, &x[i * 8 + 7], 7);
|
|
1124
1328
|
|
|
1329
|
+
bool partial = (nloe && i == nb-1);
|
|
1330
|
+
|
|
1125
1331
|
uint8_t * q = y_q + (i * qblk_size);
|
|
1126
1332
|
for (int j = 0; j < qk / 2; j++) {
|
|
1127
|
-
q[j] = (qs[j + 128] << 4) | qs[j];
|
|
1333
|
+
q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
|
|
1128
1334
|
}
|
|
1129
1335
|
}
|
|
1130
1336
|
|
|
1131
1337
|
// Repack the scales
|
|
1132
1338
|
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
|
|
1133
|
-
// the last block is truncated and
|
|
1339
|
+
// the last block is truncated and overridden by the scales.
|
|
1134
1340
|
for (int i = 0; i < nb; i++) {
|
|
1135
1341
|
// Repack the scales
|
|
1136
1342
|
uint8_t * e = (uint8_t *) (y_e + i * eblk_size);
|
|
@@ -1144,7 +1350,7 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
|
|
|
1144
1350
|
e[7] = x[i * 8 + 7].e;
|
|
1145
1351
|
}
|
|
1146
1352
|
|
|
1147
|
-
if (opt_verbose >
|
|
1353
|
+
if (opt_verbose > 2) {
|
|
1148
1354
|
for (int i = 0; i < nb; i++) {
|
|
1149
1355
|
dump_packed_block_mxfp4x4x2(y, i, k);
|
|
1150
1356
|
}
|
|
@@ -1154,6 +1360,7 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
|
|
|
1154
1360
|
static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k) {
|
|
1155
1361
|
static const int qk = QK_MXFP4x4x2;
|
|
1156
1362
|
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
|
1363
|
+
const int nloe = k % qk; // leftovers
|
|
1157
1364
|
|
|
1158
1365
|
const int eblk_size = 8 * 1; // 8x E8M0
|
|
1159
1366
|
const int qblk_size = qk / 2; // int4
|
|
@@ -1162,7 +1369,7 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
|
|
|
1162
1369
|
const uint8_t * y_q = y + 0; // quants first
|
|
1163
1370
|
const uint8_t * y_e = y + qrow_size; // then scales
|
|
1164
1371
|
|
|
1165
|
-
if (opt_verbose >
|
|
1372
|
+
if (opt_verbose > 2) {
|
|
1166
1373
|
for (int i = 0; i < nb; i++) {
|
|
1167
1374
|
dump_packed_block_mxfp4x4x2(y, i, k);
|
|
1168
1375
|
}
|
|
@@ -1172,10 +1379,17 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
|
|
|
1172
1379
|
for (int i = 0; i < nb; i++) {
|
|
1173
1380
|
uint8_t qs[QK_MXFP4x4x2]; // unpacked quants
|
|
1174
1381
|
|
|
1382
|
+
bool partial = (nloe && i == nb-1);
|
|
1383
|
+
|
|
1175
1384
|
const uint8_t * q = y_q + (i * qblk_size);
|
|
1176
1385
|
for (int j = 0; j < qk / 2; j++) {
|
|
1177
|
-
|
|
1178
|
-
|
|
1386
|
+
if (partial) {
|
|
1387
|
+
qs[j*2+0] = q[j] & 0xf;
|
|
1388
|
+
qs[j*2+1] = q[j] >> 4;
|
|
1389
|
+
} else {
|
|
1390
|
+
qs[j+000] = q[j] & 0xf;
|
|
1391
|
+
qs[j+128] = q[j] >> 4;
|
|
1392
|
+
}
|
|
1179
1393
|
}
|
|
1180
1394
|
|
|
1181
1395
|
pack_mxfp4_quants(&x[i * 8 + 0], qs, 0);
|
|
@@ -1190,7 +1404,7 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
|
|
|
1190
1404
|
|
|
1191
1405
|
// Repack the scales
|
|
1192
1406
|
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4_0x4x2)
|
|
1193
|
-
// the last block is truncated and
|
|
1407
|
+
// the last block is truncated and overridden by the scales.
|
|
1194
1408
|
for (int i = 0; i < nb; i++) {
|
|
1195
1409
|
// Unpack the scales
|
|
1196
1410
|
const uint8_t * e = (const uint8_t *) (y_e + i * eblk_size);
|
|
@@ -1239,7 +1453,7 @@ static void init_row_mxfp4x4x2(block_mxfp4 * x, int64_t k) {
|
|
|
1239
1453
|
|
|
1240
1454
|
// Init the scales
|
|
1241
1455
|
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
|
|
1242
|
-
// the last block is truncated and
|
|
1456
|
+
// the last block is truncated and overridden by the scales.
|
|
1243
1457
|
for (int i = 0; i < nb; i++) {
|
|
1244
1458
|
// Unpack the scales
|
|
1245
1459
|
x[i * 8 + 0].e = 0;
|
|
@@ -1259,7 +1473,7 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
|
|
|
1259
1473
|
|
|
1260
1474
|
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
|
1261
1475
|
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
|
|
1262
|
-
size_t row_size_rp =
|
|
1476
|
+
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
|
|
1263
1477
|
|
|
1264
1478
|
// Ensure we don't try to read more data than is available in the source buffer 'data'
|
|
1265
1479
|
// or write more than the tensor can hold.
|
|
@@ -1320,7 +1534,7 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
|
|
|
1320
1534
|
|
|
1321
1535
|
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
|
1322
1536
|
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
|
|
1323
|
-
size_t row_size_rp =
|
|
1537
|
+
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
|
|
1324
1538
|
|
|
1325
1539
|
// Ensure we don't try to copy more data than the tensor actually contains.
|
|
1326
1540
|
const size_t total_tensor_size = (size_t)nrows * row_size;
|
|
@@ -1374,11 +1588,10 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
1374
1588
|
const void * data,
|
|
1375
1589
|
size_t offset,
|
|
1376
1590
|
size_t size) {
|
|
1377
|
-
auto
|
|
1378
|
-
auto sess =
|
|
1591
|
+
auto sbuf = (ggml_hexagon_shared_buffer *) buffer->context;
|
|
1592
|
+
auto sess = sbuf->sess;
|
|
1379
1593
|
|
|
1380
|
-
HEX_VERBOSE("ggml-hex: %s set-tensor %s : data %p offset %zu size %zu\n", sess->
|
|
1381
|
-
offset, size);
|
|
1594
|
+
HEX_VERBOSE("ggml-hex: %s set-tensor %s : data %p offset %zu size %zu\n", sess->c_name(), tensor->name, data, offset, size);
|
|
1382
1595
|
|
|
1383
1596
|
switch (tensor->type) {
|
|
1384
1597
|
case GGML_TYPE_Q4_0:
|
|
@@ -1387,10 +1600,23 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
1387
1600
|
repack_q4_0_q4x4x2(tensor, data, size);
|
|
1388
1601
|
break;
|
|
1389
1602
|
|
|
1390
|
-
case
|
|
1603
|
+
case GGML_TYPE_Q4_1:
|
|
1391
1604
|
GGML_ASSERT(offset == 0);
|
|
1392
1605
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
|
1393
|
-
|
|
1606
|
+
repack_q4_1_q4x4x2(tensor, data, size);
|
|
1607
|
+
break;
|
|
1608
|
+
|
|
1609
|
+
case GGML_TYPE_Q8_0:
|
|
1610
|
+
GGML_ASSERT(offset == 0);
|
|
1611
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
|
1612
|
+
repack_q8_0_q8x4x2(tensor, data, size);
|
|
1613
|
+
break;
|
|
1614
|
+
|
|
1615
|
+
case GGML_TYPE_IQ4_NL:
|
|
1616
|
+
GGML_ASSERT(offset == 0);
|
|
1617
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
|
1618
|
+
// IQ4_NL has identical block layout to Q4_0 (ggml_half d + uint8_t qs[16])
|
|
1619
|
+
repack_q4_0_q4x4x2(tensor, data, size);
|
|
1394
1620
|
break;
|
|
1395
1621
|
|
|
1396
1622
|
case GGML_TYPE_MXFP4:
|
|
@@ -1410,11 +1636,10 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
|
1410
1636
|
void * data,
|
|
1411
1637
|
size_t offset,
|
|
1412
1638
|
size_t size) {
|
|
1413
|
-
auto
|
|
1414
|
-
auto sess =
|
|
1639
|
+
auto sbuf = (ggml_hexagon_shared_buffer *) buffer->context;
|
|
1640
|
+
auto sess = sbuf->sess;
|
|
1415
1641
|
|
|
1416
|
-
HEX_VERBOSE("ggml-hex: %s get-tensor %s : data %p offset %zu size %zu\n", sess->
|
|
1417
|
-
offset, size);
|
|
1642
|
+
HEX_VERBOSE("ggml-hex: %s get-tensor %s : data %p offset %zu size %zu\n", sess->c_name(), tensor->name, data, offset, size);
|
|
1418
1643
|
|
|
1419
1644
|
switch (tensor->type) {
|
|
1420
1645
|
case GGML_TYPE_Q4_0:
|
|
@@ -1423,12 +1648,24 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
|
1423
1648
|
repack_q4x4x2_q4_0(data, tensor, size);
|
|
1424
1649
|
break;
|
|
1425
1650
|
|
|
1651
|
+
case GGML_TYPE_Q4_1:
|
|
1652
|
+
GGML_ASSERT(offset == 0);
|
|
1653
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
|
1654
|
+
repack_q4x4x2_q4_1(data, tensor, size);
|
|
1655
|
+
break;
|
|
1656
|
+
|
|
1426
1657
|
case GGML_TYPE_Q8_0:
|
|
1427
1658
|
GGML_ASSERT(offset == 0);
|
|
1428
1659
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
|
1429
1660
|
repack_q8x4x2_q8_0(data, tensor, size);
|
|
1430
1661
|
break;
|
|
1431
1662
|
|
|
1663
|
+
case GGML_TYPE_IQ4_NL:
|
|
1664
|
+
GGML_ASSERT(offset == 0);
|
|
1665
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
|
1666
|
+
repack_q4x4x2_q4_0(data, tensor, size);
|
|
1667
|
+
break;
|
|
1668
|
+
|
|
1432
1669
|
case GGML_TYPE_MXFP4:
|
|
1433
1670
|
GGML_ASSERT(offset == 0);
|
|
1434
1671
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
|
@@ -1452,10 +1689,10 @@ static bool ggml_backend_hexagon_buffer_cpy_tensor(ggml_backend_buffer_t bu
|
|
|
1452
1689
|
}
|
|
1453
1690
|
|
|
1454
1691
|
static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
1455
|
-
auto
|
|
1456
|
-
auto sess =
|
|
1457
|
-
HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->
|
|
1458
|
-
memset(
|
|
1692
|
+
auto sbuf = (ggml_hexagon_shared_buffer *) buffer->context;
|
|
1693
|
+
auto sess = sbuf->sess;
|
|
1694
|
+
HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->c_name(), (void *) sbuf->base, sbuf->size);
|
|
1695
|
+
memset(sbuf->base, value, sbuf->size);
|
|
1459
1696
|
}
|
|
1460
1697
|
|
|
1461
1698
|
static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
|
|
@@ -1465,6 +1702,8 @@ static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
|
|
|
1465
1702
|
/* .memset_tensor = */ NULL,
|
|
1466
1703
|
/* .set_tensor = */ ggml_backend_hexagon_buffer_set_tensor,
|
|
1467
1704
|
/* .get_tensor = */ ggml_backend_hexagon_buffer_get_tensor,
|
|
1705
|
+
/* .set_tensor_2d = */ NULL,
|
|
1706
|
+
/* .get_tensor_2d = */ NULL,
|
|
1468
1707
|
/* .cpy_tensor = */ ggml_backend_hexagon_buffer_cpy_tensor,
|
|
1469
1708
|
/* .clear = */ ggml_backend_hexagon_buffer_clear,
|
|
1470
1709
|
/* .reset = */ NULL,
|
|
@@ -1480,10 +1719,11 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
|
|
|
1480
1719
|
ggml_backend_buffer_type_t buffer_type, size_t size) {
|
|
1481
1720
|
auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
|
|
1482
1721
|
try {
|
|
1483
|
-
|
|
1484
|
-
|
|
1722
|
+
size += 4 * 1024; // guard page
|
|
1723
|
+
ggml_hexagon_shared_buffer * sbuf = new ggml_hexagon_shared_buffer(sess, size);
|
|
1724
|
+
return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, sbuf, size);
|
|
1485
1725
|
} catch (const std::exception & exc) {
|
|
1486
|
-
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->
|
|
1726
|
+
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context (host): %s\n", sess->c_name(), exc.what());
|
|
1487
1727
|
return nullptr;
|
|
1488
1728
|
}
|
|
1489
1729
|
}
|
|
@@ -1492,10 +1732,11 @@ static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffe
|
|
|
1492
1732
|
ggml_backend_buffer_type_t buffer_type, size_t size) {
|
|
1493
1733
|
auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
|
|
1494
1734
|
try {
|
|
1495
|
-
|
|
1496
|
-
|
|
1735
|
+
size += 4 * 1024; // guard page
|
|
1736
|
+
ggml_hexagon_shared_buffer * sbuf = new ggml_hexagon_shared_buffer(sess, size);
|
|
1737
|
+
return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, sbuf, size);
|
|
1497
1738
|
} catch (const std::exception & exc) {
|
|
1498
|
-
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->
|
|
1739
|
+
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context (repack): %s\n", sess->c_name(), exc.what());
|
|
1499
1740
|
return nullptr;
|
|
1500
1741
|
}
|
|
1501
1742
|
}
|
|
@@ -1510,7 +1751,7 @@ static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffe
|
|
|
1510
1751
|
}
|
|
1511
1752
|
|
|
1512
1753
|
static size_t ggml_backend_hexagon_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
|
|
1513
|
-
return
|
|
1754
|
+
return opt_mbuf; // typically 1GB per buffer
|
|
1514
1755
|
GGML_UNUSED(buffer_type);
|
|
1515
1756
|
}
|
|
1516
1757
|
|
|
@@ -1542,6 +1783,448 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf
|
|
|
1542
1783
|
/* .is_host = */ ggml_backend_hexagon_repack_buffer_type_is_host,
|
|
1543
1784
|
};
|
|
1544
1785
|
|
|
1786
|
+
struct ggml_hexagon_opbatch {
|
|
1787
|
+
ggml_hexagon_session* sess;
|
|
1788
|
+
|
|
1789
|
+
std::vector<htp_opnode> ops; // htp_opnode of ops
|
|
1790
|
+
|
|
1791
|
+
std::vector<htp_buf_desc> h_bufs; // htp buffer descriptors
|
|
1792
|
+
std::vector<htp_tensor> h_tens; // htp tensor descriptors
|
|
1793
|
+
std::vector<htp_op_desc> h_ops; // htp op descriptors
|
|
1794
|
+
|
|
1795
|
+
std::unordered_map<int, int> b_map; // buffer fd to index
|
|
1796
|
+
std::unordered_map<const ggml_tensor*, int> t_map; // tensor ptr to index
|
|
1797
|
+
std::unordered_multimap<void*, int> d_map; // tensor data to index
|
|
1798
|
+
|
|
1799
|
+
unsigned int n_bufs; // num buffers in the batch
|
|
1800
|
+
unsigned int n_tens; // num tensors ...
|
|
1801
|
+
unsigned int n_ops; // num ops ...
|
|
1802
|
+
size_t b_vmem; // sum of all buffer sizes
|
|
1803
|
+
|
|
1804
|
+
unsigned int n_bufs_max;
|
|
1805
|
+
unsigned int n_tens_max;
|
|
1806
|
+
unsigned int n_ops_max;
|
|
1807
|
+
size_t b_vmem_max;
|
|
1808
|
+
|
|
1809
|
+
void reset() {
|
|
1810
|
+
n_bufs = 0;
|
|
1811
|
+
n_tens = 0;
|
|
1812
|
+
n_ops = 0;
|
|
1813
|
+
b_vmem = 0;
|
|
1814
|
+
|
|
1815
|
+
b_map.clear();
|
|
1816
|
+
t_map.clear();
|
|
1817
|
+
d_map.clear();
|
|
1818
|
+
}
|
|
1819
|
+
|
|
1820
|
+
ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t batch_size, size_t max_vmem) {
|
|
1821
|
+
this->sess = sess;
|
|
1822
|
+
|
|
1823
|
+
n_bufs_max = HTP_OP_MAX_BUFS;
|
|
1824
|
+
n_ops_max = batch_size;
|
|
1825
|
+
n_tens_max = n_ops_max + n_ops_max * HTP_OP_MAX_INPUTS;
|
|
1826
|
+
|
|
1827
|
+
b_vmem_max = max_vmem;
|
|
1828
|
+
|
|
1829
|
+
ops.resize(n_ops_max);
|
|
1830
|
+
|
|
1831
|
+
h_bufs.resize(n_bufs_max);
|
|
1832
|
+
h_tens.resize(n_tens_max);
|
|
1833
|
+
h_ops.resize(n_ops_max);
|
|
1834
|
+
|
|
1835
|
+
b_map.reserve(n_bufs_max);
|
|
1836
|
+
t_map.reserve(n_tens_max);
|
|
1837
|
+
d_map.reserve(n_tens_max);
|
|
1838
|
+
|
|
1839
|
+
GGML_LOG_INFO("ggml-hex: %s op batching: n-bufs %u n-tensors %u n-ops %u vmem %zu\n",
|
|
1840
|
+
sess->c_name(), n_bufs_max, n_tens_max, n_ops_max, b_vmem_max);
|
|
1841
|
+
|
|
1842
|
+
reset();
|
|
1843
|
+
}
|
|
1844
|
+
|
|
1845
|
+
bool empty() const { return n_ops == 0; }
|
|
1846
|
+
|
|
1847
|
+
// add buffer and return its index
|
|
1848
|
+
int add_buffer(ggml_hexagon_shared_buffer * sbuf) {
|
|
1849
|
+
// Lookup by fd
|
|
1850
|
+
auto it = b_map.find(sbuf->fd);
|
|
1851
|
+
if (it != b_map.end()) { return it->second; }
|
|
1852
|
+
|
|
1853
|
+
// Add new buffer to the batch
|
|
1854
|
+
int bi = n_bufs++;
|
|
1855
|
+
GGML_ASSERT(n_bufs < HTP_OP_MAX_BUFS);
|
|
1856
|
+
|
|
1857
|
+
b_map.insert({sbuf->fd, bi});
|
|
1858
|
+
|
|
1859
|
+
htp_buf_desc &b = h_bufs[bi];
|
|
1860
|
+
b.base = (uint64_t) sbuf->base;
|
|
1861
|
+
b.fd = sbuf->fd;
|
|
1862
|
+
b.size = sbuf->size;
|
|
1863
|
+
|
|
1864
|
+
b_vmem += b.size;
|
|
1865
|
+
|
|
1866
|
+
HEX_VERBOSE("ggml-hex: add-buffer #%u : fd %d base %p size %zu : vmem %zu\n", bi, b.fd, (void*) sbuf->base, (size_t) b.size, b_vmem);
|
|
1867
|
+
|
|
1868
|
+
return bi;
|
|
1869
|
+
}
|
|
1870
|
+
|
|
1871
|
+
bool same_shape(const htp_tensor * h, const ggml_tensor * t) const {
|
|
1872
|
+
return (h->ne[0] == t->ne[0]) && (h->ne[1] == t->ne[1]) && (h->ne[2] == t->ne[2]) && (h->ne[3] == t->ne[3]) &&
|
|
1873
|
+
(h->nb[0] == t->nb[0]) && (h->nb[1] == t->nb[1]) && (h->nb[2] == t->nb[2]) && (h->nb[3] == t->nb[3]);
|
|
1874
|
+
}
|
|
1875
|
+
|
|
1876
|
+
// add tensor and return its index
|
|
1877
|
+
int add_tensor(const ggml_tensor * t) {
|
|
1878
|
+
auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(t->buffer->context);
|
|
1879
|
+
|
|
1880
|
+
// First lookup by tensor data
|
|
1881
|
+
auto range = d_map.equal_range(t->data);
|
|
1882
|
+
for (auto it = range.first; it != range.second; ++it) {
|
|
1883
|
+
htp_tensor * h = &h_tens[it->second];
|
|
1884
|
+
if (same_shape(h, t)) { return it->second; }
|
|
1885
|
+
}
|
|
1886
|
+
|
|
1887
|
+
// Lookup by tensor ptr
|
|
1888
|
+
auto it = t_map.find(t);
|
|
1889
|
+
if (it != t_map.end()) { return it->second; }
|
|
1890
|
+
|
|
1891
|
+
// Add new tensor to the batch
|
|
1892
|
+
int ti = n_tens++;
|
|
1893
|
+
GGML_ASSERT(n_tens <= n_tens_max);
|
|
1894
|
+
|
|
1895
|
+
t_map.insert({t, ti});
|
|
1896
|
+
d_map.insert({t->data, ti});
|
|
1897
|
+
|
|
1898
|
+
uint64_t t_offset = (uint8_t *) t->data - sbuf->base;
|
|
1899
|
+
size_t t_size = ggml_nbytes(t);
|
|
1900
|
+
|
|
1901
|
+
htp_tensor &h = h_tens[ti];
|
|
1902
|
+
h.bi = add_buffer(sbuf);
|
|
1903
|
+
h.data = t_offset;
|
|
1904
|
+
h.size = t_size;
|
|
1905
|
+
h.type = t->type;
|
|
1906
|
+
h.ne[0] = t->ne[0]; h.ne[1] = t->ne[1]; h.ne[2] = t->ne[2]; h.ne[3] = t->ne[3];
|
|
1907
|
+
h.nb[0] = t->nb[0]; h.nb[1] = t->nb[1]; h.nb[2] = t->nb[2]; h.nb[3] = t->nb[3];
|
|
1908
|
+
|
|
1909
|
+
h.flags = 0;
|
|
1910
|
+
if (ggml_backend_buffer_get_usage(t->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
|
1911
|
+
h.flags |= HTP_TENSOR_COMPUTE;
|
|
1912
|
+
}
|
|
1913
|
+
|
|
1914
|
+
HEX_VERBOSE("ggml-hex: add-tensor #%u %s : bi %d data %p offset %zu size %zu flags 0x%x : %zu:%zu:%zu:%zu\n",
|
|
1915
|
+
ti, t->name, h.bi, (void*) t->data, (size_t) t_offset, t_size, h.flags,
|
|
1916
|
+
(size_t) t->ne[0], (size_t) t->ne[1], (size_t) t->ne[2], (size_t) t->ne[3]);
|
|
1917
|
+
|
|
1918
|
+
return ti;
|
|
1919
|
+
}
|
|
1920
|
+
|
|
1921
|
+
bool fit_op(const htp_opnode & node) const {
|
|
1922
|
+
if (n_ops >= n_ops_max ) return false;
|
|
1923
|
+
|
|
1924
|
+
// check how much extras we will need
|
|
1925
|
+
size_t extra_bufs = 0;
|
|
1926
|
+
size_t extra_vmem = 0;
|
|
1927
|
+
size_t extra_tens = 0;
|
|
1928
|
+
|
|
1929
|
+
auto fit_tensor = [&](const ggml_tensor *t) {
|
|
1930
|
+
if (!t) return;
|
|
1931
|
+
if (!t_map.count(t)) {
|
|
1932
|
+
extra_tens++;
|
|
1933
|
+
|
|
1934
|
+
auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(t->buffer->context);
|
|
1935
|
+
if (!b_map.count(sbuf->fd)) {
|
|
1936
|
+
extra_vmem += sbuf->size;
|
|
1937
|
+
extra_bufs += 1;
|
|
1938
|
+
}
|
|
1939
|
+
}
|
|
1940
|
+
};
|
|
1941
|
+
|
|
1942
|
+
for (const auto * src : node.get_inputs()) {
|
|
1943
|
+
fit_tensor(src);
|
|
1944
|
+
}
|
|
1945
|
+
fit_tensor(node.dst());
|
|
1946
|
+
|
|
1947
|
+
if ((extra_bufs + n_bufs) > n_bufs_max) return false;
|
|
1948
|
+
if ((extra_tens + n_tens) > n_tens_max) return false;
|
|
1949
|
+
if ((extra_vmem + b_vmem) > b_vmem_max) return false;
|
|
1950
|
+
|
|
1951
|
+
return true;
|
|
1952
|
+
}
|
|
1953
|
+
|
|
1954
|
+
// assumes that fit_op() was called first and returned true
|
|
1955
|
+
void add_op(const htp_opnode & node) {
|
|
1956
|
+
// Add new op
|
|
1957
|
+
|
|
1958
|
+
unsigned int n = n_ops++;
|
|
1959
|
+
GGML_ASSERT(n_ops <= n_ops_max);
|
|
1960
|
+
|
|
1961
|
+
ops[n] = node;
|
|
1962
|
+
|
|
1963
|
+
htp_op_desc &o = h_ops[n];
|
|
1964
|
+
memcpy(&o.params, &node.node->op_params, sizeof(node.node->op_params));
|
|
1965
|
+
o.opcode = node.opcode;
|
|
1966
|
+
o.flags = 0;
|
|
1967
|
+
|
|
1968
|
+
if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) {
|
|
1969
|
+
o.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
|
|
1970
|
+
}
|
|
1971
|
+
|
|
1972
|
+
ggml_hexagon_dump_op_exec(sess->c_name(), node, o.flags);
|
|
1973
|
+
|
|
1974
|
+
auto inputs = node.get_inputs();
|
|
1975
|
+
for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) {
|
|
1976
|
+
o.src[i] = (i < inputs.size() && inputs[i]) ? add_tensor(inputs[i]) : 0xffff;
|
|
1977
|
+
}
|
|
1978
|
+
o.dst = add_tensor(node.dst());
|
|
1979
|
+
}
|
|
1980
|
+
};
|
|
1981
|
+
|
|
1982
|
+
struct ggml_hexagon_opqueue {
|
|
1983
|
+
// Shared buffer for storing batches
|
|
1984
|
+
ggml_hexagon_shared_buffer *shm_buf;
|
|
1985
|
+
size_t shm_blk_size;
|
|
1986
|
+
|
|
1987
|
+
using opvec = std::vector<htp_opnode>;
|
|
1988
|
+
|
|
1989
|
+
std::queue<unsigned int> done; // completed batch ids
|
|
1990
|
+
std::vector<opvec> op_cache; // per batch op cache
|
|
1991
|
+
std::vector<uint64_t> start_usec; // per batch start time
|
|
1992
|
+
|
|
1993
|
+
ggml_hexagon_opqueue(ggml_hexagon_session *sess, size_t batch_size, size_t depth) {
|
|
1994
|
+
size_t n_bufs = HTP_OP_MAX_BUFS;
|
|
1995
|
+
size_t n_ops = batch_size;
|
|
1996
|
+
size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
|
|
1997
|
+
|
|
1998
|
+
shm_blk_size = sizeof(htp_buf_desc) * n_bufs +
|
|
1999
|
+
sizeof(htp_tensor) * n_tensors +
|
|
2000
|
+
sizeof(htp_op_desc) * n_ops +
|
|
2001
|
+
sizeof(htp_prof_desc) * n_ops;
|
|
2002
|
+
|
|
2003
|
+
shm_buf = new ggml_hexagon_shared_buffer(sess, shm_blk_size * depth, true /* pinned */);
|
|
2004
|
+
|
|
2005
|
+
op_cache.resize(depth);
|
|
2006
|
+
start_usec.resize(depth, 0);
|
|
2007
|
+
|
|
2008
|
+
// init done queue
|
|
2009
|
+
for (unsigned int i = 0; i < depth; i++) { done.push(i); }
|
|
2010
|
+
|
|
2011
|
+
if (opt_verbose) {
|
|
2012
|
+
GGML_LOG_INFO("ggml-hex: %s allocated op-queue : batch-size %zu depth %zu shm-size %zu shm-block-size %zu\n",
|
|
2013
|
+
sess->c_name(), batch_size, depth, shm_buf->size, shm_blk_size);
|
|
2014
|
+
}
|
|
2015
|
+
}
|
|
2016
|
+
|
|
2017
|
+
~ggml_hexagon_opqueue() {
|
|
2018
|
+
delete shm_buf;
|
|
2019
|
+
}
|
|
2020
|
+
|
|
2021
|
+
// push new batch
|
|
2022
|
+
bool push(htp_opbatch_req& req, dspqueue_buffer& dbuf, ggml_hexagon_opbatch* op_batch) {
|
|
2023
|
+
static_assert(sizeof(htp_opbatch_req) % 8 == 0, "sizeof(htp_opbatch_req) must be multiple of 8");
|
|
2024
|
+
static_assert(sizeof(htp_opbatch_rsp) % 8 == 0, "sizeof(htp_opbatch_rsp) must be multiple of 8");
|
|
2025
|
+
static_assert(sizeof(htp_buf_desc) % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
|
|
2026
|
+
static_assert(sizeof(htp_tensor) % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
|
|
2027
|
+
static_assert(sizeof(htp_op_desc) % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
|
|
2028
|
+
static_assert(sizeof(htp_prof_desc) % 8 == 0, "sizeof(htp_prof_desc) must be multiple of 8");
|
|
2029
|
+
|
|
2030
|
+
if (done.empty()) { return false; }
|
|
2031
|
+
|
|
2032
|
+
req.id = done.front(); done.pop(); // batch id
|
|
2033
|
+
req.n_bufs = op_batch->n_bufs;
|
|
2034
|
+
req.n_tensors = op_batch->n_tens;
|
|
2035
|
+
req.n_ops = op_batch->n_ops;
|
|
2036
|
+
|
|
2037
|
+
op_cache[req.id] = op_batch->ops;
|
|
2038
|
+
start_usec[req.id] = ggml_time_us();
|
|
2039
|
+
|
|
2040
|
+
const size_t b_size = sizeof(htp_buf_desc) * req.n_bufs;
|
|
2041
|
+
const size_t t_size = sizeof(htp_tensor) * req.n_tensors;
|
|
2042
|
+
const size_t o_size = sizeof(htp_op_desc) * req.n_ops;
|
|
2043
|
+
const size_t p_size = sizeof(htp_prof_desc) * req.n_ops;
|
|
2044
|
+
|
|
2045
|
+
dbuf.ptr = shm_buf->base + (req.id * shm_blk_size);
|
|
2046
|
+
dbuf.fd = shm_buf->fd;
|
|
2047
|
+
dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
|
|
2048
|
+
dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) shm_buf->base;
|
|
2049
|
+
dbuf.size = b_size + t_size + o_size + p_size;
|
|
2050
|
+
|
|
2051
|
+
GGML_ASSERT(dbuf.size <= shm_blk_size);
|
|
2052
|
+
|
|
2053
|
+
uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
|
|
2054
|
+
uint8_t * b_ptr = m_ptr; m_ptr += b_size;
|
|
2055
|
+
uint8_t * t_ptr = m_ptr; m_ptr += t_size;
|
|
2056
|
+
uint8_t * o_ptr = m_ptr;
|
|
2057
|
+
|
|
2058
|
+
memcpy(b_ptr, (void *) op_batch->h_bufs.data(), b_size);
|
|
2059
|
+
memcpy(t_ptr, (void *) op_batch->h_tens.data(), t_size);
|
|
2060
|
+
memcpy(o_ptr, (void *) op_batch->h_ops.data(), o_size);
|
|
2061
|
+
|
|
2062
|
+
HEX_VERBOSE("ggml-hex: %s op-queue push batch #%u : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu m-size %zu\n",
|
|
2063
|
+
shm_buf->sess->c_name(), req.id, req.n_bufs, req.n_tensors, req.n_ops, op_batch->b_vmem,
|
|
2064
|
+
b_size, t_size, o_size, (size_t) dbuf.size);
|
|
2065
|
+
|
|
2066
|
+
op_batch->reset();
|
|
2067
|
+
|
|
2068
|
+
if (opt_verbose > 1) {
|
|
2069
|
+
htp_buf_desc *b = (htp_buf_desc*) b_ptr;
|
|
2070
|
+
for (unsigned int i=0; i < req.n_bufs; i++) {
|
|
2071
|
+
GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", shm_buf->sess->c_name(), i,
|
|
2072
|
+
b[i].fd, (void *) b[i].base, (size_t) b[i].size);
|
|
2073
|
+
}
|
|
2074
|
+
htp_tensor *t = (htp_tensor*) t_ptr;
|
|
2075
|
+
for (unsigned int i=0; i < req.n_tensors; i++) {
|
|
2076
|
+
GGML_LOG_DEBUG("ggml-hex: %s htp-tensor #%u : bi %u offset %u size %u : %zu:%zu:%zu:%zu\n",
|
|
2077
|
+
shm_buf->sess->c_name(), i, t[i].bi, t[i].data, t[i].size,
|
|
2078
|
+
(size_t) t[i].ne[0], (size_t) t[i].ne[1], (size_t) t[i].ne[2], (size_t) t[i].ne[3]);
|
|
2079
|
+
}
|
|
2080
|
+
}
|
|
2081
|
+
|
|
2082
|
+
return true;
|
|
2083
|
+
}
|
|
2084
|
+
|
|
2085
|
+
void pop(htp_opbatch_rsp rsp, dspqueue_buffer dbuf) {
|
|
2086
|
+
GGML_ASSERT(rsp.id < op_cache.size());
|
|
2087
|
+
|
|
2088
|
+
done.push(rsp.id);
|
|
2089
|
+
|
|
2090
|
+
const size_t b_size = sizeof(htp_buf_desc) * rsp.n_bufs;
|
|
2091
|
+
const size_t t_size = sizeof(htp_tensor) * rsp.n_tensors;
|
|
2092
|
+
const size_t o_size = sizeof(htp_op_desc) * rsp.n_ops;
|
|
2093
|
+
const size_t p_size = sizeof(htp_prof_desc) * rsp.n_ops;
|
|
2094
|
+
|
|
2095
|
+
const size_t m_size = b_size + t_size + o_size + p_size;
|
|
2096
|
+
GGML_ASSERT(m_size <= shm_blk_size);
|
|
2097
|
+
|
|
2098
|
+
HEX_VERBOSE("ggml-hex: %s op-queue pop batch #%u : n-bufs %u n-tensors %u n-ops %u : m-size %zu b-size %zu t-size %zu o-size %zu\n",
|
|
2099
|
+
shm_buf->sess->c_name(), rsp.id, rsp.n_bufs, rsp.n_tensors, rsp.n_ops,
|
|
2100
|
+
(size_t) dbuf.size, b_size, t_size, o_size);
|
|
2101
|
+
|
|
2102
|
+
uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
|
|
2103
|
+
uint8_t * p_ptr = m_ptr + (b_size + t_size + o_size);
|
|
2104
|
+
|
|
2105
|
+
if (opt_profile && rsp.n_ops > 0) {
|
|
2106
|
+
auto & ops = op_cache[rsp.id];
|
|
2107
|
+
|
|
2108
|
+
uint64_t batch_usec = ggml_time_us() - start_usec[rsp.id];
|
|
2109
|
+
uint32_t htp_usec = 0;
|
|
2110
|
+
|
|
2111
|
+
GGML_ASSERT(rsp.n_ops <= ops.size());
|
|
2112
|
+
|
|
2113
|
+
const htp_prof_desc * pd = (const htp_prof_desc *) p_ptr;
|
|
2114
|
+
for (uint32_t i = 0; i < rsp.n_ops; i++) {
|
|
2115
|
+
htp_usec += pd[i].usecs;
|
|
2116
|
+
ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i].usecs, pd[i].cycles, pd[i].pmu);
|
|
2117
|
+
}
|
|
2118
|
+
|
|
2119
|
+
GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u\n",
|
|
2120
|
+
shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec);
|
|
2121
|
+
}
|
|
2122
|
+
}
|
|
2123
|
+
};
|
|
2124
|
+
|
|
2125
|
+
// Flush HTP response queue i.e wait for all outstanding requests to complete
|
|
2126
|
+
void ggml_hexagon_session::flush_pending(bool all) {
|
|
2127
|
+
while (this->op_pending) {
|
|
2128
|
+
struct htp_opbatch_rsp rsp;
|
|
2129
|
+
uint32_t rsp_size;
|
|
2130
|
+
uint32_t flags;
|
|
2131
|
+
|
|
2132
|
+
struct dspqueue_buffer dbuf;
|
|
2133
|
+
uint32_t n_dbufs;
|
|
2134
|
+
|
|
2135
|
+
// Read response packet from queue
|
|
2136
|
+
const uint32_t timeo = opt_oppoll ? 0 : DSPQUEUE_TIMEOUT;
|
|
2137
|
+
int err = dspqueue_read(this->queue, &flags, 1, &n_dbufs, &dbuf, sizeof(rsp), &rsp_size, (uint8_t *) &rsp, timeo);
|
|
2138
|
+
if (err == AEE_EEXPIRED) {
|
|
2139
|
+
continue;
|
|
2140
|
+
}
|
|
2141
|
+
|
|
2142
|
+
if (err != 0) {
|
|
2143
|
+
GGML_ABORT("ggml-hex: dspqueue_read failed: 0x%08x\n", (unsigned) err);
|
|
2144
|
+
}
|
|
2145
|
+
|
|
2146
|
+
// Basic sanity checks
|
|
2147
|
+
if (rsp_size != sizeof(rsp) || n_dbufs != 1) {
|
|
2148
|
+
GGML_ABORT("ggml-hex: %s dspcall : bad response : size %u dspbufs %u\n", this->c_name(), rsp_size, n_dbufs);
|
|
2149
|
+
}
|
|
2150
|
+
|
|
2151
|
+
if (rsp.status != HTP_STATUS_OK) {
|
|
2152
|
+
GGML_LOG_ERROR("ggml-hex: %s dspcall : dsp-rsp: %s\n", this->c_name(), status_to_str(rsp.status));
|
|
2153
|
+
// TODO: handle errors
|
|
2154
|
+
}
|
|
2155
|
+
|
|
2156
|
+
op_queue->pop(rsp, dbuf);
|
|
2157
|
+
|
|
2158
|
+
this->op_pending--; // atomic dec
|
|
2159
|
+
|
|
2160
|
+
if (!all) break;
|
|
2161
|
+
}
|
|
2162
|
+
}
|
|
2163
|
+
|
|
2164
|
+
void ggml_hexagon_session::flush_batch() {
|
|
2165
|
+
if (op_batch->empty()) { return; }
|
|
2166
|
+
|
|
2167
|
+
htp_opbatch_req req {};
|
|
2168
|
+
dspqueue_buffer dbuf{};
|
|
2169
|
+
|
|
2170
|
+
if (!op_queue->push(req, dbuf, op_batch)) {
|
|
2171
|
+
flush_pending(false);
|
|
2172
|
+
op_queue->push(req, dbuf, op_batch);
|
|
2173
|
+
}
|
|
2174
|
+
|
|
2175
|
+
// Bump pending flag (cleared in the session::flush once we get the response)
|
|
2176
|
+
this->op_pending++; // atomic inc
|
|
2177
|
+
|
|
2178
|
+
HEX_VERBOSE("ggml-hex: %s queue-opbatch: %p size %u\n", this->c_name(), dbuf.ptr, dbuf.size);
|
|
2179
|
+
|
|
2180
|
+
int err = dspqueue_write(this->queue, 0, 1, &dbuf, sizeof(req), (const uint8_t*) &req, DSPQUEUE_TIMEOUT);
|
|
2181
|
+
if (err != 0) {
|
|
2182
|
+
GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->c_name(), (unsigned) err);
|
|
2183
|
+
}
|
|
2184
|
+
}
|
|
2185
|
+
|
|
2186
|
+
void ggml_hexagon_session::enqueue_op(const htp_opnode & node) {
|
|
2187
|
+
if (!op_batch->fit_op(node)) {
|
|
2188
|
+
flush_batch();
|
|
2189
|
+
}
|
|
2190
|
+
op_batch->add_op(node);
|
|
2191
|
+
}
|
|
2192
|
+
|
|
2193
|
+
// Flush HTP response queue i.e wait for all outstanding requests to complete
|
|
2194
|
+
void ggml_hexagon_session::flush(bool all) {
|
|
2195
|
+
flush_batch();
|
|
2196
|
+
flush_pending(all);
|
|
2197
|
+
}
|
|
2198
|
+
|
|
2199
|
+
static size_t ggml_hexagon_measure_max_vmem(ggml_hexagon_session *sess) {
|
|
2200
|
+
// Allocate a bunch pinned buffers till failure.
|
|
2201
|
+
// This is kind of expensive but handy for figuring out exactly how much we can mmap on a specific device.
|
|
2202
|
+
// Typically we're going to allocate all/most of these buffers anyway for the model weights.
|
|
2203
|
+
|
|
2204
|
+
std::vector<ggml_hexagon_shared_buffer *> sbufs;
|
|
2205
|
+
|
|
2206
|
+
const size_t MiB = 1024 * 1024;
|
|
2207
|
+
const size_t GiB = MiB * 1024;
|
|
2208
|
+
|
|
2209
|
+
size_t vmem = 0;
|
|
2210
|
+
size_t step = 256u * MiB;
|
|
2211
|
+
|
|
2212
|
+
try {
|
|
2213
|
+
sbufs.push_back(new ggml_hexagon_shared_buffer(sess, GiB, true)); vmem += GiB;
|
|
2214
|
+
sbufs.push_back(new ggml_hexagon_shared_buffer(sess, GiB, true)); vmem += GiB;
|
|
2215
|
+
sbufs.push_back(new ggml_hexagon_shared_buffer(sess, GiB, true)); vmem += GiB;
|
|
2216
|
+
|
|
2217
|
+
while (1) {
|
|
2218
|
+
sbufs.push_back(new ggml_hexagon_shared_buffer(sess, step, true));
|
|
2219
|
+
vmem += step;
|
|
2220
|
+
}
|
|
2221
|
+
} catch (...) { }
|
|
2222
|
+
|
|
2223
|
+
for (auto b : sbufs) { delete b; }
|
|
2224
|
+
|
|
2225
|
+
return vmem - step; // backoff to account for overhead from internal mappings
|
|
2226
|
+
}
|
|
2227
|
+
|
|
1545
2228
|
void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
|
|
1546
2229
|
this->valid_session = false;
|
|
1547
2230
|
this->valid_handle = false;
|
|
@@ -1554,11 +2237,8 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
|
|
|
1554
2237
|
this->name = std::string("HTP") + std::to_string(dev_id);
|
|
1555
2238
|
|
|
1556
2239
|
this->op_pending = 0;
|
|
1557
|
-
this->prof_usecs = 0;
|
|
1558
|
-
this->prof_cycles = 0;
|
|
1559
|
-
this->prof_pkts = 0;
|
|
1560
2240
|
|
|
1561
|
-
|
|
2241
|
+
GGML_LOG_DEBUG("ggml-hex: %s allocating new session\n", this->name.c_str());
|
|
1562
2242
|
|
|
1563
2243
|
domain * my_domain = get_domain(this->domain_id);
|
|
1564
2244
|
if (my_domain == NULL) {
|
|
@@ -1634,9 +2314,6 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
|
|
|
1634
2314
|
|
|
1635
2315
|
this->valid_handle = true;
|
|
1636
2316
|
|
|
1637
|
-
GGML_LOG_INFO("ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(),
|
|
1638
|
-
this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
|
|
1639
|
-
|
|
1640
2317
|
// Enable FastRPC QoS mode
|
|
1641
2318
|
{
|
|
1642
2319
|
struct remote_rpc_control_latency l;
|
|
@@ -1648,11 +2325,17 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
|
|
|
1648
2325
|
}
|
|
1649
2326
|
}
|
|
1650
2327
|
|
|
2328
|
+
GGML_LOG_INFO("ggml-hex: %s new session : session-id %d domain-id %d uri %s handle 0x%lx\n", this->c_name(),
|
|
2329
|
+
this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
|
|
2330
|
+
|
|
2331
|
+
const size_t req_q_size = (sizeof(htp_opbatch_req) * opt_opqueue * 2) + 1024;
|
|
2332
|
+
const size_t rsp_q_size = (sizeof(htp_opbatch_rsp) * opt_opqueue * 2) + 1024;
|
|
2333
|
+
|
|
1651
2334
|
// Now let's setup the DSP queue
|
|
1652
2335
|
err = dspqueue_create(this->domain_id,
|
|
1653
2336
|
0, // Flags
|
|
1654
|
-
|
|
1655
|
-
|
|
2337
|
+
req_q_size, // Request queue size (in bytes)
|
|
2338
|
+
rsp_q_size, // Response queue size (in bytes)
|
|
1656
2339
|
nullptr, // Read packet callback (we handle reads explicitly)
|
|
1657
2340
|
nullptr, // Error callback (we handle errors during reads)
|
|
1658
2341
|
(void *) this, // Callback context
|
|
@@ -1672,18 +2355,36 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
|
|
|
1672
2355
|
}
|
|
1673
2356
|
|
|
1674
2357
|
if (opt_etm) {
|
|
1675
|
-
err =
|
|
2358
|
+
err = htp_iface_etm(this->handle, 1);
|
|
1676
2359
|
if (err != 0) {
|
|
1677
2360
|
GGML_LOG_ERROR("ggml-hex: failed to enable ETM tracing: 0x%08x\n", (unsigned) err);
|
|
1678
2361
|
}
|
|
1679
2362
|
}
|
|
1680
2363
|
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
2364
|
+
if (opt_profile) {
|
|
2365
|
+
htp_iface_pmu_conf pmu_conf{};
|
|
2366
|
+
std::copy(opt_pmu_evt.begin(), opt_pmu_evt.end(), pmu_conf.events);
|
|
2367
|
+
|
|
2368
|
+
err = htp_iface_profiler(this->handle, opt_profile, &pmu_conf);
|
|
2369
|
+
if (err != 0) {
|
|
2370
|
+
GGML_LOG_ERROR("ggml-hex: failed to enable profiling: 0x%08x\n", (unsigned) err);
|
|
2371
|
+
}
|
|
2372
|
+
}
|
|
2373
|
+
|
|
2374
|
+
// Allocate buffers and state for op batching
|
|
2375
|
+
this->op_queue = new ggml_hexagon_opqueue(this, opt_opbatch, opt_opqueue);
|
|
2376
|
+
|
|
2377
|
+
if (!opt_vmem) {
|
|
2378
|
+
opt_vmem = ggml_hexagon_measure_max_vmem(this);
|
|
2379
|
+
GGML_LOG_INFO("ggml-hex: %s measured max vmem %zu\n", this->c_name(), opt_vmem);
|
|
2380
|
+
}
|
|
2381
|
+
|
|
2382
|
+
this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch, opt_vmem);
|
|
2383
|
+
|
|
2384
|
+
// Start dspqueue/opbatch processing
|
|
2385
|
+
err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx, opt_use_hmx, opt_vmem);
|
|
1685
2386
|
if (err != 0) {
|
|
1686
|
-
GGML_LOG_ERROR("ggml-hex: failed to start session: 0x%08x\n", (unsigned) err);
|
|
2387
|
+
GGML_LOG_ERROR("ggml-hex: %s failed to start session: 0x%08x\n", this->c_name(), (unsigned) err);
|
|
1687
2388
|
throw std::runtime_error("ggml-hex: iface start failed (see log for details)");
|
|
1688
2389
|
}
|
|
1689
2390
|
this->valid_iface = true;
|
|
@@ -1694,21 +2395,32 @@ void ggml_hexagon_session::release() noexcept(true) {
|
|
|
1694
2395
|
|
|
1695
2396
|
int err;
|
|
1696
2397
|
|
|
1697
|
-
// Stop the DSP-side service and close the queue
|
|
1698
2398
|
if (this->valid_iface) {
|
|
2399
|
+
// Stop dspqueue/opbatch processing
|
|
1699
2400
|
err = htp_iface_stop(this->handle);
|
|
1700
2401
|
if (err != 0) {
|
|
1701
2402
|
GGML_ABORT("ggml-hex: htp_iface_stop failed: 0x%08x\n", (unsigned) err);
|
|
1702
2403
|
}
|
|
1703
2404
|
}
|
|
1704
2405
|
|
|
2406
|
+
delete this->op_batch;
|
|
2407
|
+
delete this->op_queue;
|
|
2408
|
+
|
|
1705
2409
|
if (opt_etm) {
|
|
1706
|
-
err =
|
|
2410
|
+
err = htp_iface_etm(this->handle, 0);
|
|
1707
2411
|
if (err != 0) {
|
|
1708
2412
|
GGML_LOG_ERROR("ggml-hex: warn : failed to disable ETM tracing: 0x%08x\n", (unsigned) err);
|
|
1709
2413
|
}
|
|
1710
2414
|
}
|
|
1711
2415
|
|
|
2416
|
+
if (opt_profile) {
|
|
2417
|
+
htp_iface_pmu_conf pmu_conf{};
|
|
2418
|
+
err = htp_iface_profiler(this->handle, 0, &pmu_conf);
|
|
2419
|
+
if (err != 0) {
|
|
2420
|
+
GGML_LOG_ERROR("ggml-hex: warn : failed to disable profiling: 0x%08x\n", (unsigned) err);
|
|
2421
|
+
}
|
|
2422
|
+
}
|
|
2423
|
+
|
|
1712
2424
|
if (this->valid_queue) {
|
|
1713
2425
|
err = dspqueue_close(queue);
|
|
1714
2426
|
if (err != 0) {
|
|
@@ -1725,6 +2437,9 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
|
|
|
1725
2437
|
buffer_type.device = dev;
|
|
1726
2438
|
repack_buffer_type.device = dev;
|
|
1727
2439
|
|
|
2440
|
+
op_batch = nullptr;
|
|
2441
|
+
op_queue = nullptr;
|
|
2442
|
+
|
|
1728
2443
|
try {
|
|
1729
2444
|
allocate(dev_id);
|
|
1730
2445
|
|
|
@@ -1753,24 +2468,10 @@ static bool ggml_backend_buffer_is_hexagon(const struct ggml_backend_buffer * b)
|
|
|
1753
2468
|
}
|
|
1754
2469
|
|
|
1755
2470
|
static inline bool ggml_backend_buffer_is_hexagon_repack(const struct ggml_backend_buffer * b) {
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
static bool hex_supported_dims2(const struct ggml_tensor * x, const struct ggml_tensor * y) {
|
|
1760
|
-
if (x->ne[0] != y->ne[0]) {
|
|
1761
|
-
return false;
|
|
1762
|
-
}
|
|
1763
|
-
if (x->ne[1] != y->ne[1]) {
|
|
1764
|
-
return false;
|
|
1765
|
-
}
|
|
1766
|
-
if (x->ne[2] != y->ne[2]) {
|
|
1767
|
-
return false;
|
|
2471
|
+
if (!opt_hostbuf) {
|
|
2472
|
+
return ggml_backend_buffer_is_hexagon(b);
|
|
1768
2473
|
}
|
|
1769
|
-
|
|
1770
|
-
return false;
|
|
1771
|
-
}
|
|
1772
|
-
|
|
1773
|
-
return true;
|
|
2474
|
+
return b->buft->iface.alloc_buffer == ggml_backend_hexagon_repack_buffer_type_alloc_buffer;
|
|
1774
2475
|
}
|
|
1775
2476
|
|
|
1776
2477
|
static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
@@ -1801,44 +2502,64 @@ static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_sess
|
|
|
1801
2502
|
return false;
|
|
1802
2503
|
}
|
|
1803
2504
|
|
|
1804
|
-
|
|
1805
|
-
|
|
2505
|
+
if (dst->ne[3] != 1) {
|
|
2506
|
+
return false;
|
|
2507
|
+
}
|
|
1806
2508
|
|
|
1807
|
-
|
|
1808
|
-
return t == GGML_TYPE_F32;
|
|
2509
|
+
return true;
|
|
1809
2510
|
}
|
|
1810
2511
|
|
|
1811
|
-
static bool
|
|
1812
|
-
|
|
1813
|
-
|
|
2512
|
+
static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
2513
|
+
const struct ggml_tensor * q = op->src[0];
|
|
2514
|
+
const struct ggml_tensor * k = op->src[1];
|
|
2515
|
+
const struct ggml_tensor * v = op->src[2];
|
|
2516
|
+
const struct ggml_tensor * g = op->src[3];
|
|
2517
|
+
const struct ggml_tensor * beta = op->src[4];
|
|
2518
|
+
const struct ggml_tensor * state = op->src[5];
|
|
2519
|
+
const struct ggml_tensor * dst = op;
|
|
1814
2520
|
|
|
1815
|
-
|
|
1816
|
-
|
|
1817
|
-
}
|
|
2521
|
+
if (!q || !k || !v || !g || !beta || !state) {
|
|
2522
|
+
return false;
|
|
2523
|
+
}
|
|
1818
2524
|
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
2525
|
+
if (q->type != GGML_TYPE_F32 || k->type != GGML_TYPE_F32 || v->type != GGML_TYPE_F32 ||
|
|
2526
|
+
g->type != GGML_TYPE_F32 || beta->type != GGML_TYPE_F32 || state->type != GGML_TYPE_F32 ||
|
|
2527
|
+
dst->type != GGML_TYPE_F32) {
|
|
2528
|
+
return false;
|
|
2529
|
+
}
|
|
1822
2530
|
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
|
|
2531
|
+
if (!ggml_is_contiguous_rows(q) || !ggml_is_contiguous_rows(k) || !ggml_is_contiguous_rows(v) ||
|
|
2532
|
+
!ggml_is_contiguous(g) || !ggml_is_contiguous(beta) || !ggml_is_contiguous(state) ||
|
|
2533
|
+
!ggml_is_contiguous(dst)) {
|
|
2534
|
+
return false;
|
|
2535
|
+
}
|
|
1826
2536
|
|
|
1827
|
-
|
|
1828
|
-
|
|
1829
|
-
|
|
2537
|
+
const int64_t S_v = v->ne[0];
|
|
2538
|
+
const int64_t H = v->ne[1];
|
|
2539
|
+
const int64_t n_tokens = v->ne[2];
|
|
2540
|
+
const int64_t n_seqs = v->ne[3];
|
|
2541
|
+
const int64_t K = ggml_get_op_params_i32(op, 0);
|
|
1830
2542
|
|
|
1831
|
-
|
|
1832
|
-
|
|
1833
|
-
|
|
2543
|
+
if (S_v <= 0 || S_v > 128 || H <= 0 || n_tokens <= 0 || n_seqs <= 0) {
|
|
2544
|
+
return false;
|
|
2545
|
+
}
|
|
2546
|
+
if (q->ne[0] != S_v || k->ne[0] != S_v || q->ne[1] <= 0 || k->ne[1] <= 0 ||
|
|
2547
|
+
q->ne[2] != n_tokens || k->ne[2] != n_tokens || q->ne[3] <= 0 || k->ne[3] <= 0 ||
|
|
2548
|
+
(n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) {
|
|
1834
2549
|
return false;
|
|
1835
2550
|
}
|
|
1836
|
-
if (
|
|
2551
|
+
if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) {
|
|
1837
2552
|
return false;
|
|
1838
2553
|
}
|
|
1839
|
-
|
|
2554
|
+
// state holds s0 only [S_v, S_v, H, n_seqs]; K is op param 0.
|
|
2555
|
+
if (ggml_nelements(state) != S_v * S_v * H * n_seqs) {
|
|
1840
2556
|
return false;
|
|
1841
2557
|
}
|
|
2558
|
+
if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs * K) {
|
|
2559
|
+
return false;
|
|
2560
|
+
}
|
|
2561
|
+
|
|
2562
|
+
GGML_UNUSED(sess);
|
|
1842
2563
|
return true;
|
|
1843
2564
|
}
|
|
1844
2565
|
|
|
@@ -1856,18 +2577,20 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
|
|
1856
2577
|
|
|
1857
2578
|
switch (src0->type) {
|
|
1858
2579
|
case GGML_TYPE_Q4_0:
|
|
2580
|
+
case GGML_TYPE_Q4_1:
|
|
1859
2581
|
case GGML_TYPE_Q8_0:
|
|
2582
|
+
case GGML_TYPE_IQ4_NL:
|
|
1860
2583
|
case GGML_TYPE_MXFP4:
|
|
1861
2584
|
if (src0->ne[0] % 32) {
|
|
1862
2585
|
return false;
|
|
1863
2586
|
}
|
|
1864
2587
|
|
|
1865
|
-
if (src0
|
|
2588
|
+
if (ggml_nrows(src0) > 16 * 1024) {
|
|
1866
2589
|
return false; // typically the lm-head which would be too large for VTCM
|
|
1867
2590
|
}
|
|
1868
2591
|
|
|
1869
|
-
if ((src1->ne[2] != 1 || src1->ne[3] != 1)
|
|
1870
|
-
return false;
|
|
2592
|
+
if (ggml_nrows(src1) > 1024 || src1->ne[2] != 1 || src1->ne[3] != 1) {
|
|
2593
|
+
return false; // no huge batches or broadcasting (for now)
|
|
1871
2594
|
}
|
|
1872
2595
|
|
|
1873
2596
|
// src0 (weights) must be repacked
|
|
@@ -1881,6 +2604,30 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
|
|
1881
2604
|
GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F16 src0 not supported\n");
|
|
1882
2605
|
return false;
|
|
1883
2606
|
}
|
|
2607
|
+
if (src1->ne[2] < src0->ne[2] || src1->ne[3] < src0->ne[3]) {
|
|
2608
|
+
GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: src1 broadcasting not supported\n");
|
|
2609
|
+
return false;
|
|
2610
|
+
}
|
|
2611
|
+
if (ggml_nrows(src1) > 1024) {
|
|
2612
|
+
return false; // no huge batches (for now)
|
|
2613
|
+
}
|
|
2614
|
+
break;
|
|
2615
|
+
|
|
2616
|
+
case GGML_TYPE_F32:
|
|
2617
|
+
if (src1->type != GGML_TYPE_F32) {
|
|
2618
|
+
return false;
|
|
2619
|
+
}
|
|
2620
|
+
if (src0->nb[1] < src0->nb[0]) {
|
|
2621
|
+
GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F32 src0 not supported\n");
|
|
2622
|
+
return false;
|
|
2623
|
+
}
|
|
2624
|
+
if (src1->ne[2] < src0->ne[2] || src1->ne[3] < src0->ne[3]) {
|
|
2625
|
+
GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: src1 broadcasting not supported\n");
|
|
2626
|
+
return false;
|
|
2627
|
+
}
|
|
2628
|
+
if (ggml_nrows(src1) > 1024) {
|
|
2629
|
+
return false; // no huge batches (for now)
|
|
2630
|
+
}
|
|
1884
2631
|
break;
|
|
1885
2632
|
|
|
1886
2633
|
default:
|
|
@@ -1902,7 +2649,9 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
|
|
|
1902
2649
|
|
|
1903
2650
|
switch (src0->type) {
|
|
1904
2651
|
case GGML_TYPE_Q4_0:
|
|
2652
|
+
case GGML_TYPE_Q4_1:
|
|
1905
2653
|
case GGML_TYPE_Q8_0:
|
|
2654
|
+
case GGML_TYPE_IQ4_NL:
|
|
1906
2655
|
case GGML_TYPE_MXFP4:
|
|
1907
2656
|
if ((src0->ne[0] % 32)) {
|
|
1908
2657
|
return false;
|
|
@@ -1926,24 +2675,30 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se
|
|
|
1926
2675
|
const struct ggml_tensor * src1 = op->src[1];
|
|
1927
2676
|
const struct ggml_tensor * dst = op;
|
|
1928
2677
|
|
|
1929
|
-
if (
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
|
|
2678
|
+
if (src0->type == GGML_TYPE_F32) {
|
|
2679
|
+
if (src1->type != GGML_TYPE_F32) {
|
|
2680
|
+
return false;
|
|
2681
|
+
}
|
|
2682
|
+
if (dst->type != GGML_TYPE_F32) {
|
|
2683
|
+
return false;
|
|
2684
|
+
}
|
|
1934
2685
|
}
|
|
1935
|
-
if (
|
|
1936
|
-
|
|
2686
|
+
else if (src0->type == GGML_TYPE_F16) {
|
|
2687
|
+
if (src1->type != GGML_TYPE_F16) {
|
|
2688
|
+
return false;
|
|
2689
|
+
}
|
|
2690
|
+
if (dst->type != GGML_TYPE_F16) {
|
|
2691
|
+
return false;
|
|
2692
|
+
}
|
|
1937
2693
|
}
|
|
1938
|
-
|
|
2694
|
+
else {
|
|
1939
2695
|
return false;
|
|
1940
2696
|
}
|
|
1941
|
-
|
|
2697
|
+
|
|
2698
|
+
if (!ggml_are_same_shape(src0, dst)) {
|
|
1942
2699
|
return false;
|
|
1943
2700
|
}
|
|
1944
|
-
|
|
1945
|
-
// TODO: add support for non-contigiuos tensors
|
|
1946
|
-
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
|
|
2701
|
+
if (!ggml_can_repeat(src1, src0) || ggml_is_permuted(src1)) {
|
|
1947
2702
|
return false;
|
|
1948
2703
|
}
|
|
1949
2704
|
|
|
@@ -1955,16 +2710,16 @@ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * se
|
|
|
1955
2710
|
const struct ggml_tensor * src1 = op->src[1];
|
|
1956
2711
|
const struct ggml_tensor * dst = op;
|
|
1957
2712
|
|
|
1958
|
-
if (
|
|
2713
|
+
if (src0->type != GGML_TYPE_F32) {
|
|
1959
2714
|
return false;
|
|
1960
2715
|
}
|
|
1961
|
-
if (
|
|
2716
|
+
if (src1->type != GGML_TYPE_F32) {
|
|
1962
2717
|
return false;
|
|
1963
2718
|
}
|
|
1964
|
-
if (
|
|
2719
|
+
if (dst->type != GGML_TYPE_F32) {
|
|
1965
2720
|
return false;
|
|
1966
2721
|
}
|
|
1967
|
-
if (!
|
|
2722
|
+
if (!ggml_are_same_shape(src0, dst)) {
|
|
1968
2723
|
return false;
|
|
1969
2724
|
}
|
|
1970
2725
|
|
|
@@ -1980,13 +2735,32 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
|
|
|
1980
2735
|
const struct ggml_tensor * src0 = op->src[0];
|
|
1981
2736
|
const struct ggml_tensor * dst = op;
|
|
1982
2737
|
|
|
1983
|
-
if (
|
|
2738
|
+
if (src0->type != GGML_TYPE_F32) {
|
|
2739
|
+
return false;
|
|
2740
|
+
}
|
|
2741
|
+
if (dst->type != GGML_TYPE_F32) {
|
|
2742
|
+
return false;
|
|
2743
|
+
}
|
|
2744
|
+
if (!ggml_are_same_shape(src0, dst)) {
|
|
2745
|
+
return false;
|
|
2746
|
+
}
|
|
2747
|
+
|
|
2748
|
+
// dst must be contiguous; src0 may be non-contiguous
|
|
2749
|
+
if (!ggml_is_contiguous(dst)) {
|
|
1984
2750
|
return false;
|
|
1985
2751
|
}
|
|
1986
|
-
|
|
2752
|
+
|
|
2753
|
+
return true;
|
|
2754
|
+
}
|
|
2755
|
+
|
|
2756
|
+
static bool ggml_hexagon_supported_sum_rows(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
2757
|
+
const struct ggml_tensor * src0 = op->src[0];
|
|
2758
|
+
const struct ggml_tensor * dst = op;
|
|
2759
|
+
|
|
2760
|
+
if (src0->type != GGML_TYPE_F32) {
|
|
1987
2761
|
return false;
|
|
1988
2762
|
}
|
|
1989
|
-
if (
|
|
2763
|
+
if (dst->type != GGML_TYPE_F32) {
|
|
1990
2764
|
return false;
|
|
1991
2765
|
}
|
|
1992
2766
|
|
|
@@ -2004,10 +2778,10 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
|
|
|
2004
2778
|
const struct ggml_tensor * src1 = op->src[1];
|
|
2005
2779
|
const struct ggml_tensor * dst = op;
|
|
2006
2780
|
|
|
2007
|
-
if (
|
|
2781
|
+
if (src0->type != GGML_TYPE_F32) {
|
|
2008
2782
|
return false;
|
|
2009
2783
|
}
|
|
2010
|
-
if (
|
|
2784
|
+
if (dst->type != GGML_TYPE_F32) {
|
|
2011
2785
|
return false;
|
|
2012
2786
|
}
|
|
2013
2787
|
|
|
@@ -2016,10 +2790,10 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
|
|
|
2016
2790
|
}
|
|
2017
2791
|
|
|
2018
2792
|
if (src1) {
|
|
2019
|
-
if (
|
|
2793
|
+
if (src1->type != GGML_TYPE_F32) {
|
|
2020
2794
|
return false;
|
|
2021
2795
|
}
|
|
2022
|
-
if (!
|
|
2796
|
+
if (!ggml_are_same_shape(src0, src1)) {
|
|
2023
2797
|
return false;
|
|
2024
2798
|
}
|
|
2025
2799
|
if (!ggml_is_contiguous(src1)) {
|
|
@@ -2040,15 +2814,15 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
|
|
|
2040
2814
|
return false; // FIXME: add support for sinks
|
|
2041
2815
|
}
|
|
2042
2816
|
|
|
2043
|
-
if (
|
|
2817
|
+
if (src0->type != GGML_TYPE_F32) {
|
|
2044
2818
|
return false;
|
|
2045
2819
|
}
|
|
2046
|
-
if (
|
|
2820
|
+
if (dst->type != GGML_TYPE_F32) {
|
|
2047
2821
|
return false;
|
|
2048
2822
|
}
|
|
2049
2823
|
|
|
2050
2824
|
if (src1) {
|
|
2051
|
-
if (
|
|
2825
|
+
if (src1->type != GGML_TYPE_F32 && src1->type != GGML_TYPE_F16) {
|
|
2052
2826
|
return false;
|
|
2053
2827
|
}
|
|
2054
2828
|
if (src0->ne[0] != src1->ne[0]) {
|
|
@@ -2075,6 +2849,23 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
|
|
|
2075
2849
|
}
|
|
2076
2850
|
}
|
|
2077
2851
|
|
|
2852
|
+
// Reject non-HVX-aligned sizes when ne[0] > HVX_F32_LANES
|
|
2853
|
+
// The HVX softmax implementation has issues with tail handling for larger non-aligned sizes
|
|
2854
|
+
// Small sizes (ne[0] <= 32) work correctly with tail-only processing
|
|
2855
|
+
const int64_t ne0 = src0->ne[0];
|
|
2856
|
+
if (ne0 > 32 && (ne0 & (32 - 1)) != 0) {
|
|
2857
|
+
return false;
|
|
2858
|
+
}
|
|
2859
|
+
|
|
2860
|
+
// HVX vector size constraints for softmax
|
|
2861
|
+
#define SOFTMAX_MAX_ROW_SIZE 131072 // 128K elements max for numerical precision
|
|
2862
|
+
|
|
2863
|
+
// Reject very large row sizes to avoid numerical precision issues
|
|
2864
|
+
// Softmax accumulation over many elements can lead to precision loss
|
|
2865
|
+
if (ne0 > SOFTMAX_MAX_ROW_SIZE) {
|
|
2866
|
+
return false;
|
|
2867
|
+
}
|
|
2868
|
+
|
|
2078
2869
|
return true;
|
|
2079
2870
|
}
|
|
2080
2871
|
|
|
@@ -2118,12 +2909,32 @@ static bool ggml_hexagon_supported_get_rows(const struct ggml_hexagon_session *
|
|
|
2118
2909
|
return true;
|
|
2119
2910
|
}
|
|
2120
2911
|
|
|
2912
|
+
static bool ggml_hexagon_supported_argsort(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
2913
|
+
const struct ggml_tensor * src0 = op->src[0]; // values
|
|
2914
|
+
const struct ggml_tensor * dst = op; // indices
|
|
2915
|
+
|
|
2916
|
+
if (src0->type != GGML_TYPE_F32) {
|
|
2917
|
+
return false;
|
|
2918
|
+
}
|
|
2919
|
+
|
|
2920
|
+
if (dst->type != GGML_TYPE_I32) {
|
|
2921
|
+
return false;
|
|
2922
|
+
}
|
|
2923
|
+
|
|
2924
|
+
if (src0->ne[0] > (16*1024)) {
|
|
2925
|
+
// reject tensors with huge rows for now
|
|
2926
|
+
return false;
|
|
2927
|
+
}
|
|
2928
|
+
|
|
2929
|
+
return true;
|
|
2930
|
+
}
|
|
2931
|
+
|
|
2121
2932
|
static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
2122
2933
|
const int32_t * op_params = &op->op_params[0];
|
|
2123
2934
|
|
|
2124
2935
|
int mode = op_params[2];
|
|
2125
2936
|
|
|
2126
|
-
if (
|
|
2937
|
+
if (mode == GGML_ROPE_TYPE_VISION) {
|
|
2127
2938
|
return false;
|
|
2128
2939
|
}
|
|
2129
2940
|
if (mode & 1) {
|
|
@@ -2135,17 +2946,17 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
|
|
|
2135
2946
|
const struct ggml_tensor * src2 = op->src[2];
|
|
2136
2947
|
const struct ggml_tensor * dst = op;
|
|
2137
2948
|
|
|
2138
|
-
if (
|
|
2949
|
+
if (src0->type != GGML_TYPE_F32) {
|
|
2139
2950
|
return false; // FIXME: add support for GGML_TYPE_F16 for src0
|
|
2140
2951
|
}
|
|
2141
|
-
if (
|
|
2952
|
+
if (dst->type != GGML_TYPE_F32) {
|
|
2142
2953
|
return false;
|
|
2143
2954
|
}
|
|
2144
|
-
if (
|
|
2955
|
+
if (src1->type != GGML_TYPE_I32) {
|
|
2145
2956
|
return false;
|
|
2146
2957
|
}
|
|
2147
2958
|
if (src2) {
|
|
2148
|
-
if (
|
|
2959
|
+
if (src2->type != GGML_TYPE_F32) {
|
|
2149
2960
|
return false;
|
|
2150
2961
|
}
|
|
2151
2962
|
int n_dims = op_params[1];
|
|
@@ -2168,277 +2979,147 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
|
|
|
2168
2979
|
return true;
|
|
2169
2980
|
}
|
|
2170
2981
|
|
|
2171
|
-
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
|
|
2175
|
-
};
|
|
2176
|
-
|
|
2177
|
-
static void dspqbuf_dump(dspqueue_buffer * d, const struct ggml_tensor * t, dspqbuf_type type) {
|
|
2178
|
-
if (opt_verbose < 2) return;
|
|
2179
|
-
|
|
2180
|
-
auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
|
|
2181
|
-
auto sess = buf->sess;
|
|
2182
|
-
|
|
2183
|
-
GGML_LOG_DEBUG("ggml-hex: %s dspqbuf : %s base-addr %p base-size %zu data %p offset %u size %u\n", sess->name.c_str(),
|
|
2184
|
-
t->name, (void *) buf->base, buf->size, (void *) d->ptr, (unsigned int) d->offset,
|
|
2185
|
-
(unsigned int) d->size);
|
|
2186
|
-
}
|
|
2187
|
-
|
|
2188
|
-
// Init hexagon tensor from GGML tensor and Hexagon buffer
|
|
2189
|
-
static void htp_req_tensor_init(htp_tensor * h, const ggml_tensor * t) {
|
|
2190
|
-
h->data = 0; // updated by the receiver
|
|
2191
|
-
h->type = t->type;
|
|
2192
|
-
h->ne[0] = t->ne[0];
|
|
2193
|
-
h->ne[1] = t->ne[1];
|
|
2194
|
-
h->ne[2] = t->ne[2];
|
|
2195
|
-
h->ne[3] = t->ne[3];
|
|
2196
|
-
h->nb[0] = t->nb[0];
|
|
2197
|
-
h->nb[1] = t->nb[1];
|
|
2198
|
-
h->nb[2] = t->nb[2];
|
|
2199
|
-
h->nb[3] = t->nb[3];
|
|
2200
|
-
}
|
|
2982
|
+
static bool ggml_hexagon_supported_ssm_conv(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
2983
|
+
const struct ggml_tensor * src0 = op->src[0];
|
|
2984
|
+
const struct ggml_tensor * src1 = op->src[1];
|
|
2985
|
+
const struct ggml_tensor * dst = op;
|
|
2201
2986
|
|
|
2202
|
-
|
|
2203
|
-
if (
|
|
2204
|
-
return
|
|
2987
|
+
// Only support FP32 for now
|
|
2988
|
+
if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
|
2989
|
+
return false;
|
|
2205
2990
|
}
|
|
2206
2991
|
|
|
2207
|
-
|
|
2992
|
+
// Check IO tensor shapes and dims
|
|
2993
|
+
if (src0->ne[3] != 1 || src1->ne[2] != 1 || src1->ne[3] != 1 || dst->ne[3] != 1) {
|
|
2994
|
+
return false; // src0 should be effectively 3D
|
|
2995
|
+
}
|
|
2208
2996
|
|
|
2209
|
-
|
|
2210
|
-
|
|
2211
|
-
|
|
2212
|
-
|
|
2213
|
-
d->size = ggml_nbytes(t);
|
|
2997
|
+
const int d_conv = src1->ne[0];
|
|
2998
|
+
const int d_inner = src0->ne[1];
|
|
2999
|
+
const int n_t = dst->ne[1];
|
|
3000
|
+
const int n_s = dst->ne[2];
|
|
2214
3001
|
|
|
2215
|
-
if (
|
|
2216
|
-
|
|
2217
|
-
d->size = 64;
|
|
3002
|
+
if (src0->ne[0] != d_conv - 1 + n_t || src0->ne[1] != d_inner || src0->ne[2] != n_s) {
|
|
3003
|
+
return false;
|
|
2218
3004
|
}
|
|
2219
|
-
|
|
2220
|
-
|
|
2221
|
-
case DSPQBUF_TYPE_DSP_WRITE_CPU_READ:
|
|
2222
|
-
// Flush CPU
|
|
2223
|
-
d->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER;
|
|
2224
|
-
break;
|
|
2225
|
-
case DSPQBUF_TYPE_CPU_WRITE_DSP_READ:
|
|
2226
|
-
// Flush CPU, Invalidate DSP
|
|
2227
|
-
d->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
|
|
2228
|
-
break;
|
|
2229
|
-
default:
|
|
2230
|
-
// Constant buffer, no cache maintenance
|
|
2231
|
-
d->flags = 0;
|
|
2232
|
-
break;
|
|
3005
|
+
if (src1->ne[0] != d_conv || src1->ne[1] != d_inner) {
|
|
3006
|
+
return false;
|
|
2233
3007
|
}
|
|
2234
|
-
|
|
2235
|
-
|
|
2236
|
-
|
|
2237
|
-
dspqbuf_dump(d, t, type);
|
|
2238
|
-
|
|
2239
|
-
return 1;
|
|
2240
|
-
}
|
|
2241
|
-
|
|
2242
|
-
typedef size_t (*htp_req_init_func_t)(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * op);
|
|
2243
|
-
|
|
2244
|
-
template <htp_req_init_func_t _init_req_func>
|
|
2245
|
-
static inline void ggml_hexagon_dispatch_op(ggml_hexagon_session *sess, const struct ggml_tensor * op, uint32_t flags) {
|
|
2246
|
-
uint64_t t = ggml_time_us();
|
|
2247
|
-
|
|
2248
|
-
// Construct HTP request
|
|
2249
|
-
htp_general_req req;
|
|
2250
|
-
memset(&req, 0, sizeof(req));
|
|
2251
|
-
|
|
2252
|
-
req.flags = flags;
|
|
2253
|
-
if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
|
|
2254
|
-
req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
|
|
3008
|
+
if (dst->ne[0] != d_inner || dst->ne[1] != n_t || dst->ne[2] != n_s) {
|
|
3009
|
+
return false;
|
|
2255
3010
|
}
|
|
2256
|
-
if (
|
|
2257
|
-
|
|
3011
|
+
if (src0->nb[0] != sizeof(float) || src1->nb[0] != sizeof(float) || dst->nb[0] != sizeof(float)) {
|
|
3012
|
+
return false;
|
|
2258
3013
|
}
|
|
2259
|
-
|
|
2260
|
-
|
|
2261
|
-
|
|
2262
|
-
if ((opt_opmask & HTP_OPMASK_QUEUE)) {
|
|
2263
|
-
dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
|
|
2264
|
-
size_t n_bufs = _init_req_func(&req, bufs, op);
|
|
2265
|
-
sess->enqueue(req, bufs, n_bufs, opt_opsync);
|
|
3014
|
+
if (src0->nb[1] != src0->ne[0] * sizeof(float) || src1->nb[1] != src1->ne[0] * sizeof(float)) {
|
|
3015
|
+
return false;
|
|
2266
3016
|
}
|
|
2267
3017
|
|
|
2268
|
-
|
|
2269
|
-
|
|
2270
|
-
ggml_hexagon_dump_op_prof(sess->name, op, sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, t);
|
|
3018
|
+
return true;
|
|
2271
3019
|
}
|
|
2272
3020
|
|
|
2273
|
-
|
|
2274
|
-
|
|
2275
|
-
|
|
2276
|
-
case GGML_OP_MUL_MAT:
|
|
2277
|
-
req->op = HTP_OP_MUL_MAT;
|
|
2278
|
-
break;
|
|
2279
|
-
case GGML_OP_MUL:
|
|
2280
|
-
req->op = HTP_OP_MUL;
|
|
2281
|
-
break;
|
|
2282
|
-
case GGML_OP_ADD:
|
|
2283
|
-
req->op = HTP_OP_ADD;
|
|
2284
|
-
break;
|
|
2285
|
-
case GGML_OP_SUB:
|
|
2286
|
-
req->op = HTP_OP_SUB;
|
|
2287
|
-
break;
|
|
2288
|
-
default:
|
|
2289
|
-
GGML_ABORT("ggml-hex: binary : unsupported op: %d\n", t->op);
|
|
2290
|
-
break;
|
|
2291
|
-
}
|
|
2292
|
-
|
|
2293
|
-
// src0: Weights (mulmat) or First Operand (binary op).
|
|
2294
|
-
// If constant (e.g. weights), no cache management is needed.
|
|
2295
|
-
// src1: Input Activations (mulmat) or Second Operand (binary op).
|
|
3021
|
+
static bool ggml_hexagon_supported_pad(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
3022
|
+
const struct ggml_tensor * src0 = op->src[0];
|
|
3023
|
+
const struct ggml_tensor * dst = op;
|
|
2296
3024
|
|
|
2297
|
-
|
|
2298
|
-
|
|
2299
|
-
|
|
2300
|
-
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
3025
|
+
if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
|
3026
|
+
return false;
|
|
3027
|
+
}
|
|
2301
3028
|
|
|
2302
|
-
|
|
3029
|
+
GGML_UNUSED(sess);
|
|
3030
|
+
return true;
|
|
2303
3031
|
}
|
|
2304
3032
|
|
|
2305
|
-
static
|
|
2306
|
-
|
|
2307
|
-
|
|
2308
|
-
size_t n_bufs = 0;
|
|
2309
|
-
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2310
|
-
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2311
|
-
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
2312
|
-
|
|
2313
|
-
return n_bufs;
|
|
2314
|
-
}
|
|
3033
|
+
static bool ggml_hexagon_supported_cumsum(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
3034
|
+
const struct ggml_tensor * src0 = op->src[0];
|
|
3035
|
+
const struct ggml_tensor * dst = op;
|
|
2315
3036
|
|
|
2316
|
-
|
|
2317
|
-
|
|
2318
|
-
switch (t->op) {
|
|
2319
|
-
case GGML_OP_MUL_MAT_ID:
|
|
2320
|
-
req->op = HTP_OP_MUL_MAT_ID;
|
|
2321
|
-
break;
|
|
2322
|
-
case GGML_OP_ADD_ID:
|
|
2323
|
-
req->op = HTP_OP_ADD_ID;
|
|
2324
|
-
break;
|
|
2325
|
-
default:
|
|
2326
|
-
GGML_ABORT("ggml-hex: unsupported op: %d\n", t->op);
|
|
3037
|
+
if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
|
3038
|
+
return false;
|
|
2327
3039
|
}
|
|
2328
3040
|
|
|
2329
|
-
|
|
2330
|
-
|
|
2331
|
-
|
|
2332
|
-
// src2: Expert IDs (mulmat) or Activated Experts (other op).
|
|
2333
|
-
|
|
2334
|
-
size_t n_bufs = 0;
|
|
2335
|
-
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], _is_src0_constant ? DSPQBUF_TYPE_CONSTANT : DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2336
|
-
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2337
|
-
n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2338
|
-
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
3041
|
+
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
|
|
3042
|
+
return false;
|
|
3043
|
+
}
|
|
2339
3044
|
|
|
2340
|
-
|
|
3045
|
+
GGML_UNUSED(sess);
|
|
3046
|
+
return true;
|
|
2341
3047
|
}
|
|
2342
3048
|
|
|
2343
|
-
static
|
|
2344
|
-
|
|
2345
|
-
|
|
2346
|
-
size_t n_bufs = 0;
|
|
2347
|
-
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2348
|
-
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2349
|
-
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
3049
|
+
static bool ggml_hexagon_supported_diag(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
3050
|
+
const struct ggml_tensor * src0 = op->src[0];
|
|
3051
|
+
const struct ggml_tensor * dst = op;
|
|
2350
3052
|
|
|
2351
|
-
|
|
2352
|
-
|
|
3053
|
+
// diag only supports F32 currently
|
|
3054
|
+
if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
|
3055
|
+
return false;
|
|
3056
|
+
}
|
|
2353
3057
|
|
|
2354
|
-
|
|
2355
|
-
|
|
3058
|
+
// Input must have ne[1] == 1 (vector input)
|
|
3059
|
+
if (src0->ne[1] != 1) {
|
|
3060
|
+
return false;
|
|
3061
|
+
}
|
|
2356
3062
|
|
|
2357
|
-
|
|
3063
|
+
// Output must be square in first two dimensions
|
|
3064
|
+
if (dst->ne[0] != dst->ne[1] || dst->ne[0] != src0->ne[0]) {
|
|
3065
|
+
return false;
|
|
3066
|
+
}
|
|
2358
3067
|
|
|
2359
|
-
|
|
2360
|
-
|
|
2361
|
-
|
|
2362
|
-
supported = true;
|
|
2363
|
-
break;
|
|
3068
|
+
GGML_UNUSED(sess);
|
|
3069
|
+
return true;
|
|
3070
|
+
}
|
|
2364
3071
|
|
|
2365
|
-
|
|
2366
|
-
|
|
2367
|
-
|
|
2368
|
-
|
|
3072
|
+
static bool ggml_hexagon_supported_solve_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
3073
|
+
const struct ggml_tensor * src0 = op->src[0]; // A
|
|
3074
|
+
const struct ggml_tensor * src1 = op->src[1]; // B
|
|
3075
|
+
const struct ggml_tensor * dst = op; // X
|
|
2369
3076
|
|
|
2370
|
-
|
|
2371
|
-
|
|
2372
|
-
|
|
2373
|
-
supported = true;
|
|
2374
|
-
} else if (ggml_get_unary_op(t) == GGML_UNARY_OP_GELU) {
|
|
2375
|
-
req->op = HTP_OP_UNARY_GELU;
|
|
2376
|
-
supported = true;
|
|
2377
|
-
}
|
|
2378
|
-
break;
|
|
3077
|
+
if (!src0 || !src1) {
|
|
3078
|
+
return false;
|
|
3079
|
+
}
|
|
2379
3080
|
|
|
2380
|
-
|
|
2381
|
-
|
|
2382
|
-
|
|
2383
|
-
supported = true;
|
|
2384
|
-
} else if (ggml_get_glu_op(t) == GGML_GLU_OP_SWIGLU_OAI) {
|
|
2385
|
-
req->op = HTP_OP_GLU_SWIGLU_OAI;
|
|
2386
|
-
supported = true;
|
|
2387
|
-
}
|
|
2388
|
-
break;
|
|
3081
|
+
if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
|
3082
|
+
return false;
|
|
3083
|
+
}
|
|
2389
3084
|
|
|
2390
|
-
|
|
2391
|
-
|
|
2392
|
-
|
|
2393
|
-
break;
|
|
3085
|
+
if (src0->ne[0] != src0->ne[1]) {
|
|
3086
|
+
return false;
|
|
3087
|
+
}
|
|
2394
3088
|
|
|
2395
|
-
|
|
2396
|
-
|
|
3089
|
+
if (src0->ne[1] != src1->ne[1]) {
|
|
3090
|
+
return false;
|
|
2397
3091
|
}
|
|
2398
3092
|
|
|
2399
|
-
if (
|
|
2400
|
-
|
|
3093
|
+
if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
|
|
3094
|
+
return false;
|
|
2401
3095
|
}
|
|
2402
3096
|
|
|
2403
|
-
|
|
2404
|
-
|
|
2405
|
-
|
|
2406
|
-
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
3097
|
+
if (dst->ne[0] != src1->ne[0] || dst->ne[1] != src1->ne[1] || dst->ne[2] != src1->ne[2] || dst->ne[3] != src1->ne[3]) {
|
|
3098
|
+
return false;
|
|
3099
|
+
}
|
|
2407
3100
|
|
|
2408
|
-
|
|
3101
|
+
GGML_UNUSED(sess);
|
|
3102
|
+
return true;
|
|
2409
3103
|
}
|
|
2410
3104
|
|
|
2411
|
-
static
|
|
2412
|
-
memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
|
|
2413
|
-
req->op = HTP_OP_ROPE;
|
|
3105
|
+
static bool ggml_hexagon_supported_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
2414
3106
|
|
|
2415
|
-
|
|
2416
|
-
|
|
2417
|
-
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2418
|
-
n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2419
|
-
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
2420
|
-
|
|
2421
|
-
return n_bufs;
|
|
2422
|
-
}
|
|
3107
|
+
const struct ggml_tensor * src0 = op->src[0];
|
|
3108
|
+
const struct ggml_tensor * dst = op;
|
|
2423
3109
|
|
|
2424
|
-
|
|
2425
|
-
|
|
2426
|
-
|
|
3110
|
+
if (src0->type != GGML_TYPE_F32) { return false; }
|
|
3111
|
+
if (dst->type != GGML_TYPE_F32) { return false; }
|
|
3112
|
+
if (!ggml_are_same_shape(src0, dst)) { return false; }
|
|
3113
|
+
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) { return false; }
|
|
2427
3114
|
|
|
2428
|
-
|
|
2429
|
-
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2430
|
-
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2431
|
-
n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2432
|
-
n_bufs += htp_req_buff_init(&req->src3, &bufs[n_bufs], t->src[3], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2433
|
-
n_bufs += htp_req_buff_init(&req->src4, &bufs[n_bufs], t->src[4], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2434
|
-
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
3115
|
+
return true;
|
|
2435
3116
|
|
|
2436
|
-
|
|
3117
|
+
GGML_UNUSED(sess);
|
|
2437
3118
|
}
|
|
2438
3119
|
|
|
2439
3120
|
static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
|
|
2440
3121
|
auto sess = static_cast<ggml_hexagon_session *>(backend->context);
|
|
2441
|
-
return sess->
|
|
3122
|
+
return sess->c_name();
|
|
2442
3123
|
}
|
|
2443
3124
|
|
|
2444
3125
|
static void ggml_backend_hexagon_free(ggml_backend_t backend) {
|
|
@@ -2447,118 +3128,118 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
|
|
|
2447
3128
|
delete backend;
|
|
2448
3129
|
}
|
|
2449
3130
|
|
|
2450
|
-
static
|
|
2451
|
-
|
|
2452
|
-
|
|
3131
|
+
static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
|
|
3132
|
+
switch (t->op) {
|
|
3133
|
+
case GGML_OP_FLASH_ATTN_EXT: return HTP_OP_FLASH_ATTN_EXT;
|
|
3134
|
+
case GGML_OP_MUL_MAT: return HTP_OP_MUL_MAT;
|
|
3135
|
+
case GGML_OP_MUL_MAT_ID: return HTP_OP_MUL_MAT_ID;
|
|
3136
|
+
case GGML_OP_MUL: return HTP_OP_MUL;
|
|
3137
|
+
case GGML_OP_ADD: return HTP_OP_ADD;
|
|
3138
|
+
case GGML_OP_ADD_ID: return HTP_OP_ADD_ID;
|
|
3139
|
+
case GGML_OP_SUB: return HTP_OP_SUB;
|
|
3140
|
+
case GGML_OP_DIV: return HTP_OP_DIV;
|
|
3141
|
+
case GGML_OP_CPY: return HTP_OP_CPY;
|
|
3142
|
+
case GGML_OP_CONT: return HTP_OP_CPY;
|
|
3143
|
+
case GGML_OP_GET_ROWS: return HTP_OP_GET_ROWS;
|
|
3144
|
+
case GGML_OP_SET_ROWS: return HTP_OP_SET_ROWS;
|
|
3145
|
+
case GGML_OP_SUM_ROWS: return HTP_OP_SUM_ROWS;
|
|
3146
|
+
case GGML_OP_ARGSORT: return HTP_OP_ARGSORT;
|
|
3147
|
+
case GGML_OP_NORM: return HTP_OP_NORM;
|
|
3148
|
+
case GGML_OP_L2_NORM: return HTP_OP_L2_NORM;
|
|
3149
|
+
case GGML_OP_RMS_NORM: return HTP_OP_RMS_NORM;
|
|
3150
|
+
case GGML_OP_CONCAT: return HTP_OP_CONCAT;
|
|
3151
|
+
case GGML_OP_SCALE: return HTP_OP_SCALE;
|
|
3152
|
+
case GGML_OP_SQR: return HTP_OP_SQR;
|
|
3153
|
+
case GGML_OP_SQRT: return HTP_OP_SQRT;
|
|
3154
|
+
case GGML_OP_SOFT_MAX: return HTP_OP_SOFTMAX;
|
|
3155
|
+
case GGML_OP_SSM_CONV: return HTP_OP_SSM_CONV;
|
|
3156
|
+
case GGML_OP_GATED_DELTA_NET: return HTP_OP_GATED_DELTA_NET;
|
|
3157
|
+
case GGML_OP_ROPE: return HTP_OP_ROPE;
|
|
3158
|
+
case GGML_OP_REPEAT: return HTP_OP_REPEAT;
|
|
3159
|
+
case GGML_OP_CUMSUM: return HTP_OP_CUMSUM;
|
|
3160
|
+
case GGML_OP_FILL: return HTP_OP_FILL;
|
|
3161
|
+
case GGML_OP_DIAG: return HTP_OP_DIAG;
|
|
3162
|
+
case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI;
|
|
3163
|
+
case GGML_OP_TRI: return HTP_OP_TRI;
|
|
3164
|
+
case GGML_OP_PAD: return HTP_OP_PAD;
|
|
2453
3165
|
|
|
2454
|
-
|
|
2455
|
-
{
|
|
2456
|
-
|
|
2457
|
-
|
|
3166
|
+
case GGML_OP_UNARY:
|
|
3167
|
+
switch (ggml_get_unary_op(t)) {
|
|
3168
|
+
case GGML_UNARY_OP_SILU: return HTP_OP_UNARY_SILU;
|
|
3169
|
+
case GGML_UNARY_OP_GELU: return HTP_OP_UNARY_GELU;
|
|
3170
|
+
case GGML_UNARY_OP_GELU_QUICK: return HTP_OP_UNARY_GELU;
|
|
3171
|
+
case GGML_UNARY_OP_SIGMOID: return HTP_OP_UNARY_SIGMOID;
|
|
3172
|
+
case GGML_UNARY_OP_NEG: return HTP_OP_UNARY_NEG;
|
|
3173
|
+
case GGML_UNARY_OP_EXP: return HTP_OP_UNARY_EXP;
|
|
3174
|
+
case GGML_UNARY_OP_SOFTPLUS: return HTP_OP_UNARY_SOFTPLUS;
|
|
3175
|
+
case GGML_UNARY_OP_TANH: return HTP_OP_UNARY_TANH;
|
|
3176
|
+
default:
|
|
3177
|
+
break;
|
|
3178
|
+
}
|
|
3179
|
+
break;
|
|
2458
3180
|
|
|
2459
|
-
|
|
2460
|
-
|
|
2461
|
-
|
|
2462
|
-
|
|
2463
|
-
|
|
2464
|
-
|
|
2465
|
-
|
|
3181
|
+
case GGML_OP_GLU:
|
|
3182
|
+
switch (ggml_get_glu_op(t)) {
|
|
3183
|
+
case GGML_GLU_OP_SWIGLU: return HTP_OP_GLU_SWIGLU;
|
|
3184
|
+
case GGML_GLU_OP_SWIGLU_OAI: return HTP_OP_GLU_SWIGLU_OAI;
|
|
3185
|
+
case GGML_GLU_OP_GEGLU: return HTP_OP_GLU_GEGLU;
|
|
3186
|
+
default: break;
|
|
3187
|
+
}
|
|
3188
|
+
break;
|
|
3189
|
+
|
|
3190
|
+
default:
|
|
3191
|
+
GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(t));
|
|
2466
3192
|
}
|
|
3193
|
+
return HTP_OP_INVALID;
|
|
3194
|
+
}
|
|
2467
3195
|
|
|
2468
|
-
|
|
3196
|
+
static inline bool op_is_compute(ggml_tensor *node)
|
|
3197
|
+
{
|
|
3198
|
+
return !ggml_op_is_empty(node->op) && !ggml_is_empty(node) && (node->flags & GGML_TENSOR_FLAG_COMPUTE);
|
|
2469
3199
|
}
|
|
2470
3200
|
|
|
2471
3201
|
static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
|
|
2472
3202
|
auto sess = static_cast<ggml_hexagon_session *>(backend->context);
|
|
2473
3203
|
|
|
2474
|
-
HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->
|
|
2475
|
-
|
|
2476
|
-
const int last = last_compute_op(graph);
|
|
3204
|
+
HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->c_name(), graph->n_nodes);
|
|
2477
3205
|
|
|
2478
|
-
|
|
3206
|
+
std::vector<htp_opnode> nodes;
|
|
3207
|
+
nodes.reserve(graph->n_nodes);
|
|
2479
3208
|
|
|
3209
|
+
// Fusion
|
|
2480
3210
|
for (int i = 0; i < graph->n_nodes; ++i) {
|
|
2481
|
-
ggml_tensor *
|
|
2482
|
-
|
|
2483
|
-
if (!is_compute_op(node)) {
|
|
3211
|
+
ggml_tensor * n = graph->nodes[i];
|
|
3212
|
+
if (!op_is_compute(n)) {
|
|
2484
3213
|
continue;
|
|
2485
3214
|
}
|
|
2486
3215
|
|
|
2487
|
-
|
|
3216
|
+
ggml_tensor * next_node = (i + 1 < graph->n_nodes) ? graph->nodes[i + 1] : nullptr;
|
|
2488
3217
|
|
|
2489
|
-
|
|
2490
|
-
|
|
2491
|
-
|
|
2492
|
-
|
|
3218
|
+
htp_opnode node = {
|
|
3219
|
+
/*.node =*/ n,
|
|
3220
|
+
/*.fused =*/ {},
|
|
3221
|
+
/*.opcode =*/ HTP_OP_INVALID
|
|
3222
|
+
};
|
|
2493
3223
|
|
|
2494
|
-
|
|
2495
|
-
|
|
2496
|
-
|
|
3224
|
+
if (n->op == GGML_OP_RMS_NORM && next_node) {
|
|
3225
|
+
if (next_node->op == GGML_OP_MUL && op_is_compute(next_node) && ggml_can_fuse(graph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
|
|
3226
|
+
node.add_fused(next_node);
|
|
3227
|
+
node.opcode = HTP_OP_RMS_NORM_MUL;
|
|
3228
|
+
i++; // skip the fused MUL node
|
|
3229
|
+
}
|
|
2497
3230
|
}
|
|
2498
3231
|
|
|
2499
|
-
|
|
2500
|
-
|
|
2501
|
-
|
|
2502
|
-
ggml_hexagon_dispatch_op<init_binary_req<true>>(sess, node, flags);
|
|
2503
|
-
} else {
|
|
2504
|
-
ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
|
|
2505
|
-
}
|
|
2506
|
-
prev_quant_op = node;
|
|
2507
|
-
break;
|
|
2508
|
-
case GGML_OP_MUL_MAT_ID:
|
|
2509
|
-
if (ggml_is_quantized(node->src[0]->type)) {
|
|
2510
|
-
ggml_hexagon_dispatch_op<init_binary_id_req<true>>(sess, node, flags);
|
|
2511
|
-
} else {
|
|
2512
|
-
ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
|
|
2513
|
-
}
|
|
2514
|
-
prev_quant_op = node;
|
|
2515
|
-
break;
|
|
2516
|
-
case GGML_OP_MUL:
|
|
2517
|
-
case GGML_OP_ADD:
|
|
2518
|
-
case GGML_OP_SUB:
|
|
2519
|
-
ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
|
|
2520
|
-
break;
|
|
2521
|
-
case GGML_OP_ADD_ID:
|
|
2522
|
-
ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
|
|
2523
|
-
break;
|
|
2524
|
-
case GGML_OP_RMS_NORM:
|
|
2525
|
-
case GGML_OP_SCALE:
|
|
2526
|
-
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
|
2527
|
-
break;
|
|
2528
|
-
case GGML_OP_UNARY:
|
|
2529
|
-
if ((ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) ||
|
|
2530
|
-
(ggml_get_unary_op(node) == GGML_UNARY_OP_GELU)) {
|
|
2531
|
-
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
|
2532
|
-
}
|
|
2533
|
-
break;
|
|
2534
|
-
case GGML_OP_GLU:
|
|
2535
|
-
if ((ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU) ||
|
|
2536
|
-
(ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI)) {
|
|
2537
|
-
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
|
2538
|
-
}
|
|
2539
|
-
break;
|
|
2540
|
-
case GGML_OP_SOFT_MAX:
|
|
2541
|
-
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
|
2542
|
-
break;
|
|
2543
|
-
|
|
2544
|
-
case GGML_OP_ROPE:
|
|
2545
|
-
ggml_hexagon_dispatch_op<init_rope_req>(sess, node, flags);
|
|
2546
|
-
break;
|
|
2547
|
-
|
|
2548
|
-
case GGML_OP_FLASH_ATTN_EXT:
|
|
2549
|
-
ggml_hexagon_dispatch_op<init_flash_attn_ext_req>(sess, node, flags);
|
|
2550
|
-
break;
|
|
2551
|
-
|
|
2552
|
-
case GGML_OP_SET_ROWS:
|
|
2553
|
-
ggml_hexagon_dispatch_op<init_set_rows_req>(sess, node, flags);
|
|
2554
|
-
break;
|
|
3232
|
+
if (node.opcode == HTP_OP_INVALID) {
|
|
3233
|
+
node.opcode = op_remap_to_htp(n);
|
|
3234
|
+
}
|
|
2555
3235
|
|
|
2556
|
-
|
|
2557
|
-
|
|
2558
|
-
break;
|
|
3236
|
+
nodes.push_back(std::move(node));
|
|
3237
|
+
}
|
|
2559
3238
|
|
|
2560
|
-
|
|
2561
|
-
|
|
3239
|
+
// Queue and execute
|
|
3240
|
+
if (opt_opstage & HTP_OPSTAGE_QUEUE) {
|
|
3241
|
+
for (const auto & node : nodes) {
|
|
3242
|
+
sess->enqueue_op(node);
|
|
2562
3243
|
}
|
|
2563
3244
|
}
|
|
2564
3245
|
|
|
@@ -2571,57 +3252,13 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
|
|
2571
3252
|
static void ggml_backend_hexagon_synchronize(ggml_backend_t backend) {
|
|
2572
3253
|
auto sess = static_cast<ggml_hexagon_session *>(backend->context);
|
|
2573
3254
|
|
|
2574
|
-
HEX_VERBOSE("ggml-hex: %s synchronize\n", sess->
|
|
3255
|
+
HEX_VERBOSE("ggml-hex: %s synchronize\n", sess->c_name());
|
|
2575
3256
|
|
|
2576
3257
|
// Wait until all pending ops complete
|
|
2577
3258
|
sess->flush();
|
|
2578
3259
|
}
|
|
2579
3260
|
|
|
2580
|
-
|
|
2581
|
-
ggml_tensor * node;
|
|
2582
|
-
|
|
2583
|
-
std::vector<ggml_tensor *> fused;
|
|
2584
|
-
|
|
2585
|
-
ggml_op op() const {
|
|
2586
|
-
return node->op;
|
|
2587
|
-
}
|
|
2588
|
-
|
|
2589
|
-
const ggml_tensor * dst() const {
|
|
2590
|
-
return fused.empty() ? node : fused.back();
|
|
2591
|
-
}
|
|
2592
|
-
|
|
2593
|
-
const ggml_tensor * src0() const {
|
|
2594
|
-
return node->src[0];
|
|
2595
|
-
}
|
|
2596
|
-
|
|
2597
|
-
const ggml_tensor * src1() const {
|
|
2598
|
-
return node->src[1];
|
|
2599
|
-
}
|
|
2600
|
-
|
|
2601
|
-
bool is_empty() const {
|
|
2602
|
-
return ggml_op_is_empty(node->op);
|
|
2603
|
-
}
|
|
2604
|
-
|
|
2605
|
-
void add_fused(ggml_tensor * t) {
|
|
2606
|
-
fused.push_back(t);
|
|
2607
|
-
}
|
|
2608
|
-
|
|
2609
|
-
bool stackable() const {
|
|
2610
|
-
switch (this->op()) {
|
|
2611
|
-
case GGML_OP_MUL_MAT:
|
|
2612
|
-
case GGML_OP_MUL_MAT_ID:
|
|
2613
|
-
return ggml_is_quantized(this->src0()->type);
|
|
2614
|
-
default:
|
|
2615
|
-
return false;
|
|
2616
|
-
}
|
|
2617
|
-
}
|
|
2618
|
-
|
|
2619
|
-
bool same_input(const node_info& n) const {
|
|
2620
|
-
return n.src1() == this->src1();
|
|
2621
|
-
}
|
|
2622
|
-
};
|
|
2623
|
-
|
|
2624
|
-
static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<node_info> & nodes) {
|
|
3261
|
+
static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<htp_opnode> & nodes) {
|
|
2625
3262
|
const int n = nodes.size();
|
|
2626
3263
|
|
|
2627
3264
|
std::vector<int> res;
|
|
@@ -2632,7 +3269,7 @@ static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<no
|
|
|
2632
3269
|
// The main goal here is to stack the MUL_MAT ops with the same src1 input.
|
|
2633
3270
|
// This allows use to reuse dynamically quantized src1 in VTCM.
|
|
2634
3271
|
|
|
2635
|
-
// TODO: the current version might do incorrect
|
|
3272
|
+
// TODO: the current version might do incorrect reordering in cases where quantized src0
|
|
2636
3273
|
// input is an output of another Op.
|
|
2637
3274
|
|
|
2638
3275
|
for (int i0 = 0; i0 < n; i0++) {
|
|
@@ -2649,7 +3286,7 @@ static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<no
|
|
|
2649
3286
|
}
|
|
2650
3287
|
|
|
2651
3288
|
// that many nodes forward to search for stackable nodes that can reuse VTCM
|
|
2652
|
-
constexpr int N_FORWARD =
|
|
3289
|
+
constexpr int N_FORWARD = 16;
|
|
2653
3290
|
|
|
2654
3291
|
for (int i1 = i0 + 1; i1 < i0 + N_FORWARD && i1 < n; i1++) {
|
|
2655
3292
|
if (used[i1]) {
|
|
@@ -2675,14 +3312,14 @@ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgr
|
|
|
2675
3312
|
|
|
2676
3313
|
enum ggml_op ops[MAX_FUSE];
|
|
2677
3314
|
|
|
2678
|
-
std::vector<
|
|
3315
|
+
std::vector<htp_opnode> nodes;
|
|
2679
3316
|
nodes.reserve(gf->n_nodes);
|
|
2680
3317
|
|
|
2681
3318
|
// fuse nodes:
|
|
2682
3319
|
// we don't want to make reorders that break fusing, so we first pack all fusable tensors
|
|
2683
3320
|
// and perform the reorder over the fused nodes. after the reorder is done, we unfuse
|
|
2684
3321
|
for (int i = 0; i < n; i++) {
|
|
2685
|
-
|
|
3322
|
+
htp_opnode node = {
|
|
2686
3323
|
/*.node =*/gf->nodes[i],
|
|
2687
3324
|
/*.fused =*/{},
|
|
2688
3325
|
};
|
|
@@ -2749,6 +3386,8 @@ static struct ggml_backend_i hexagon_backend_i = {
|
|
|
2749
3386
|
/* .free = */ ggml_backend_hexagon_free,
|
|
2750
3387
|
/* .set_tensor_async = */ NULL,
|
|
2751
3388
|
/* .get_tensor_async = */ NULL,
|
|
3389
|
+
/* .set_tensor_2d_async = */ NULL,
|
|
3390
|
+
/* .get_tensor_2d_async = */ NULL,
|
|
2752
3391
|
/* .cpy_tensor_async = */ NULL,
|
|
2753
3392
|
/* .synchronize = */ ggml_backend_hexagon_synchronize,
|
|
2754
3393
|
/* .graph_plan_create = */ NULL,
|
|
@@ -2788,7 +3427,7 @@ static ggml_backend_t ggml_backend_hexagon_device_init(ggml_backend_dev_t dev, c
|
|
|
2788
3427
|
|
|
2789
3428
|
static const char * ggml_backend_hexagon_device_get_name(ggml_backend_dev_t dev) {
|
|
2790
3429
|
auto sess = static_cast<ggml_hexagon_session *>(dev->context);
|
|
2791
|
-
return sess->
|
|
3430
|
+
return sess->c_name();
|
|
2792
3431
|
|
|
2793
3432
|
GGML_UNUSED(dev);
|
|
2794
3433
|
}
|
|
@@ -2799,8 +3438,7 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
|
|
|
2799
3438
|
}
|
|
2800
3439
|
|
|
2801
3440
|
static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
|
2802
|
-
|
|
2803
|
-
*free = 2ULL * 1024 * 1024 * 1024;
|
|
3441
|
+
*free = 0;
|
|
2804
3442
|
*total = *free;
|
|
2805
3443
|
|
|
2806
3444
|
GGML_UNUSED(dev);
|
|
@@ -2858,9 +3496,98 @@ static bool ggml_hexagon_supported_buffers(ggml_hexagon_session *sess, const str
|
|
|
2858
3496
|
return true;
|
|
2859
3497
|
}
|
|
2860
3498
|
|
|
3499
|
+
static bool ggml_hexagon_supported_cpy(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
3500
|
+
const struct ggml_tensor * src0 = op->src[0];
|
|
3501
|
+
const struct ggml_tensor * dst = op;
|
|
3502
|
+
|
|
3503
|
+
// for now we can do f32 -> f16 and f16 -> f32 (without reshaping)
|
|
3504
|
+
if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
|
|
3505
|
+
if ( dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) return false;
|
|
3506
|
+
|
|
3507
|
+
const bool sametype = (src0->type == dst->type);
|
|
3508
|
+
const bool transposed = ggml_is_transposed(src0) || ggml_is_transposed(dst);
|
|
3509
|
+
const bool sameshape = !transposed && ggml_are_same_shape(src0, dst);
|
|
3510
|
+
|
|
3511
|
+
// can handle any shape and any same-type (pretty slow if reshaping is required)
|
|
3512
|
+
if (sametype) return true;
|
|
3513
|
+
|
|
3514
|
+
// cannot handle re-shaping and type conversion at the same time
|
|
3515
|
+
if (!sameshape) return false;
|
|
3516
|
+
|
|
3517
|
+
return true;
|
|
3518
|
+
}
|
|
3519
|
+
|
|
3520
|
+
static bool ggml_hexagon_supported_cont(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
3521
|
+
GGML_UNUSED(sess);
|
|
3522
|
+
const struct ggml_tensor * src0 = op->src[0];
|
|
3523
|
+
|
|
3524
|
+
// CONT is same-type only, supports f32 and f16
|
|
3525
|
+
if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
|
|
3526
|
+
|
|
3527
|
+
return true;
|
|
3528
|
+
}
|
|
3529
|
+
|
|
3530
|
+
static bool ggml_hexagon_supported_repeat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
3531
|
+
GGML_UNUSED(sess);
|
|
3532
|
+
const struct ggml_tensor * src0 = op->src[0];
|
|
3533
|
+
const struct ggml_tensor * dst = op;
|
|
3534
|
+
|
|
3535
|
+
// Support f32 and f16
|
|
3536
|
+
if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
|
|
3537
|
+
|
|
3538
|
+
// src and dst must be the same type
|
|
3539
|
+
if (src0->type != dst->type) return false;
|
|
3540
|
+
|
|
3541
|
+
// dst dims must be multiples of src dims
|
|
3542
|
+
if (dst->ne[0] % src0->ne[0] != 0) return false;
|
|
3543
|
+
if (dst->ne[1] % src0->ne[1] != 0) return false;
|
|
3544
|
+
if (dst->ne[2] % src0->ne[2] != 0) return false;
|
|
3545
|
+
if (dst->ne[3] % src0->ne[3] != 0) return false;
|
|
3546
|
+
|
|
3547
|
+
// require contiguous tensors (no transposition)
|
|
3548
|
+
if (ggml_is_transposed(src0) || ggml_is_transposed(dst)) return false;
|
|
3549
|
+
|
|
3550
|
+
return true;
|
|
3551
|
+
}
|
|
3552
|
+
|
|
3553
|
+
static bool ggml_hexagon_supported_concat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
3554
|
+
int dim = ((const int32_t *) op->op_params)[0];
|
|
3555
|
+
if (dim < 0 || dim >= GGML_MAX_DIMS) {
|
|
3556
|
+
return false;
|
|
3557
|
+
}
|
|
3558
|
+
|
|
3559
|
+
for (int i = 0; i < GGML_MAX_SRC; ++i) {
|
|
3560
|
+
const struct ggml_tensor * src = op->src[i];
|
|
3561
|
+
if (!src) {
|
|
3562
|
+
continue;
|
|
3563
|
+
}
|
|
3564
|
+
if (src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_I32 && src->type != GGML_TYPE_F16) {
|
|
3565
|
+
return false;
|
|
3566
|
+
}
|
|
3567
|
+
}
|
|
3568
|
+
|
|
3569
|
+
return true;
|
|
3570
|
+
}
|
|
3571
|
+
|
|
3572
|
+
static bool ggml_hexagon_supported_fill(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
3573
|
+
const struct ggml_tensor * dst = op;
|
|
3574
|
+
|
|
3575
|
+
if (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) {
|
|
3576
|
+
return false;
|
|
3577
|
+
}
|
|
3578
|
+
|
|
3579
|
+
GGML_UNUSED(sess);
|
|
3580
|
+
return true;
|
|
3581
|
+
}
|
|
3582
|
+
|
|
2861
3583
|
static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
|
2862
3584
|
auto sess = static_cast<ggml_hexagon_session *>(dev->context);
|
|
2863
3585
|
|
|
3586
|
+
// reject ops that match the filter
|
|
3587
|
+
if (opt_opfilter && std::regex_match(ggml_op_desc(op), *opt_opfilter)) {
|
|
3588
|
+
return false;
|
|
3589
|
+
}
|
|
3590
|
+
|
|
2864
3591
|
// all srcs & dsts must be mapped to the same session
|
|
2865
3592
|
if (!ggml_hexagon_supported_buffers(sess, op)) {
|
|
2866
3593
|
ggml_hexagon_dump_op_supp(sess->name, op, false);
|
|
@@ -2877,6 +3604,13 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
|
|
2877
3604
|
supp = true;
|
|
2878
3605
|
break;
|
|
2879
3606
|
|
|
3607
|
+
case GGML_OP_MUL:
|
|
3608
|
+
case GGML_OP_ADD:
|
|
3609
|
+
case GGML_OP_SUB:
|
|
3610
|
+
case GGML_OP_DIV:
|
|
3611
|
+
supp = ggml_hexagon_supported_binary(sess, op);
|
|
3612
|
+
break;
|
|
3613
|
+
|
|
2880
3614
|
case GGML_OP_MUL_MAT:
|
|
2881
3615
|
supp = ggml_hexagon_supported_mul_mat(sess, op);
|
|
2882
3616
|
break;
|
|
@@ -2885,41 +3619,61 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
|
|
2885
3619
|
supp = ggml_hexagon_supported_mul_mat_id(sess, op);
|
|
2886
3620
|
break;
|
|
2887
3621
|
|
|
2888
|
-
case GGML_OP_MUL:
|
|
2889
|
-
case GGML_OP_ADD:
|
|
2890
|
-
case GGML_OP_SUB:
|
|
2891
|
-
supp = ggml_hexagon_supported_binary(sess, op);
|
|
2892
|
-
break;
|
|
2893
|
-
|
|
2894
3622
|
case GGML_OP_ADD_ID:
|
|
2895
3623
|
supp = ggml_hexagon_supported_add_id(sess, op);
|
|
2896
3624
|
break;
|
|
2897
3625
|
|
|
3626
|
+
case GGML_OP_NORM:
|
|
3627
|
+
case GGML_OP_L2_NORM:
|
|
2898
3628
|
case GGML_OP_RMS_NORM:
|
|
2899
3629
|
case GGML_OP_SCALE:
|
|
2900
3630
|
supp = ggml_hexagon_supported_unary(sess, op);
|
|
2901
3631
|
break;
|
|
2902
3632
|
|
|
3633
|
+
case GGML_OP_SQR:
|
|
3634
|
+
case GGML_OP_SQRT:
|
|
3635
|
+
supp = ggml_hexagon_supported_unary(sess, op);
|
|
3636
|
+
break;
|
|
3637
|
+
|
|
3638
|
+
case GGML_OP_SUM_ROWS:
|
|
3639
|
+
supp = ggml_hexagon_supported_sum_rows(sess, op);
|
|
3640
|
+
break;
|
|
3641
|
+
|
|
2903
3642
|
case GGML_OP_SOFT_MAX:
|
|
2904
3643
|
supp = ggml_hexagon_supported_softmax(sess, op);
|
|
2905
3644
|
break;
|
|
2906
3645
|
|
|
2907
3646
|
case GGML_OP_UNARY:
|
|
2908
|
-
{
|
|
2909
|
-
|
|
2910
|
-
|
|
3647
|
+
switch (ggml_get_unary_op(op)) {
|
|
3648
|
+
case GGML_UNARY_OP_NEG:
|
|
3649
|
+
case GGML_UNARY_OP_EXP:
|
|
3650
|
+
case GGML_UNARY_OP_SIGMOID:
|
|
3651
|
+
case GGML_UNARY_OP_SOFTPLUS:
|
|
3652
|
+
case GGML_UNARY_OP_TANH:
|
|
3653
|
+
supp = ggml_hexagon_supported_unary(sess, op);
|
|
3654
|
+
break;
|
|
3655
|
+
case GGML_UNARY_OP_SILU:
|
|
3656
|
+
case GGML_UNARY_OP_GELU:
|
|
3657
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
|
2911
3658
|
supp = ggml_hexagon_supported_activations(sess, op);
|
|
2912
|
-
|
|
2913
|
-
|
|
3659
|
+
break;
|
|
3660
|
+
default:
|
|
3661
|
+
break;
|
|
2914
3662
|
}
|
|
3663
|
+
break;
|
|
3664
|
+
|
|
2915
3665
|
case GGML_OP_GLU:
|
|
2916
|
-
{
|
|
2917
|
-
|
|
2918
|
-
|
|
3666
|
+
switch (ggml_get_glu_op(op)) {
|
|
3667
|
+
case GGML_GLU_OP_SWIGLU:
|
|
3668
|
+
case GGML_GLU_OP_SWIGLU_OAI:
|
|
3669
|
+
case GGML_GLU_OP_GEGLU:
|
|
2919
3670
|
supp = ggml_hexagon_supported_activations(sess, op);
|
|
2920
|
-
|
|
2921
|
-
|
|
3671
|
+
break;
|
|
3672
|
+
default:
|
|
3673
|
+
break;
|
|
2922
3674
|
}
|
|
3675
|
+
break;
|
|
3676
|
+
|
|
2923
3677
|
case GGML_OP_ROPE:
|
|
2924
3678
|
supp = ggml_hexagon_supported_rope(sess, op);
|
|
2925
3679
|
break;
|
|
@@ -2936,6 +3690,58 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
|
|
2936
3690
|
supp = ggml_hexagon_supported_get_rows(sess, op);
|
|
2937
3691
|
break;
|
|
2938
3692
|
|
|
3693
|
+
case GGML_OP_CPY:
|
|
3694
|
+
supp = ggml_hexagon_supported_cpy(sess, op);
|
|
3695
|
+
break;
|
|
3696
|
+
|
|
3697
|
+
case GGML_OP_CONT:
|
|
3698
|
+
supp = ggml_hexagon_supported_cont(sess, op);
|
|
3699
|
+
break;
|
|
3700
|
+
|
|
3701
|
+
case GGML_OP_REPEAT:
|
|
3702
|
+
supp = ggml_hexagon_supported_repeat(sess, op);
|
|
3703
|
+
break;
|
|
3704
|
+
|
|
3705
|
+
case GGML_OP_ARGSORT:
|
|
3706
|
+
supp = ggml_hexagon_supported_argsort(sess, op);
|
|
3707
|
+
break;
|
|
3708
|
+
|
|
3709
|
+
case GGML_OP_SSM_CONV:
|
|
3710
|
+
supp = ggml_hexagon_supported_ssm_conv(sess, op);
|
|
3711
|
+
break;
|
|
3712
|
+
|
|
3713
|
+
case GGML_OP_GATED_DELTA_NET:
|
|
3714
|
+
supp = ggml_hexagon_supported_gated_delta_net(sess, op);
|
|
3715
|
+
break;
|
|
3716
|
+
|
|
3717
|
+
case GGML_OP_CUMSUM:
|
|
3718
|
+
supp = ggml_hexagon_supported_cumsum(sess, op);
|
|
3719
|
+
break;
|
|
3720
|
+
|
|
3721
|
+
case GGML_OP_CONCAT:
|
|
3722
|
+
supp = ggml_hexagon_supported_concat(sess, op);
|
|
3723
|
+
break;
|
|
3724
|
+
|
|
3725
|
+
case GGML_OP_FILL:
|
|
3726
|
+
supp = ggml_hexagon_supported_fill(sess, op);
|
|
3727
|
+
break;
|
|
3728
|
+
|
|
3729
|
+
case GGML_OP_DIAG:
|
|
3730
|
+
supp = ggml_hexagon_supported_diag(sess, op);
|
|
3731
|
+
break;
|
|
3732
|
+
|
|
3733
|
+
case GGML_OP_SOLVE_TRI:
|
|
3734
|
+
supp = ggml_hexagon_supported_solve_tri(sess, op);
|
|
3735
|
+
break;
|
|
3736
|
+
|
|
3737
|
+
case GGML_OP_TRI:
|
|
3738
|
+
supp = ggml_hexagon_supported_tri(sess, op);
|
|
3739
|
+
break;
|
|
3740
|
+
|
|
3741
|
+
case GGML_OP_PAD:
|
|
3742
|
+
supp = ggml_hexagon_supported_pad(sess, op);
|
|
3743
|
+
break;
|
|
3744
|
+
|
|
2939
3745
|
default:
|
|
2940
3746
|
break;
|
|
2941
3747
|
}
|
|
@@ -3002,19 +3808,6 @@ struct ggml_hexagon_registry {
|
|
|
3002
3808
|
ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
|
3003
3809
|
GGML_LOG_INFO("ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev %zu\n", opt_ndev);
|
|
3004
3810
|
|
|
3005
|
-
if (!opt_arch) {
|
|
3006
|
-
int err = get_hex_arch_ver(CDSP_DOMAIN_ID, &opt_arch);
|
|
3007
|
-
if (err != 0) {
|
|
3008
|
-
GGML_LOG_ERROR("ggml-hex: failed to query HTP version (err %d) defaulting to v73\n", err);
|
|
3009
|
-
opt_arch = 73;
|
|
3010
|
-
}
|
|
3011
|
-
}
|
|
3012
|
-
|
|
3013
|
-
if (opt_arch < 75) {
|
|
3014
|
-
opt_ndev = 1;
|
|
3015
|
-
GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
|
|
3016
|
-
}
|
|
3017
|
-
|
|
3018
3811
|
GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
|
|
3019
3812
|
|
|
3020
3813
|
// Create devices / sessions
|
|
@@ -3061,7 +3854,7 @@ static ggml_backend_dev_t ggml_backend_hexagon_reg_get_device(ggml_backend_reg_t
|
|
|
3061
3854
|
}
|
|
3062
3855
|
|
|
3063
3856
|
static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
|
3064
|
-
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
|
|
3857
|
+
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0 && opt_hostbuf) {
|
|
3065
3858
|
ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_hexagon_device_get_extra_buffers_type;
|
|
3066
3859
|
return (void *) fct;
|
|
3067
3860
|
}
|
|
@@ -3069,56 +3862,117 @@ static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, cons
|
|
|
3069
3862
|
return NULL;
|
|
3070
3863
|
}
|
|
3071
3864
|
|
|
3865
|
+
template<typename T> std::vector<T> str_to_vec(const char* str) {
|
|
3866
|
+
std::stringstream ss(str);
|
|
3867
|
+
std::vector<T> v;
|
|
3868
|
+
std::string t;
|
|
3869
|
+
|
|
3870
|
+
while (std::getline(ss, t, ',')) {
|
|
3871
|
+
v.push_back(std::stoul(t, nullptr, 0));
|
|
3872
|
+
}
|
|
3873
|
+
|
|
3874
|
+
return v;
|
|
3875
|
+
}
|
|
3876
|
+
|
|
3877
|
+
template<typename T, int BASE=10> std::string vec_to_str(std::vector<T> v) {
|
|
3878
|
+
std::stringstream ss;
|
|
3879
|
+
ss << std::setbase(BASE) << std::showbase;
|
|
3880
|
+
for (auto i : v) { ss << i << ','; }
|
|
3881
|
+
auto str = ss.str(); str.pop_back(); // drop last comma
|
|
3882
|
+
return str;
|
|
3883
|
+
}
|
|
3884
|
+
|
|
3072
3885
|
static void ggml_hexagon_init(ggml_backend_reg * reg) {
|
|
3073
3886
|
// Basic sanity checks to make sure definitions match
|
|
3074
3887
|
static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0,
|
|
3075
3888
|
"please update hexagon_type to match ggml_type");
|
|
3889
|
+
static_assert((unsigned int) HTP_TYPE_Q4_1 == (unsigned int) GGML_TYPE_Q4_1,
|
|
3890
|
+
"please update hexagon_type to match ggml_type");
|
|
3076
3891
|
static_assert((unsigned int) HTP_TYPE_Q8_0 == (unsigned int) GGML_TYPE_Q8_0,
|
|
3077
3892
|
"please update hexagon_type to match ggml_type");
|
|
3078
3893
|
static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4,
|
|
3079
3894
|
"please update hexagon_type to match ggml_type");
|
|
3895
|
+
static_assert((unsigned int) HTP_TYPE_IQ4_NL == (unsigned int) GGML_TYPE_IQ4_NL,
|
|
3896
|
+
"please update hexagon_type to match ggml_type");
|
|
3897
|
+
|
|
3898
|
+
const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
|
|
3899
|
+
const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
|
|
3900
|
+
const char * str_opstage = getenv("GGML_HEXAGON_OPSTAGE");
|
|
3901
|
+
const char * str_opbatch = getenv("GGML_HEXAGON_OPBATCH");
|
|
3902
|
+
const char * str_opqueue = getenv("GGML_HEXAGON_OPQUEUE");
|
|
3903
|
+
const char * str_oppoll = getenv("GGML_HEXAGON_OPPOLL");
|
|
3904
|
+
const char * str_opfilter = getenv("GGML_HEXAGON_OPFILTER");
|
|
3905
|
+
const char * str_profile = getenv("GGML_HEXAGON_PROFILE");
|
|
3906
|
+
const char * str_etm = getenv("GGML_HEXAGON_ETM");
|
|
3907
|
+
const char * str_nhvx = getenv("GGML_HEXAGON_NHVX");
|
|
3908
|
+
const char * str_use_hmx = getenv("GGML_HEXAGON_USE_HMX");
|
|
3909
|
+
const char * str_ndev = getenv("GGML_HEXAGON_NDEV");
|
|
3910
|
+
const char * str_arch = getenv("GGML_HEXAGON_ARCH");
|
|
3911
|
+
const char * str_vmem = getenv("GGML_HEXAGON_VMEM");
|
|
3912
|
+
const char * str_mbuf = getenv("GGML_HEXAGON_MBUF");
|
|
3913
|
+
|
|
3914
|
+
// Init Arch first since it affects other defaults
|
|
3915
|
+
if (!str_arch) {
|
|
3916
|
+
int err = get_hex_arch_ver(CDSP_DOMAIN_ID, &opt_arch);
|
|
3917
|
+
if (err != 0) {
|
|
3918
|
+
GGML_LOG_ERROR("ggml-hex: failed to query HTP version (err %d) defaulting to v73\n", err);
|
|
3919
|
+
opt_arch = 73;
|
|
3920
|
+
}
|
|
3921
|
+
} else {
|
|
3922
|
+
if (str_arch[0] == 'v' || str_arch[0] == 'V') {
|
|
3923
|
+
str_arch++;
|
|
3924
|
+
}
|
|
3925
|
+
opt_arch = strtoul(str_arch, NULL, 0);
|
|
3926
|
+
}
|
|
3080
3927
|
|
|
3081
|
-
|
|
3082
|
-
const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
|
|
3928
|
+
size_t MiB = 1024 * 1024;
|
|
3083
3929
|
|
|
3084
|
-
|
|
3085
|
-
|
|
3086
|
-
opt_etm = getenv("GGML_HEXAGON_ETM") != nullptr;
|
|
3087
|
-
opt_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL") != nullptr;
|
|
3930
|
+
// Update vmem default
|
|
3931
|
+
opt_vmem = opt_arch >= 75 ? HTP_OP_MAX_VMEM_DEFAULT : 3000 * MiB;
|
|
3088
3932
|
|
|
3089
|
-
|
|
3090
|
-
if (str_opmask != nullptr) {
|
|
3091
|
-
opt_opmask = strtoul(str_opmask, NULL, 0);
|
|
3092
|
-
}
|
|
3093
|
-
opt_opsync = getenv("GGML_HEXAGON_OPSYNC") != nullptr;
|
|
3933
|
+
auto RE_ICASE = std::regex_constants::icase;
|
|
3094
3934
|
|
|
3095
|
-
|
|
3096
|
-
|
|
3097
|
-
|
|
3098
|
-
|
|
3099
|
-
|
|
3100
|
-
|
|
3101
|
-
|
|
3935
|
+
opt_opfilter = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL;
|
|
3936
|
+
opt_verbose = str_verbose ? atoi(str_verbose) : 0;
|
|
3937
|
+
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
|
|
3938
|
+
opt_opstage = str_opstage ? strtoul(str_opstage, NULL, 0) : opt_opstage;
|
|
3939
|
+
opt_opbatch = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
|
|
3940
|
+
opt_opqueue = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
|
|
3941
|
+
opt_oppoll = str_oppoll ? strtoul(str_oppoll, NULL, 0) : opt_oppoll;
|
|
3942
|
+
opt_profile = str_profile ? atoi(str_profile) : 0;
|
|
3943
|
+
opt_etm = str_etm ? atoi(str_etm) : 0;
|
|
3944
|
+
opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
|
|
3945
|
+
opt_use_hmx = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx;
|
|
3946
|
+
opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev;
|
|
3947
|
+
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
|
|
3948
|
+
opt_mbuf = str_mbuf ? strtoul(str_mbuf, NULL, 0) * MiB : opt_mbuf;
|
|
3949
|
+
opt_vmem = str_vmem ? strtoul(str_vmem, NULL, 0) * MiB : opt_vmem;
|
|
3102
3950
|
|
|
3103
|
-
|
|
3104
|
-
|
|
3105
|
-
opt_nhvx = strtoul(str_nhvx, NULL, 0);
|
|
3951
|
+
if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
|
|
3952
|
+
opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
|
|
3106
3953
|
}
|
|
3107
3954
|
|
|
3108
|
-
|
|
3109
|
-
if (
|
|
3110
|
-
|
|
3111
|
-
|
|
3112
|
-
}
|
|
3113
|
-
opt_arch = strtoul(str_arch, NULL, 0);
|
|
3955
|
+
#if defined(__ANDROID__)
|
|
3956
|
+
if (opt_arch < 75) {
|
|
3957
|
+
opt_ndev = 1;
|
|
3958
|
+
GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
|
|
3114
3959
|
}
|
|
3960
|
+
#endif
|
|
3115
3961
|
|
|
3116
|
-
|
|
3962
|
+
if (str_profile) {
|
|
3963
|
+
opt_pmu_evt = [&]() -> std::vector<uint32_t> {
|
|
3964
|
+
auto v = str_to_vec<uint32_t>(str_profile);
|
|
3965
|
+
switch (v.size()) {
|
|
3966
|
+
case 1: opt_profile = v[0]; return opt_pmu_evt; // mode with default pmu events
|
|
3967
|
+
case 8: opt_profile = 2; return v; // mode with custom pmu events
|
|
3968
|
+
default: opt_profile = 0; return {}; // garbage input
|
|
3969
|
+
}}();
|
|
3970
|
+
if (opt_profile == 1) opt_pmu_evt = {};
|
|
3971
|
+
GGML_LOG_INFO("ggml-hex: Profiling mode %u : pmu-evt [ %s ]\n", opt_profile,
|
|
3972
|
+
vec_to_str<uint32_t, 16>(opt_pmu_evt).c_str());
|
|
3973
|
+
}
|
|
3117
3974
|
|
|
3118
3975
|
reg->context = new ggml_hexagon_registry(reg);
|
|
3119
|
-
|
|
3120
|
-
HEX_VERBOSE("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req),
|
|
3121
|
-
sizeof(struct htp_general_rsp));
|
|
3122
3976
|
}
|
|
3123
3977
|
|
|
3124
3978
|
static const struct ggml_backend_reg_i ggml_backend_hexagon_reg_i = {
|
|
@@ -3139,6 +3993,11 @@ ggml_backend_reg_t ggml_backend_hexagon_reg(void) {
|
|
|
3139
3993
|
static std::mutex mutex;
|
|
3140
3994
|
std::lock_guard<std::mutex> lock(mutex);
|
|
3141
3995
|
if (!initialized) {
|
|
3996
|
+
auto nErr = htpdrv_init();
|
|
3997
|
+
if (nErr != AEE_SUCCESS) {
|
|
3998
|
+
return NULL;
|
|
3999
|
+
}
|
|
4000
|
+
|
|
3142
4001
|
ggml_hexagon_init(®);
|
|
3143
4002
|
}
|
|
3144
4003
|
|