whispercpp 1.3.5 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +99 -2
- data/ext/extconf.rb +1 -0
- data/ext/ruby_whisper.c +20 -4
- data/ext/ruby_whisper.h +30 -2
- data/ext/ruby_whisper_context.c +216 -124
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_model.c +0 -1
- data/ext/ruby_whisper_params.c +0 -1
- data/ext/ruby_whisper_segment.c +0 -1
- data/ext/ruby_whisper_token.c +29 -9
- data/ext/ruby_whisper_transcribe.cpp +4 -1
- data/ext/ruby_whisper_vad_context.c +48 -1
- data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +0 -1
- data/ext/ruby_whisper_vad_segments.c +0 -1
- data/ext/sources/CMakeLists.txt +1 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/examples/bench/bench.cpp +23 -18
- data/ext/sources/examples/cli/cli.cpp +8 -0
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/server/server.cpp +18 -4
- data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
- data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
- data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
- data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
- data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
- data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
- data/ext/sources/examples/talk-llama/llama-context.h +27 -28
- data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
- data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
- data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
- data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
- data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
- data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
- data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
- data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
- data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
- data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
- data/ext/sources/examples/talk-llama/llama-model.h +72 -19
- data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
- data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
- data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
- data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
- data/ext/sources/examples/talk-llama/llama.cpp +76 -22
- data/ext/sources/examples/talk-llama/llama.h +63 -30
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
- data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
- data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
- data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
- data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
- data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/models.h +181 -46
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
- data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
- data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
- data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
- data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
- data/ext/sources/ggml/CMakeLists.txt +9 -3
- data/ext/sources/ggml/include/ggml-backend.h +1 -1
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +5 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +6 -1
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml.h +56 -9
- data/ext/sources/ggml/src/CMakeLists.txt +3 -0
- data/ext/sources/ggml/src/ggml-alloc.c +4 -9
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
- data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
- data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
- data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
- data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
- data/ext/sources/ggml/src/ggml-impl.h +62 -0
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
- data/ext/sources/ggml/src/ggml-quants.c +96 -5
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
- data/ext/sources/ggml/src/ggml.c +167 -33
- data/ext/sources/ggml/src/gguf.cpp +229 -44
- data/ext/sources/src/whisper.cpp +6 -28
- data/sig/whisper.rbs +43 -2
- data/test/test_context_params.rb +82 -0
- data/test/test_token.rb +11 -0
- data/test/test_vad_context.rb +58 -8
- data/test/test_whisper.rb +20 -0
- data/whispercpp.gemspec +1 -1
- metadata +240 -28
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
|
@@ -14,9 +14,6 @@
|
|
|
14
14
|
|
|
15
15
|
#ifdef _WIN32
|
|
16
16
|
# include <sal.h>
|
|
17
|
-
# ifndef _WINDOWS
|
|
18
|
-
# define _WINDOWS
|
|
19
|
-
# endif
|
|
20
17
|
#else
|
|
21
18
|
# include <semaphore.h>
|
|
22
19
|
# include <unistd.h>
|
|
@@ -25,8 +22,6 @@
|
|
|
25
22
|
#pragma clang diagnostic ignored "-Wnested-anon-types"
|
|
26
23
|
#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
|
|
27
24
|
|
|
28
|
-
#include "htp-utils.h"
|
|
29
|
-
|
|
30
25
|
#include <AEEStdErr.h>
|
|
31
26
|
#include <dspqueue.h>
|
|
32
27
|
#include <rpcmem.h>
|
|
@@ -40,14 +35,15 @@
|
|
|
40
35
|
#include "op-desc.h"
|
|
41
36
|
#include "htp-msg.h"
|
|
42
37
|
#include "htp_iface.h"
|
|
38
|
+
#include "htp-drv.h"
|
|
43
39
|
|
|
44
40
|
static size_t opt_ndev = 1;
|
|
45
|
-
static size_t opt_nhvx = 0;
|
|
46
|
-
static int opt_arch = 0;
|
|
41
|
+
static size_t opt_nhvx = 0; // use all
|
|
42
|
+
static int opt_arch = 0; // autodetect
|
|
47
43
|
static int opt_etm = 0;
|
|
48
44
|
static int opt_verbose = 0;
|
|
49
45
|
static int opt_profile = 0;
|
|
50
|
-
static int opt_hostbuf = 1;
|
|
46
|
+
static int opt_hostbuf = 1; // hostbuf ON by default
|
|
51
47
|
static int opt_experimental = 0;
|
|
52
48
|
|
|
53
49
|
// Enable all stages by default
|
|
@@ -143,16 +139,16 @@ struct ggml_hexagon_session {
|
|
|
143
139
|
};
|
|
144
140
|
|
|
145
141
|
void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
|
|
146
|
-
// Bump pending flag (cleared in the session::flush once we get the
|
|
142
|
+
// Bump pending flag (cleared in the session::flush once we get the response)
|
|
147
143
|
this->op_pending++; // atomic inc
|
|
148
144
|
|
|
149
145
|
int err = dspqueue_write(this->queue,
|
|
150
146
|
0, // flags - the framework will autoset this
|
|
151
147
|
n_bufs, // number of buffers
|
|
152
148
|
bufs, // buffer references
|
|
153
|
-
sizeof(req),
|
|
149
|
+
sizeof(req), // Message length
|
|
154
150
|
(const uint8_t *) &req, // Message
|
|
155
|
-
|
|
151
|
+
DSPQUEUE_TIMEOUT // Timeout
|
|
156
152
|
);
|
|
157
153
|
|
|
158
154
|
if (err != 0) {
|
|
@@ -182,13 +178,13 @@ void ggml_hexagon_session::flush() {
|
|
|
182
178
|
|
|
183
179
|
// Read response packet from queue
|
|
184
180
|
int err = dspqueue_read(q, &flags,
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
181
|
+
HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
|
|
182
|
+
&n_bufs, // Number of buffer references
|
|
183
|
+
bufs, // Buffer references
|
|
184
|
+
sizeof(rsp), // Max message length
|
|
185
|
+
&rsp_size, // Message length
|
|
186
|
+
(uint8_t *) &rsp, // Message
|
|
187
|
+
DSPQUEUE_TIMEOUT); // Timeout
|
|
192
188
|
|
|
193
189
|
if (err == AEE_EEXPIRED) {
|
|
194
190
|
// TODO: might need to bail out if the HTP is stuck on something
|
|
@@ -269,13 +265,7 @@ struct ggml_backend_hexagon_buffer_context {
|
|
|
269
265
|
ggml_backend_hexagon_buffer_context(ggml_hexagon_session * sess, size_t size, bool repack) {
|
|
270
266
|
size += 4 * 1024; // extra page for padding
|
|
271
267
|
|
|
272
|
-
|
|
273
|
-
this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
|
|
274
|
-
} else {
|
|
275
|
-
GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str());
|
|
276
|
-
this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
|
|
277
|
-
}
|
|
278
|
-
|
|
268
|
+
this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
|
|
279
269
|
if (!this->base) {
|
|
280
270
|
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size);
|
|
281
271
|
throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)");
|
|
@@ -412,6 +402,7 @@ static void pack_q4_0_quants(block_q4_0 * x, const uint8_t * qs, unsigned int bi
|
|
|
412
402
|
static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
|
|
413
403
|
static const int qk = QK_Q4_0x4x2;
|
|
414
404
|
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
|
405
|
+
const int nloe = k % qk; // leftovers
|
|
415
406
|
|
|
416
407
|
const int dblk_size = 8 * 2; // 8x __fp16
|
|
417
408
|
const int qblk_size = qk / 2; // int4
|
|
@@ -445,15 +436,17 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
|
|
|
445
436
|
unpack_q4_0_quants(qs, &x[i * 8 + 6], 6);
|
|
446
437
|
unpack_q4_0_quants(qs, &x[i * 8 + 7], 7);
|
|
447
438
|
|
|
439
|
+
bool partial = (nloe && i == nb-1);
|
|
440
|
+
|
|
448
441
|
uint8_t * q = y_q + (i * qblk_size);
|
|
449
442
|
for (int j = 0; j < qk / 2; j++) {
|
|
450
|
-
q[j] = (qs[j + 128] << 4) | qs[j];
|
|
443
|
+
q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
|
|
451
444
|
}
|
|
452
445
|
}
|
|
453
446
|
|
|
454
447
|
// Repack the scales
|
|
455
448
|
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
|
|
456
|
-
// the last block is truncated and
|
|
449
|
+
// the last block is truncated and overridden by the scales.
|
|
457
450
|
for (int i = 0; i < nb; i++) {
|
|
458
451
|
// Repack the scales
|
|
459
452
|
ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
|
|
@@ -477,6 +470,7 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
|
|
|
477
470
|
static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
|
|
478
471
|
static const int qk = QK_Q4_0x4x2;
|
|
479
472
|
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
|
473
|
+
const int nloe = k % qk; // leftovers
|
|
480
474
|
|
|
481
475
|
const int dblk_size = 8 * 2; // 8x __fp16
|
|
482
476
|
const int qblk_size = qk / 2; // int4
|
|
@@ -495,10 +489,17 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
|
|
|
495
489
|
for (int i = 0; i < nb; i++) {
|
|
496
490
|
uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
|
|
497
491
|
|
|
492
|
+
bool partial = (nloe && i == nb-1);
|
|
493
|
+
|
|
498
494
|
const uint8_t * q = y_q + (i * qblk_size);
|
|
499
495
|
for (int j = 0; j < qk / 2; j++) {
|
|
500
|
-
|
|
501
|
-
|
|
496
|
+
if (partial) {
|
|
497
|
+
qs[j*2+0] = q[j] & 0xf;
|
|
498
|
+
qs[j*2+1] = q[j] >> 4;
|
|
499
|
+
} else {
|
|
500
|
+
qs[j+000] = q[j] & 0xf;
|
|
501
|
+
qs[j+128] = q[j] >> 4;
|
|
502
|
+
}
|
|
502
503
|
}
|
|
503
504
|
|
|
504
505
|
pack_q4_0_quants(&x[i * 8 + 0], qs, 0);
|
|
@@ -513,7 +514,7 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
|
|
|
513
514
|
|
|
514
515
|
// Repack the scales
|
|
515
516
|
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
|
|
516
|
-
// the last block is truncated and
|
|
517
|
+
// the last block is truncated and overridden by the scales.
|
|
517
518
|
for (int i = 0; i < nb; i++) {
|
|
518
519
|
// Unpack the scales
|
|
519
520
|
const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
|
|
@@ -562,7 +563,7 @@ static void init_row_q4x4x2(block_q4_0 * x, int64_t k) {
|
|
|
562
563
|
|
|
563
564
|
// Init the scales
|
|
564
565
|
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
|
|
565
|
-
// the last block is truncated and
|
|
566
|
+
// the last block is truncated and overridden by the scales.
|
|
566
567
|
for (int i = 0; i < nb; i++) {
|
|
567
568
|
// Unpack the scales
|
|
568
569
|
x[i * 8 + 0].d = 0;
|
|
@@ -780,7 +781,7 @@ static void repack_row_q8x4x2(uint8_t * y, const block_q8_0 * x, int64_t k) {
|
|
|
780
781
|
|
|
781
782
|
// Repack the scales
|
|
782
783
|
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
|
|
783
|
-
// the last block is truncated and
|
|
784
|
+
// the last block is truncated and overridden by the scales.
|
|
784
785
|
for (int i = 0; i < nb; i++) {
|
|
785
786
|
// Repack the scales
|
|
786
787
|
ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
|
|
@@ -839,7 +840,7 @@ static void unpack_row_q8x4x2(block_q8_0 * x, const uint8_t * y, int64_t k) {
|
|
|
839
840
|
|
|
840
841
|
// Repack the scales
|
|
841
842
|
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
|
|
842
|
-
// the last block is truncated and
|
|
843
|
+
// the last block is truncated and overridden by the scales.
|
|
843
844
|
for (int i = 0; i < nb; i++) {
|
|
844
845
|
// Unpack the scales
|
|
845
846
|
const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
|
|
@@ -888,7 +889,7 @@ static void init_row_q8x4x2(block_q8_0 * x, int64_t k) {
|
|
|
888
889
|
|
|
889
890
|
// Init the scales
|
|
890
891
|
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q8_0x4x2)
|
|
891
|
-
// the last block is truncated and
|
|
892
|
+
// the last block is truncated and overridden by the scales.
|
|
892
893
|
for (int i = 0; i < nb; i++) {
|
|
893
894
|
// Unpack the scales
|
|
894
895
|
x[i * 8 + 0].d = 0;
|
|
@@ -1088,6 +1089,7 @@ static void pack_mxfp4_quants(block_mxfp4 * x, const uint8_t * qs, unsigned int
|
|
|
1088
1089
|
static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k) {
|
|
1089
1090
|
static const int qk = QK_MXFP4x4x2;
|
|
1090
1091
|
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
|
1092
|
+
const int nloe = k % qk; // leftovers
|
|
1091
1093
|
|
|
1092
1094
|
const int eblk_size = 8 * 1; // 8x E8M0
|
|
1093
1095
|
const int qblk_size = qk / 2; // int4
|
|
@@ -1122,15 +1124,17 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
|
|
|
1122
1124
|
unpack_mxfp4_quants(qs, &x[i * 8 + 6], 6);
|
|
1123
1125
|
unpack_mxfp4_quants(qs, &x[i * 8 + 7], 7);
|
|
1124
1126
|
|
|
1127
|
+
bool partial = (nloe && i == nb-1);
|
|
1128
|
+
|
|
1125
1129
|
uint8_t * q = y_q + (i * qblk_size);
|
|
1126
1130
|
for (int j = 0; j < qk / 2; j++) {
|
|
1127
|
-
q[j] = (qs[j + 128] << 4) | qs[j];
|
|
1131
|
+
q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
|
|
1128
1132
|
}
|
|
1129
1133
|
}
|
|
1130
1134
|
|
|
1131
1135
|
// Repack the scales
|
|
1132
1136
|
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
|
|
1133
|
-
// the last block is truncated and
|
|
1137
|
+
// the last block is truncated and overridden by the scales.
|
|
1134
1138
|
for (int i = 0; i < nb; i++) {
|
|
1135
1139
|
// Repack the scales
|
|
1136
1140
|
uint8_t * e = (uint8_t *) (y_e + i * eblk_size);
|
|
@@ -1154,6 +1158,7 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
|
|
|
1154
1158
|
static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k) {
|
|
1155
1159
|
static const int qk = QK_MXFP4x4x2;
|
|
1156
1160
|
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
|
1161
|
+
const int nloe = k % qk; // leftovers
|
|
1157
1162
|
|
|
1158
1163
|
const int eblk_size = 8 * 1; // 8x E8M0
|
|
1159
1164
|
const int qblk_size = qk / 2; // int4
|
|
@@ -1172,10 +1177,17 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
|
|
|
1172
1177
|
for (int i = 0; i < nb; i++) {
|
|
1173
1178
|
uint8_t qs[QK_MXFP4x4x2]; // unpacked quants
|
|
1174
1179
|
|
|
1180
|
+
bool partial = (nloe && i == nb-1);
|
|
1181
|
+
|
|
1175
1182
|
const uint8_t * q = y_q + (i * qblk_size);
|
|
1176
1183
|
for (int j = 0; j < qk / 2; j++) {
|
|
1177
|
-
|
|
1178
|
-
|
|
1184
|
+
if (partial) {
|
|
1185
|
+
qs[j*2+0] = q[j] & 0xf;
|
|
1186
|
+
qs[j*2+1] = q[j] >> 4;
|
|
1187
|
+
} else {
|
|
1188
|
+
qs[j+000] = q[j] & 0xf;
|
|
1189
|
+
qs[j+128] = q[j] >> 4;
|
|
1190
|
+
}
|
|
1179
1191
|
}
|
|
1180
1192
|
|
|
1181
1193
|
pack_mxfp4_quants(&x[i * 8 + 0], qs, 0);
|
|
@@ -1190,7 +1202,7 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
|
|
|
1190
1202
|
|
|
1191
1203
|
// Repack the scales
|
|
1192
1204
|
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4_0x4x2)
|
|
1193
|
-
// the last block is truncated and
|
|
1205
|
+
// the last block is truncated and overridden by the scales.
|
|
1194
1206
|
for (int i = 0; i < nb; i++) {
|
|
1195
1207
|
// Unpack the scales
|
|
1196
1208
|
const uint8_t * e = (const uint8_t *) (y_e + i * eblk_size);
|
|
@@ -1239,7 +1251,7 @@ static void init_row_mxfp4x4x2(block_mxfp4 * x, int64_t k) {
|
|
|
1239
1251
|
|
|
1240
1252
|
// Init the scales
|
|
1241
1253
|
// Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
|
|
1242
|
-
// the last block is truncated and
|
|
1254
|
+
// the last block is truncated and overridden by the scales.
|
|
1243
1255
|
for (int i = 0; i < nb; i++) {
|
|
1244
1256
|
// Unpack the scales
|
|
1245
1257
|
x[i * 8 + 0].e = 0;
|
|
@@ -1753,24 +1765,10 @@ static bool ggml_backend_buffer_is_hexagon(const struct ggml_backend_buffer * b)
|
|
|
1753
1765
|
}
|
|
1754
1766
|
|
|
1755
1767
|
static inline bool ggml_backend_buffer_is_hexagon_repack(const struct ggml_backend_buffer * b) {
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
static bool hex_supported_dims2(const struct ggml_tensor * x, const struct ggml_tensor * y) {
|
|
1760
|
-
if (x->ne[0] != y->ne[0]) {
|
|
1761
|
-
return false;
|
|
1768
|
+
if (!opt_hostbuf) {
|
|
1769
|
+
return ggml_backend_buffer_is_hexagon(b);
|
|
1762
1770
|
}
|
|
1763
|
-
|
|
1764
|
-
return false;
|
|
1765
|
-
}
|
|
1766
|
-
if (x->ne[2] != y->ne[2]) {
|
|
1767
|
-
return false;
|
|
1768
|
-
}
|
|
1769
|
-
if (x->ne[3] != y->ne[3]) {
|
|
1770
|
-
return false;
|
|
1771
|
-
}
|
|
1772
|
-
|
|
1773
|
-
return true;
|
|
1771
|
+
return b->buft->iface.alloc_buffer == ggml_backend_hexagon_repack_buffer_type_alloc_buffer;
|
|
1774
1772
|
}
|
|
1775
1773
|
|
|
1776
1774
|
static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
@@ -1804,43 +1802,6 @@ static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_sess
|
|
|
1804
1802
|
return opt_experimental;
|
|
1805
1803
|
}
|
|
1806
1804
|
|
|
1807
|
-
static bool hex_supported_src0_type(ggml_type t) {
|
|
1808
|
-
return t == GGML_TYPE_F32;
|
|
1809
|
-
}
|
|
1810
|
-
|
|
1811
|
-
static bool hex_supported_src1_type(ggml_type t) {
|
|
1812
|
-
return t == GGML_TYPE_F32;
|
|
1813
|
-
}
|
|
1814
|
-
|
|
1815
|
-
static bool hex_supported_src2_type(ggml_type t) {
|
|
1816
|
-
return t == GGML_TYPE_F32;
|
|
1817
|
-
}
|
|
1818
|
-
|
|
1819
|
-
static bool hex_supported_src1_type2(ggml_type t) {
|
|
1820
|
-
return t == GGML_TYPE_F16;
|
|
1821
|
-
}
|
|
1822
|
-
|
|
1823
|
-
static bool hex_supported_src1_type3(ggml_type t) {
|
|
1824
|
-
return t == GGML_TYPE_I32;
|
|
1825
|
-
}
|
|
1826
|
-
|
|
1827
|
-
static bool hex_supported_dst_type(ggml_type t) {
|
|
1828
|
-
return t == GGML_TYPE_F32;
|
|
1829
|
-
}
|
|
1830
|
-
|
|
1831
|
-
static bool hex_supported_dims(const struct ggml_tensor * x, const struct ggml_tensor * y) {
|
|
1832
|
-
// TODO: support broadcast for ne[2 and 3]
|
|
1833
|
-
if (x->ne[0] != y->ne[0]) {
|
|
1834
|
-
return false;
|
|
1835
|
-
}
|
|
1836
|
-
if (x->ne[2] != y->ne[2]) {
|
|
1837
|
-
return false;
|
|
1838
|
-
}
|
|
1839
|
-
if (x->ne[3] != y->ne[3]) {
|
|
1840
|
-
return false;
|
|
1841
|
-
}
|
|
1842
|
-
return true;
|
|
1843
|
-
}
|
|
1844
1805
|
|
|
1845
1806
|
static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
|
|
1846
1807
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
@@ -1862,12 +1823,12 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
|
|
1862
1823
|
return false;
|
|
1863
1824
|
}
|
|
1864
1825
|
|
|
1865
|
-
if (src0
|
|
1826
|
+
if (ggml_nrows(src0) > 16 * 1024) {
|
|
1866
1827
|
return false; // typically the lm-head which would be too large for VTCM
|
|
1867
1828
|
}
|
|
1868
1829
|
|
|
1869
|
-
if ((src1->ne[2] != 1 || src1->ne[3] != 1)
|
|
1870
|
-
return false;
|
|
1830
|
+
if (ggml_nrows(src1) > 1024 || src1->ne[2] != 1 || src1->ne[3] != 1) {
|
|
1831
|
+
return false; // no huge batches or broadcasting (for now)
|
|
1871
1832
|
}
|
|
1872
1833
|
|
|
1873
1834
|
// src0 (weights) must be repacked
|
|
@@ -1881,6 +1842,9 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
|
|
1881
1842
|
GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F16 src0 not supported\n");
|
|
1882
1843
|
return false;
|
|
1883
1844
|
}
|
|
1845
|
+
if (ggml_nrows(src1) > 1024) {
|
|
1846
|
+
return false; // no huge batches (for now)
|
|
1847
|
+
}
|
|
1884
1848
|
break;
|
|
1885
1849
|
|
|
1886
1850
|
default:
|
|
@@ -1926,24 +1890,30 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se
|
|
|
1926
1890
|
const struct ggml_tensor * src1 = op->src[1];
|
|
1927
1891
|
const struct ggml_tensor * dst = op;
|
|
1928
1892
|
|
|
1929
|
-
if (
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
|
|
1893
|
+
if (src0->type == GGML_TYPE_F32) {
|
|
1894
|
+
if (src1->type != GGML_TYPE_F32) {
|
|
1895
|
+
return false;
|
|
1896
|
+
}
|
|
1897
|
+
if (dst->type != GGML_TYPE_F32) {
|
|
1898
|
+
return false;
|
|
1899
|
+
}
|
|
1934
1900
|
}
|
|
1935
|
-
if (
|
|
1936
|
-
|
|
1901
|
+
else if (src0->type == GGML_TYPE_F16) {
|
|
1902
|
+
if (src1->type != GGML_TYPE_F16) {
|
|
1903
|
+
return false;
|
|
1904
|
+
}
|
|
1905
|
+
if (dst->type != GGML_TYPE_F16) {
|
|
1906
|
+
return false;
|
|
1907
|
+
}
|
|
1937
1908
|
}
|
|
1938
|
-
|
|
1909
|
+
else {
|
|
1939
1910
|
return false;
|
|
1940
1911
|
}
|
|
1941
|
-
|
|
1912
|
+
|
|
1913
|
+
if (!ggml_are_same_shape(src0, dst)) {
|
|
1942
1914
|
return false;
|
|
1943
1915
|
}
|
|
1944
|
-
|
|
1945
|
-
// TODO: add support for non-contigiuos tensors
|
|
1946
|
-
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
|
|
1916
|
+
if (!ggml_can_repeat(src1, src0) || ggml_is_permuted(src1)) {
|
|
1947
1917
|
return false;
|
|
1948
1918
|
}
|
|
1949
1919
|
|
|
@@ -1955,16 +1925,16 @@ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * se
|
|
|
1955
1925
|
const struct ggml_tensor * src1 = op->src[1];
|
|
1956
1926
|
const struct ggml_tensor * dst = op;
|
|
1957
1927
|
|
|
1958
|
-
if (
|
|
1928
|
+
if (src0->type != GGML_TYPE_F32) {
|
|
1959
1929
|
return false;
|
|
1960
1930
|
}
|
|
1961
|
-
if (
|
|
1931
|
+
if (src1->type != GGML_TYPE_F32) {
|
|
1962
1932
|
return false;
|
|
1963
1933
|
}
|
|
1964
|
-
if (
|
|
1934
|
+
if (dst->type != GGML_TYPE_F32) {
|
|
1965
1935
|
return false;
|
|
1966
1936
|
}
|
|
1967
|
-
if (!
|
|
1937
|
+
if (!ggml_are_same_shape(src0, dst)) {
|
|
1968
1938
|
return false;
|
|
1969
1939
|
}
|
|
1970
1940
|
|
|
@@ -1980,13 +1950,32 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
|
|
|
1980
1950
|
const struct ggml_tensor * src0 = op->src[0];
|
|
1981
1951
|
const struct ggml_tensor * dst = op;
|
|
1982
1952
|
|
|
1983
|
-
if (
|
|
1953
|
+
if (src0->type != GGML_TYPE_F32) {
|
|
1954
|
+
return false;
|
|
1955
|
+
}
|
|
1956
|
+
if (dst->type != GGML_TYPE_F32) {
|
|
1957
|
+
return false;
|
|
1958
|
+
}
|
|
1959
|
+
if (!ggml_are_same_shape(src0, dst)) {
|
|
1960
|
+
return false;
|
|
1961
|
+
}
|
|
1962
|
+
|
|
1963
|
+
// TODO: add support for non-contigiuos tensors
|
|
1964
|
+
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
|
|
1984
1965
|
return false;
|
|
1985
1966
|
}
|
|
1986
|
-
|
|
1967
|
+
|
|
1968
|
+
return true;
|
|
1969
|
+
}
|
|
1970
|
+
|
|
1971
|
+
static bool ggml_hexagon_supported_sum_rows(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
1972
|
+
const struct ggml_tensor * src0 = op->src[0];
|
|
1973
|
+
const struct ggml_tensor * dst = op;
|
|
1974
|
+
|
|
1975
|
+
if (src0->type != GGML_TYPE_F32) {
|
|
1987
1976
|
return false;
|
|
1988
1977
|
}
|
|
1989
|
-
if (
|
|
1978
|
+
if (dst->type != GGML_TYPE_F32) {
|
|
1990
1979
|
return false;
|
|
1991
1980
|
}
|
|
1992
1981
|
|
|
@@ -2004,10 +1993,10 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
|
|
|
2004
1993
|
const struct ggml_tensor * src1 = op->src[1];
|
|
2005
1994
|
const struct ggml_tensor * dst = op;
|
|
2006
1995
|
|
|
2007
|
-
if (
|
|
1996
|
+
if (src0->type != GGML_TYPE_F32) {
|
|
2008
1997
|
return false;
|
|
2009
1998
|
}
|
|
2010
|
-
if (
|
|
1999
|
+
if (dst->type != GGML_TYPE_F32) {
|
|
2011
2000
|
return false;
|
|
2012
2001
|
}
|
|
2013
2002
|
|
|
@@ -2016,10 +2005,10 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
|
|
|
2016
2005
|
}
|
|
2017
2006
|
|
|
2018
2007
|
if (src1) {
|
|
2019
|
-
if (
|
|
2008
|
+
if (src1->type != GGML_TYPE_F32) {
|
|
2020
2009
|
return false;
|
|
2021
2010
|
}
|
|
2022
|
-
if (!
|
|
2011
|
+
if (!ggml_are_same_shape(src0, src1)) {
|
|
2023
2012
|
return false;
|
|
2024
2013
|
}
|
|
2025
2014
|
if (!ggml_is_contiguous(src1)) {
|
|
@@ -2040,15 +2029,15 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
|
|
|
2040
2029
|
return false; // FIXME: add support for sinks
|
|
2041
2030
|
}
|
|
2042
2031
|
|
|
2043
|
-
if (
|
|
2032
|
+
if (src0->type != GGML_TYPE_F32) {
|
|
2044
2033
|
return false;
|
|
2045
2034
|
}
|
|
2046
|
-
if (
|
|
2035
|
+
if (dst->type != GGML_TYPE_F32) {
|
|
2047
2036
|
return false;
|
|
2048
2037
|
}
|
|
2049
2038
|
|
|
2050
2039
|
if (src1) {
|
|
2051
|
-
if (
|
|
2040
|
+
if (src1->type != GGML_TYPE_F32 && src1->type != GGML_TYPE_F16) {
|
|
2052
2041
|
return false;
|
|
2053
2042
|
}
|
|
2054
2043
|
if (src0->ne[0] != src1->ne[0]) {
|
|
@@ -2118,6 +2107,26 @@ static bool ggml_hexagon_supported_get_rows(const struct ggml_hexagon_session *
|
|
|
2118
2107
|
return true;
|
|
2119
2108
|
}
|
|
2120
2109
|
|
|
2110
|
+
static bool ggml_hexagon_supported_argsort(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
2111
|
+
const struct ggml_tensor * src0 = op->src[0]; // values
|
|
2112
|
+
const struct ggml_tensor * dst = op; // indices
|
|
2113
|
+
|
|
2114
|
+
if (src0->type != GGML_TYPE_F32) {
|
|
2115
|
+
return false;
|
|
2116
|
+
}
|
|
2117
|
+
|
|
2118
|
+
if (dst->type != GGML_TYPE_I32) {
|
|
2119
|
+
return false;
|
|
2120
|
+
}
|
|
2121
|
+
|
|
2122
|
+
if (src0->ne[0] > (16*1024)) {
|
|
2123
|
+
// reject tensors with huge rows for now
|
|
2124
|
+
return false;
|
|
2125
|
+
}
|
|
2126
|
+
|
|
2127
|
+
return true;
|
|
2128
|
+
}
|
|
2129
|
+
|
|
2121
2130
|
static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
2122
2131
|
const int32_t * op_params = &op->op_params[0];
|
|
2123
2132
|
|
|
@@ -2135,17 +2144,17 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
|
|
|
2135
2144
|
const struct ggml_tensor * src2 = op->src[2];
|
|
2136
2145
|
const struct ggml_tensor * dst = op;
|
|
2137
2146
|
|
|
2138
|
-
if (
|
|
2147
|
+
if (src0->type != GGML_TYPE_F32) {
|
|
2139
2148
|
return false; // FIXME: add support for GGML_TYPE_F16 for src0
|
|
2140
2149
|
}
|
|
2141
|
-
if (
|
|
2150
|
+
if (dst->type != GGML_TYPE_F32) {
|
|
2142
2151
|
return false;
|
|
2143
2152
|
}
|
|
2144
|
-
if (
|
|
2153
|
+
if (src1->type != GGML_TYPE_I32) {
|
|
2145
2154
|
return false;
|
|
2146
2155
|
}
|
|
2147
2156
|
if (src2) {
|
|
2148
|
-
if (
|
|
2157
|
+
if (src2->type != GGML_TYPE_F32) {
|
|
2149
2158
|
return false;
|
|
2150
2159
|
}
|
|
2151
2160
|
int n_dims = op_params[1];
|
|
@@ -2168,6 +2177,44 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
|
|
|
2168
2177
|
return true;
|
|
2169
2178
|
}
|
|
2170
2179
|
|
|
2180
|
+
static bool ggml_hexagon_supported_ssm_conv(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
2181
|
+
const struct ggml_tensor * src0 = op->src[0];
|
|
2182
|
+
const struct ggml_tensor * src1 = op->src[1];
|
|
2183
|
+
const struct ggml_tensor * dst = op;
|
|
2184
|
+
|
|
2185
|
+
// Only support FP32 for now
|
|
2186
|
+
if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
|
2187
|
+
return false;
|
|
2188
|
+
}
|
|
2189
|
+
|
|
2190
|
+
// Check IO tensor shapes and dims
|
|
2191
|
+
if (src0->ne[3] != 1 || src1->ne[2] != 1 || src1->ne[3] != 1 || dst->ne[3] != 1) {
|
|
2192
|
+
return false; // src0 should be effectively 3D
|
|
2193
|
+
}
|
|
2194
|
+
|
|
2195
|
+
const int d_conv = src1->ne[0];
|
|
2196
|
+
const int d_inner = src0->ne[1];
|
|
2197
|
+
const int n_t = dst->ne[1];
|
|
2198
|
+
const int n_s = dst->ne[2];
|
|
2199
|
+
|
|
2200
|
+
if (src0->ne[0] != d_conv - 1 + n_t || src0->ne[1] != d_inner || src0->ne[2] != n_s) {
|
|
2201
|
+
return false;
|
|
2202
|
+
}
|
|
2203
|
+
if (src1->ne[0] != d_conv || src1->ne[1] != d_inner) {
|
|
2204
|
+
return false;
|
|
2205
|
+
}
|
|
2206
|
+
if (dst->ne[0] != d_inner || dst->ne[1] != n_t || dst->ne[2] != n_s) {
|
|
2207
|
+
return false;
|
|
2208
|
+
}
|
|
2209
|
+
|
|
2210
|
+
// TODO: add support for non-contiguous tensors
|
|
2211
|
+
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
|
|
2212
|
+
return false;
|
|
2213
|
+
}
|
|
2214
|
+
|
|
2215
|
+
return true;
|
|
2216
|
+
}
|
|
2217
|
+
|
|
2171
2218
|
enum dspqbuf_type {
|
|
2172
2219
|
DSPQBUF_TYPE_DSP_WRITE_CPU_READ = 0,
|
|
2173
2220
|
DSPQBUF_TYPE_CPU_WRITE_DSP_READ,
|
|
@@ -2285,6 +2332,9 @@ static inline size_t init_binary_req(htp_general_req * req, dspqueue_buffer * bu
|
|
|
2285
2332
|
case GGML_OP_SUB:
|
|
2286
2333
|
req->op = HTP_OP_SUB;
|
|
2287
2334
|
break;
|
|
2335
|
+
case GGML_OP_DIV:
|
|
2336
|
+
req->op = HTP_OP_DIV;
|
|
2337
|
+
break;
|
|
2288
2338
|
default:
|
|
2289
2339
|
GGML_ABORT("ggml-hex: binary : unsupported op: %d\n", t->op);
|
|
2290
2340
|
break;
|
|
@@ -2302,6 +2352,16 @@ static inline size_t init_binary_req(htp_general_req * req, dspqueue_buffer * bu
|
|
|
2302
2352
|
return n_bufs;
|
|
2303
2353
|
}
|
|
2304
2354
|
|
|
2355
|
+
static inline size_t init_cpy_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
|
2356
|
+
req->op = HTP_OP_CPY;
|
|
2357
|
+
|
|
2358
|
+
size_t n_bufs = 0;
|
|
2359
|
+
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2360
|
+
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
2361
|
+
|
|
2362
|
+
return n_bufs;
|
|
2363
|
+
}
|
|
2364
|
+
|
|
2305
2365
|
static inline size_t init_get_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
|
2306
2366
|
req->op = HTP_OP_GET_ROWS;
|
|
2307
2367
|
|
|
@@ -2313,6 +2373,17 @@ static inline size_t init_get_rows_req(htp_general_req * req, dspqueue_buffer *
|
|
|
2313
2373
|
return n_bufs;
|
|
2314
2374
|
}
|
|
2315
2375
|
|
|
2376
|
+
static inline size_t init_argsort_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
|
2377
|
+
req->op = HTP_OP_ARGSORT;
|
|
2378
|
+
memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
|
|
2379
|
+
|
|
2380
|
+
size_t n_bufs = 0;
|
|
2381
|
+
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2382
|
+
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
2383
|
+
|
|
2384
|
+
return n_bufs;
|
|
2385
|
+
}
|
|
2386
|
+
|
|
2316
2387
|
template <bool _is_src0_constant>
|
|
2317
2388
|
static inline size_t init_binary_id_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
|
2318
2389
|
switch (t->op) {
|
|
@@ -2367,6 +2438,16 @@ static inline size_t init_unary_req(htp_general_req * req, dspqueue_buffer * buf
|
|
|
2367
2438
|
supported = true;
|
|
2368
2439
|
break;
|
|
2369
2440
|
|
|
2441
|
+
case GGML_OP_SQR:
|
|
2442
|
+
req->op = HTP_OP_SQR;
|
|
2443
|
+
supported = true;
|
|
2444
|
+
break;
|
|
2445
|
+
|
|
2446
|
+
case GGML_OP_SQRT:
|
|
2447
|
+
req->op = HTP_OP_SQRT;
|
|
2448
|
+
supported = true;
|
|
2449
|
+
break;
|
|
2450
|
+
|
|
2370
2451
|
case GGML_OP_UNARY:
|
|
2371
2452
|
if (ggml_get_unary_op(t) == GGML_UNARY_OP_SILU) {
|
|
2372
2453
|
req->op = HTP_OP_UNARY_SILU;
|
|
@@ -2384,6 +2465,9 @@ static inline size_t init_unary_req(htp_general_req * req, dspqueue_buffer * buf
|
|
|
2384
2465
|
} else if (ggml_get_glu_op(t) == GGML_GLU_OP_SWIGLU_OAI) {
|
|
2385
2466
|
req->op = HTP_OP_GLU_SWIGLU_OAI;
|
|
2386
2467
|
supported = true;
|
|
2468
|
+
} else if (ggml_get_glu_op(t) == GGML_GLU_OP_GEGLU) {
|
|
2469
|
+
req->op = HTP_OP_GLU_GEGLU;
|
|
2470
|
+
supported = true;
|
|
2387
2471
|
}
|
|
2388
2472
|
break;
|
|
2389
2473
|
|
|
@@ -2408,6 +2492,17 @@ static inline size_t init_unary_req(htp_general_req * req, dspqueue_buffer * buf
|
|
|
2408
2492
|
return n_bufs;
|
|
2409
2493
|
}
|
|
2410
2494
|
|
|
2495
|
+
static inline size_t init_sum_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
|
2496
|
+
memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
|
|
2497
|
+
req->op = HTP_OP_SUM_ROWS;
|
|
2498
|
+
|
|
2499
|
+
size_t n_bufs = 0;
|
|
2500
|
+
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2501
|
+
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
2502
|
+
|
|
2503
|
+
return n_bufs;
|
|
2504
|
+
}
|
|
2505
|
+
|
|
2411
2506
|
static inline size_t init_rope_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
|
2412
2507
|
memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
|
|
2413
2508
|
req->op = HTP_OP_ROPE;
|
|
@@ -2436,6 +2531,17 @@ static inline size_t init_flash_attn_ext_req(htp_general_req * req, dspqueue_buf
|
|
|
2436
2531
|
return n_bufs;
|
|
2437
2532
|
}
|
|
2438
2533
|
|
|
2534
|
+
static inline size_t init_ssm_conv_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
|
2535
|
+
req->op = HTP_OP_SSM_CONV;
|
|
2536
|
+
|
|
2537
|
+
size_t n_bufs = 0;
|
|
2538
|
+
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2539
|
+
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CONSTANT);
|
|
2540
|
+
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
2541
|
+
|
|
2542
|
+
return n_bufs;
|
|
2543
|
+
}
|
|
2544
|
+
|
|
2439
2545
|
static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
|
|
2440
2546
|
auto sess = static_cast<ggml_hexagon_session *>(backend->context);
|
|
2441
2547
|
return sess->name.c_str();
|
|
@@ -2448,12 +2554,12 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
|
|
|
2448
2554
|
}
|
|
2449
2555
|
|
|
2450
2556
|
static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) {
|
|
2451
|
-
return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type)
|
|
2557
|
+
return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type));
|
|
2452
2558
|
}
|
|
2453
2559
|
|
|
2454
2560
|
static inline bool is_compute_op(ggml_tensor *node)
|
|
2455
2561
|
{
|
|
2456
|
-
return !
|
|
2562
|
+
return !ggml_op_is_empty(node->op) && !ggml_is_empty(node) && (node->flags & GGML_TENSOR_FLAG_COMPUTE);
|
|
2457
2563
|
}
|
|
2458
2564
|
|
|
2459
2565
|
// scan the graph and figure out last compute op index
|
|
@@ -2475,7 +2581,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
|
|
2475
2581
|
|
|
2476
2582
|
const int last = last_compute_op(graph);
|
|
2477
2583
|
|
|
2478
|
-
const struct ggml_tensor *
|
|
2584
|
+
const struct ggml_tensor * prev_op = nullptr; // prev executed op
|
|
2479
2585
|
|
|
2480
2586
|
for (int i = 0; i < graph->n_nodes; ++i) {
|
|
2481
2587
|
ggml_tensor * node = graph->nodes[i];
|
|
@@ -2487,10 +2593,12 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
|
|
2487
2593
|
uint32_t flags = 0;
|
|
2488
2594
|
|
|
2489
2595
|
// skip quantizer if src1 is reused
|
|
2490
|
-
if (op_reuse_src1(node,
|
|
2596
|
+
if (op_reuse_src1(node, prev_op)) {
|
|
2491
2597
|
flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
|
|
2492
2598
|
}
|
|
2493
2599
|
|
|
2600
|
+
prev_op = node;
|
|
2601
|
+
|
|
2494
2602
|
// ask for early notification for the last Op
|
|
2495
2603
|
if (i == last) {
|
|
2496
2604
|
flags |= HTP_OPFLAGS_EARLY_WAKEUP;
|
|
@@ -2503,7 +2611,6 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
|
|
2503
2611
|
} else {
|
|
2504
2612
|
ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
|
|
2505
2613
|
}
|
|
2506
|
-
prev_quant_op = node;
|
|
2507
2614
|
break;
|
|
2508
2615
|
case GGML_OP_MUL_MAT_ID:
|
|
2509
2616
|
if (ggml_is_quantized(node->src[0]->type)) {
|
|
@@ -2511,11 +2618,11 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
|
|
2511
2618
|
} else {
|
|
2512
2619
|
ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
|
|
2513
2620
|
}
|
|
2514
|
-
prev_quant_op = node;
|
|
2515
2621
|
break;
|
|
2516
2622
|
case GGML_OP_MUL:
|
|
2517
2623
|
case GGML_OP_ADD:
|
|
2518
2624
|
case GGML_OP_SUB:
|
|
2625
|
+
case GGML_OP_DIV:
|
|
2519
2626
|
ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
|
|
2520
2627
|
break;
|
|
2521
2628
|
case GGML_OP_ADD_ID:
|
|
@@ -2525,6 +2632,13 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
|
|
2525
2632
|
case GGML_OP_SCALE:
|
|
2526
2633
|
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
|
2527
2634
|
break;
|
|
2635
|
+
case GGML_OP_SQR:
|
|
2636
|
+
case GGML_OP_SQRT:
|
|
2637
|
+
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
|
2638
|
+
break;
|
|
2639
|
+
case GGML_OP_SUM_ROWS:
|
|
2640
|
+
ggml_hexagon_dispatch_op<init_sum_rows_req>(sess, node, flags);
|
|
2641
|
+
break;
|
|
2528
2642
|
case GGML_OP_UNARY:
|
|
2529
2643
|
if ((ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) ||
|
|
2530
2644
|
(ggml_get_unary_op(node) == GGML_UNARY_OP_GELU)) {
|
|
@@ -2533,7 +2647,8 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
|
|
2533
2647
|
break;
|
|
2534
2648
|
case GGML_OP_GLU:
|
|
2535
2649
|
if ((ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU) ||
|
|
2536
|
-
(ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI)
|
|
2650
|
+
(ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI) ||
|
|
2651
|
+
(ggml_get_glu_op(node) == GGML_GLU_OP_GEGLU)) {
|
|
2537
2652
|
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
|
2538
2653
|
}
|
|
2539
2654
|
break;
|
|
@@ -2557,6 +2672,18 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
|
|
2557
2672
|
ggml_hexagon_dispatch_op<init_get_rows_req>(sess, node, flags);
|
|
2558
2673
|
break;
|
|
2559
2674
|
|
|
2675
|
+
case GGML_OP_CPY:
|
|
2676
|
+
ggml_hexagon_dispatch_op<init_cpy_req>(sess, node, flags);
|
|
2677
|
+
break;
|
|
2678
|
+
|
|
2679
|
+
case GGML_OP_ARGSORT:
|
|
2680
|
+
ggml_hexagon_dispatch_op<init_argsort_req>(sess, node, flags);
|
|
2681
|
+
break;
|
|
2682
|
+
|
|
2683
|
+
case GGML_OP_SSM_CONV:
|
|
2684
|
+
ggml_hexagon_dispatch_op<init_ssm_conv_req>(sess, node, flags);
|
|
2685
|
+
break;
|
|
2686
|
+
|
|
2560
2687
|
default:
|
|
2561
2688
|
GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(node));
|
|
2562
2689
|
}
|
|
@@ -2632,7 +2759,7 @@ static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<no
|
|
|
2632
2759
|
// The main goal here is to stack the MUL_MAT ops with the same src1 input.
|
|
2633
2760
|
// This allows use to reuse dynamically quantized src1 in VTCM.
|
|
2634
2761
|
|
|
2635
|
-
// TODO: the current version might do incorrect
|
|
2762
|
+
// TODO: the current version might do incorrect reordering in cases where quantized src0
|
|
2636
2763
|
// input is an output of another Op.
|
|
2637
2764
|
|
|
2638
2765
|
for (int i0 = 0; i0 < n; i0++) {
|
|
@@ -2649,7 +2776,7 @@ static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<no
|
|
|
2649
2776
|
}
|
|
2650
2777
|
|
|
2651
2778
|
// that many nodes forward to search for stackable nodes that can reuse VTCM
|
|
2652
|
-
constexpr int N_FORWARD =
|
|
2779
|
+
constexpr int N_FORWARD = 16;
|
|
2653
2780
|
|
|
2654
2781
|
for (int i1 = i0 + 1; i1 < i0 + N_FORWARD && i1 < n; i1++) {
|
|
2655
2782
|
if (used[i1]) {
|
|
@@ -2858,6 +2985,27 @@ static bool ggml_hexagon_supported_buffers(ggml_hexagon_session *sess, const str
|
|
|
2858
2985
|
return true;
|
|
2859
2986
|
}
|
|
2860
2987
|
|
|
2988
|
+
static bool ggml_hexagon_supported_cpy(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
2989
|
+
const struct ggml_tensor * src0 = op->src[0];
|
|
2990
|
+
const struct ggml_tensor * dst = op;
|
|
2991
|
+
|
|
2992
|
+
// for now we can do f32 -> f16 and f16 -> f32 (without reshaping)
|
|
2993
|
+
if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
|
|
2994
|
+
if ( dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) return false;
|
|
2995
|
+
|
|
2996
|
+
const bool sametype = (src0->type == dst->type);
|
|
2997
|
+
const bool transposed = ggml_is_transposed(src0) || ggml_is_transposed(dst);
|
|
2998
|
+
const bool sameshape = !transposed && ggml_are_same_shape(src0, dst);
|
|
2999
|
+
|
|
3000
|
+
// can handle any shape and any same-type (pretty slow if reshaping is required)
|
|
3001
|
+
if (sametype) return true;
|
|
3002
|
+
|
|
3003
|
+
// cannot handle re-shaping and type conversion at the same time
|
|
3004
|
+
if (!sameshape) return false;
|
|
3005
|
+
|
|
3006
|
+
return true;
|
|
3007
|
+
}
|
|
3008
|
+
|
|
2861
3009
|
static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
|
2862
3010
|
auto sess = static_cast<ggml_hexagon_session *>(dev->context);
|
|
2863
3011
|
|
|
@@ -2888,6 +3036,7 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
|
|
2888
3036
|
case GGML_OP_MUL:
|
|
2889
3037
|
case GGML_OP_ADD:
|
|
2890
3038
|
case GGML_OP_SUB:
|
|
3039
|
+
case GGML_OP_DIV:
|
|
2891
3040
|
supp = ggml_hexagon_supported_binary(sess, op);
|
|
2892
3041
|
break;
|
|
2893
3042
|
|
|
@@ -2900,6 +3049,15 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
|
|
2900
3049
|
supp = ggml_hexagon_supported_unary(sess, op);
|
|
2901
3050
|
break;
|
|
2902
3051
|
|
|
3052
|
+
case GGML_OP_SQR:
|
|
3053
|
+
case GGML_OP_SQRT:
|
|
3054
|
+
supp = ggml_hexagon_supported_unary(sess, op);
|
|
3055
|
+
break;
|
|
3056
|
+
|
|
3057
|
+
case GGML_OP_SUM_ROWS:
|
|
3058
|
+
supp = ggml_hexagon_supported_sum_rows(sess, op);
|
|
3059
|
+
break;
|
|
3060
|
+
|
|
2903
3061
|
case GGML_OP_SOFT_MAX:
|
|
2904
3062
|
supp = ggml_hexagon_supported_softmax(sess, op);
|
|
2905
3063
|
break;
|
|
@@ -2915,7 +3073,7 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
|
|
2915
3073
|
case GGML_OP_GLU:
|
|
2916
3074
|
{
|
|
2917
3075
|
const auto glu_op = ggml_get_glu_op(op);
|
|
2918
|
-
if ((glu_op == GGML_GLU_OP_SWIGLU) || (glu_op == GGML_GLU_OP_SWIGLU_OAI)) {
|
|
3076
|
+
if ((glu_op == GGML_GLU_OP_SWIGLU) || (glu_op == GGML_GLU_OP_SWIGLU_OAI) || (glu_op == GGML_GLU_OP_GEGLU)) {
|
|
2919
3077
|
supp = ggml_hexagon_supported_activations(sess, op);
|
|
2920
3078
|
}
|
|
2921
3079
|
break;
|
|
@@ -2936,6 +3094,18 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
|
|
2936
3094
|
supp = ggml_hexagon_supported_get_rows(sess, op);
|
|
2937
3095
|
break;
|
|
2938
3096
|
|
|
3097
|
+
case GGML_OP_CPY:
|
|
3098
|
+
supp = ggml_hexagon_supported_cpy(sess, op);
|
|
3099
|
+
break;
|
|
3100
|
+
|
|
3101
|
+
case GGML_OP_ARGSORT:
|
|
3102
|
+
supp = ggml_hexagon_supported_argsort(sess, op);
|
|
3103
|
+
break;
|
|
3104
|
+
|
|
3105
|
+
case GGML_OP_SSM_CONV:
|
|
3106
|
+
supp = ggml_hexagon_supported_ssm_conv(sess, op);
|
|
3107
|
+
break;
|
|
3108
|
+
|
|
2939
3109
|
default:
|
|
2940
3110
|
break;
|
|
2941
3111
|
}
|
|
@@ -3010,10 +3180,12 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
|
|
3010
3180
|
}
|
|
3011
3181
|
}
|
|
3012
3182
|
|
|
3183
|
+
#if defined(__ANDROID__)
|
|
3013
3184
|
if (opt_arch < 75) {
|
|
3014
3185
|
opt_ndev = 1;
|
|
3015
3186
|
GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
|
|
3016
3187
|
}
|
|
3188
|
+
#endif
|
|
3017
3189
|
|
|
3018
3190
|
GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
|
|
3019
3191
|
|
|
@@ -3061,7 +3233,7 @@ static ggml_backend_dev_t ggml_backend_hexagon_reg_get_device(ggml_backend_reg_t
|
|
|
3061
3233
|
}
|
|
3062
3234
|
|
|
3063
3235
|
static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
|
3064
|
-
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
|
|
3236
|
+
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0 && opt_hostbuf) {
|
|
3065
3237
|
ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_hexagon_device_get_extra_buffers_type;
|
|
3066
3238
|
return (void *) fct;
|
|
3067
3239
|
}
|
|
@@ -3078,34 +3250,31 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
|
|
|
3078
3250
|
static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4,
|
|
3079
3251
|
"please update hexagon_type to match ggml_type");
|
|
3080
3252
|
|
|
3253
|
+
const char * str_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL");
|
|
3081
3254
|
const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
|
|
3082
3255
|
const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
|
|
3083
|
-
|
|
3256
|
+
const char * str_opmask = getenv("GGML_HEXAGON_OPMASK");
|
|
3257
|
+
const char * str_opsync = getenv("GGML_HEXAGON_OPSYNC");
|
|
3258
|
+
const char * str_profile = getenv("GGML_HEXAGON_PROFILE");
|
|
3259
|
+
const char * str_etm = getenv("GGML_HEXAGON_ETM");
|
|
3260
|
+
const char * str_nhvx = getenv("GGML_HEXAGON_NHVX");
|
|
3261
|
+
const char * str_ndev = getenv("GGML_HEXAGON_NDEV");
|
|
3262
|
+
const char * str_arch = getenv("GGML_HEXAGON_ARCH");
|
|
3263
|
+
|
|
3264
|
+
opt_experimental = str_experimental ? atoi(str_experimental) : 0;
|
|
3084
3265
|
opt_verbose = str_verbose ? atoi(str_verbose) : 0;
|
|
3085
|
-
|
|
3086
|
-
|
|
3087
|
-
|
|
3266
|
+
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
|
|
3267
|
+
opt_opmask = str_opmask ? strtoul(str_opmask, NULL, 0) : opt_opmask;
|
|
3268
|
+
opt_opsync = str_opsync ? atoi(str_opsync) : 0;
|
|
3269
|
+
opt_profile = str_profile ? atoi(str_profile) : 0;
|
|
3270
|
+
opt_etm = str_etm ? atoi(str_etm) : 0;
|
|
3271
|
+
opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
|
|
3272
|
+
opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev;
|
|
3088
3273
|
|
|
3089
|
-
|
|
3090
|
-
|
|
3091
|
-
opt_opmask = strtoul(str_opmask, NULL, 0);
|
|
3274
|
+
if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
|
|
3275
|
+
opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
|
|
3092
3276
|
}
|
|
3093
|
-
opt_opsync = getenv("GGML_HEXAGON_OPSYNC") != nullptr;
|
|
3094
3277
|
|
|
3095
|
-
const char * str_ndev = getenv("GGML_HEXAGON_NDEV");
|
|
3096
|
-
if (str_ndev) {
|
|
3097
|
-
opt_ndev = strtoul(str_ndev, NULL, 0);
|
|
3098
|
-
if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
|
|
3099
|
-
opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
|
|
3100
|
-
}
|
|
3101
|
-
}
|
|
3102
|
-
|
|
3103
|
-
const char * str_nhvx = getenv("GGML_HEXAGON_NHVX");
|
|
3104
|
-
if (str_nhvx) {
|
|
3105
|
-
opt_nhvx = strtoul(str_nhvx, NULL, 0);
|
|
3106
|
-
}
|
|
3107
|
-
|
|
3108
|
-
const char * str_arch = getenv("GGML_HEXAGON_ARCH");
|
|
3109
3278
|
if (str_arch) {
|
|
3110
3279
|
if (str_arch[0] == 'v') {
|
|
3111
3280
|
str_arch++;
|
|
@@ -3139,6 +3308,11 @@ ggml_backend_reg_t ggml_backend_hexagon_reg(void) {
|
|
|
3139
3308
|
static std::mutex mutex;
|
|
3140
3309
|
std::lock_guard<std::mutex> lock(mutex);
|
|
3141
3310
|
if (!initialized) {
|
|
3311
|
+
auto nErr = htpdrv_init();
|
|
3312
|
+
if (nErr != AEE_SUCCESS) {
|
|
3313
|
+
return NULL;
|
|
3314
|
+
}
|
|
3315
|
+
|
|
3142
3316
|
ggml_hexagon_init(®);
|
|
3143
3317
|
}
|
|
3144
3318
|
|