whispercpp 1.3.5 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +99 -2
- data/ext/extconf.rb +1 -0
- data/ext/ruby_whisper.c +20 -4
- data/ext/ruby_whisper.h +30 -2
- data/ext/ruby_whisper_context.c +216 -124
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_model.c +0 -1
- data/ext/ruby_whisper_params.c +0 -1
- data/ext/ruby_whisper_segment.c +0 -1
- data/ext/ruby_whisper_token.c +29 -9
- data/ext/ruby_whisper_transcribe.cpp +4 -1
- data/ext/ruby_whisper_vad_context.c +48 -1
- data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +0 -1
- data/ext/ruby_whisper_vad_segments.c +0 -1
- data/ext/sources/CMakeLists.txt +1 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/examples/bench/bench.cpp +23 -18
- data/ext/sources/examples/cli/cli.cpp +8 -0
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/server/server.cpp +18 -4
- data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
- data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
- data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
- data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
- data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
- data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
- data/ext/sources/examples/talk-llama/llama-context.h +27 -28
- data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
- data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
- data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
- data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
- data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
- data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
- data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
- data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
- data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
- data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
- data/ext/sources/examples/talk-llama/llama-model.h +72 -19
- data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
- data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
- data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
- data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
- data/ext/sources/examples/talk-llama/llama.cpp +76 -22
- data/ext/sources/examples/talk-llama/llama.h +63 -30
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
- data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
- data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
- data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
- data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
- data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/models.h +181 -46
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
- data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
- data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
- data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
- data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
- data/ext/sources/ggml/CMakeLists.txt +9 -3
- data/ext/sources/ggml/include/ggml-backend.h +1 -1
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +5 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +6 -1
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml.h +56 -9
- data/ext/sources/ggml/src/CMakeLists.txt +3 -0
- data/ext/sources/ggml/src/ggml-alloc.c +4 -9
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
- data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
- data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
- data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
- data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
- data/ext/sources/ggml/src/ggml-impl.h +62 -0
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
- data/ext/sources/ggml/src/ggml-quants.c +96 -5
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
- data/ext/sources/ggml/src/ggml.c +167 -33
- data/ext/sources/ggml/src/gguf.cpp +229 -44
- data/ext/sources/src/whisper.cpp +6 -28
- data/sig/whisper.rbs +43 -2
- data/test/test_context_params.rb +82 -0
- data/test/test_token.rb +11 -0
- data/test/test_vad_context.rb +58 -8
- data/test/test_whisper.rb +20 -0
- data/whispercpp.gemspec +1 -1
- metadata +240 -28
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
|
@@ -2,24 +2,20 @@
|
|
|
2
2
|
#pragma clang diagnostic ignored "-Wunused-function"
|
|
3
3
|
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
|
4
4
|
|
|
5
|
-
#ifdef HTP_DEBUG
|
|
6
|
-
# define FARF_HIGH 1
|
|
7
|
-
#endif
|
|
8
5
|
#include <HAP_farf.h>
|
|
9
|
-
#include <HAP_mem.h>
|
|
10
6
|
#include <HAP_perf.h>
|
|
11
|
-
|
|
12
|
-
#include <hexagon_types.h>
|
|
7
|
+
|
|
13
8
|
#include <math.h>
|
|
14
9
|
#include <string.h>
|
|
15
10
|
|
|
11
|
+
#include "hex-dma.h"
|
|
12
|
+
#include "hvx-utils.h"
|
|
13
|
+
|
|
16
14
|
#define GGML_COMMON_DECL_C
|
|
17
15
|
#include "ggml-common.h"
|
|
18
16
|
#include "htp-ctx.h"
|
|
19
17
|
#include "htp-msg.h"
|
|
20
18
|
#include "htp-ops.h"
|
|
21
|
-
#include "hvx-utils.h"
|
|
22
|
-
#include "ops-utils.h"
|
|
23
19
|
|
|
24
20
|
#define set_rows_preamble \
|
|
25
21
|
const uint32_t ne00 = octx->src0.ne[0]; \
|
|
@@ -47,11 +43,21 @@
|
|
|
47
43
|
\
|
|
48
44
|
const uint32_t nr = ne01;
|
|
49
45
|
|
|
50
|
-
|
|
46
|
+
struct htp_set_rows_context {
|
|
47
|
+
struct htp_ops_context * octx;
|
|
48
|
+
struct fastdiv_values div_ne12;
|
|
49
|
+
struct fastdiv_values div_ne11;
|
|
50
|
+
uint32_t src0_nrows_per_thread;
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
static void set_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *data) {
|
|
54
|
+
struct htp_set_rows_context * srctx = (struct htp_set_rows_context *)data;
|
|
55
|
+
struct htp_ops_context * octx = srctx->octx;
|
|
56
|
+
|
|
51
57
|
set_rows_preamble;
|
|
52
58
|
|
|
53
59
|
// parallelize by rows of src0
|
|
54
|
-
const uint32_t dr =
|
|
60
|
+
const uint32_t dr = srctx->src0_nrows_per_thread;
|
|
55
61
|
const uint32_t ir0 = dr * ith;
|
|
56
62
|
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
|
|
57
63
|
|
|
@@ -60,8 +66,8 @@ static int set_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth,
|
|
|
60
66
|
for (uint32_t i03 = 0; i03 < ne03; ++i03) {
|
|
61
67
|
for (uint32_t i02 = 0; i02 < ne02; ++i02) {
|
|
62
68
|
for (uint32_t i = ir0; i < ir1; ++i) {
|
|
63
|
-
const uint32_t i12 = fastmodulo(i03, ne12, &
|
|
64
|
-
const uint32_t i11 = fastmodulo(i02, ne11, &
|
|
69
|
+
const uint32_t i12 = fastmodulo(i03, ne12, &srctx->div_ne12);
|
|
70
|
+
const uint32_t i11 = fastmodulo(i02, ne11, &srctx->div_ne11);
|
|
65
71
|
const uint32_t i10 = i;
|
|
66
72
|
|
|
67
73
|
const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12;
|
|
@@ -76,19 +82,20 @@ static int set_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth,
|
|
|
76
82
|
const uintptr_t dst_ptr = octx->dst.data + i1*nb1 + i02*nb2 + i03*nb3;
|
|
77
83
|
|
|
78
84
|
// copy row
|
|
79
|
-
|
|
85
|
+
hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00);
|
|
80
86
|
}
|
|
81
87
|
}
|
|
82
88
|
}
|
|
83
|
-
|
|
84
|
-
return HTP_STATUS_OK;
|
|
85
89
|
}
|
|
86
90
|
|
|
87
|
-
static
|
|
91
|
+
static void set_rows_thread_f16_f32(unsigned int nth, unsigned int ith, void *data) {
|
|
92
|
+
struct htp_set_rows_context * srctx = (struct htp_set_rows_context *)data;
|
|
93
|
+
struct htp_ops_context * octx = srctx->octx;
|
|
94
|
+
|
|
88
95
|
set_rows_preamble;
|
|
89
96
|
|
|
90
97
|
// parallelize by rows of src0
|
|
91
|
-
const uint32_t dr =
|
|
98
|
+
const uint32_t dr = srctx->src0_nrows_per_thread;
|
|
92
99
|
const uint32_t ir0 = dr * ith;
|
|
93
100
|
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
|
|
94
101
|
|
|
@@ -97,8 +104,8 @@ static int set_rows_thread_f16_f32(struct htp_ops_context * octx, const int nth,
|
|
|
97
104
|
for (uint32_t i03 = 0; i03 < ne03; ++i03) {
|
|
98
105
|
for (uint32_t i02 = 0; i02 < ne02; ++i02) {
|
|
99
106
|
for (uint32_t i = ir0; i < ir1; ++i) {
|
|
100
|
-
const uint32_t i12 = fastmodulo(i03, ne12, &
|
|
101
|
-
const uint32_t i11 = fastmodulo(i02, ne11, &
|
|
107
|
+
const uint32_t i12 = fastmodulo(i03, ne12, &srctx->div_ne12);
|
|
108
|
+
const uint32_t i11 = fastmodulo(i02, ne11, &srctx->div_ne11);
|
|
102
109
|
const uint32_t i10 = i;
|
|
103
110
|
|
|
104
111
|
const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12;
|
|
@@ -112,25 +119,17 @@ static int set_rows_thread_f16_f32(struct htp_ops_context * octx, const int nth,
|
|
|
112
119
|
const uint8_t* src0_ptr = (const uint8_t *) octx->src0.data + i*nb01 + i02*nb02 + i03*nb03;
|
|
113
120
|
uint8_t* dst_ptr = (uint8_t *) octx->dst.data + i1*nb1 + i02*nb2 + i03*nb3;
|
|
114
121
|
|
|
115
|
-
|
|
122
|
+
hvx_copy_f16_f32_uu(dst_ptr, src0_ptr, ne00);
|
|
116
123
|
}
|
|
117
124
|
}
|
|
118
125
|
}
|
|
119
|
-
|
|
120
|
-
return HTP_STATUS_OK;
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
static void set_rows_work_f16_f32(unsigned int n, unsigned int i, void *data) {
|
|
124
|
-
set_rows_thread_f16_f32((struct htp_ops_context *) data, n, i);
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
static void set_rows_work_f32_f32(unsigned int n, unsigned int i, void *data) {
|
|
128
|
-
set_rows_thread_f32_f32((struct htp_ops_context *) data, n, i);
|
|
129
126
|
}
|
|
130
127
|
|
|
131
128
|
int op_set_rows(struct htp_ops_context * octx) {
|
|
132
129
|
set_rows_preamble;
|
|
133
130
|
|
|
131
|
+
const uint32_t n_threads = MIN(nr, octx->n_threads);
|
|
132
|
+
|
|
134
133
|
if (octx->src0.type != HTP_TYPE_F32) {
|
|
135
134
|
return HTP_STATUS_NO_SUPPORT;
|
|
136
135
|
}
|
|
@@ -147,18 +146,19 @@ int op_set_rows(struct htp_ops_context * octx) {
|
|
|
147
146
|
return HTP_STATUS_OK;
|
|
148
147
|
}
|
|
149
148
|
|
|
150
|
-
|
|
151
|
-
octx
|
|
149
|
+
struct htp_set_rows_context srctx;
|
|
150
|
+
srctx.octx = octx;
|
|
151
|
+
srctx.div_ne12 = init_fastdiv_values(ne12);
|
|
152
|
+
srctx.div_ne11 = init_fastdiv_values(ne11);
|
|
152
153
|
|
|
153
|
-
|
|
154
|
-
octx->src0_nrows_per_thread = (nr + n_jobs - 1) / n_jobs;
|
|
154
|
+
srctx.src0_nrows_per_thread = (nr + n_threads - 1) / n_threads;
|
|
155
155
|
|
|
156
156
|
switch(octx->dst.type) {
|
|
157
157
|
case HTP_TYPE_F32:
|
|
158
|
-
worker_pool_run_func(octx->ctx->worker_pool,
|
|
158
|
+
worker_pool_run_func(octx->ctx->worker_pool, set_rows_thread_f32_f32, &srctx, n_threads);
|
|
159
159
|
break;
|
|
160
160
|
case HTP_TYPE_F16:
|
|
161
|
-
worker_pool_run_func(octx->ctx->worker_pool,
|
|
161
|
+
worker_pool_run_func(octx->ctx->worker_pool, set_rows_thread_f16_f32, &srctx, n_threads);
|
|
162
162
|
break;
|
|
163
163
|
default:
|
|
164
164
|
return HTP_STATUS_NO_SUPPORT;
|
|
@@ -2,27 +2,21 @@
|
|
|
2
2
|
#pragma clang diagnostic ignored "-Wunused-function"
|
|
3
3
|
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
|
4
4
|
|
|
5
|
-
#ifdef HTP_DEBUG
|
|
6
|
-
# define FARF_HIGH 1
|
|
7
|
-
#endif
|
|
8
5
|
#include <HAP_farf.h>
|
|
9
|
-
#include <HAP_mem.h>
|
|
10
6
|
#include <HAP_perf.h>
|
|
11
|
-
|
|
12
|
-
#include <hexagon_protos.h>
|
|
13
|
-
#include <hexagon_types.h>
|
|
7
|
+
|
|
14
8
|
#include <math.h>
|
|
15
|
-
#include <qurt_thread.h>
|
|
16
9
|
#include <string.h>
|
|
17
10
|
|
|
11
|
+
#include "hex-dma.h"
|
|
12
|
+
#include "hvx-utils.h"
|
|
13
|
+
#include "hex-fastdiv.h"
|
|
14
|
+
|
|
18
15
|
#define GGML_COMMON_DECL_C
|
|
19
16
|
#include "ggml-common.h"
|
|
20
17
|
#include "htp-ctx.h"
|
|
21
|
-
#include "htp-dma.h"
|
|
22
18
|
#include "htp-msg.h"
|
|
23
19
|
#include "htp-ops.h"
|
|
24
|
-
#include "hvx-utils.h"
|
|
25
|
-
#include "ops-utils.h"
|
|
26
20
|
|
|
27
21
|
#define htp_softmax_preamble3 \
|
|
28
22
|
const uint32_t ne00 = src0->ne[0]; \
|
|
@@ -55,7 +49,7 @@
|
|
|
55
49
|
const uint32_t nb2 = dst->nb[2]; \
|
|
56
50
|
const uint32_t nb3 = dst->nb[3];
|
|
57
51
|
|
|
58
|
-
struct
|
|
52
|
+
struct htp_softmax_context {
|
|
59
53
|
bool use_f16;
|
|
60
54
|
bool use_src1;
|
|
61
55
|
uint32_t n_head;
|
|
@@ -66,28 +60,48 @@ struct softmax_th_ctx {
|
|
|
66
60
|
float m0;
|
|
67
61
|
float m1;
|
|
68
62
|
|
|
63
|
+
uint32_t src0_nrows_per_thread;
|
|
64
|
+
struct fastdiv_values fastdiv_ne01;
|
|
65
|
+
struct fastdiv_values fastdiv_ne02;
|
|
66
|
+
struct fastdiv_values fastdiv_ne12; // For mask broadcasting
|
|
67
|
+
struct fastdiv_values fastdiv_ne13; // For mask broadcasting
|
|
68
|
+
size_t spad_stride;
|
|
69
|
+
|
|
69
70
|
struct htp_ops_context * octx;
|
|
70
71
|
};
|
|
71
72
|
|
|
72
|
-
static void init_softmax_ctx(struct
|
|
73
|
+
static void init_softmax_ctx(struct htp_softmax_context * smctx, struct htp_ops_context * octx) {
|
|
73
74
|
const struct htp_tensor * src0 = &octx->src0;
|
|
74
75
|
const struct htp_tensor * src1 = &octx->src1;
|
|
75
76
|
|
|
76
|
-
memset(
|
|
77
|
+
memset(smctx, 0, sizeof(struct htp_softmax_context));
|
|
78
|
+
|
|
79
|
+
memcpy(&smctx->scale, (float *) octx->op_params, sizeof(float));
|
|
80
|
+
memcpy(&smctx->max_bias, (float *) octx->op_params + 1, sizeof(float));
|
|
81
|
+
|
|
82
|
+
smctx->n_head = src0->ne[2];
|
|
83
|
+
smctx->n_head_log2 = 1u << (uint32_t) floor(log2(smctx->n_head));
|
|
84
|
+
|
|
85
|
+
smctx->m0 = powf(2.0f, -(smctx->max_bias) / smctx->n_head_log2);
|
|
86
|
+
smctx->m1 = powf(2.0f, -(smctx->max_bias / 2.0f) / smctx->n_head_log2);
|
|
77
87
|
|
|
78
|
-
|
|
79
|
-
|
|
88
|
+
smctx->use_src1 = (src1->ne[0] != 0);
|
|
89
|
+
smctx->use_f16 = (src1->ne[0] != 0) && (src1->type == HTP_TYPE_F16);
|
|
80
90
|
|
|
81
|
-
|
|
82
|
-
softmax_ctx->n_head_log2 = 1u << (uint32_t) floor(log2(softmax_ctx->n_head));
|
|
91
|
+
smctx->octx = octx;
|
|
83
92
|
|
|
84
|
-
|
|
85
|
-
|
|
93
|
+
// Initialize fastdiv values
|
|
94
|
+
const uint32_t ne01 = src0->ne[1];
|
|
95
|
+
const uint32_t ne02 = src0->ne[2];
|
|
86
96
|
|
|
87
|
-
|
|
88
|
-
|
|
97
|
+
if (ne01 > 0) smctx->fastdiv_ne01 = init_fastdiv_values(ne01);
|
|
98
|
+
if (ne02 > 0) smctx->fastdiv_ne02 = init_fastdiv_values(ne02);
|
|
89
99
|
|
|
90
|
-
|
|
100
|
+
const uint32_t ne12 = (src1->ne[0]) ? src1->ne[2] : 1;
|
|
101
|
+
const uint32_t ne13 = (src1->ne[0]) ? src1->ne[3] : 1;
|
|
102
|
+
|
|
103
|
+
if (ne12 > 0) smctx->fastdiv_ne12 = init_fastdiv_values(ne12);
|
|
104
|
+
if (ne13 > 0) smctx->fastdiv_ne13 = init_fastdiv_values(ne13);
|
|
91
105
|
}
|
|
92
106
|
|
|
93
107
|
static void hvx_fast_softmax_prep_f32(const uint8_t * restrict src,
|
|
@@ -100,8 +114,8 @@ static void hvx_fast_softmax_prep_f32(const uint8_t * restrict src,
|
|
|
100
114
|
uint8_t * restrict dst_curr = dst;
|
|
101
115
|
const uint8_t * restrict mask_curr = mask;
|
|
102
116
|
|
|
103
|
-
HVX_Vector scale_vec =
|
|
104
|
-
HVX_Vector slope_vec =
|
|
117
|
+
HVX_Vector scale_vec = hvx_vec_splat_f32(scale);
|
|
118
|
+
HVX_Vector slope_vec = hvx_vec_splat_f32(slope);
|
|
105
119
|
|
|
106
120
|
int step_of_1 = num_elems >> 5;
|
|
107
121
|
|
|
@@ -134,9 +148,9 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src,
|
|
|
134
148
|
HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
|
|
135
149
|
|
|
136
150
|
HVX_Vector sum_vec = Q6_V_vsplat_R(0x00000000);
|
|
137
|
-
HVX_Vector max_vec =
|
|
151
|
+
HVX_Vector max_vec = hvx_vec_splat_f32(((const float *) src)[0]);
|
|
138
152
|
HVX_Vector zero_v = Q6_V_vzero();
|
|
139
|
-
HVX_Vector one_v =
|
|
153
|
+
HVX_Vector one_v = hvx_vec_splat_f32(1.0);
|
|
140
154
|
|
|
141
155
|
int step_of_1 = num_elems >> 5;
|
|
142
156
|
|
|
@@ -146,26 +160,24 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src,
|
|
|
146
160
|
max_vec = Q6_Vsf_vmax_VsfVsf(max_vec, v1);
|
|
147
161
|
}
|
|
148
162
|
|
|
149
|
-
|
|
150
|
-
max_vec = hvx_vec_repl4(v);
|
|
163
|
+
max_vec = hvx_vec_reduce_max_f32(max_vec); // replicated over all lanes
|
|
151
164
|
|
|
152
165
|
#pragma unroll(4)
|
|
153
166
|
for (int i = 0; i < step_of_1; i++) {
|
|
154
167
|
HVX_Vector v1 = v_src[i];
|
|
155
168
|
HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v1, max_vec);
|
|
156
169
|
|
|
157
|
-
HVX_Vector v3 =
|
|
170
|
+
HVX_Vector v3 = hvx_vec_exp_f32(Q6_Vsf_equals_Vqf32(v2));
|
|
158
171
|
|
|
159
172
|
sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), v3);
|
|
160
173
|
|
|
161
174
|
v_pad[i] = v3;
|
|
162
175
|
}
|
|
163
176
|
|
|
164
|
-
|
|
165
|
-
sum_vec = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(v));
|
|
177
|
+
sum_vec = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_vec)); // replicated over all lanes
|
|
166
178
|
|
|
167
179
|
HVX_VectorPred pos_sum = Q6_Q_vcmp_gt_VwVw(sum_vec, zero_v);
|
|
168
|
-
HVX_Vector v4 =
|
|
180
|
+
HVX_Vector v4 = hvx_vec_inverse_f32(sum_vec);
|
|
169
181
|
HVX_Vector scale_vec = Q6_V_vmux_QVV(pos_sum, v4, one_v);
|
|
170
182
|
|
|
171
183
|
#pragma unroll(4)
|
|
@@ -181,92 +193,18 @@ static float hvx_softmax_f32(const uint8_t * restrict src,
|
|
|
181
193
|
uint8_t * restrict spad,
|
|
182
194
|
const int num_elems,
|
|
183
195
|
const float max) {
|
|
184
|
-
hvx_sub_scalar_f32(src, max,
|
|
196
|
+
hvx_sub_scalar_f32(spad, src, max, num_elems);
|
|
185
197
|
|
|
186
198
|
hvx_exp_f32(spad, dst, num_elems, false);
|
|
187
199
|
|
|
188
|
-
float sum =
|
|
200
|
+
float sum = hvx_reduce_sum_f32(dst, num_elems);
|
|
189
201
|
|
|
190
202
|
return sum;
|
|
191
203
|
}
|
|
192
204
|
|
|
193
|
-
static void
|
|
194
|
-
struct
|
|
195
|
-
|
|
196
|
-
const struct htp_tensor * src0 = &octx->src0;
|
|
197
|
-
const struct htp_tensor * src1 = &octx->src1;
|
|
198
|
-
const struct htp_tensor * dst = &octx->dst;
|
|
199
|
-
|
|
200
|
-
htp_softmax_preamble3;
|
|
201
|
-
|
|
202
|
-
uint8_t * src0_spad_data = octx->src0_spad.data + (ith * nb01);
|
|
203
|
-
uint8_t * src1_spad_data = octx->src1_spad.data + (ith * nb01);
|
|
204
|
-
uint8_t * dst_spad_data = octx->dst_spad.data + (ith * nb1);
|
|
205
|
-
|
|
206
|
-
float * wp0 = (float *) src0_spad_data;
|
|
207
|
-
float * wp1 = (float *) src1_spad_data;
|
|
208
|
-
float * wp2 = (float *) dst_spad_data;
|
|
209
|
-
|
|
210
|
-
for (uint32_t i03 = 0; i03 < ne03; i03++) {
|
|
211
|
-
for (uint32_t i02 = 0; i02 < ne02; i02++) {
|
|
212
|
-
for (uint32_t i01 = ith; i01 < ne01; i01 += nth) {
|
|
213
|
-
const uint32_t i11 = i01;
|
|
214
|
-
const uint32_t i12 = i02 % ne12;
|
|
215
|
-
const uint32_t i13 = i03 % ne13;
|
|
216
|
-
|
|
217
|
-
// ALiBi
|
|
218
|
-
const uint32_t h = i02; // head
|
|
219
|
-
|
|
220
|
-
const float slope = (softmax_ctx->max_bias > 0.0f) ?
|
|
221
|
-
h < softmax_ctx->n_head_log2 ?
|
|
222
|
-
powf(softmax_ctx->m0, h + 1) :
|
|
223
|
-
powf(softmax_ctx->m1, 2 * (h - softmax_ctx->n_head_log2) + 1) :
|
|
224
|
-
1.0f;
|
|
225
|
-
|
|
226
|
-
float * sp = (float *) ((char *) octx->src0.data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
|
227
|
-
float * dp = (float *) ((char *) octx->dst.data + i01 * nb1 + i02 * nb2 + i03 * nb3);
|
|
228
|
-
|
|
229
|
-
// broadcast the mask across rows
|
|
230
|
-
__fp16 * mp_f16 = (softmax_ctx->use_src1) ?
|
|
231
|
-
(__fp16 *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
|
|
232
|
-
NULL;
|
|
233
|
-
float * mp_f32 = (softmax_ctx->use_src1) ?
|
|
234
|
-
(float *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
|
|
235
|
-
NULL;
|
|
236
|
-
|
|
237
|
-
if ((1 == opt_path) && (mp_f32) && !(softmax_ctx->use_f16)) {
|
|
238
|
-
hvx_fast_softmax_prep_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, softmax_ctx->scale,
|
|
239
|
-
(const uint8_t *) mp_f32, slope);
|
|
240
|
-
} else {
|
|
241
|
-
hvx_scale_f32((uint8_t *) wp0, (const uint8_t *) sp, ne00, softmax_ctx->scale);
|
|
242
|
-
if (mp_f32) {
|
|
243
|
-
if (softmax_ctx->use_f16) {
|
|
244
|
-
for (int i = 0; i < ne00; ++i) {
|
|
245
|
-
wp0[i] += slope * (float) mp_f16[i];
|
|
246
|
-
}
|
|
247
|
-
} else {
|
|
248
|
-
for (int i = 0; i < ne00; ++i) {
|
|
249
|
-
wp0[i] += slope * mp_f32[i];
|
|
250
|
-
}
|
|
251
|
-
}
|
|
252
|
-
}
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
if (1 == opt_path) {
|
|
256
|
-
hvx_fast_softmax_f32((const uint8_t *) wp0, (uint8_t *) dp, (uint8_t *) wp1, ne00);
|
|
257
|
-
} else {
|
|
258
|
-
float max = hvx_self_max_f32((const uint8_t *) wp0, ne00);
|
|
259
|
-
float sum = hvx_softmax_f32((const uint8_t *) wp0, (uint8_t *) wp2, (uint8_t *) wp1, ne00, max);
|
|
260
|
-
sum = sum > 0.0 ? (1.0 / sum) : 1;
|
|
261
|
-
hvx_scale_f32((uint8_t *) dp, (const uint8_t *) wp2, ne00, sum);
|
|
262
|
-
}
|
|
263
|
-
}
|
|
264
|
-
}
|
|
265
|
-
}
|
|
266
|
-
}
|
|
267
|
-
|
|
268
|
-
static void softmax_job_f32_per_thread(struct softmax_th_ctx * softmax_ctx, int nth, int ith) {
|
|
269
|
-
struct htp_ops_context * octx = softmax_ctx->octx;
|
|
205
|
+
static void softmax_job_f32(unsigned int nth, unsigned int ith, void * data) {
|
|
206
|
+
struct htp_softmax_context * smctx = (struct htp_softmax_context *) data;
|
|
207
|
+
struct htp_ops_context * octx = smctx->octx;
|
|
270
208
|
|
|
271
209
|
const struct htp_tensor * src0 = &octx->src0;
|
|
272
210
|
const struct htp_tensor * src1 = &octx->src1;
|
|
@@ -275,7 +213,7 @@ static void softmax_job_f32_per_thread(struct softmax_th_ctx * softmax_ctx, int
|
|
|
275
213
|
htp_softmax_preamble3;
|
|
276
214
|
|
|
277
215
|
const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
|
|
278
|
-
const uint32_t src0_nrows_per_thread =
|
|
216
|
+
const uint32_t src0_nrows_per_thread = smctx->src0_nrows_per_thread;
|
|
279
217
|
|
|
280
218
|
const uint32_t src0_start_row = src0_nrows_per_thread * ith;
|
|
281
219
|
const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
|
|
@@ -290,7 +228,7 @@ static void softmax_job_f32_per_thread(struct softmax_th_ctx * softmax_ctx, int
|
|
|
290
228
|
|
|
291
229
|
int is_aligned = 1;
|
|
292
230
|
int opt_path = 0;
|
|
293
|
-
if (!
|
|
231
|
+
if (!hex_is_aligned((void *) src0->data, VLEN) || !hex_is_aligned((void *) dst->data, VLEN)) {
|
|
294
232
|
is_aligned = 0;
|
|
295
233
|
FARF(HIGH, "softmax-f32: unaligned addresses in elementwise op, possibly slower execution\n");
|
|
296
234
|
}
|
|
@@ -298,20 +236,103 @@ static void softmax_job_f32_per_thread(struct softmax_th_ctx * softmax_ctx, int
|
|
|
298
236
|
opt_path = 1;
|
|
299
237
|
}
|
|
300
238
|
|
|
301
|
-
|
|
239
|
+
uint8_t * src0_spad_data = octx->src0_spad.data + (ith * smctx->spad_stride);
|
|
240
|
+
uint8_t * src1_spad_data = octx->src1_spad.data + (ith * smctx->spad_stride);
|
|
241
|
+
uint8_t * dst_spad_data = octx->dst_spad.data + (ith * smctx->spad_stride);
|
|
242
|
+
|
|
243
|
+
float * wp0 = (float *) src0_spad_data;
|
|
244
|
+
float * wp1 = (float *) src1_spad_data;
|
|
245
|
+
float * wp2 = (float *) dst_spad_data;
|
|
246
|
+
|
|
247
|
+
uint32_t prev_i2 = (uint32_t)-1;
|
|
248
|
+
float slope = 1.0f;
|
|
249
|
+
|
|
250
|
+
for (uint32_t r = src0_start_row; r < src0_end_row; ++r) {
|
|
251
|
+
uint32_t i1 = fastmodulo(r, ne01, &smctx->fastdiv_ne01);
|
|
252
|
+
uint32_t r_div_ne01 = fastdiv(r, &smctx->fastdiv_ne01);
|
|
253
|
+
uint32_t i2 = fastmodulo(r_div_ne01, ne02, &smctx->fastdiv_ne02);
|
|
254
|
+
uint32_t i3 = fastdiv(r_div_ne01, &smctx->fastdiv_ne02);
|
|
255
|
+
|
|
256
|
+
// Map to original logic indices
|
|
257
|
+
// i01 = i1
|
|
258
|
+
// i02 = i2
|
|
259
|
+
// i03 = i3
|
|
260
|
+
|
|
261
|
+
const uint32_t i11 = i1;
|
|
262
|
+
// const uint32_t i12 = i2 % ne12;
|
|
263
|
+
// const uint32_t i13 = i3 % ne13;
|
|
264
|
+
|
|
265
|
+
uint32_t i12, i13;
|
|
266
|
+
if (ne12 == ne02) {
|
|
267
|
+
i12 = i2;
|
|
268
|
+
} else {
|
|
269
|
+
i12 = fastmodulo(i2, ne12, &smctx->fastdiv_ne12);
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
if (ne13 == ne03) {
|
|
273
|
+
i13 = i3;
|
|
274
|
+
} else {
|
|
275
|
+
i13 = fastmodulo(i3, ne13, &smctx->fastdiv_ne13);
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// ALiBi
|
|
279
|
+
if (i2 != prev_i2) {
|
|
280
|
+
const uint32_t h = i2; // head
|
|
281
|
+
|
|
282
|
+
slope = (smctx->max_bias > 0.0f) ?
|
|
283
|
+
h < smctx->n_head_log2 ?
|
|
284
|
+
powf(smctx->m0, h + 1) :
|
|
285
|
+
powf(smctx->m1, 2 * (h - smctx->n_head_log2) + 1) :
|
|
286
|
+
1.0f;
|
|
287
|
+
prev_i2 = i2;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
float * sp = (float *) ((char *) octx->src0.data + i1 * nb01 + i2 * nb02 + i3 * nb03);
|
|
291
|
+
float * dp = (float *) ((char *) octx->dst.data + i1 * nb1 + i2 * nb2 + i3 * nb3);
|
|
292
|
+
|
|
293
|
+
// broadcast the mask across rows
|
|
294
|
+
__fp16 * mp_f16 = (smctx->use_src1) ?
|
|
295
|
+
(__fp16 *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
|
|
296
|
+
NULL;
|
|
297
|
+
float * mp_f32 = (smctx->use_src1) ?
|
|
298
|
+
(float *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
|
|
299
|
+
NULL;
|
|
300
|
+
|
|
301
|
+
if ((1 == opt_path) && (mp_f32) && !(smctx->use_f16)) {
|
|
302
|
+
hvx_fast_softmax_prep_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, smctx->scale,
|
|
303
|
+
(const uint8_t *) mp_f32, slope);
|
|
304
|
+
} else {
|
|
305
|
+
hvx_scale_f32((uint8_t *) wp0, (const uint8_t *) sp, ne00, smctx->scale);
|
|
306
|
+
if (mp_f32) {
|
|
307
|
+
if (smctx->use_f16) {
|
|
308
|
+
for (int i = 0; i < ne00; ++i) {
|
|
309
|
+
wp0[i] += slope * (float) mp_f16[i];
|
|
310
|
+
}
|
|
311
|
+
} else {
|
|
312
|
+
for (int i = 0; i < ne00; ++i) {
|
|
313
|
+
wp0[i] += slope * mp_f32[i];
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
if (1 == opt_path) {
|
|
320
|
+
hvx_fast_softmax_f32((const uint8_t *) wp0, (uint8_t *) dp, (uint8_t *) wp1, ne00);
|
|
321
|
+
} else {
|
|
322
|
+
float max = hvx_reduce_max_f32((const uint8_t *) wp0, ne00);
|
|
323
|
+
float sum = hvx_softmax_f32((const uint8_t *) wp0, (uint8_t *) wp2, (uint8_t *) wp1, ne00, max);
|
|
324
|
+
sum = sum > 0.0 ? (1.0 / sum) : 1;
|
|
325
|
+
hvx_scale_f32((uint8_t *) dp, (const uint8_t *) wp2, ne00, sum);
|
|
326
|
+
}
|
|
327
|
+
}
|
|
302
328
|
|
|
303
329
|
t2 = HAP_perf_get_qtimer_count();
|
|
304
330
|
|
|
305
331
|
FARF(HIGH, "softmax-f32 %d/%d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
|
|
306
|
-
|
|
332
|
+
smctx->use_f16, opt_path, ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13,
|
|
307
333
|
ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
|
308
334
|
}
|
|
309
335
|
|
|
310
|
-
static void softmax_job_dispatcher_f32(unsigned int n, unsigned int i, void * p_data) {
|
|
311
|
-
struct softmax_th_ctx * p_softmax_ctx = (struct softmax_th_ctx *) p_data;
|
|
312
|
-
softmax_job_f32_per_thread(p_softmax_ctx, n, i);
|
|
313
|
-
}
|
|
314
|
-
|
|
315
336
|
static int execute_op_softmax_f32(struct htp_ops_context * octx) {
|
|
316
337
|
int err = HTP_STATUS_OK;
|
|
317
338
|
|
|
@@ -319,17 +340,12 @@ static int execute_op_softmax_f32(struct htp_ops_context * octx) {
|
|
|
319
340
|
const struct htp_tensor * src1 = &octx->src1;
|
|
320
341
|
struct htp_tensor * dst = &octx->dst;
|
|
321
342
|
|
|
322
|
-
|
|
323
|
-
const char *
|
|
324
|
-
|
|
325
|
-
struct softmax_th_ctx softmax_ctx;
|
|
343
|
+
struct htp_softmax_context smctx;
|
|
344
|
+
const char * op_type = "softmax-f32";
|
|
326
345
|
|
|
327
346
|
switch (octx->op) {
|
|
328
347
|
case HTP_OP_SOFTMAX:
|
|
329
|
-
|
|
330
|
-
op_type = "softmax-f32";
|
|
331
|
-
|
|
332
|
-
init_softmax_ctx(&softmax_ctx, octx);
|
|
348
|
+
init_softmax_ctx(&smctx, octx);
|
|
333
349
|
break;
|
|
334
350
|
|
|
335
351
|
default:
|
|
@@ -337,7 +353,8 @@ static int execute_op_softmax_f32(struct htp_ops_context * octx) {
|
|
|
337
353
|
return HTP_STATUS_NO_SUPPORT;
|
|
338
354
|
}
|
|
339
355
|
|
|
340
|
-
const uint32_t
|
|
356
|
+
const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
|
|
357
|
+
const uint32_t n_threads = MIN(octx->n_threads, src0_nrows);
|
|
341
358
|
|
|
342
359
|
const size_t src0_row_size = src0->nb[1];
|
|
343
360
|
const size_t src1_row_size = src0_row_size;
|
|
@@ -345,9 +362,12 @@ static int execute_op_softmax_f32(struct htp_ops_context * octx) {
|
|
|
345
362
|
|
|
346
363
|
// VTCM scratchpads for all tensors
|
|
347
364
|
// N rows per thread, padded to HVX vector size
|
|
348
|
-
octx->dst_spad.size =
|
|
349
|
-
octx->src0_spad.size =
|
|
350
|
-
octx->src1_spad.size =
|
|
365
|
+
octx->dst_spad.size = hex_round_up(dst_row_size, 128) * n_threads;
|
|
366
|
+
octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads;
|
|
367
|
+
octx->src1_spad.size = hex_round_up(src1_row_size, 128) * n_threads;
|
|
368
|
+
|
|
369
|
+
// Use stride for calculating offset
|
|
370
|
+
smctx.spad_stride = hex_round_up(src0_row_size, 128);
|
|
351
371
|
|
|
352
372
|
size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
|
|
353
373
|
|
|
@@ -374,12 +394,9 @@ static int execute_op_softmax_f32(struct htp_ops_context * octx) {
|
|
|
374
394
|
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
|
|
375
395
|
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
|
|
376
396
|
|
|
377
|
-
uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
|
|
378
|
-
|
|
379
397
|
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
|
|
380
|
-
|
|
381
|
-
octx->
|
|
382
|
-
worker_pool_run_func(octx->ctx->worker_pool, op_func, &softmax_ctx, n_jobs);
|
|
398
|
+
smctx.src0_nrows_per_thread = (src0_nrows + n_threads - 1) / n_threads;
|
|
399
|
+
worker_pool_run_func(octx->ctx->worker_pool, softmax_job_f32, &smctx, n_threads);
|
|
383
400
|
}
|
|
384
401
|
|
|
385
402
|
return err;
|