whispercpp 1.3.4 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -43
- data/ext/extconf.rb +2 -2
- data/ext/ruby_whisper.c +14 -2
- data/ext/ruby_whisper.h +39 -0
- data/ext/ruby_whisper_context.c +22 -22
- data/ext/ruby_whisper_model.c +12 -12
- data/ext/ruby_whisper_params.c +47 -23
- data/ext/ruby_whisper_segment.c +84 -19
- data/ext/ruby_whisper_token.c +351 -0
- data/ext/ruby_whisper_transcribe.cpp +1 -1
- data/ext/ruby_whisper_vad_context.c +75 -0
- data/ext/ruby_whisper_vad_context_detect.cpp +50 -0
- data/ext/ruby_whisper_vad_segment.c +139 -0
- data/ext/ruby_whisper_vad_segments.c +106 -0
- data/ext/sources/CMakeLists.txt +4 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/arm64-apple-clang.cmake +16 -0
- data/ext/sources/cmake/arm64-windows-llvm.cmake +16 -0
- data/ext/sources/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
- data/ext/sources/cmake/x64-windows-llvm.cmake +5 -0
- data/ext/sources/examples/addon.node/vad-example.js +2 -2
- data/ext/sources/examples/cli/cli.cpp +121 -112
- data/ext/sources/examples/lsp/CMakeLists.txt +2 -1
- data/ext/sources/examples/quantize/CMakeLists.txt +2 -1
- data/ext/sources/examples/server/server.cpp +10 -11
- data/ext/sources/examples/talk-llama/CMakeLists.txt +5 -1
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +12 -3
- data/ext/sources/examples/talk-llama/llama-adapter.h +7 -1
- data/ext/sources/examples/talk-llama/llama-arch.cpp +2046 -1974
- data/ext/sources/examples/talk-llama/llama-arch.h +67 -2
- data/ext/sources/examples/talk-llama/llama-batch.cpp +75 -33
- data/ext/sources/examples/talk-llama/llama-batch.h +17 -4
- data/ext/sources/examples/talk-llama/llama-chat.cpp +79 -3
- data/ext/sources/examples/talk-llama/llama-chat.h +4 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +775 -78
- data/ext/sources/examples/talk-llama/llama-context.h +57 -9
- data/ext/sources/examples/talk-llama/llama-cparams.h +1 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +288 -53
- data/ext/sources/examples/talk-llama/llama-grammar.h +22 -1
- data/ext/sources/examples/talk-llama/llama-graph.cpp +381 -64
- data/ext/sources/examples/talk-llama/llama-graph.h +103 -13
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +26 -2
- data/ext/sources/examples/talk-llama/llama-hparams.h +41 -10
- data/ext/sources/examples/talk-llama/llama-impl.cpp +7 -3
- data/ext/sources/examples/talk-llama/llama-impl.h +1 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +5 -3
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +145 -65
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +22 -7
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +44 -2
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +12 -10
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +32 -19
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +2 -2
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +172 -37
- data/ext/sources/examples/talk-llama/llama-mmap.h +8 -3
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +91 -9
- data/ext/sources/examples/talk-llama/llama-model-loader.h +6 -0
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +3 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +1529 -13134
- data/ext/sources/examples/talk-llama/llama-model.h +44 -3
- data/ext/sources/examples/talk-llama/llama-quant.cpp +8 -23
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +1294 -198
- data/ext/sources/examples/talk-llama/llama-sampling.h +19 -7
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +133 -37
- data/ext/sources/examples/talk-llama/llama-vocab.h +45 -40
- data/ext/sources/examples/talk-llama/llama.cpp +729 -2
- data/ext/sources/examples/talk-llama/llama.h +152 -14
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +191 -0
- data/ext/sources/examples/talk-llama/models/apertus.cpp +125 -0
- data/ext/sources/examples/talk-llama/models/arcee.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/arctic.cpp +138 -0
- data/ext/sources/examples/talk-llama/models/arwkv7.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/bert.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +160 -0
- data/ext/sources/examples/talk-llama/models/bloom.cpp +101 -0
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +111 -0
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +102 -0
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +134 -0
- data/ext/sources/examples/talk-llama/models/command-r.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/deci.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +259 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +134 -0
- data/ext/sources/examples/talk-llama/models/dream.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +114 -0
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +113 -0
- data/ext/sources/examples/talk-llama/models/falcon.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +116 -0
- data/ext/sources/examples/talk-llama/models/gemma.cpp +112 -0
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +155 -0
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +384 -0
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +170 -0
- data/ext/sources/examples/talk-llama/models/glm4.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +196 -0
- data/ext/sources/examples/talk-llama/models/granite.cpp +211 -0
- data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +283 -0
- data/ext/sources/examples/talk-llama/models/grok.cpp +159 -0
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +141 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +154 -0
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/jais.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +106 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +175 -0
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/llada.cpp +99 -0
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/llama.cpp +168 -0
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/mamba.cpp +55 -0
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +199 -0
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +160 -0
- data/ext/sources/examples/talk-llama/models/models.h +569 -0
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +116 -0
- data/ext/sources/examples/talk-llama/models/mpt.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +104 -0
- data/ext/sources/examples/talk-llama/models/olmo.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +127 -0
- data/ext/sources/examples/talk-llama/models/openelm.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/orion.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi3.cpp +152 -0
- data/ext/sources/examples/talk-llama/models/plamo.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +316 -0
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/plm.cpp +168 -0
- data/ext/sources/examples/talk-llama/models/qwen.cpp +108 -0
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +151 -0
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +873 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +149 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +141 -0
- data/ext/sources/examples/talk-llama/models/refact.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +162 -0
- data/ext/sources/examples/talk-llama/models/rwkv6.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/rwkv7.cpp +90 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +146 -0
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +100 -0
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +166 -0
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +96 -0
- data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +149 -0
- data/ext/sources/examples/talk-llama/models/xverse.cpp +108 -0
- data/ext/sources/examples/talk-llama/unicode.cpp +102 -16
- data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +1 -1
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +1 -1
- data/ext/sources/ggml/CMakeLists.txt +82 -54
- data/ext/sources/ggml/include/ggml-alloc.h +9 -0
- data/ext/sources/ggml/include/ggml-backend.h +4 -1
- data/ext/sources/ggml/include/ggml-cpu.h +1 -0
- data/ext/sources/ggml/include/ggml-hexagon.h +19 -0
- data/ext/sources/ggml/include/ggml-rpc.h +8 -11
- data/ext/sources/ggml/include/ggml-zendnn.h +22 -0
- data/ext/sources/ggml/include/ggml.h +190 -12
- data/ext/sources/ggml/src/CMakeLists.txt +82 -11
- data/ext/sources/ggml/src/ggml-alloc.c +124 -41
- data/ext/sources/ggml/src/ggml-backend-impl.h +1 -4
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +27 -3
- data/ext/sources/ggml/src/ggml-backend.cpp +71 -21
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +17 -3
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -9
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +138 -47
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +2179 -1696
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +238 -317
- data/ext/sources/ggml/src/ggml-cann/common.h +283 -208
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +626 -776
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +156 -86
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1004 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6 -6
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +50 -2
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -3
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +195 -71
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +573 -106
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +33 -44
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +298 -112
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +819 -125
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +708 -431
- data/ext/sources/ggml/src/ggml-cpu/ops.h +5 -4
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +671 -31
- data/ext/sources/ggml/src/ggml-cpu/repack.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +41 -43
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +3 -2
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +7 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +124 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.h +261 -146
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +72 -1
- data/ext/sources/ggml/src/ggml-cuda/argmax.cu +2 -2
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +123 -6
- data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +16 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +353 -80
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +339 -246
- data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +1 -5
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cu +307 -0
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/diag.cu +77 -0
- data/ext/sources/ggml/src/ggml-cuda/diag.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +31 -21
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +663 -596
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +35 -741
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +1241 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +30 -37
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +14 -13
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +83 -37
- data/ext/sources/ggml/src/ggml-cuda/fill.cu +37 -0
- data/ext/sources/ggml/src/ggml-cuda/fill.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1155 -164
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +5 -4
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +741 -48
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +60 -12
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +381 -42
- data/ext/sources/ggml/src/ggml-cuda/mmid.cu +164 -0
- data/ext/sources/ggml/src/ggml-cuda/mmid.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +69 -176
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +498 -171
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +375 -79
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +3 -2
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +241 -95
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +64 -33
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +151 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +14 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +192 -77
- data/ext/sources/ggml/src/ggml-cuda/rope.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +101 -47
- data/ext/sources/ggml/src/ggml-cuda/set.cu +39 -0
- data/ext/sources/ggml/src/ggml-cuda/set.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +203 -6
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +275 -0
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +14 -20
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +49 -84
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +19 -1
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +96 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +168 -76
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +11 -4
- data/ext/sources/ggml/src/ggml-cuda/tri.cu +136 -0
- data/ext/sources/ggml/src/ggml-cuda/tri.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +105 -11
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +36 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +163 -7
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +12 -1
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +6 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +80 -0
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3151 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +44 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +682 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +566 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +112 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.c +63 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +157 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +165 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +92 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +94 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +72 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +1020 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +1353 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +1001 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2503 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +487 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +168 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +287 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +454 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +221 -0
- data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +153 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +8 -13
- data/ext/sources/ggml/src/ggml-impl.h +67 -6
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +2 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +29 -20
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +652 -285
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +103 -56
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +496 -118
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +231 -9
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +1227 -224
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +12 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +14 -8
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1972 -704
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +3 -1
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +11 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1430 -120
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +63 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +82 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +4 -3
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
- data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +35 -16
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +438 -156
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +48 -3
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +77 -0
- data/ext/sources/ggml/src/ggml-sycl/add-id.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +0 -9
- data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +0 -6
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +117 -15
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +55 -44
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +34 -0
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +79 -0
- data/ext/sources/ggml/src/ggml-sycl/count-equal.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +0 -3
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +18 -0
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +76 -3
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +333 -300
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +10 -2
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +335 -110
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +156 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.cpp +97 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.hpp +24 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.cpp +122 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +30 -17
- data/ext/sources/ggml/src/ggml-sycl/set.cpp +73 -0
- data/ext/sources/ggml/src/ggml-sycl/set.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +327 -162
- data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +4 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +58 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +5013 -2859
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +47 -49
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +9 -21
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +39 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +19 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +45 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +50 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +17 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +2 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +19 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mat_vec_base.comp → mul_mat_vec_base.glsl} +70 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +71 -21
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +41 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +44 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +39 -36
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +494 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +78 -103
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +34 -23
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mm_funcs.comp → mul_mm_funcs.glsl} +69 -59
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +72 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +88 -228
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +21 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +10 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +50 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +234 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +6 -50
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +6 -33
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +6 -33
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +6 -39
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +2 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +43 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +345 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +90 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +335 -151
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +28 -2
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +169 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1964 -435
- data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +33 -10
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +591 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +1 -1
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +6 -6
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +83 -17
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +112 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +483 -0
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +466 -0
- data/ext/sources/ggml/src/ggml.c +425 -33
- data/ext/sources/include/whisper.h +1 -0
- data/ext/sources/src/CMakeLists.txt +3 -1
- data/ext/sources/src/whisper.cpp +101 -35
- data/ext/sources/tests/CMakeLists.txt +2 -2
- data/ext/sources/tests/test-vad-full.cpp +4 -2
- data/ext/sources/tests/test-vad.cpp +1 -1
- data/extsources.rb +1 -0
- data/lib/whisper/model/uri.rb +17 -18
- data/sig/whisper.rbs +119 -2
- data/test/test_params.rb +16 -8
- data/test/test_segment.rb +0 -1
- data/test/test_token.rb +70 -0
- data/test/test_vad.rb +1 -1
- data/test/test_vad_context.rb +50 -0
- data/test/test_vad_segment.rb +19 -0
- data/test/test_vad_segments.rb +16 -0
- data/test/test_whisper.rb +7 -0
- data/whispercpp.gemspec +1 -1
- metadata +287 -34
- data/ext/sources/build-xcframework.sh +0 -571
- data/ext/sources/ggml/src/ggml-cann/Doxyfile +0 -2579
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +0 -44
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +0 -41
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +0 -44
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +0 -41
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +0 -48
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
#ifndef HTP_CTX_H
|
|
2
|
+
#define HTP_CTX_H
|
|
3
|
+
|
|
4
|
+
#include "htp-dma.h"
|
|
5
|
+
#include "worker-pool.h"
|
|
6
|
+
|
|
7
|
+
#include <assert.h>
|
|
8
|
+
#include <dspqueue.h>
|
|
9
|
+
#include <stdatomic.h>
|
|
10
|
+
#include <stdint.h>
|
|
11
|
+
|
|
12
|
+
#define HTP_MAX_NTHREADS 10
|
|
13
|
+
|
|
14
|
+
// Main context for htp DSP backend
|
|
15
|
+
struct htp_context {
|
|
16
|
+
dspqueue_t queue;
|
|
17
|
+
dma_queue * dma[HTP_MAX_NTHREADS];
|
|
18
|
+
worker_pool_context_t worker_pool;
|
|
19
|
+
uint32_t n_threads;
|
|
20
|
+
|
|
21
|
+
int thread_id;
|
|
22
|
+
int thread_prio;
|
|
23
|
+
|
|
24
|
+
uint8_t * vtcm_base;
|
|
25
|
+
size_t vtcm_size;
|
|
26
|
+
uint32_t vtcm_rctx;
|
|
27
|
+
|
|
28
|
+
atomic_bool vtcm_valid;
|
|
29
|
+
atomic_bool vtcm_inuse;
|
|
30
|
+
atomic_bool vtcm_needs_release;
|
|
31
|
+
|
|
32
|
+
uint32_t opmask;
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
#endif /* HTP_CTX_H */
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
#include "htp-dma.h"
|
|
2
|
+
|
|
3
|
+
#include <stdbool.h>
|
|
4
|
+
#include <stdlib.h>
|
|
5
|
+
#include <string.h>
|
|
6
|
+
|
|
7
|
+
#pragma clang diagnostic ignored "-Wunused-function"
|
|
8
|
+
|
|
9
|
+
static inline uint32_t pow2_ceil(uint32_t x) {
|
|
10
|
+
if (x <= 1) {
|
|
11
|
+
return 1;
|
|
12
|
+
}
|
|
13
|
+
int p = 2;
|
|
14
|
+
x--;
|
|
15
|
+
while (x >>= 1) {
|
|
16
|
+
p <<= 1;
|
|
17
|
+
}
|
|
18
|
+
return p;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
dma_queue * dma_queue_create(size_t capacity) {
|
|
22
|
+
dma_queue * q = (dma_queue *) memalign(32, sizeof(dma_queue));
|
|
23
|
+
if (q == NULL) {
|
|
24
|
+
FARF(ERROR, "%s: failed to allocate DMA queue\n", __FUNCTION__);
|
|
25
|
+
return NULL;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
capacity = pow2_ceil(capacity);
|
|
29
|
+
|
|
30
|
+
memset(q, 0, sizeof(dma_queue));
|
|
31
|
+
q->capacity = capacity;
|
|
32
|
+
q->idx_mask = capacity - 1;
|
|
33
|
+
|
|
34
|
+
q->desc = (hexagon_udma_descriptor_type1_t *) memalign(64, capacity * sizeof(hexagon_udma_descriptor_type1_t));
|
|
35
|
+
memset(q->desc, 0, capacity * sizeof(hexagon_udma_descriptor_type1_t));
|
|
36
|
+
|
|
37
|
+
q->dptr = (dma_ptr *) memalign(4, capacity * sizeof(dma_ptr));
|
|
38
|
+
memset(q->dptr, 0, capacity * sizeof(dma_ptr));
|
|
39
|
+
|
|
40
|
+
q->tail = &q->desc[capacity - 1];
|
|
41
|
+
|
|
42
|
+
if (!q->desc && !q->dptr) {
|
|
43
|
+
FARF(ERROR, "%s: failed to allocate DMA queue items\n", __FUNCTION__);
|
|
44
|
+
return NULL;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
FARF(HIGH, "dma-queue: capacity %u\n", capacity);
|
|
48
|
+
|
|
49
|
+
return q;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
void dma_queue_delete(dma_queue * q) {
|
|
53
|
+
if (!q) {
|
|
54
|
+
return;
|
|
55
|
+
}
|
|
56
|
+
free(q->desc);
|
|
57
|
+
free(q->dptr);
|
|
58
|
+
free(q);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
void dma_queue_flush(dma_queue * q) {
|
|
62
|
+
while (dma_queue_pop(q).dst != NULL) ;
|
|
63
|
+
}
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
#ifndef HTP_DMA_H
|
|
2
|
+
#define HTP_DMA_H
|
|
3
|
+
|
|
4
|
+
#include <HAP_farf.h>
|
|
5
|
+
#include <hexagon_protos.h>
|
|
6
|
+
#include <hexagon_types.h>
|
|
7
|
+
#include <stdbool.h>
|
|
8
|
+
#include <stdint.h>
|
|
9
|
+
|
|
10
|
+
#ifdef __cplusplus
|
|
11
|
+
extern "C" {
|
|
12
|
+
#endif
|
|
13
|
+
|
|
14
|
+
typedef struct {
|
|
15
|
+
void *dst;
|
|
16
|
+
const void *src;
|
|
17
|
+
} dma_ptr;
|
|
18
|
+
|
|
19
|
+
typedef struct {
|
|
20
|
+
hexagon_udma_descriptor_type1_t * desc; // descriptor pointers
|
|
21
|
+
hexagon_udma_descriptor_type1_t * tail; // tail pointer
|
|
22
|
+
dma_ptr * dptr; // dst/src pointers
|
|
23
|
+
uint32_t push_idx;
|
|
24
|
+
uint32_t pop_idx;
|
|
25
|
+
uint32_t capacity;
|
|
26
|
+
uint32_t idx_mask;
|
|
27
|
+
} dma_queue;
|
|
28
|
+
|
|
29
|
+
dma_queue * dma_queue_create(size_t capacity);
|
|
30
|
+
void dma_queue_delete(dma_queue * q);
|
|
31
|
+
void dma_queue_flush(dma_queue * q);
|
|
32
|
+
|
|
33
|
+
// TODO: technically we don't need these and could use Q6_dmstart/wait/etc instead
|
|
34
|
+
// but those do not seem to always compiler properly.
|
|
35
|
+
static inline void dmstart(void * next) {
|
|
36
|
+
asm volatile(" release(%0):at" : : "r"(next));
|
|
37
|
+
asm volatile(" dmstart(%0)" : : "r"(next));
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
static inline void dmlink(void * cur, void * next) {
|
|
41
|
+
asm volatile(" release(%0):at" : : "r"(next));
|
|
42
|
+
asm volatile(" dmlink(%0, %1)" : : "r"(cur), "r"(next));
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
static inline unsigned int dmpoll(void) {
|
|
46
|
+
unsigned int ret = 0;
|
|
47
|
+
asm volatile(" %0 = dmpoll" : "=r"(ret) : : "memory");
|
|
48
|
+
return ret;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
static inline unsigned int dmwait(void) {
|
|
52
|
+
unsigned int ret = 0;
|
|
53
|
+
asm volatile(" %0 = dmwait" : "=r"(ret) : : "memory");
|
|
54
|
+
return ret;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
static inline dma_ptr dma_make_ptr(void *dst, const void *src)
|
|
58
|
+
{
|
|
59
|
+
dma_ptr p = { dst, src };
|
|
60
|
+
return p;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
static inline bool dma_queue_push(dma_queue * q,
|
|
64
|
+
dma_ptr dptr,
|
|
65
|
+
size_t dst_row_size,
|
|
66
|
+
size_t src_row_size,
|
|
67
|
+
size_t width, // width in bytes. number of bytes to transfer per row
|
|
68
|
+
size_t nrows) {
|
|
69
|
+
if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
|
|
70
|
+
FARF(ERROR, "dma-push: queue full\n");
|
|
71
|
+
return false;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
hexagon_udma_descriptor_type1_t * desc = &q->desc[q->push_idx];
|
|
75
|
+
|
|
76
|
+
desc->next = NULL;
|
|
77
|
+
desc->length = 0;
|
|
78
|
+
desc->desctype = HEXAGON_UDMA_DESC_DESCTYPE_TYPE1;
|
|
79
|
+
desc->dstbypass = 1;
|
|
80
|
+
desc->srcbypass = 1;
|
|
81
|
+
#if __HVX_ARCH__ >= 73
|
|
82
|
+
desc->dstbypass = 1;
|
|
83
|
+
desc->srcbypass = 1;
|
|
84
|
+
#else
|
|
85
|
+
desc->dstbypass = 0;
|
|
86
|
+
desc->srcbypass = 1;
|
|
87
|
+
#endif
|
|
88
|
+
desc->order = 0;
|
|
89
|
+
desc->dstate = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE;
|
|
90
|
+
desc->src = (void *) dptr.src;
|
|
91
|
+
desc->dst = (void *) dptr.dst;
|
|
92
|
+
desc->allocation = 0;
|
|
93
|
+
desc->padding = 0;
|
|
94
|
+
desc->roiwidth = width;
|
|
95
|
+
desc->roiheight = nrows;
|
|
96
|
+
desc->srcstride = src_row_size;
|
|
97
|
+
desc->dststride = dst_row_size;
|
|
98
|
+
desc->srcwidthoffset = 0;
|
|
99
|
+
desc->dstwidthoffset = 0;
|
|
100
|
+
|
|
101
|
+
q->dptr[q->push_idx] = dptr;
|
|
102
|
+
|
|
103
|
+
dmlink(q->tail, desc);
|
|
104
|
+
q->tail = desc;
|
|
105
|
+
|
|
106
|
+
// FARF(ERROR, "dma-push: i %u len %u dst %p src %p\n", q->push_idx, len, dst, src);
|
|
107
|
+
q->push_idx = (q->push_idx + 1) & q->idx_mask;
|
|
108
|
+
return true;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
static inline bool dma_queue_push_ddr_to_vtcm(dma_queue * q,
|
|
112
|
+
dma_ptr dptr,
|
|
113
|
+
size_t dst_row_size,
|
|
114
|
+
size_t src_row_size,
|
|
115
|
+
size_t nrows) {
|
|
116
|
+
return dma_queue_push(q, dptr, dst_row_size, src_row_size, src_row_size, nrows);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q,
|
|
121
|
+
dma_ptr dptr,
|
|
122
|
+
size_t dst_row_size,
|
|
123
|
+
size_t src_row_size,
|
|
124
|
+
size_t nrows) {
|
|
125
|
+
return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
static inline dma_ptr dma_queue_pop(dma_queue * q) {
|
|
129
|
+
dma_ptr dptr = { NULL };
|
|
130
|
+
|
|
131
|
+
if (q->push_idx == q->pop_idx) {
|
|
132
|
+
return dptr;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
hexagon_udma_descriptor_type1_t * desc = &q->desc[q->pop_idx];
|
|
136
|
+
|
|
137
|
+
// Wait for desc to complete
|
|
138
|
+
while (1) {
|
|
139
|
+
dmpoll();
|
|
140
|
+
if (desc->dstate == HEXAGON_UDMA_DESC_DSTATE_COMPLETE) {
|
|
141
|
+
break;
|
|
142
|
+
}
|
|
143
|
+
// FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
dptr = q->dptr[q->pop_idx];
|
|
147
|
+
|
|
148
|
+
// FARF(ERROR, "dma-pop: i %u dst %p\n", q->pop_idx, dst);
|
|
149
|
+
q->pop_idx = (q->pop_idx + 1) & q->idx_mask;
|
|
150
|
+
return dptr;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
#ifdef __cplusplus
|
|
154
|
+
} // extern "C"
|
|
155
|
+
#endif
|
|
156
|
+
|
|
157
|
+
#endif /* HTP_DMA_H */
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
#ifndef HTP_MSG_H
|
|
2
|
+
#define HTP_MSG_H
|
|
3
|
+
|
|
4
|
+
#include <assert.h>
|
|
5
|
+
|
|
6
|
+
// ggml-common.h must be included prio to this header
|
|
7
|
+
|
|
8
|
+
// Mask to enable various stages of the Ops.
|
|
9
|
+
// Used for debugging and profiling.
|
|
10
|
+
enum {
|
|
11
|
+
HTP_OPMASK_QUEUE = (1 << 0), // Enable Queueing (ie calls into the DSP)
|
|
12
|
+
HTP_OPMASK_QUANTIZE = (1 << 1), // Enable Quantize
|
|
13
|
+
HTP_OPMASK_COMPUTE = (1 << 2), // Enable Compute
|
|
14
|
+
};
|
|
15
|
+
|
|
16
|
+
// Op flags
|
|
17
|
+
enum {
|
|
18
|
+
HTP_OPFLAGS_SKIP_QUANTIZE = (1 << 0), // Skip dynamic quantization (reuse quantized tensors)
|
|
19
|
+
HTP_OPFLAGS_SKIP_COMPUTE = (1 << 1), // Skip actual computation (used for profiling)
|
|
20
|
+
HTP_OPFLAGS_EARLY_WAKEUP = (1 << 2) // Send early wakeup notification
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
enum htp_status {
|
|
24
|
+
HTP_STATUS_OK = 1,
|
|
25
|
+
HTP_STATUS_INTERNAL_ERR = 2,
|
|
26
|
+
HTP_STATUS_NO_SUPPORT = 3,
|
|
27
|
+
HTP_STATUS_INVAL_PARAMS = 4,
|
|
28
|
+
HTP_STATUS_VTCM_TOO_SMALL = 5,
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
// The values must match the ggml_type.
|
|
32
|
+
// Duplicated here because we can't include full ggml.h in the htp build.
|
|
33
|
+
// We have some static_asserts in the cpp code to ensure things are in sync.
|
|
34
|
+
enum htp_data_type {
|
|
35
|
+
HTP_TYPE_F32 = 0,
|
|
36
|
+
HTP_TYPE_F16 = 1,
|
|
37
|
+
HTP_TYPE_Q4_0 = 2,
|
|
38
|
+
HTP_TYPE_Q8_0 = 8,
|
|
39
|
+
HTP_TYPE_I32 = 26,
|
|
40
|
+
HTP_TYPE_I64 = 27,
|
|
41
|
+
HTP_TYPE_MXFP4 = 39,
|
|
42
|
+
HTP_TYPE_COUNT
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
// These values are manually translated over to HTP
|
|
46
|
+
// !!!! DO NOT ALTER THE ORDER OF THE FIRST FOUR ENUMS !!!!
|
|
47
|
+
enum htp_op {
|
|
48
|
+
HTP_OP_MUL = 0,
|
|
49
|
+
HTP_OP_ADD = 1,
|
|
50
|
+
HTP_OP_SUB = 2,
|
|
51
|
+
HTP_OP_DIV = 3,
|
|
52
|
+
HTP_OP_MUL_MAT = 4,
|
|
53
|
+
HTP_OP_MUL_MAT_ID = 5,
|
|
54
|
+
HTP_OP_RMS_NORM = 6,
|
|
55
|
+
HTP_OP_UNARY_SILU = 7,
|
|
56
|
+
HTP_OP_UNARY_GELU = 8,
|
|
57
|
+
HTP_OP_GLU_SWIGLU = 9,
|
|
58
|
+
HTP_OP_GLU_SWIGLU_OAI = 10,
|
|
59
|
+
HTP_OP_SOFTMAX = 11,
|
|
60
|
+
HTP_OP_ADD_ID = 12,
|
|
61
|
+
HTP_OP_ROPE = 13,
|
|
62
|
+
HTP_OP_FLASH_ATTN_EXT = 14,
|
|
63
|
+
HTP_OP_SET_ROWS = 15,
|
|
64
|
+
HTP_OP_SCALE = 16,
|
|
65
|
+
HTP_OP_GET_ROWS = 17,
|
|
66
|
+
INVALID
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
static inline size_t htp_type_block_size(uint32_t t) {
|
|
70
|
+
switch (t) {
|
|
71
|
+
case HTP_TYPE_F32:
|
|
72
|
+
return 1;
|
|
73
|
+
case HTP_TYPE_F16:
|
|
74
|
+
return 1;
|
|
75
|
+
case HTP_TYPE_Q4_0:
|
|
76
|
+
return QK4_0;
|
|
77
|
+
case HTP_TYPE_Q8_0:
|
|
78
|
+
return QK8_0;
|
|
79
|
+
case HTP_TYPE_MXFP4:
|
|
80
|
+
return QK_MXFP4;
|
|
81
|
+
default:
|
|
82
|
+
assert(0 && "unsupported HTP data type");
|
|
83
|
+
}
|
|
84
|
+
return 0;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
static inline size_t htp_type_nbytes(uint32_t t) {
|
|
88
|
+
switch (t) {
|
|
89
|
+
case HTP_TYPE_F32:
|
|
90
|
+
return 4;
|
|
91
|
+
case HTP_TYPE_F16:
|
|
92
|
+
return 2;
|
|
93
|
+
case HTP_TYPE_Q4_0:
|
|
94
|
+
return sizeof(block_q4_0);
|
|
95
|
+
case HTP_TYPE_Q8_0:
|
|
96
|
+
return sizeof(block_q8_0);
|
|
97
|
+
case HTP_TYPE_MXFP4:
|
|
98
|
+
return sizeof(block_mxfp4);
|
|
99
|
+
default:
|
|
100
|
+
assert(0 && "unsupported HTP data type");
|
|
101
|
+
}
|
|
102
|
+
return 0;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
static const char * htp_type_name(uint32_t t) {
|
|
106
|
+
switch (t) {
|
|
107
|
+
case HTP_TYPE_F32:
|
|
108
|
+
return "fp32";
|
|
109
|
+
case HTP_TYPE_F16:
|
|
110
|
+
return "fp16";
|
|
111
|
+
case HTP_TYPE_Q4_0:
|
|
112
|
+
return "q4_0";
|
|
113
|
+
case HTP_TYPE_Q8_0:
|
|
114
|
+
return "q8_0";
|
|
115
|
+
case HTP_TYPE_MXFP4:
|
|
116
|
+
return "mxfp4";
|
|
117
|
+
}
|
|
118
|
+
return 0;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Internal types
|
|
122
|
+
#define QK_Q4_0x4x2 256 // 4x Q4_0 blocks packed with next 4x Q4_0 blocks (size in bytes 128)
|
|
123
|
+
#define QK_Q8_0x4x2 256 // 4x Q8_0 blocks concat with next 4x Q8_0 blocks
|
|
124
|
+
#define QK_MXFP4x4x2 256 // 4x MXFP4 blocks concat with next 4x MXFP4 blocks
|
|
125
|
+
|
|
126
|
+
#define HTP_MAX_DIMS 4
|
|
127
|
+
|
|
128
|
+
struct htp_tensor {
|
|
129
|
+
uint32_t data; // Buffer offset in the messages, and data pointer on the NSP
|
|
130
|
+
uint32_t type; // Data type
|
|
131
|
+
uint32_t ne[HTP_MAX_DIMS]; // Number of elements
|
|
132
|
+
uint32_t nb[HTP_MAX_DIMS]; // Stride in bytes (see ggml.h ggml_tensor)
|
|
133
|
+
};
|
|
134
|
+
|
|
135
|
+
#define HTP_MAX_OP_PARAMS 64
|
|
136
|
+
|
|
137
|
+
struct htp_general_req {
|
|
138
|
+
uint32_t op; // GGML/HTP Op
|
|
139
|
+
int32_t op_params[HTP_MAX_OP_PARAMS / sizeof(int32_t)];
|
|
140
|
+
// Params for the op, e.g. epsilon of RMS norm
|
|
141
|
+
uint32_t flags; // Request flags
|
|
142
|
+
|
|
143
|
+
struct htp_tensor src0; // Input0 tensor
|
|
144
|
+
struct htp_tensor src1; // Input1 tensor
|
|
145
|
+
struct htp_tensor src2; // Input2 tensor
|
|
146
|
+
struct htp_tensor src3; // Input3 tensor
|
|
147
|
+
struct htp_tensor src4; // Input4 tensor
|
|
148
|
+
struct htp_tensor dst; // Output tensor
|
|
149
|
+
|
|
150
|
+
// should be multiple of 64 bytes (cacheline)
|
|
151
|
+
};
|
|
152
|
+
|
|
153
|
+
struct htp_general_rsp {
|
|
154
|
+
uint32_t op; // GGML/HTP Op
|
|
155
|
+
uint32_t status; // HTP_STATUS_...
|
|
156
|
+
uint32_t prof_usecs; // Number of usec per request
|
|
157
|
+
uint32_t prof_cycles; // Number of cycles per request
|
|
158
|
+
uint32_t prof_pkts; // Number of instruction packets per request
|
|
159
|
+
uint8_t unused[44]; // Pad to 64 bytes
|
|
160
|
+
};
|
|
161
|
+
|
|
162
|
+
#define HTP_MAX_MESSAGE_SIZE sizeof(struct htp_general_req)
|
|
163
|
+
#define HTP_MAX_PACKET_BUFFERS 8
|
|
164
|
+
|
|
165
|
+
#endif /* HTP_MSG_H */
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
#ifndef HTP_OPS_H
|
|
2
|
+
#define HTP_OPS_H
|
|
3
|
+
|
|
4
|
+
#include "htp-ctx.h"
|
|
5
|
+
#include "htp-msg.h"
|
|
6
|
+
#include "worker-pool.h"
|
|
7
|
+
#include "ops-utils.h"
|
|
8
|
+
|
|
9
|
+
#include <assert.h>
|
|
10
|
+
#include <stdint.h>
|
|
11
|
+
|
|
12
|
+
// ggml-common.h must be included prior to this header
|
|
13
|
+
|
|
14
|
+
struct htp_spad {
|
|
15
|
+
uint8_t * data;
|
|
16
|
+
size_t stride;
|
|
17
|
+
size_t size;
|
|
18
|
+
size_t size_per_thread;
|
|
19
|
+
};
|
|
20
|
+
|
|
21
|
+
struct htp_ops_context {
|
|
22
|
+
struct htp_context * ctx;
|
|
23
|
+
|
|
24
|
+
enum htp_op op;
|
|
25
|
+
int32_t op_params[HTP_MAX_OP_PARAMS / sizeof(int32_t)];
|
|
26
|
+
|
|
27
|
+
struct htp_tensor src0;
|
|
28
|
+
struct htp_tensor src1;
|
|
29
|
+
struct htp_tensor src2;
|
|
30
|
+
struct htp_tensor src3;
|
|
31
|
+
struct htp_tensor src4;
|
|
32
|
+
struct htp_tensor dst;
|
|
33
|
+
|
|
34
|
+
struct htp_spad src0_spad;
|
|
35
|
+
struct htp_spad src1_spad;
|
|
36
|
+
struct htp_spad src2_spad;
|
|
37
|
+
struct htp_spad src3_spad;
|
|
38
|
+
struct htp_spad dst_spad;
|
|
39
|
+
|
|
40
|
+
worker_pool_context_t * wpool; // worker pool
|
|
41
|
+
uint32_t n_threads; // num threads
|
|
42
|
+
|
|
43
|
+
uint32_t src0_nrows_per_thread;
|
|
44
|
+
uint32_t src1_nrows_per_thread;
|
|
45
|
+
|
|
46
|
+
struct fastdiv_values src0_div1; // fastdiv values for ne1
|
|
47
|
+
struct fastdiv_values src0_div2; // fastdiv values for ne2
|
|
48
|
+
struct fastdiv_values src0_div3; // fastdiv values for ne3
|
|
49
|
+
struct fastdiv_values src0_div21; // fastdiv values for ne2 * ne1
|
|
50
|
+
|
|
51
|
+
struct fastdiv_values src1_div1; // fastdiv values for ne1
|
|
52
|
+
struct fastdiv_values src1_div2; // fastdiv values for ne2
|
|
53
|
+
struct fastdiv_values src1_div3; // fastdiv values for ne3
|
|
54
|
+
struct fastdiv_values src1_div21; // fastdiv values for ne2 * ne1
|
|
55
|
+
|
|
56
|
+
struct fastdiv_values src3_div1; // fastdiv values for ne1
|
|
57
|
+
struct fastdiv_values src3_div2; // fastdiv values for ne2
|
|
58
|
+
struct fastdiv_values src3_div3; // fastdiv values for ne3
|
|
59
|
+
struct fastdiv_values src3_div21; // fastdiv values for ne2 * ne1
|
|
60
|
+
|
|
61
|
+
struct fastdiv_values broadcast_rk2;
|
|
62
|
+
struct fastdiv_values broadcast_rk3;
|
|
63
|
+
struct fastdiv_values broadcast_rv2;
|
|
64
|
+
struct fastdiv_values broadcast_rv3;
|
|
65
|
+
|
|
66
|
+
struct fastdiv_values mm_div_ne12_ne1; // fastdiv values for ne12 * ne1
|
|
67
|
+
struct fastdiv_values mm_div_ne1; // fastdiv values for ne1
|
|
68
|
+
struct fastdiv_values mm_div_r2; // fastdiv values for ne12 / ne02
|
|
69
|
+
struct fastdiv_values mm_div_r3; // fastdiv values for ne13 / ne03
|
|
70
|
+
|
|
71
|
+
struct fastdiv_values set_rows_div_ne12; // fastdiv values for ne12
|
|
72
|
+
struct fastdiv_values set_rows_div_ne11; // fastdiv values for ne11
|
|
73
|
+
|
|
74
|
+
struct fastdiv_values get_rows_div_ne10; // fastdiv values for ne10
|
|
75
|
+
struct fastdiv_values get_rows_div_ne10_ne11; // fastdiv values for ne10 * ne11
|
|
76
|
+
|
|
77
|
+
uint32_t flags;
|
|
78
|
+
};
|
|
79
|
+
|
|
80
|
+
int op_matmul(struct htp_ops_context * octx);
|
|
81
|
+
int op_matmul_id(struct htp_ops_context * octx);
|
|
82
|
+
int op_binary(struct htp_ops_context * octx);
|
|
83
|
+
int op_unary(struct htp_ops_context * octx);
|
|
84
|
+
int op_activations(struct htp_ops_context * octx);
|
|
85
|
+
int op_softmax(struct htp_ops_context * octx);
|
|
86
|
+
int op_add_id(struct htp_ops_context * octx);
|
|
87
|
+
int op_rope(struct htp_ops_context * octx);
|
|
88
|
+
int op_flash_attn_ext(struct htp_ops_context * octx);
|
|
89
|
+
int op_set_rows(struct htp_ops_context * octx);
|
|
90
|
+
int op_get_rows(struct htp_ops_context * octx);
|
|
91
|
+
|
|
92
|
+
#endif /* HTP_OPS_H */
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
// FastRPC IDL interface for GGML HTP
|
|
2
|
+
|
|
3
|
+
#ifndef HTP_IDL
|
|
4
|
+
#define HTP_IDL
|
|
5
|
+
|
|
6
|
+
#include "AEEStdDef.idl"
|
|
7
|
+
#include "remote.idl"
|
|
8
|
+
|
|
9
|
+
interface htp_iface : remote_handle64 {
|
|
10
|
+
AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx);
|
|
11
|
+
AEEResult stop();
|
|
12
|
+
AEEResult enable_etm();
|
|
13
|
+
AEEResult disable_etm();
|
|
14
|
+
};
|
|
15
|
+
|
|
16
|
+
#endif /* HTP_IDL */
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
#pragma clang diagnostic ignored "-Wunused-variable"
|
|
2
|
+
#pragma clang diagnostic ignored "-Wunused-function"
|
|
3
|
+
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
|
4
|
+
|
|
5
|
+
#include <hexagon_protos.h>
|
|
6
|
+
#include <hexagon_types.h>
|
|
7
|
+
#include <math.h>
|
|
8
|
+
#include <string.h>
|
|
9
|
+
|
|
10
|
+
#define GGML_COMMON_DECL_C
|
|
11
|
+
#include "ggml-common.h"
|
|
12
|
+
#include "htp-ctx.h"
|
|
13
|
+
#include "htp-dma.h"
|
|
14
|
+
#include "htp-msg.h"
|
|
15
|
+
#include "htp-ops.h"
|
|
16
|
+
#include "hvx-utils.h"
|
|
17
|
+
#include "ops-utils.h"
|
|
18
|
+
|
|
19
|
+
static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec, HVX_Vector max_exp, HVX_Vector inf) {
|
|
20
|
+
const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
|
|
21
|
+
|
|
22
|
+
HVX_Vector out = hvx_vec_exp_fp32(in_vec);
|
|
23
|
+
|
|
24
|
+
return Q6_V_vmux_QVV(pred0, inf, out);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate) {
|
|
28
|
+
int left_over = num_elems & (VLEN_FP32 - 1);
|
|
29
|
+
int num_elems_whole = num_elems - left_over;
|
|
30
|
+
|
|
31
|
+
int unaligned_addr = 0;
|
|
32
|
+
int unaligned_loop = 0;
|
|
33
|
+
if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
|
|
34
|
+
FARF(HIGH, "hvx_exp_f32: unaligned address in hvx op, possibly slower execution\n");
|
|
35
|
+
unaligned_addr = 1;
|
|
36
|
+
}
|
|
37
|
+
// assert((0 == unaligned_addr) || (0 == num_elems_whole));
|
|
38
|
+
if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
|
|
39
|
+
unaligned_loop = 1;
|
|
40
|
+
FARF(HIGH, "hvx_exp_f32: unaligned loop in hvx op, possibly slower execution\n");
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
HVX_Vector vec_out = Q6_V_vzero();
|
|
44
|
+
|
|
45
|
+
static const float kInf = INFINITY;
|
|
46
|
+
static const float kMaxExp = 88.02f; // log(INF)
|
|
47
|
+
|
|
48
|
+
const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
|
|
49
|
+
const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
|
|
50
|
+
|
|
51
|
+
if (0 == unaligned_loop) {
|
|
52
|
+
HVX_Vector * p_vec_in1 = (HVX_Vector *) src;
|
|
53
|
+
HVX_Vector * p_vec_out = (HVX_Vector *) dst;
|
|
54
|
+
|
|
55
|
+
#pragma unroll(4)
|
|
56
|
+
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
|
57
|
+
if (true == negate) {
|
|
58
|
+
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++);
|
|
59
|
+
*p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
|
|
60
|
+
} else {
|
|
61
|
+
*p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++, max_exp, inf);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
} else {
|
|
65
|
+
#pragma unroll(4)
|
|
66
|
+
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
|
67
|
+
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
|
68
|
+
|
|
69
|
+
if (true == negate) {
|
|
70
|
+
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
|
|
71
|
+
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
|
|
72
|
+
} else {
|
|
73
|
+
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in, max_exp, inf);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
if (left_over > 0) {
|
|
79
|
+
const float * srcf = (float *) src + num_elems_whole;
|
|
80
|
+
float * dstf = (float *) dst + num_elems_whole;
|
|
81
|
+
|
|
82
|
+
HVX_Vector in = *(HVX_UVector *) srcf;
|
|
83
|
+
|
|
84
|
+
if (true == negate) {
|
|
85
|
+
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
|
|
86
|
+
|
|
87
|
+
vec_out = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
|
|
88
|
+
} else {
|
|
89
|
+
vec_out = hvx_vec_exp_fp32_guard(in, max_exp, inf);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
#pragma clang diagnostic ignored "-Wunused-variable"
|
|
2
|
+
#pragma clang diagnostic ignored "-Wunused-function"
|
|
3
|
+
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
|
4
|
+
|
|
5
|
+
#include <hexagon_protos.h>
|
|
6
|
+
#include <hexagon_types.h>
|
|
7
|
+
#include <math.h>
|
|
8
|
+
#include <string.h>
|
|
9
|
+
|
|
10
|
+
#define GGML_COMMON_DECL_C
|
|
11
|
+
#include "ggml-common.h"
|
|
12
|
+
#include "htp-ctx.h"
|
|
13
|
+
#include "htp-dma.h"
|
|
14
|
+
#include "htp-msg.h"
|
|
15
|
+
#include "htp-ops.h"
|
|
16
|
+
#include "hvx-utils.h"
|
|
17
|
+
#include "ops-utils.h"
|
|
18
|
+
|
|
19
|
+
static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) {
|
|
20
|
+
HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
|
|
21
|
+
|
|
22
|
+
HVX_Vector masked_out = Q6_V_vand_VV(out, nan_inf_mask);
|
|
23
|
+
const HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out);
|
|
24
|
+
|
|
25
|
+
return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
|
|
29
|
+
int left_over = num_elems & (VLEN_FP32 - 1);
|
|
30
|
+
int num_elems_whole = num_elems - left_over;
|
|
31
|
+
|
|
32
|
+
int unaligned_addr = 0;
|
|
33
|
+
int unaligned_loop = 0;
|
|
34
|
+
if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
|
|
35
|
+
FARF(HIGH, "hvx_inverse_f32: unaligned address in hvx op, possibly slower execution\n");
|
|
36
|
+
unaligned_addr = 1;
|
|
37
|
+
}
|
|
38
|
+
// assert((0 == unaligned_addr) || (0 == num_elems_whole));
|
|
39
|
+
if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
|
|
40
|
+
unaligned_loop = 1;
|
|
41
|
+
FARF(HIGH, "hvx_inverse_f32: unaligned loop in hvx op, possibly slower execution\n");
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
static const uint32_t kNanInfMask = 0x7f800000;
|
|
45
|
+
const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(kNanInfMask);
|
|
46
|
+
|
|
47
|
+
if (0 == unaligned_loop) {
|
|
48
|
+
HVX_Vector * p_vec_in = (HVX_Vector *) src;
|
|
49
|
+
HVX_Vector * p_vec_out = (HVX_Vector *) dst;
|
|
50
|
+
|
|
51
|
+
#pragma unroll(4)
|
|
52
|
+
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
|
53
|
+
*p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++, nan_inf_mask);
|
|
54
|
+
}
|
|
55
|
+
} else {
|
|
56
|
+
#pragma unroll(4)
|
|
57
|
+
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
|
58
|
+
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
|
59
|
+
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in, nan_inf_mask);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
if (left_over > 0) {
|
|
64
|
+
const float * srcf = (float *) src + num_elems_whole;
|
|
65
|
+
float * dstf = (float *) dst + num_elems_whole;
|
|
66
|
+
|
|
67
|
+
HVX_Vector in = *(HVX_UVector *) srcf;
|
|
68
|
+
HVX_Vector out = hvx_vec_inverse_fp32_guard(in, nan_inf_mask);
|
|
69
|
+
|
|
70
|
+
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
|
|
71
|
+
}
|
|
72
|
+
}
|