whispercpp 1.3.4 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -43
- data/ext/extconf.rb +2 -2
- data/ext/ruby_whisper.c +14 -2
- data/ext/ruby_whisper.h +39 -0
- data/ext/ruby_whisper_context.c +22 -22
- data/ext/ruby_whisper_model.c +12 -12
- data/ext/ruby_whisper_params.c +47 -23
- data/ext/ruby_whisper_segment.c +84 -19
- data/ext/ruby_whisper_token.c +351 -0
- data/ext/ruby_whisper_transcribe.cpp +1 -1
- data/ext/ruby_whisper_vad_context.c +75 -0
- data/ext/ruby_whisper_vad_context_detect.cpp +50 -0
- data/ext/ruby_whisper_vad_segment.c +139 -0
- data/ext/ruby_whisper_vad_segments.c +106 -0
- data/ext/sources/CMakeLists.txt +4 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/arm64-apple-clang.cmake +16 -0
- data/ext/sources/cmake/arm64-windows-llvm.cmake +16 -0
- data/ext/sources/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
- data/ext/sources/cmake/x64-windows-llvm.cmake +5 -0
- data/ext/sources/examples/addon.node/vad-example.js +2 -2
- data/ext/sources/examples/cli/cli.cpp +121 -112
- data/ext/sources/examples/lsp/CMakeLists.txt +2 -1
- data/ext/sources/examples/quantize/CMakeLists.txt +2 -1
- data/ext/sources/examples/server/server.cpp +10 -11
- data/ext/sources/examples/talk-llama/CMakeLists.txt +5 -1
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +12 -3
- data/ext/sources/examples/talk-llama/llama-adapter.h +7 -1
- data/ext/sources/examples/talk-llama/llama-arch.cpp +2046 -1974
- data/ext/sources/examples/talk-llama/llama-arch.h +67 -2
- data/ext/sources/examples/talk-llama/llama-batch.cpp +75 -33
- data/ext/sources/examples/talk-llama/llama-batch.h +17 -4
- data/ext/sources/examples/talk-llama/llama-chat.cpp +79 -3
- data/ext/sources/examples/talk-llama/llama-chat.h +4 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +775 -78
- data/ext/sources/examples/talk-llama/llama-context.h +57 -9
- data/ext/sources/examples/talk-llama/llama-cparams.h +1 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +288 -53
- data/ext/sources/examples/talk-llama/llama-grammar.h +22 -1
- data/ext/sources/examples/talk-llama/llama-graph.cpp +381 -64
- data/ext/sources/examples/talk-llama/llama-graph.h +103 -13
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +26 -2
- data/ext/sources/examples/talk-llama/llama-hparams.h +41 -10
- data/ext/sources/examples/talk-llama/llama-impl.cpp +7 -3
- data/ext/sources/examples/talk-llama/llama-impl.h +1 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +5 -3
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +145 -65
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +22 -7
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +44 -2
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +12 -10
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +32 -19
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +2 -2
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +172 -37
- data/ext/sources/examples/talk-llama/llama-mmap.h +8 -3
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +91 -9
- data/ext/sources/examples/talk-llama/llama-model-loader.h +6 -0
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +3 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +1529 -13134
- data/ext/sources/examples/talk-llama/llama-model.h +44 -3
- data/ext/sources/examples/talk-llama/llama-quant.cpp +8 -23
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +1294 -198
- data/ext/sources/examples/talk-llama/llama-sampling.h +19 -7
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +133 -37
- data/ext/sources/examples/talk-llama/llama-vocab.h +45 -40
- data/ext/sources/examples/talk-llama/llama.cpp +729 -2
- data/ext/sources/examples/talk-llama/llama.h +152 -14
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +191 -0
- data/ext/sources/examples/talk-llama/models/apertus.cpp +125 -0
- data/ext/sources/examples/talk-llama/models/arcee.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/arctic.cpp +138 -0
- data/ext/sources/examples/talk-llama/models/arwkv7.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/bert.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +160 -0
- data/ext/sources/examples/talk-llama/models/bloom.cpp +101 -0
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +111 -0
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +102 -0
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +134 -0
- data/ext/sources/examples/talk-llama/models/command-r.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/deci.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +259 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +134 -0
- data/ext/sources/examples/talk-llama/models/dream.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +114 -0
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +113 -0
- data/ext/sources/examples/talk-llama/models/falcon.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +116 -0
- data/ext/sources/examples/talk-llama/models/gemma.cpp +112 -0
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +155 -0
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +384 -0
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +170 -0
- data/ext/sources/examples/talk-llama/models/glm4.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +196 -0
- data/ext/sources/examples/talk-llama/models/granite.cpp +211 -0
- data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +283 -0
- data/ext/sources/examples/talk-llama/models/grok.cpp +159 -0
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +141 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +154 -0
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/jais.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +106 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +175 -0
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/llada.cpp +99 -0
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/llama.cpp +168 -0
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/mamba.cpp +55 -0
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +199 -0
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +160 -0
- data/ext/sources/examples/talk-llama/models/models.h +569 -0
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +116 -0
- data/ext/sources/examples/talk-llama/models/mpt.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +104 -0
- data/ext/sources/examples/talk-llama/models/olmo.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +127 -0
- data/ext/sources/examples/talk-llama/models/openelm.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/orion.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi3.cpp +152 -0
- data/ext/sources/examples/talk-llama/models/plamo.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +316 -0
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/plm.cpp +168 -0
- data/ext/sources/examples/talk-llama/models/qwen.cpp +108 -0
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +151 -0
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +873 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +149 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +141 -0
- data/ext/sources/examples/talk-llama/models/refact.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +162 -0
- data/ext/sources/examples/talk-llama/models/rwkv6.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/rwkv7.cpp +90 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +146 -0
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +100 -0
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +166 -0
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +96 -0
- data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +149 -0
- data/ext/sources/examples/talk-llama/models/xverse.cpp +108 -0
- data/ext/sources/examples/talk-llama/unicode.cpp +102 -16
- data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +1 -1
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +1 -1
- data/ext/sources/ggml/CMakeLists.txt +82 -54
- data/ext/sources/ggml/include/ggml-alloc.h +9 -0
- data/ext/sources/ggml/include/ggml-backend.h +4 -1
- data/ext/sources/ggml/include/ggml-cpu.h +1 -0
- data/ext/sources/ggml/include/ggml-hexagon.h +19 -0
- data/ext/sources/ggml/include/ggml-rpc.h +8 -11
- data/ext/sources/ggml/include/ggml-zendnn.h +22 -0
- data/ext/sources/ggml/include/ggml.h +190 -12
- data/ext/sources/ggml/src/CMakeLists.txt +82 -11
- data/ext/sources/ggml/src/ggml-alloc.c +124 -41
- data/ext/sources/ggml/src/ggml-backend-impl.h +1 -4
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +27 -3
- data/ext/sources/ggml/src/ggml-backend.cpp +71 -21
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +17 -3
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -9
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +138 -47
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +2179 -1696
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +238 -317
- data/ext/sources/ggml/src/ggml-cann/common.h +283 -208
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +626 -776
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +156 -86
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1004 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6 -6
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +50 -2
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -3
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +195 -71
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +573 -106
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +33 -44
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +298 -112
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +819 -125
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +708 -431
- data/ext/sources/ggml/src/ggml-cpu/ops.h +5 -4
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +671 -31
- data/ext/sources/ggml/src/ggml-cpu/repack.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +41 -43
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +3 -2
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +7 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +124 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.h +261 -146
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +72 -1
- data/ext/sources/ggml/src/ggml-cuda/argmax.cu +2 -2
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +123 -6
- data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +16 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +353 -80
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +339 -246
- data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +1 -5
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cu +307 -0
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/diag.cu +77 -0
- data/ext/sources/ggml/src/ggml-cuda/diag.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +31 -21
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +663 -596
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +35 -741
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +1241 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +30 -37
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +14 -13
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +83 -37
- data/ext/sources/ggml/src/ggml-cuda/fill.cu +37 -0
- data/ext/sources/ggml/src/ggml-cuda/fill.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1155 -164
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +5 -4
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +741 -48
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +60 -12
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +381 -42
- data/ext/sources/ggml/src/ggml-cuda/mmid.cu +164 -0
- data/ext/sources/ggml/src/ggml-cuda/mmid.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +69 -176
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +498 -171
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +375 -79
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +3 -2
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +241 -95
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +64 -33
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +151 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +14 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +192 -77
- data/ext/sources/ggml/src/ggml-cuda/rope.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +101 -47
- data/ext/sources/ggml/src/ggml-cuda/set.cu +39 -0
- data/ext/sources/ggml/src/ggml-cuda/set.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +203 -6
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +275 -0
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +14 -20
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +49 -84
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +19 -1
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +96 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +168 -76
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +11 -4
- data/ext/sources/ggml/src/ggml-cuda/tri.cu +136 -0
- data/ext/sources/ggml/src/ggml-cuda/tri.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +105 -11
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +36 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +163 -7
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +12 -1
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +6 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +80 -0
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3151 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +44 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +682 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +566 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +112 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.c +63 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +157 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +165 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +92 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +94 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +72 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +1020 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +1353 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +1001 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2503 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +487 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +168 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +287 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +454 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +221 -0
- data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +153 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +8 -13
- data/ext/sources/ggml/src/ggml-impl.h +67 -6
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +2 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +29 -20
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +652 -285
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +103 -56
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +496 -118
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +231 -9
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +1227 -224
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +12 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +14 -8
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1972 -704
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +3 -1
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +11 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1430 -120
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +63 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +82 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +4 -3
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
- data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +35 -16
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +438 -156
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +48 -3
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +77 -0
- data/ext/sources/ggml/src/ggml-sycl/add-id.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +0 -9
- data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +0 -6
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +117 -15
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +55 -44
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +34 -0
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +79 -0
- data/ext/sources/ggml/src/ggml-sycl/count-equal.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +0 -3
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +18 -0
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +76 -3
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +333 -300
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +10 -2
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +335 -110
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +156 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.cpp +97 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.hpp +24 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.cpp +122 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +30 -17
- data/ext/sources/ggml/src/ggml-sycl/set.cpp +73 -0
- data/ext/sources/ggml/src/ggml-sycl/set.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +327 -162
- data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +4 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +58 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +5013 -2859
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +47 -49
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +9 -21
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +39 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +19 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +45 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +50 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +17 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +2 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +19 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mat_vec_base.comp → mul_mat_vec_base.glsl} +70 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +71 -21
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +41 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +44 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +39 -36
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +494 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +78 -103
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +34 -23
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mm_funcs.comp → mul_mm_funcs.glsl} +69 -59
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +72 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +88 -228
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +21 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +10 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +50 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +234 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +6 -50
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +6 -33
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +6 -33
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +6 -39
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +2 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +43 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +345 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +90 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +335 -151
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +28 -2
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +169 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1964 -435
- data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +33 -10
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +591 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +1 -1
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +6 -6
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +83 -17
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +112 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +483 -0
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +466 -0
- data/ext/sources/ggml/src/ggml.c +425 -33
- data/ext/sources/include/whisper.h +1 -0
- data/ext/sources/src/CMakeLists.txt +3 -1
- data/ext/sources/src/whisper.cpp +101 -35
- data/ext/sources/tests/CMakeLists.txt +2 -2
- data/ext/sources/tests/test-vad-full.cpp +4 -2
- data/ext/sources/tests/test-vad.cpp +1 -1
- data/extsources.rb +1 -0
- data/lib/whisper/model/uri.rb +17 -18
- data/sig/whisper.rbs +119 -2
- data/test/test_params.rb +16 -8
- data/test/test_segment.rb +0 -1
- data/test/test_token.rb +70 -0
- data/test/test_vad.rb +1 -1
- data/test/test_vad_context.rb +50 -0
- data/test/test_vad_segment.rb +19 -0
- data/test/test_vad_segments.rb +16 -0
- data/test/test_whisper.rb +7 -0
- data/whispercpp.gemspec +1 -1
- metadata +287 -34
- data/ext/sources/build-xcframework.sh +0 -571
- data/ext/sources/ggml/src/ggml-cann/Doxyfile +0 -2579
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +0 -44
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +0 -41
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +0 -44
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +0 -41
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +0 -48
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
#pragma clang diagnostic ignored "-Wunused-variable"
|
|
2
|
+
#pragma clang diagnostic ignored "-Wunused-function"
|
|
3
|
+
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
|
4
|
+
|
|
5
|
+
#ifdef HTP_DEBUG
|
|
6
|
+
# define FARF_HIGH 1
|
|
7
|
+
#endif
|
|
8
|
+
|
|
9
|
+
#include <HAP_farf.h>
|
|
10
|
+
#include <HAP_mem.h>
|
|
11
|
+
#include <HAP_perf.h>
|
|
12
|
+
#include <HAP_ps.h>
|
|
13
|
+
#include <hexagon_protos.h>
|
|
14
|
+
#include <hexagon_types.h>
|
|
15
|
+
#include <math.h>
|
|
16
|
+
#include <qurt_thread.h>
|
|
17
|
+
#include <string.h>
|
|
18
|
+
|
|
19
|
+
#define GGML_COMMON_DECL_C
|
|
20
|
+
#include "ggml-common.h"
|
|
21
|
+
#include "htp-ctx.h"
|
|
22
|
+
#include "htp-dma.h"
|
|
23
|
+
#include "htp-msg.h"
|
|
24
|
+
#include "htp-ops.h"
|
|
25
|
+
#include "hvx-utils.h"
|
|
26
|
+
#include "ops-utils.h"
|
|
27
|
+
|
|
28
|
+
#define htp_unary_preamble \
|
|
29
|
+
const uint32_t ne00 = src->ne[0]; \
|
|
30
|
+
const uint32_t ne01 = src->ne[1]; \
|
|
31
|
+
const uint32_t ne02 = src->ne[2]; \
|
|
32
|
+
const uint32_t ne03 = src->ne[3]; \
|
|
33
|
+
\
|
|
34
|
+
const uint32_t ne0 = dst->ne[0]; \
|
|
35
|
+
const uint32_t ne1 = dst->ne[1]; \
|
|
36
|
+
const uint32_t ne2 = dst->ne[2]; \
|
|
37
|
+
const uint32_t ne3 = dst->ne[3]; \
|
|
38
|
+
\
|
|
39
|
+
const uint32_t nb00 = src->nb[0]; \
|
|
40
|
+
const uint32_t nb01 = src->nb[1]; \
|
|
41
|
+
const uint32_t nb02 = src->nb[2]; \
|
|
42
|
+
const uint32_t nb03 = src->nb[3]; \
|
|
43
|
+
\
|
|
44
|
+
const uint32_t nb0 = dst->nb[0]; \
|
|
45
|
+
const uint32_t nb1 = dst->nb[1]; \
|
|
46
|
+
const uint32_t nb2 = dst->nb[2]; \
|
|
47
|
+
const uint32_t nb3 = dst->nb[3];
|
|
48
|
+
|
|
49
|
+
static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
|
|
50
|
+
uint8_t * restrict dst,
|
|
51
|
+
uint8_t * restrict pad,
|
|
52
|
+
const int num_elems,
|
|
53
|
+
float epsilon) {
|
|
54
|
+
const HVX_Vector * restrict v_src = (HVX_Vector *) src;
|
|
55
|
+
HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
|
|
56
|
+
|
|
57
|
+
HVX_Vector sum_v = Q6_V_vsplat_R(0x00000000);
|
|
58
|
+
HVX_Vector epsilon_v = hvx_vec_splat_fp32(epsilon);
|
|
59
|
+
|
|
60
|
+
int step_of_1 = num_elems >> 5;
|
|
61
|
+
#pragma unroll(4)
|
|
62
|
+
for (int i = 0; i < step_of_1; i++) {
|
|
63
|
+
HVX_Vector v1 = v_src[i];
|
|
64
|
+
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
|
|
65
|
+
sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
HVX_Vector reduced_sum = hvx_vec_qf32_reduce_sum(sum_v);
|
|
69
|
+
sum_v = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(reduced_sum));
|
|
70
|
+
|
|
71
|
+
HVX_Vector t_v = hvx_vec_splat_fp32((float) num_elems);
|
|
72
|
+
HVX_Vector denom_v = hvx_vec_inverse_fp32(t_v);
|
|
73
|
+
HVX_Vector mean_v = Q6_Vqf32_vmpy_VsfVsf(sum_v, denom_v);
|
|
74
|
+
HVX_Vector mean_epsilon_v = Q6_Vqf32_vadd_Vqf32Vsf(mean_v, epsilon_v);
|
|
75
|
+
|
|
76
|
+
HVX_Vector scale_v = hvx_vec_rsqrt_fp32(Q6_Vsf_equals_Vqf32(mean_epsilon_v));
|
|
77
|
+
|
|
78
|
+
#pragma unroll(4)
|
|
79
|
+
for (int i = 0; i < step_of_1; i++) {
|
|
80
|
+
HVX_Vector v1 = v_src[i];
|
|
81
|
+
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v);
|
|
82
|
+
v_dst[i] = Q6_Vsf_equals_Vqf32(v2);
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
static void scale_htp_f32(const float * restrict src,
|
|
87
|
+
float * restrict dst,
|
|
88
|
+
uint8_t * restrict spad,
|
|
89
|
+
const uint32_t num_rows,
|
|
90
|
+
const uint32_t row_elems,
|
|
91
|
+
const size_t row_size,
|
|
92
|
+
int32_t * op_params,
|
|
93
|
+
int opt_path) {
|
|
94
|
+
float scale = 0.f;
|
|
95
|
+
float bias = 0.f;
|
|
96
|
+
memcpy(&scale, &op_params[0], sizeof(float));
|
|
97
|
+
memcpy(&bias, &op_params[1], sizeof(float));
|
|
98
|
+
|
|
99
|
+
for (uint32_t ir = 0; ir < num_rows; ir++) {
|
|
100
|
+
const float * restrict src_local = src + (ir * row_elems);
|
|
101
|
+
float * restrict dst_local = dst + (ir * row_elems);
|
|
102
|
+
|
|
103
|
+
if (ir + 1 < num_rows) {
|
|
104
|
+
htp_l2fetch(src_local + row_elems, 1, row_size, row_size);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
hvx_scale_offset_f32((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems, scale, bias);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
static void rms_norm_htp_f32(const float * restrict src,
|
|
112
|
+
float * restrict dst,
|
|
113
|
+
uint8_t * restrict spad,
|
|
114
|
+
const uint32_t num_rows,
|
|
115
|
+
const uint32_t row_elems,
|
|
116
|
+
const size_t row_size,
|
|
117
|
+
int32_t * op_params,
|
|
118
|
+
int opt_path) {
|
|
119
|
+
float epsilon = 0.f;
|
|
120
|
+
memcpy(&epsilon, op_params, sizeof(float));
|
|
121
|
+
|
|
122
|
+
for (uint32_t ir = 0; ir < num_rows; ir++) {
|
|
123
|
+
const float * restrict src_local = src + (ir * row_elems);
|
|
124
|
+
float * restrict dst_local = dst + (ir * row_elems);
|
|
125
|
+
|
|
126
|
+
if (ir + 1 < num_rows) {
|
|
127
|
+
htp_l2fetch(src_local + row_elems, 1, row_size, row_size);
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
if (1 == opt_path) {
|
|
131
|
+
hvx_fast_rms_norm_f32((const uint8_t *) src_local, (uint8_t *) dst_local, spad, row_elems, epsilon);
|
|
132
|
+
} else {
|
|
133
|
+
float sum = hvx_sum_of_squares_f32((const uint8_t *) src_local, row_elems);
|
|
134
|
+
|
|
135
|
+
const float mean = sum / row_elems;
|
|
136
|
+
const float scale = 1.0f / sqrtf(mean + epsilon);
|
|
137
|
+
|
|
138
|
+
hvx_scale_f32((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems, scale);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
static void unary_job_f32_per_thread(const struct htp_tensor * src,
|
|
144
|
+
struct htp_tensor * dst,
|
|
145
|
+
uint8_t * spad,
|
|
146
|
+
int htp_op,
|
|
147
|
+
int32_t * op_params,
|
|
148
|
+
uint32_t nth,
|
|
149
|
+
uint32_t ith,
|
|
150
|
+
uint32_t src0_nrows_per_thread) {
|
|
151
|
+
htp_unary_preamble;
|
|
152
|
+
|
|
153
|
+
const size_t src0_row_size = nb01;
|
|
154
|
+
const size_t dst_row_size = nb1;
|
|
155
|
+
|
|
156
|
+
const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
|
|
157
|
+
|
|
158
|
+
const uint32_t src0_start_row = src0_nrows_per_thread * ith;
|
|
159
|
+
const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
|
|
160
|
+
|
|
161
|
+
// no work for this thread
|
|
162
|
+
if (src0_start_row >= src0_end_row) {
|
|
163
|
+
return;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
uint64_t t1, t2;
|
|
167
|
+
t1 = HAP_perf_get_qtimer_count();
|
|
168
|
+
|
|
169
|
+
int is_aligned = 1;
|
|
170
|
+
int opt_path = 0;
|
|
171
|
+
if ((0 == htp_is_aligned((void *) src->data, VLEN)) || (0 == htp_is_aligned((void *) dst->data, VLEN))) {
|
|
172
|
+
is_aligned = 0;
|
|
173
|
+
FARF(HIGH, "unary-f32: unaligned addresses in unary op, possibly slower execution\n");
|
|
174
|
+
}
|
|
175
|
+
if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
|
|
176
|
+
opt_path = 1;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
const uint8_t * restrict data_src = (const uint8_t *) src->data;
|
|
180
|
+
uint8_t * restrict data_dst = (uint8_t *) dst->data;
|
|
181
|
+
|
|
182
|
+
const float * restrict src_th = (float *) (data_src + (src0_start_row * src0_row_size));
|
|
183
|
+
float * restrict dst_th = (float *) (data_dst + (src0_start_row * dst_row_size));
|
|
184
|
+
uint8_t * restrict spad_th = (uint8_t *) spad + (ith * nb01);
|
|
185
|
+
|
|
186
|
+
switch (htp_op) {
|
|
187
|
+
case HTP_OP_RMS_NORM:
|
|
188
|
+
rms_norm_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
|
|
189
|
+
break;
|
|
190
|
+
case HTP_OP_SCALE:
|
|
191
|
+
scale_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
|
|
192
|
+
break;
|
|
193
|
+
|
|
194
|
+
default:
|
|
195
|
+
break;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
t2 = HAP_perf_get_qtimer_count();
|
|
199
|
+
|
|
200
|
+
FARF(HIGH, "unary-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, src->ne[0],
|
|
201
|
+
src->ne[1], src->ne[2], src->ne[3], src0_start_row, src0_end_row, dst->ne[0], dst->ne[1], dst->ne[2],
|
|
202
|
+
dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
static void unary_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {
|
|
206
|
+
struct htp_ops_context * octx = (struct htp_ops_context *) data;
|
|
207
|
+
|
|
208
|
+
unary_job_f32_per_thread(&octx->src0, &octx->dst, octx->src0_spad.data, octx->op, octx->op_params, n, i,
|
|
209
|
+
octx->src0_nrows_per_thread);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
static int execute_op_unary_f32(struct htp_ops_context * octx) {
|
|
213
|
+
int err = HTP_STATUS_OK;
|
|
214
|
+
|
|
215
|
+
const struct htp_tensor * src0 = &octx->src0;
|
|
216
|
+
struct htp_tensor * dst = &octx->dst;
|
|
217
|
+
|
|
218
|
+
worker_callback_t unary_op_func;
|
|
219
|
+
const char * op_type = NULL;
|
|
220
|
+
|
|
221
|
+
switch (octx->op) {
|
|
222
|
+
case HTP_OP_RMS_NORM:
|
|
223
|
+
unary_op_func = unary_job_dispatcher_f32;
|
|
224
|
+
op_type = "rmsnorm-f32";
|
|
225
|
+
break;
|
|
226
|
+
case HTP_OP_SCALE:
|
|
227
|
+
unary_op_func = unary_job_dispatcher_f32;
|
|
228
|
+
op_type = "scale-f32";
|
|
229
|
+
break;
|
|
230
|
+
|
|
231
|
+
default:
|
|
232
|
+
FARF(ERROR, "Unsupported unary Op %u\n", octx->op);
|
|
233
|
+
return HTP_STATUS_NO_SUPPORT;
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
const int n_threads = octx->n_threads;
|
|
237
|
+
const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
|
|
238
|
+
|
|
239
|
+
const size_t src0_row_size = src0->nb[1];
|
|
240
|
+
const size_t dst_row_size = dst->nb[1];
|
|
241
|
+
|
|
242
|
+
// VTCM scratchpads for all tensors
|
|
243
|
+
octx->dst_spad.size = htp_round_up(dst_row_size, 128) * n_threads;
|
|
244
|
+
octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;
|
|
245
|
+
|
|
246
|
+
size_t spad_size = octx->src0_spad.size + octx->dst_spad.size;
|
|
247
|
+
|
|
248
|
+
FARF(HIGH, "%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
|
|
249
|
+
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
|
250
|
+
octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
|
|
251
|
+
|
|
252
|
+
// Make sure the reserved vtcm size is sufficient
|
|
253
|
+
if (octx->ctx->vtcm_size < spad_size) {
|
|
254
|
+
FARF(ERROR, "unary-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
|
|
255
|
+
spad_size);
|
|
256
|
+
return HTP_STATUS_VTCM_TOO_SMALL;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
octx->src0_spad.data = octx->ctx->vtcm_base;
|
|
260
|
+
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size;
|
|
261
|
+
|
|
262
|
+
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
|
|
263
|
+
uint32_t n_jobs = MIN(n_threads, src0_nrows);
|
|
264
|
+
|
|
265
|
+
octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
|
|
266
|
+
|
|
267
|
+
worker_pool_run_func(octx->ctx->worker_pool, unary_op_func, octx, n_jobs);
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
return err;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
int op_unary(struct htp_ops_context * octx) {
|
|
274
|
+
int err = HTP_STATUS_OK;
|
|
275
|
+
|
|
276
|
+
switch (octx->src0.type) {
|
|
277
|
+
case HTP_TYPE_F32:
|
|
278
|
+
err = execute_op_unary_f32(octx);
|
|
279
|
+
break;
|
|
280
|
+
|
|
281
|
+
default:
|
|
282
|
+
err = HTP_STATUS_NO_SUPPORT;
|
|
283
|
+
break;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
return err;
|
|
287
|
+
}
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
#include "worker-pool.h"
|
|
2
|
+
|
|
3
|
+
#include <qurt.h>
|
|
4
|
+
#include <stdatomic.h>
|
|
5
|
+
#include <stdint.h>
|
|
6
|
+
#include <stdio.h>
|
|
7
|
+
#include <stdlib.h>
|
|
8
|
+
#include <string.h>
|
|
9
|
+
|
|
10
|
+
#ifdef HTP_DEBUG
|
|
11
|
+
# define FARF_HIGH 1
|
|
12
|
+
#endif
|
|
13
|
+
|
|
14
|
+
#include "HAP_farf.h"
|
|
15
|
+
|
|
16
|
+
#define WORKER_THREAD_STACK_SZ (2 * 16384)
|
|
17
|
+
#define LOWEST_USABLE_QURT_PRIO (254)
|
|
18
|
+
|
|
19
|
+
struct worker_pool_s;
|
|
20
|
+
|
|
21
|
+
// internal structure kept in thread-local storage per instance of worker pool
|
|
22
|
+
typedef struct {
|
|
23
|
+
struct worker_pool_s * pool;
|
|
24
|
+
unsigned int id;
|
|
25
|
+
} worker_context_t;
|
|
26
|
+
|
|
27
|
+
// internal structure kept in thread-local storage per instance of worker pool
|
|
28
|
+
typedef struct worker_pool_s {
|
|
29
|
+
worker_pool_job_t job[MAX_NUM_WORKERS]; // list of job descriptors
|
|
30
|
+
qurt_thread_t thread[MAX_NUM_WORKERS]; // thread ID's of the workers
|
|
31
|
+
worker_context_t context[MAX_NUM_WORKERS]; // worker contexts
|
|
32
|
+
void * stack[MAX_NUM_WORKERS]; // thread stack pointers
|
|
33
|
+
unsigned int n_threads; // number of workers in this pool
|
|
34
|
+
|
|
35
|
+
atomic_uint seqn; // seqno used to detect new jobs
|
|
36
|
+
atomic_uint next_job; // next job index
|
|
37
|
+
atomic_uint n_pending; // number of pending jobs
|
|
38
|
+
atomic_uint n_jobs; // number of current jobs
|
|
39
|
+
atomic_bool killed; // threads need to exit
|
|
40
|
+
} worker_pool_t;
|
|
41
|
+
|
|
42
|
+
static void worker_pool_main(void * context) {
|
|
43
|
+
worker_context_t * me = (worker_context_t *) context;
|
|
44
|
+
worker_pool_t * pool = me->pool;
|
|
45
|
+
|
|
46
|
+
FARF(HIGH, "worker-pool: thread %u started", me->id);
|
|
47
|
+
|
|
48
|
+
unsigned int prev_seqn = 0;
|
|
49
|
+
while (!atomic_load(&pool->killed)) {
|
|
50
|
+
unsigned int seqn = atomic_load(&pool->seqn);
|
|
51
|
+
if (seqn == prev_seqn) {
|
|
52
|
+
// Nothing to do
|
|
53
|
+
qurt_futex_wait(&pool->seqn, prev_seqn);
|
|
54
|
+
continue;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// New job
|
|
58
|
+
prev_seqn = seqn;
|
|
59
|
+
|
|
60
|
+
unsigned int n = atomic_load(&pool->n_jobs);
|
|
61
|
+
unsigned int i = atomic_fetch_add(&pool->next_job, 1);
|
|
62
|
+
if (i >= n) {
|
|
63
|
+
// Spurios wakeup
|
|
64
|
+
continue;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
pool->job[i].func(n, i, pool->job[i].data);
|
|
68
|
+
|
|
69
|
+
atomic_fetch_sub(&pool->n_pending, 1);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
FARF(HIGH, "worker-pool: thread %u stopped", me->id);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
AEEResult worker_pool_init_with_stack_size(worker_pool_context_t * context, uint32_t n_threads, uint32_t stack_size) {
|
|
76
|
+
int err = 0;
|
|
77
|
+
|
|
78
|
+
if (NULL == context) {
|
|
79
|
+
FARF(ERROR, "NULL context passed to worker_pool_init().");
|
|
80
|
+
return AEE_EBADPARM;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// Allocations
|
|
84
|
+
int size = (stack_size * n_threads) + (sizeof(worker_pool_t));
|
|
85
|
+
|
|
86
|
+
unsigned char * mem_blob = (unsigned char *) malloc(size);
|
|
87
|
+
if (!mem_blob) {
|
|
88
|
+
FARF(ERROR, "Could not allocate memory for worker pool!!");
|
|
89
|
+
return AEE_ENOMEMORY;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
worker_pool_t * me = (worker_pool_t *) (mem_blob + stack_size * n_threads);
|
|
93
|
+
|
|
94
|
+
// name for the first worker, useful in debugging threads
|
|
95
|
+
char name[19];
|
|
96
|
+
snprintf(name, 12, "0x%8x:", (int) me);
|
|
97
|
+
strcat(name, "worker0");
|
|
98
|
+
me->n_threads = n_threads;
|
|
99
|
+
|
|
100
|
+
// initializations
|
|
101
|
+
for (unsigned int i = 0; i < me->n_threads; i++) {
|
|
102
|
+
me->stack[i] = NULL;
|
|
103
|
+
me->thread[i] = 0;
|
|
104
|
+
|
|
105
|
+
me->context[i].id = i;
|
|
106
|
+
me->context[i].pool = me;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// initialize job queue
|
|
110
|
+
me->n_pending = 0;
|
|
111
|
+
me->n_jobs = 0;
|
|
112
|
+
me->next_job = 0;
|
|
113
|
+
me->seqn = 0;
|
|
114
|
+
me->killed = 0;
|
|
115
|
+
|
|
116
|
+
// launch the workers
|
|
117
|
+
qurt_thread_attr_t attr;
|
|
118
|
+
qurt_thread_attr_init(&attr);
|
|
119
|
+
|
|
120
|
+
for (unsigned int i = 0; i < me->n_threads; i++) {
|
|
121
|
+
// set up stack
|
|
122
|
+
me->stack[i] = mem_blob;
|
|
123
|
+
mem_blob += stack_size;
|
|
124
|
+
qurt_thread_attr_set_stack_addr(&attr, me->stack[i]);
|
|
125
|
+
qurt_thread_attr_set_stack_size(&attr, stack_size);
|
|
126
|
+
|
|
127
|
+
// set up name
|
|
128
|
+
qurt_thread_attr_set_name(&attr, name);
|
|
129
|
+
name[17] = (name[17] + 1);
|
|
130
|
+
// name threads context:worker0, context:worker1, .. (recycle at 9, but num threads should be less than that anyway)
|
|
131
|
+
if (name[17] > '9') {
|
|
132
|
+
name[17] = '0';
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// set up priority - by default, match the creating thread's prio
|
|
136
|
+
int prio = qurt_thread_get_priority(qurt_thread_get_id());
|
|
137
|
+
|
|
138
|
+
if (prio < 1) {
|
|
139
|
+
prio = 1;
|
|
140
|
+
}
|
|
141
|
+
if (prio > LOWEST_USABLE_QURT_PRIO) {
|
|
142
|
+
prio = LOWEST_USABLE_QURT_PRIO;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
qurt_thread_attr_set_priority(&attr, prio);
|
|
146
|
+
|
|
147
|
+
// launch
|
|
148
|
+
err = qurt_thread_create(&me->thread[i], &attr, worker_pool_main, (void *) &me->context[i]);
|
|
149
|
+
if (err) {
|
|
150
|
+
FARF(ERROR, "Could not launch worker threads!");
|
|
151
|
+
worker_pool_release((worker_pool_context_t *) &me);
|
|
152
|
+
return AEE_EQURTTHREADCREATE;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
*context = (worker_pool_context_t *) me;
|
|
156
|
+
return AEE_SUCCESS;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
AEEResult worker_pool_init(worker_pool_context_t * context, uint32_t n_threads) {
|
|
160
|
+
return worker_pool_init_with_stack_size(context, n_threads, WORKER_THREAD_STACK_SZ);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// clean up worker pool
|
|
164
|
+
void worker_pool_release(worker_pool_context_t * context) {
|
|
165
|
+
worker_pool_t * me = (worker_pool_t *) *context;
|
|
166
|
+
|
|
167
|
+
// if no worker pool exists, return error.
|
|
168
|
+
if (NULL == me) {
|
|
169
|
+
return;
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
atomic_store(&me->killed, 1);
|
|
173
|
+
atomic_fetch_add(&me->seqn, 1);
|
|
174
|
+
qurt_futex_wake(&me->seqn, me->n_threads);
|
|
175
|
+
|
|
176
|
+
// de-initializations
|
|
177
|
+
for (unsigned int i = 0; i < me->n_threads; i++) {
|
|
178
|
+
if (me->thread[i]) {
|
|
179
|
+
int status;
|
|
180
|
+
(void) qurt_thread_join(me->thread[i], &status);
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// free allocated memory (were allocated as a single buffer starting at stack[0])
|
|
185
|
+
if (me->stack[0]) {
|
|
186
|
+
free(me->stack[0]);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
*context = NULL;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// run jobs
|
|
193
|
+
AEEResult worker_pool_run_jobs(worker_pool_context_t context, worker_pool_job_t * job, unsigned int n) {
|
|
194
|
+
worker_pool_t * me = (worker_pool_t *) context;
|
|
195
|
+
if (NULL == me) {
|
|
196
|
+
FARF(ERROR, "worker-pool: invalid context");
|
|
197
|
+
return AEE_EBADPARM;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
if (n > me->n_threads) {
|
|
201
|
+
FARF(ERROR, "worker-pool: invalid number of jobs %u for n-threads %u", n, me->n_threads);
|
|
202
|
+
return AEE_EBADPARM;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
memcpy(me->job, job, sizeof(worker_pool_job_t) * n);
|
|
206
|
+
|
|
207
|
+
if (n > 1) {
|
|
208
|
+
atomic_store(&me->next_job, 1);
|
|
209
|
+
atomic_store(&me->n_jobs, n);
|
|
210
|
+
atomic_store(&me->n_pending, n - 1);
|
|
211
|
+
|
|
212
|
+
// wake up workers
|
|
213
|
+
atomic_fetch_add(&me->seqn, 1);
|
|
214
|
+
qurt_futex_wake(&me->seqn, n - 1);
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// main thread runs job #0
|
|
218
|
+
me->job[0].func(n, 0, me->job[0].data);
|
|
219
|
+
|
|
220
|
+
if (n > 1) {
|
|
221
|
+
while (atomic_load(&me->n_pending))
|
|
222
|
+
;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
return 0;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
// run func
|
|
229
|
+
AEEResult worker_pool_run_func(worker_pool_context_t context, worker_callback_t func, void * data, unsigned int n) {
|
|
230
|
+
worker_pool_job_t job[n];
|
|
231
|
+
|
|
232
|
+
for (unsigned int i = 0; i < n; i++) {
|
|
233
|
+
job[i].func = func;
|
|
234
|
+
job[i].data = data;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
return worker_pool_run_jobs(context, job, n);
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
AEEResult worker_pool_set_thread_priority(worker_pool_context_t context, unsigned int prio) {
|
|
241
|
+
worker_pool_t * me = (worker_pool_t *) context;
|
|
242
|
+
|
|
243
|
+
// if no worker pool exists, return error.
|
|
244
|
+
if (!me) {
|
|
245
|
+
return AEE_ENOMORE;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
int result = AEE_SUCCESS;
|
|
249
|
+
if (prio < 1) {
|
|
250
|
+
prio = 1;
|
|
251
|
+
}
|
|
252
|
+
if (prio > LOWEST_USABLE_QURT_PRIO) {
|
|
253
|
+
prio = LOWEST_USABLE_QURT_PRIO;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
for (unsigned int i = 0; i < me->n_threads; i++) {
|
|
257
|
+
int res = qurt_thread_set_priority(me->thread[i], (unsigned short) prio);
|
|
258
|
+
if (0 != res) {
|
|
259
|
+
result = AEE_EBADPARM;
|
|
260
|
+
FARF(ERROR, "QURT failed to set priority of thread %d, ERROR = %d", me->thread[i], res);
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
return result;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
AEEResult worker_pool_retrieve_thread_id(worker_pool_context_t context, unsigned int * tids) {
|
|
268
|
+
worker_pool_t * me = (worker_pool_t *) context;
|
|
269
|
+
if (!me) {
|
|
270
|
+
FARF(ERROR, "worker-pool: invalid context");
|
|
271
|
+
return AEE_EBADPARM;
|
|
272
|
+
;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
for (int i = 0; i < me->n_threads; i++) {
|
|
276
|
+
tids[i] = me->thread[i];
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
return AEE_SUCCESS;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
AEEResult worker_pool_get_thread_priority(worker_pool_context_t context, unsigned int * prio) {
|
|
283
|
+
worker_pool_t * me = (worker_pool_t *) context;
|
|
284
|
+
if (!me) {
|
|
285
|
+
FARF(ERROR, "worker-pool: invalid context");
|
|
286
|
+
return AEE_EBADPARM;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
int priority = qurt_thread_get_priority(me->thread[0]);
|
|
290
|
+
if (priority > 0) {
|
|
291
|
+
*prio = priority;
|
|
292
|
+
return 0;
|
|
293
|
+
} else {
|
|
294
|
+
*prio = 0;
|
|
295
|
+
return AEE_EBADSTATE;
|
|
296
|
+
}
|
|
297
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
#ifndef HTP_WORKER_POOL_H
|
|
2
|
+
#define HTP_WORKER_POOL_H
|
|
3
|
+
|
|
4
|
+
// MACRO enables function to be visible in shared-library case.
|
|
5
|
+
#define WORKERPOOL_API __attribute__((visibility("default")))
|
|
6
|
+
|
|
7
|
+
#include <AEEStdDef.h>
|
|
8
|
+
#include <AEEStdErr.h>
|
|
9
|
+
#include <stdint.h>
|
|
10
|
+
|
|
11
|
+
#ifdef __cplusplus
|
|
12
|
+
extern "C" {
|
|
13
|
+
#endif
|
|
14
|
+
|
|
15
|
+
/// signature of callbacks to be invoked by worker threads
|
|
16
|
+
typedef void (*worker_callback_t)(unsigned int n, unsigned int i, void *);
|
|
17
|
+
|
|
18
|
+
/// Typedef of worker_pool context
|
|
19
|
+
typedef void * worker_pool_context_t;
|
|
20
|
+
|
|
21
|
+
/// descriptor for requested callback
|
|
22
|
+
typedef struct {
|
|
23
|
+
worker_callback_t func;
|
|
24
|
+
void * data;
|
|
25
|
+
} worker_pool_job_t;
|
|
26
|
+
|
|
27
|
+
/// Maximum supported number of worker threads.
|
|
28
|
+
#define MAX_NUM_WORKERS 10
|
|
29
|
+
|
|
30
|
+
// Initialize worker pool.
|
|
31
|
+
WORKERPOOL_API AEEResult worker_pool_init(worker_pool_context_t * context, uint32_t n_threads);
|
|
32
|
+
|
|
33
|
+
// Initialize worker pool with custom stack size
|
|
34
|
+
WORKERPOOL_API AEEResult worker_pool_init_with_stack_size(worker_pool_context_t * context,
|
|
35
|
+
uint32_t n_threads,
|
|
36
|
+
uint32_t stack_size);
|
|
37
|
+
|
|
38
|
+
// Kill worker threads and release worker pool resources
|
|
39
|
+
WORKERPOOL_API void worker_pool_release(worker_pool_context_t * context);
|
|
40
|
+
|
|
41
|
+
// Run jobs with the worker pool.
|
|
42
|
+
WORKERPOOL_API AEEResult worker_pool_run_jobs(worker_pool_context_t context, worker_pool_job_t * job, unsigned int n);
|
|
43
|
+
|
|
44
|
+
WORKERPOOL_API AEEResult worker_pool_run_func(worker_pool_context_t context,
|
|
45
|
+
worker_callback_t func,
|
|
46
|
+
void * data,
|
|
47
|
+
unsigned int n);
|
|
48
|
+
|
|
49
|
+
WORKERPOOL_API AEEResult worker_pool_set_thread_priority(worker_pool_context_t context, unsigned int prio);
|
|
50
|
+
WORKERPOOL_API AEEResult worker_pool_get_thread_priority(worker_pool_context_t context, unsigned int * prio);
|
|
51
|
+
WORKERPOOL_API AEEResult worker_pool_retrieve_thread_id(worker_pool_context_t context, unsigned int * tids);
|
|
52
|
+
|
|
53
|
+
#ifdef __cplusplus
|
|
54
|
+
}
|
|
55
|
+
#endif
|
|
56
|
+
|
|
57
|
+
#endif // #ifndef HTP_WORKER_POOL_H
|