whispercpp 1.3.4 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -43
- data/ext/extconf.rb +2 -2
- data/ext/ruby_whisper.c +14 -2
- data/ext/ruby_whisper.h +39 -0
- data/ext/ruby_whisper_context.c +22 -22
- data/ext/ruby_whisper_model.c +12 -12
- data/ext/ruby_whisper_params.c +47 -23
- data/ext/ruby_whisper_segment.c +84 -19
- data/ext/ruby_whisper_token.c +351 -0
- data/ext/ruby_whisper_transcribe.cpp +1 -1
- data/ext/ruby_whisper_vad_context.c +75 -0
- data/ext/ruby_whisper_vad_context_detect.cpp +50 -0
- data/ext/ruby_whisper_vad_segment.c +139 -0
- data/ext/ruby_whisper_vad_segments.c +106 -0
- data/ext/sources/CMakeLists.txt +4 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/arm64-apple-clang.cmake +16 -0
- data/ext/sources/cmake/arm64-windows-llvm.cmake +16 -0
- data/ext/sources/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
- data/ext/sources/cmake/x64-windows-llvm.cmake +5 -0
- data/ext/sources/examples/addon.node/vad-example.js +2 -2
- data/ext/sources/examples/cli/cli.cpp +121 -112
- data/ext/sources/examples/lsp/CMakeLists.txt +2 -1
- data/ext/sources/examples/quantize/CMakeLists.txt +2 -1
- data/ext/sources/examples/server/server.cpp +10 -11
- data/ext/sources/examples/talk-llama/CMakeLists.txt +5 -1
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +12 -3
- data/ext/sources/examples/talk-llama/llama-adapter.h +7 -1
- data/ext/sources/examples/talk-llama/llama-arch.cpp +2046 -1974
- data/ext/sources/examples/talk-llama/llama-arch.h +67 -2
- data/ext/sources/examples/talk-llama/llama-batch.cpp +75 -33
- data/ext/sources/examples/talk-llama/llama-batch.h +17 -4
- data/ext/sources/examples/talk-llama/llama-chat.cpp +79 -3
- data/ext/sources/examples/talk-llama/llama-chat.h +4 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +775 -78
- data/ext/sources/examples/talk-llama/llama-context.h +57 -9
- data/ext/sources/examples/talk-llama/llama-cparams.h +1 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +288 -53
- data/ext/sources/examples/talk-llama/llama-grammar.h +22 -1
- data/ext/sources/examples/talk-llama/llama-graph.cpp +381 -64
- data/ext/sources/examples/talk-llama/llama-graph.h +103 -13
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +26 -2
- data/ext/sources/examples/talk-llama/llama-hparams.h +41 -10
- data/ext/sources/examples/talk-llama/llama-impl.cpp +7 -3
- data/ext/sources/examples/talk-llama/llama-impl.h +1 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +5 -3
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +145 -65
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +22 -7
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +44 -2
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +12 -10
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +32 -19
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +2 -2
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +172 -37
- data/ext/sources/examples/talk-llama/llama-mmap.h +8 -3
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +91 -9
- data/ext/sources/examples/talk-llama/llama-model-loader.h +6 -0
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +3 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +1529 -13134
- data/ext/sources/examples/talk-llama/llama-model.h +44 -3
- data/ext/sources/examples/talk-llama/llama-quant.cpp +8 -23
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +1294 -198
- data/ext/sources/examples/talk-llama/llama-sampling.h +19 -7
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +133 -37
- data/ext/sources/examples/talk-llama/llama-vocab.h +45 -40
- data/ext/sources/examples/talk-llama/llama.cpp +729 -2
- data/ext/sources/examples/talk-llama/llama.h +152 -14
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +191 -0
- data/ext/sources/examples/talk-llama/models/apertus.cpp +125 -0
- data/ext/sources/examples/talk-llama/models/arcee.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/arctic.cpp +138 -0
- data/ext/sources/examples/talk-llama/models/arwkv7.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/bert.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +160 -0
- data/ext/sources/examples/talk-llama/models/bloom.cpp +101 -0
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +111 -0
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +102 -0
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +134 -0
- data/ext/sources/examples/talk-llama/models/command-r.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/deci.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +259 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +134 -0
- data/ext/sources/examples/talk-llama/models/dream.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +114 -0
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +113 -0
- data/ext/sources/examples/talk-llama/models/falcon.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +116 -0
- data/ext/sources/examples/talk-llama/models/gemma.cpp +112 -0
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +155 -0
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +384 -0
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +170 -0
- data/ext/sources/examples/talk-llama/models/glm4.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +196 -0
- data/ext/sources/examples/talk-llama/models/granite.cpp +211 -0
- data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +283 -0
- data/ext/sources/examples/talk-llama/models/grok.cpp +159 -0
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +141 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +154 -0
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/jais.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +106 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +175 -0
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/llada.cpp +99 -0
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/llama.cpp +168 -0
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/mamba.cpp +55 -0
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +199 -0
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +160 -0
- data/ext/sources/examples/talk-llama/models/models.h +569 -0
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +116 -0
- data/ext/sources/examples/talk-llama/models/mpt.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +104 -0
- data/ext/sources/examples/talk-llama/models/olmo.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +127 -0
- data/ext/sources/examples/talk-llama/models/openelm.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/orion.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi3.cpp +152 -0
- data/ext/sources/examples/talk-llama/models/plamo.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +316 -0
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/plm.cpp +168 -0
- data/ext/sources/examples/talk-llama/models/qwen.cpp +108 -0
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +151 -0
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +873 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +149 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +141 -0
- data/ext/sources/examples/talk-llama/models/refact.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +162 -0
- data/ext/sources/examples/talk-llama/models/rwkv6.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/rwkv7.cpp +90 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +146 -0
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +100 -0
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +166 -0
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +96 -0
- data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +149 -0
- data/ext/sources/examples/talk-llama/models/xverse.cpp +108 -0
- data/ext/sources/examples/talk-llama/unicode.cpp +102 -16
- data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +1 -1
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +1 -1
- data/ext/sources/ggml/CMakeLists.txt +82 -54
- data/ext/sources/ggml/include/ggml-alloc.h +9 -0
- data/ext/sources/ggml/include/ggml-backend.h +4 -1
- data/ext/sources/ggml/include/ggml-cpu.h +1 -0
- data/ext/sources/ggml/include/ggml-hexagon.h +19 -0
- data/ext/sources/ggml/include/ggml-rpc.h +8 -11
- data/ext/sources/ggml/include/ggml-zendnn.h +22 -0
- data/ext/sources/ggml/include/ggml.h +190 -12
- data/ext/sources/ggml/src/CMakeLists.txt +82 -11
- data/ext/sources/ggml/src/ggml-alloc.c +124 -41
- data/ext/sources/ggml/src/ggml-backend-impl.h +1 -4
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +27 -3
- data/ext/sources/ggml/src/ggml-backend.cpp +71 -21
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +17 -3
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -9
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +138 -47
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +2179 -1696
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +238 -317
- data/ext/sources/ggml/src/ggml-cann/common.h +283 -208
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +626 -776
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +156 -86
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1004 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6 -6
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +50 -2
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -3
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +195 -71
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +573 -106
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +33 -44
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +298 -112
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +819 -125
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +708 -431
- data/ext/sources/ggml/src/ggml-cpu/ops.h +5 -4
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +671 -31
- data/ext/sources/ggml/src/ggml-cpu/repack.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +41 -43
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +3 -2
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +7 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +124 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.h +261 -146
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +72 -1
- data/ext/sources/ggml/src/ggml-cuda/argmax.cu +2 -2
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +123 -6
- data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +16 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +353 -80
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +339 -246
- data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +1 -5
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cu +307 -0
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/diag.cu +77 -0
- data/ext/sources/ggml/src/ggml-cuda/diag.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +31 -21
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +663 -596
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +35 -741
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +1241 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +30 -37
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +14 -13
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +83 -37
- data/ext/sources/ggml/src/ggml-cuda/fill.cu +37 -0
- data/ext/sources/ggml/src/ggml-cuda/fill.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1155 -164
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +5 -4
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +741 -48
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +60 -12
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +381 -42
- data/ext/sources/ggml/src/ggml-cuda/mmid.cu +164 -0
- data/ext/sources/ggml/src/ggml-cuda/mmid.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +69 -176
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +498 -171
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +375 -79
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +3 -2
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +241 -95
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +64 -33
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +151 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +14 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +192 -77
- data/ext/sources/ggml/src/ggml-cuda/rope.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +101 -47
- data/ext/sources/ggml/src/ggml-cuda/set.cu +39 -0
- data/ext/sources/ggml/src/ggml-cuda/set.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +203 -6
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +275 -0
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +14 -20
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +49 -84
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +19 -1
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +96 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +168 -76
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +11 -4
- data/ext/sources/ggml/src/ggml-cuda/tri.cu +136 -0
- data/ext/sources/ggml/src/ggml-cuda/tri.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +105 -11
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +36 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +163 -7
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +12 -1
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +6 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +80 -0
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3151 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +44 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +682 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +566 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +112 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.c +63 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +157 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +165 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +92 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +94 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +72 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +1020 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +1353 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +1001 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2503 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +487 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +168 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +287 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +454 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +221 -0
- data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +153 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +8 -13
- data/ext/sources/ggml/src/ggml-impl.h +67 -6
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +2 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +29 -20
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +652 -285
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +103 -56
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +496 -118
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +231 -9
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +1227 -224
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +12 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +14 -8
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1972 -704
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +3 -1
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +11 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1430 -120
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +63 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +82 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +4 -3
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
- data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +35 -16
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +438 -156
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +48 -3
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +77 -0
- data/ext/sources/ggml/src/ggml-sycl/add-id.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +0 -9
- data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +0 -6
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +117 -15
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +55 -44
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +34 -0
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +79 -0
- data/ext/sources/ggml/src/ggml-sycl/count-equal.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +0 -3
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +18 -0
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +76 -3
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +333 -300
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +10 -2
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +335 -110
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +156 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.cpp +97 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.hpp +24 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.cpp +122 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +30 -17
- data/ext/sources/ggml/src/ggml-sycl/set.cpp +73 -0
- data/ext/sources/ggml/src/ggml-sycl/set.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +327 -162
- data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +4 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +58 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +5013 -2859
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +47 -49
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +9 -21
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +39 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +19 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +45 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +50 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +17 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +2 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +19 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mat_vec_base.comp → mul_mat_vec_base.glsl} +70 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +71 -21
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +41 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +44 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +39 -36
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +494 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +78 -103
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +34 -23
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mm_funcs.comp → mul_mm_funcs.glsl} +69 -59
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +72 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +88 -228
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +21 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +10 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +50 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +234 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +6 -50
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +6 -33
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +6 -33
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +6 -39
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +2 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +43 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +345 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +90 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +335 -151
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +28 -2
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +169 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1964 -435
- data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +33 -10
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +591 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +1 -1
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +6 -6
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +83 -17
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +112 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +483 -0
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +466 -0
- data/ext/sources/ggml/src/ggml.c +425 -33
- data/ext/sources/include/whisper.h +1 -0
- data/ext/sources/src/CMakeLists.txt +3 -1
- data/ext/sources/src/whisper.cpp +101 -35
- data/ext/sources/tests/CMakeLists.txt +2 -2
- data/ext/sources/tests/test-vad-full.cpp +4 -2
- data/ext/sources/tests/test-vad.cpp +1 -1
- data/extsources.rb +1 -0
- data/lib/whisper/model/uri.rb +17 -18
- data/sig/whisper.rbs +119 -2
- data/test/test_params.rb +16 -8
- data/test/test_segment.rb +0 -1
- data/test/test_token.rb +70 -0
- data/test/test_vad.rb +1 -1
- data/test/test_vad_context.rb +50 -0
- data/test/test_vad_segment.rb +19 -0
- data/test/test_vad_segments.rb +16 -0
- data/test/test_whisper.rb +7 -0
- data/whispercpp.gemspec +1 -1
- metadata +287 -34
- data/ext/sources/build-xcframework.sh +0 -571
- data/ext/sources/ggml/src/ggml-cann/Doxyfile +0 -2579
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +0 -44
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +0 -41
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +0 -44
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +0 -41
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +0 -48
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
|
@@ -0,0 +1,1001 @@
|
|
|
1
|
+
#pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
|
|
2
|
+
#pragma clang diagnostic ignored "-Wunused-function"
|
|
3
|
+
|
|
4
|
+
#define FARF_ERROR 1
|
|
5
|
+
#define FARF_HIGH 1
|
|
6
|
+
#define FARF_MEDIUM 0
|
|
7
|
+
#define FARF_LOW 0
|
|
8
|
+
#include <AEEStdErr.h>
|
|
9
|
+
#include <dspqueue.h>
|
|
10
|
+
#include <HAP_compute_res.h>
|
|
11
|
+
#include <HAP_etm_config.h>
|
|
12
|
+
#include <HAP_farf.h>
|
|
13
|
+
#include <HAP_mem.h>
|
|
14
|
+
#include <HAP_perf.h>
|
|
15
|
+
#include <HAP_power.h>
|
|
16
|
+
#include <HAP_ps.h>
|
|
17
|
+
#include <qurt.h>
|
|
18
|
+
#include <qurt_thread.h>
|
|
19
|
+
#include <remote.h>
|
|
20
|
+
#include <string.h>
|
|
21
|
+
|
|
22
|
+
#define GGML_COMMON_DECL_C
|
|
23
|
+
#include "ggml-common.h"
|
|
24
|
+
#include "htp-ctx.h"
|
|
25
|
+
#include "htp-dma.h"
|
|
26
|
+
#include "htp-msg.h"
|
|
27
|
+
#include "htp-ops.h"
|
|
28
|
+
#include "ops-utils.h"
|
|
29
|
+
#include "worker-pool.h"
|
|
30
|
+
|
|
31
|
+
AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
|
|
32
|
+
struct htp_context * ctx;
|
|
33
|
+
int err = 0;
|
|
34
|
+
|
|
35
|
+
ctx = calloc(1, sizeof(*ctx));
|
|
36
|
+
if (ctx == NULL) {
|
|
37
|
+
return AEE_ENOMEMORY;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// Use the context structure as a handle
|
|
41
|
+
*handle = (remote_handle64) ctx;
|
|
42
|
+
|
|
43
|
+
// Enable FARF logs
|
|
44
|
+
HAP_setFARFRuntimeLoggingParams(0xffff, NULL, 0);
|
|
45
|
+
|
|
46
|
+
// Set client class
|
|
47
|
+
{
|
|
48
|
+
HAP_power_request_t request;
|
|
49
|
+
memset(&request, 0, sizeof(HAP_power_request_t));
|
|
50
|
+
request.type = HAP_power_set_apptype;
|
|
51
|
+
request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
|
|
52
|
+
|
|
53
|
+
if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
|
|
54
|
+
return err;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
{
|
|
59
|
+
HAP_power_request_t request;
|
|
60
|
+
memset(&request, 0, sizeof(request));
|
|
61
|
+
|
|
62
|
+
request.type = HAP_power_set_DCVS_v3;
|
|
63
|
+
request.dcvs_v3.set_dcvs_enable = TRUE;
|
|
64
|
+
request.dcvs_v3.dcvs_enable = TRUE;
|
|
65
|
+
request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
|
|
66
|
+
request.dcvs_v3.set_bus_params = TRUE;
|
|
67
|
+
request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_MAX;
|
|
68
|
+
request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_MAX;
|
|
69
|
+
request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_MAX;
|
|
70
|
+
request.dcvs_v3.set_core_params = TRUE;
|
|
71
|
+
request.dcvs_v3.core_params.min_corner = HAP_DCVS_VCORNER_MAX;
|
|
72
|
+
request.dcvs_v3.core_params.max_corner = HAP_DCVS_VCORNER_MAX;
|
|
73
|
+
request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_MAX;
|
|
74
|
+
request.dcvs_v3.set_sleep_disable = TRUE;
|
|
75
|
+
request.dcvs_v3.sleep_disable = TRUE;
|
|
76
|
+
if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
|
|
77
|
+
return err;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
memset(&request, 0, sizeof(request));
|
|
81
|
+
request.type = HAP_power_set_HVX;
|
|
82
|
+
request.hvx.power_up = TRUE;
|
|
83
|
+
if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
|
|
84
|
+
return err;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
{
|
|
89
|
+
// Power on HMX
|
|
90
|
+
HAP_power_request_t request;
|
|
91
|
+
memset(&request, 0, sizeof(HAP_power_request_t));
|
|
92
|
+
request.type = HAP_power_set_HMX;
|
|
93
|
+
request.hmx.power_up = TRUE;
|
|
94
|
+
FARF(ALWAYS, "Powering HMX on\n");
|
|
95
|
+
err = HAP_power_set((void *) &ctx, &request);
|
|
96
|
+
if (err != AEE_SUCCESS) {
|
|
97
|
+
FARF(ERROR, "Error powering on HMX.");
|
|
98
|
+
return err;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
return AEE_SUCCESS;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
AEEResult htp_iface_close(remote_handle64 handle) {
|
|
106
|
+
struct htp_context * ctx = (struct htp_context *) handle;
|
|
107
|
+
|
|
108
|
+
if (!ctx) {
|
|
109
|
+
return AEE_EBADPARM;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
if (ctx->queue) {
|
|
113
|
+
FARF(ERROR, "Closing handle with queue still open");
|
|
114
|
+
return AEE_EITEMBUSY;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
free(ctx);
|
|
118
|
+
return AEE_SUCCESS;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
AEEResult htp_iface_enable_etm(remote_handle64 handle) {
|
|
122
|
+
int err = HAP_user_etm_enable();
|
|
123
|
+
if (err) {
|
|
124
|
+
if (err == AEE_EVERSIONNOTSUPPORT) {
|
|
125
|
+
FARF(ERROR, "API HAP_user_etm_enable is not supported\n");
|
|
126
|
+
} else {
|
|
127
|
+
FARF(ERROR, "Error executing HAP_user_etm_enable with error code : 0x%x\n", err);
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
return err;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
AEEResult htp_iface_disable_etm(remote_handle64 handle) {
|
|
134
|
+
int err = HAP_user_etm_disable();
|
|
135
|
+
if (err) {
|
|
136
|
+
if (err == AEE_EVERSIONNOTSUPPORT) {
|
|
137
|
+
FARF(ERROR, "API HAP_user_etm_disable is not supported\n");
|
|
138
|
+
} else {
|
|
139
|
+
FARF(ERROR, "Error executing HAP_user_etm_disable with error code : 0x%x\n", err);
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
return err;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
static int vtcm_acquire(struct htp_context * ctx) {
|
|
146
|
+
int err;
|
|
147
|
+
if (!ctx->vtcm_valid) {
|
|
148
|
+
// Temporarily bump thread priority to make sure it's higher than other sessions.
|
|
149
|
+
// This way the resource manager will notify the other thread to release VTCM.
|
|
150
|
+
// Note that we need to reaquire VTCM at normal priority for this to work next time.
|
|
151
|
+
qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10);
|
|
152
|
+
err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
|
|
153
|
+
if (err != 0) {
|
|
154
|
+
FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
|
|
155
|
+
abort();
|
|
156
|
+
}
|
|
157
|
+
HAP_compute_res_release_cached(ctx->vtcm_rctx);
|
|
158
|
+
qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio);
|
|
159
|
+
|
|
160
|
+
err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
|
|
161
|
+
if (err != 0) {
|
|
162
|
+
FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
|
|
163
|
+
abort();
|
|
164
|
+
}
|
|
165
|
+
ctx->vtcm_valid = true;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
ctx->vtcm_inuse = true;
|
|
169
|
+
return 0;
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
static int vtcm_release(struct htp_context * ctx) {
|
|
173
|
+
ctx->vtcm_inuse = false;
|
|
174
|
+
|
|
175
|
+
if (ctx->vtcm_valid && ctx->vtcm_needs_release) {
|
|
176
|
+
ctx->vtcm_valid = false;
|
|
177
|
+
ctx->vtcm_needs_release = false;
|
|
178
|
+
HAP_compute_res_release_cached(ctx->vtcm_rctx);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
return 0;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
static int vtcm_release_callback(unsigned int rctx, void * state) {
|
|
185
|
+
struct htp_context * ctx = (struct htp_context *) state;
|
|
186
|
+
|
|
187
|
+
if (!ctx || ctx->vtcm_rctx != rctx) {
|
|
188
|
+
return AEE_EBADPARM;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// If VTCM is not inuse (not processing Ops) release it right here
|
|
192
|
+
// otherwise we'll release it once we're done with the current Op.
|
|
193
|
+
|
|
194
|
+
if (ctx->vtcm_inuse) {
|
|
195
|
+
ctx->vtcm_needs_release = false;
|
|
196
|
+
return 0;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
ctx->vtcm_valid = false;
|
|
200
|
+
HAP_compute_res_release_cached(ctx->vtcm_rctx);
|
|
201
|
+
|
|
202
|
+
return 0;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
static int vtcm_alloc(struct htp_context * ctx) {
|
|
206
|
+
unsigned int vtcm_size = 8 * 1024 * 1024; // 8MB default
|
|
207
|
+
HAP_compute_res_query_VTCM(0, &vtcm_size, NULL, NULL, NULL);
|
|
208
|
+
|
|
209
|
+
compute_res_attr_t attr;
|
|
210
|
+
HAP_compute_res_attr_init(&attr);
|
|
211
|
+
HAP_compute_res_attr_set_serialize(&attr, 0);
|
|
212
|
+
HAP_compute_res_attr_set_cache_mode(&attr, 1);
|
|
213
|
+
HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, 0, vtcm_size);
|
|
214
|
+
HAP_compute_res_attr_set_release_callback(&attr, vtcm_release_callback, (void *) ctx);
|
|
215
|
+
HAP_compute_res_attr_set_hmx_param(&attr, 1);
|
|
216
|
+
|
|
217
|
+
// Allocate VTCM for scratch pads
|
|
218
|
+
uint32_t rctx = HAP_compute_res_acquire(&attr, 1000000 /* timeout */);
|
|
219
|
+
if (!rctx) {
|
|
220
|
+
FARF(ERROR, "failed to allocate %zu bytes VTCM\n", ctx->vtcm_size);
|
|
221
|
+
return AEE_ENOMEMORY;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
void * vtcm_ptr;
|
|
225
|
+
if (HAP_compute_res_attr_get_vtcm_ptr_v2(&attr, &vtcm_ptr, &vtcm_size) != 0) {
|
|
226
|
+
HAP_compute_res_release(rctx);
|
|
227
|
+
FARF(ERROR, "failed to allocate %zu bytes VTCM (new)\n", ctx->vtcm_size);
|
|
228
|
+
return AEE_ENOMEMORY;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
ctx->vtcm_base = (uint8_t *) vtcm_ptr;
|
|
232
|
+
ctx->vtcm_size = vtcm_size;
|
|
233
|
+
ctx->vtcm_rctx = rctx;
|
|
234
|
+
ctx->vtcm_valid = false;
|
|
235
|
+
ctx->vtcm_inuse = false;
|
|
236
|
+
ctx->vtcm_needs_release = false;
|
|
237
|
+
|
|
238
|
+
return 0;
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
static void vtcm_free(struct htp_context * ctx) {
|
|
242
|
+
if (ctx->vtcm_rctx) {
|
|
243
|
+
HAP_compute_res_release(ctx->vtcm_rctx);
|
|
244
|
+
ctx->vtcm_base = 0;
|
|
245
|
+
ctx->vtcm_rctx = 0;
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
static void htp_packet_callback(dspqueue_t queue, int error, void * context);
|
|
250
|
+
static void htp_error_callback(dspqueue_t queue, int error, void * context);
|
|
251
|
+
|
|
252
|
+
AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx) {
|
|
253
|
+
struct htp_context * ctx = (struct htp_context *) handle;
|
|
254
|
+
|
|
255
|
+
if (!ctx) {
|
|
256
|
+
return AEE_EBADPARM;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
if (ctx->queue) {
|
|
260
|
+
FARF(ERROR, "Queue already open");
|
|
261
|
+
return AEE_EITEMBUSY;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
// Import queue created on the CPU
|
|
265
|
+
int err = dspqueue_import(dsp_queue_id, // Queue ID from dspqueue_export
|
|
266
|
+
htp_packet_callback, // Packet callback
|
|
267
|
+
htp_error_callback, // Error callback; no errors expected on the DSP
|
|
268
|
+
(void *) ctx, // Callback context
|
|
269
|
+
&ctx->queue);
|
|
270
|
+
|
|
271
|
+
if (err) {
|
|
272
|
+
FARF(ERROR, "Queue import failed with 0x%08x", (unsigned) err);
|
|
273
|
+
return err;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
ctx->thread_id = qurt_thread_get_id();
|
|
277
|
+
ctx->thread_prio = qurt_thread_get_priority(ctx->thread_id);
|
|
278
|
+
|
|
279
|
+
// allocate VTCM
|
|
280
|
+
err = vtcm_alloc(ctx);
|
|
281
|
+
if (err != AEE_SUCCESS) {
|
|
282
|
+
FARF(ERROR, "Unable to allocate VTCM");
|
|
283
|
+
return AEE_ENOMEMORY;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
qurt_sysenv_max_hthreads_t hw_threads;
|
|
287
|
+
qurt_sysenv_get_max_hw_threads(&hw_threads);
|
|
288
|
+
uint32_t hw_nhvx = (qurt_hvx_get_units() >> 8) & 0xFF;
|
|
289
|
+
|
|
290
|
+
if (n_hvx == 0) {
|
|
291
|
+
n_hvx = hw_nhvx;
|
|
292
|
+
}
|
|
293
|
+
if (n_hvx > hw_threads.max_hthreads) {
|
|
294
|
+
n_hvx = hw_threads.max_hthreads;
|
|
295
|
+
}
|
|
296
|
+
if (n_hvx > HTP_MAX_NTHREADS) {
|
|
297
|
+
n_hvx = HTP_MAX_NTHREADS;
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
ctx->n_threads = n_hvx;
|
|
301
|
+
for (int i = 0; i < ctx->n_threads; i++) {
|
|
302
|
+
// see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541
|
|
303
|
+
ctx->dma[i] = dma_queue_create(64);
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// init worker pool
|
|
307
|
+
err = worker_pool_init(&ctx->worker_pool, n_hvx);
|
|
308
|
+
if (err != AEE_SUCCESS) {
|
|
309
|
+
FARF(ERROR, "Unable to create worker pool");
|
|
310
|
+
return err;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
FARF(HIGH, "session %u started: n-hvx %u vtcm-size %zu vtcm-rctx %u n-threads %u thread-id %d thread-prio %d \n",
|
|
314
|
+
sess_id, hw_nhvx, ctx->vtcm_size, ctx->vtcm_rctx, ctx->n_threads, ctx->thread_id, ctx->thread_prio);
|
|
315
|
+
|
|
316
|
+
return AEE_SUCCESS;
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
AEEResult htp_iface_stop(remote_handle64 handle) {
|
|
320
|
+
struct htp_context * ctx = (struct htp_context *) handle;
|
|
321
|
+
if (!ctx) {
|
|
322
|
+
return AEE_EBADPARM;
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
if (!ctx->queue) {
|
|
326
|
+
FARF(ERROR, "Queue not open");
|
|
327
|
+
return AEE_EBADSTATE;
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// Close queue. dspqueue_close() will also wait for callbacks to finish.
|
|
331
|
+
int err = dspqueue_close(ctx->queue);
|
|
332
|
+
ctx->queue = NULL;
|
|
333
|
+
if (err != 0) {
|
|
334
|
+
FARF(ERROR, "Queue close failed with 0x%08x", (unsigned) err);
|
|
335
|
+
return err;
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
if (ctx->worker_pool) {
|
|
339
|
+
// Release worker pool
|
|
340
|
+
worker_pool_release(&ctx->worker_pool);
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
for (int i = 0; i < ctx->n_threads; i++) {
|
|
344
|
+
dma_queue_delete(ctx->dma[i]);
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
vtcm_free(ctx);
|
|
348
|
+
|
|
349
|
+
return AEE_SUCCESS;
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
static void htp_error_callback(dspqueue_t queue, int error, void * context) {
|
|
353
|
+
// No errors expected on the DSP.
|
|
354
|
+
FARF(ERROR, "Error callback: 0x%08x", (unsigned) error);
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
struct profile_data {
|
|
358
|
+
uint64_t usecs;
|
|
359
|
+
uint64_t cycles;
|
|
360
|
+
uint64_t pkts;
|
|
361
|
+
};
|
|
362
|
+
|
|
363
|
+
static inline void profile_start(struct profile_data * d) {
|
|
364
|
+
d->usecs = HAP_perf_get_qtimer_count();
|
|
365
|
+
d->cycles = htp_get_cycles();
|
|
366
|
+
d->pkts = htp_get_pktcnt();
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
static inline void profile_stop(struct profile_data * d) {
|
|
370
|
+
d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
|
|
371
|
+
d->cycles = htp_get_cycles() - d->cycles;
|
|
372
|
+
d->pkts = htp_get_pktcnt() - d->pkts;
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
static int send_htp_rsp(struct htp_context * c,
|
|
376
|
+
uint32_t op,
|
|
377
|
+
uint32_t status,
|
|
378
|
+
struct dspqueue_buffer * bufs,
|
|
379
|
+
size_t n_bufs,
|
|
380
|
+
struct profile_data * prof) {
|
|
381
|
+
// Prep response struct
|
|
382
|
+
struct htp_general_rsp rsp;
|
|
383
|
+
rsp.op = op;
|
|
384
|
+
rsp.status = status;
|
|
385
|
+
rsp.prof_usecs = prof->usecs;
|
|
386
|
+
rsp.prof_cycles = prof->cycles;
|
|
387
|
+
rsp.prof_pkts = prof->pkts;
|
|
388
|
+
|
|
389
|
+
int err = dspqueue_write(c->queue,
|
|
390
|
+
0, // Flags
|
|
391
|
+
n_bufs,
|
|
392
|
+
bufs, // Buffer references
|
|
393
|
+
sizeof(rsp),
|
|
394
|
+
(const uint8_t *) &rsp, // Message
|
|
395
|
+
DSPQUEUE_TIMEOUT_NONE);
|
|
396
|
+
|
|
397
|
+
if (err != 0) {
|
|
398
|
+
FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err);
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
return err;
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
static void proc_matmul_req(struct htp_context * ctx,
|
|
405
|
+
struct htp_general_req * req,
|
|
406
|
+
struct dspqueue_buffer * bufs,
|
|
407
|
+
size_t n_bufs) {
|
|
408
|
+
struct dspqueue_buffer rsp_bufs[1];
|
|
409
|
+
|
|
410
|
+
// We had written to the output buffer, we'd also need to flush it
|
|
411
|
+
rsp_bufs[0].fd = bufs[2].fd;
|
|
412
|
+
rsp_bufs[0].ptr = bufs[2].ptr;
|
|
413
|
+
rsp_bufs[0].size = bufs[2].size;
|
|
414
|
+
rsp_bufs[0].offset = bufs[2].offset;
|
|
415
|
+
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
|
416
|
+
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
|
417
|
+
|
|
418
|
+
// Setup Op context
|
|
419
|
+
struct htp_ops_context octx = { 0 };
|
|
420
|
+
octx.ctx = ctx;
|
|
421
|
+
octx.src0 = req->src0;
|
|
422
|
+
octx.src1 = req->src1;
|
|
423
|
+
octx.dst = req->dst;
|
|
424
|
+
octx.flags = req->flags;
|
|
425
|
+
octx.op = req->op;
|
|
426
|
+
|
|
427
|
+
// Update data pointers
|
|
428
|
+
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
429
|
+
octx.src1.data = (uint32_t) bufs[1].ptr;
|
|
430
|
+
octx.dst.data = (uint32_t) bufs[2].ptr;
|
|
431
|
+
octx.n_threads = ctx->n_threads;
|
|
432
|
+
|
|
433
|
+
struct profile_data prof;
|
|
434
|
+
profile_start(&prof);
|
|
435
|
+
|
|
436
|
+
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
|
437
|
+
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
|
438
|
+
rsp_status = op_matmul(&octx);
|
|
439
|
+
vtcm_release(ctx);
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
profile_stop(&prof);
|
|
443
|
+
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
static void proc_get_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
|
|
447
|
+
struct dspqueue_buffer rsp_bufs[1];
|
|
448
|
+
|
|
449
|
+
// We had written to the output buffer, we'd also need to flush it
|
|
450
|
+
rsp_bufs[0].fd = bufs[2].fd;
|
|
451
|
+
rsp_bufs[0].ptr = bufs[2].ptr;
|
|
452
|
+
rsp_bufs[0].offset = bufs[2].offset;
|
|
453
|
+
rsp_bufs[0].size = bufs[2].size;
|
|
454
|
+
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
|
455
|
+
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
|
456
|
+
|
|
457
|
+
// Setup Op context
|
|
458
|
+
struct htp_ops_context octx = { 0 };
|
|
459
|
+
octx.ctx = ctx;
|
|
460
|
+
octx.src0 = req->src0;
|
|
461
|
+
octx.src1 = req->src1;
|
|
462
|
+
octx.dst = req->dst;
|
|
463
|
+
octx.flags = req->flags;
|
|
464
|
+
octx.op = req->op;
|
|
465
|
+
|
|
466
|
+
// Update data pointers
|
|
467
|
+
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
468
|
+
octx.src1.data = (uint32_t) bufs[1].ptr;
|
|
469
|
+
octx.dst.data = (uint32_t) bufs[2].ptr;
|
|
470
|
+
octx.n_threads = ctx->n_threads;
|
|
471
|
+
|
|
472
|
+
struct profile_data prof;
|
|
473
|
+
profile_start(&prof);
|
|
474
|
+
|
|
475
|
+
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
|
476
|
+
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
|
477
|
+
rsp_status = op_get_rows(&octx);
|
|
478
|
+
vtcm_release(ctx);
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
profile_stop(&prof);
|
|
482
|
+
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
static void proc_matmul_id_req(struct htp_context * ctx,
|
|
486
|
+
struct htp_general_req * req,
|
|
487
|
+
struct dspqueue_buffer * bufs,
|
|
488
|
+
size_t n_bufs) {
|
|
489
|
+
struct dspqueue_buffer rsp_bufs[1];
|
|
490
|
+
|
|
491
|
+
// We had written to the output buffer, we'd also need to flush it
|
|
492
|
+
rsp_bufs[0].fd = bufs[3].fd;
|
|
493
|
+
rsp_bufs[0].ptr = bufs[3].ptr;
|
|
494
|
+
rsp_bufs[0].size = bufs[3].size;
|
|
495
|
+
rsp_bufs[0].offset = bufs[3].offset;
|
|
496
|
+
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
|
497
|
+
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
|
498
|
+
|
|
499
|
+
// Setup Op context
|
|
500
|
+
struct htp_ops_context octx = { 0 };
|
|
501
|
+
octx.ctx = ctx;
|
|
502
|
+
octx.src0 = req->src0;
|
|
503
|
+
octx.src1 = req->src1;
|
|
504
|
+
octx.src2 = req->src2;
|
|
505
|
+
octx.dst = req->dst;
|
|
506
|
+
octx.flags = req->flags;
|
|
507
|
+
octx.op = req->op;
|
|
508
|
+
|
|
509
|
+
// Update data pointers
|
|
510
|
+
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
511
|
+
octx.src1.data = (uint32_t) bufs[1].ptr;
|
|
512
|
+
octx.src2.data = (uint32_t) bufs[2].ptr;
|
|
513
|
+
octx.dst.data = (uint32_t) bufs[3].ptr;
|
|
514
|
+
octx.n_threads = ctx->n_threads;
|
|
515
|
+
|
|
516
|
+
struct profile_data prof;
|
|
517
|
+
profile_start(&prof);
|
|
518
|
+
|
|
519
|
+
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
|
520
|
+
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
|
521
|
+
rsp_status = op_matmul_id(&octx);
|
|
522
|
+
vtcm_release(ctx);
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
profile_stop(&prof);
|
|
526
|
+
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
static void proc_binary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
|
|
530
|
+
struct dspqueue_buffer rsp_bufs[1];
|
|
531
|
+
|
|
532
|
+
// We had written to the output buffer, we'd also need to flush it
|
|
533
|
+
rsp_bufs[0].fd = bufs[2].fd;
|
|
534
|
+
rsp_bufs[0].ptr = bufs[2].ptr;
|
|
535
|
+
rsp_bufs[0].offset = bufs[2].offset;
|
|
536
|
+
rsp_bufs[0].size = bufs[2].size;
|
|
537
|
+
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
|
538
|
+
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
|
539
|
+
|
|
540
|
+
// Setup Op context
|
|
541
|
+
struct htp_ops_context octx = { 0 };
|
|
542
|
+
octx.ctx = ctx;
|
|
543
|
+
octx.src0 = req->src0;
|
|
544
|
+
octx.src1 = req->src1;
|
|
545
|
+
octx.dst = req->dst;
|
|
546
|
+
octx.flags = req->flags;
|
|
547
|
+
octx.op = req->op;
|
|
548
|
+
|
|
549
|
+
// Update data pointers
|
|
550
|
+
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
551
|
+
octx.src1.data = (uint32_t) bufs[1].ptr;
|
|
552
|
+
octx.dst.data = (uint32_t) bufs[2].ptr;
|
|
553
|
+
octx.n_threads = ctx->n_threads;
|
|
554
|
+
|
|
555
|
+
struct profile_data prof;
|
|
556
|
+
profile_start(&prof);
|
|
557
|
+
|
|
558
|
+
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
|
559
|
+
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
|
560
|
+
rsp_status = op_binary(&octx);
|
|
561
|
+
vtcm_release(ctx);
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
profile_stop(&prof);
|
|
565
|
+
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
static void proc_add_id_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
|
|
569
|
+
struct dspqueue_buffer rsp_bufs[1];
|
|
570
|
+
|
|
571
|
+
// We had written to the output buffer, we'd also need to flush it
|
|
572
|
+
rsp_bufs[0].fd = bufs[3].fd;
|
|
573
|
+
rsp_bufs[0].ptr = bufs[3].ptr;
|
|
574
|
+
rsp_bufs[0].offset = bufs[3].offset;
|
|
575
|
+
rsp_bufs[0].size = bufs[3].size;
|
|
576
|
+
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
|
577
|
+
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
|
578
|
+
|
|
579
|
+
// Setup Op context
|
|
580
|
+
struct htp_ops_context octx = { 0 };
|
|
581
|
+
octx.ctx = ctx;
|
|
582
|
+
octx.src0 = req->src0;
|
|
583
|
+
octx.src1 = req->src1;
|
|
584
|
+
octx.src2 = req->src2;
|
|
585
|
+
octx.dst = req->dst;
|
|
586
|
+
octx.flags = req->flags;
|
|
587
|
+
octx.op = req->op;
|
|
588
|
+
|
|
589
|
+
// Update data pointers
|
|
590
|
+
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
591
|
+
octx.src1.data = (uint32_t) bufs[1].ptr;
|
|
592
|
+
octx.src2.data = (uint32_t) bufs[2].ptr;
|
|
593
|
+
octx.dst.data = (uint32_t) bufs[3].ptr;
|
|
594
|
+
octx.n_threads = ctx->n_threads;
|
|
595
|
+
|
|
596
|
+
struct profile_data prof;
|
|
597
|
+
profile_start(&prof);
|
|
598
|
+
|
|
599
|
+
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
|
600
|
+
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
|
601
|
+
rsp_status = op_binary(&octx);
|
|
602
|
+
vtcm_release(ctx);
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
profile_stop(&prof);
|
|
606
|
+
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
static void proc_unary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
|
|
610
|
+
struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
|
|
611
|
+
|
|
612
|
+
// We had written to the output buffer, we'd also need to flush it
|
|
613
|
+
rsp_bufs[0].fd = bufs[1].fd;
|
|
614
|
+
rsp_bufs[0].ptr = bufs[1].ptr;
|
|
615
|
+
rsp_bufs[0].offset = bufs[1].offset;
|
|
616
|
+
rsp_bufs[0].size = bufs[1].size;
|
|
617
|
+
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
|
618
|
+
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
|
619
|
+
|
|
620
|
+
// Setup Op context
|
|
621
|
+
struct htp_ops_context octx = { 0 };
|
|
622
|
+
octx.ctx = ctx;
|
|
623
|
+
octx.src0 = req->src0;
|
|
624
|
+
octx.dst = req->dst;
|
|
625
|
+
octx.flags = req->flags;
|
|
626
|
+
octx.op = req->op;
|
|
627
|
+
|
|
628
|
+
memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
|
|
629
|
+
|
|
630
|
+
// Update data pointers
|
|
631
|
+
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
632
|
+
octx.dst.data = (uint32_t) bufs[1].ptr;
|
|
633
|
+
octx.n_threads = ctx->n_threads;
|
|
634
|
+
|
|
635
|
+
struct profile_data prof;
|
|
636
|
+
profile_start(&prof);
|
|
637
|
+
|
|
638
|
+
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
|
639
|
+
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
|
640
|
+
rsp_status = op_unary(&octx);
|
|
641
|
+
vtcm_release(ctx);
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
profile_stop(&prof);
|
|
645
|
+
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
static void proc_activations_req(struct htp_context * ctx,
|
|
649
|
+
struct htp_general_req * req,
|
|
650
|
+
struct dspqueue_buffer * bufs,
|
|
651
|
+
uint32_t n_bufs) {
|
|
652
|
+
struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
|
|
653
|
+
|
|
654
|
+
int write_idx = (n_bufs == 3) ? 2 : 1;
|
|
655
|
+
|
|
656
|
+
// We had written to the output buffer, we'd also need to flush it
|
|
657
|
+
rsp_bufs[0].fd = bufs[write_idx].fd;
|
|
658
|
+
rsp_bufs[0].ptr = bufs[write_idx].ptr;
|
|
659
|
+
rsp_bufs[0].offset = bufs[write_idx].offset;
|
|
660
|
+
rsp_bufs[0].size = bufs[write_idx].size;
|
|
661
|
+
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
|
662
|
+
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
|
663
|
+
|
|
664
|
+
// Setup Op context
|
|
665
|
+
struct htp_ops_context octx = { 0 };
|
|
666
|
+
octx.ctx = ctx;
|
|
667
|
+
octx.src0 = req->src0;
|
|
668
|
+
if (3 == n_bufs) {
|
|
669
|
+
octx.src1 = req->src1;
|
|
670
|
+
}
|
|
671
|
+
octx.dst = req->dst;
|
|
672
|
+
octx.flags = req->flags;
|
|
673
|
+
octx.op = req->op;
|
|
674
|
+
|
|
675
|
+
memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
|
|
676
|
+
|
|
677
|
+
// Update data pointers
|
|
678
|
+
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
679
|
+
if (3 == n_bufs) {
|
|
680
|
+
octx.src1.data = (uint32_t) bufs[1].ptr;
|
|
681
|
+
octx.dst.data = (uint32_t) bufs[2].ptr;
|
|
682
|
+
} else {
|
|
683
|
+
octx.dst.data = (uint32_t) bufs[1].ptr;
|
|
684
|
+
}
|
|
685
|
+
octx.n_threads = ctx->n_threads;
|
|
686
|
+
|
|
687
|
+
struct profile_data prof;
|
|
688
|
+
profile_start(&prof);
|
|
689
|
+
|
|
690
|
+
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
|
691
|
+
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
|
692
|
+
if (octx.op == HTP_OP_SOFTMAX) {
|
|
693
|
+
rsp_status = op_softmax(&octx);
|
|
694
|
+
} else {
|
|
695
|
+
rsp_status = op_activations(&octx);
|
|
696
|
+
}
|
|
697
|
+
vtcm_release(ctx);
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
profile_stop(&prof);
|
|
701
|
+
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
static void proc_rope_req(struct htp_context * ctx,
|
|
705
|
+
struct htp_general_req * req,
|
|
706
|
+
struct dspqueue_buffer * bufs,
|
|
707
|
+
uint32_t n_bufs) {
|
|
708
|
+
struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
|
|
709
|
+
|
|
710
|
+
int write_idx = n_bufs - 1;
|
|
711
|
+
|
|
712
|
+
// We had written to the output buffer, we'd also need to flush it
|
|
713
|
+
rsp_bufs[0].fd = bufs[write_idx].fd;
|
|
714
|
+
rsp_bufs[0].ptr = bufs[write_idx].ptr;
|
|
715
|
+
rsp_bufs[0].offset = bufs[write_idx].offset;
|
|
716
|
+
rsp_bufs[0].size = bufs[write_idx].size;
|
|
717
|
+
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
|
718
|
+
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
|
719
|
+
|
|
720
|
+
// Setup Op context
|
|
721
|
+
struct htp_ops_context octx = { 0 };
|
|
722
|
+
octx.ctx = ctx;
|
|
723
|
+
octx.src0 = req->src0;
|
|
724
|
+
octx.src1 = req->src1;
|
|
725
|
+
if (4 == n_bufs) {
|
|
726
|
+
octx.src2 = req->src2;
|
|
727
|
+
}
|
|
728
|
+
octx.dst = req->dst;
|
|
729
|
+
octx.flags = req->flags;
|
|
730
|
+
octx.op = req->op;
|
|
731
|
+
|
|
732
|
+
memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
|
|
733
|
+
|
|
734
|
+
// Update data pointers
|
|
735
|
+
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
736
|
+
octx.src1.data = (uint32_t) bufs[1].ptr;
|
|
737
|
+
if (4 == n_bufs) {
|
|
738
|
+
octx.src2.data = (uint32_t) bufs[2].ptr;
|
|
739
|
+
octx.dst.data = (uint32_t) bufs[3].ptr;
|
|
740
|
+
} else {
|
|
741
|
+
octx.dst.data = (uint32_t) bufs[2].ptr;
|
|
742
|
+
}
|
|
743
|
+
octx.n_threads = ctx->n_threads;
|
|
744
|
+
|
|
745
|
+
struct profile_data prof;
|
|
746
|
+
profile_start(&prof);
|
|
747
|
+
|
|
748
|
+
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
|
749
|
+
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
|
750
|
+
rsp_status = op_rope(&octx);
|
|
751
|
+
vtcm_release(ctx);
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
profile_stop(&prof);
|
|
755
|
+
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
static void proc_set_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
|
|
759
|
+
struct dspqueue_buffer rsp_bufs[1];
|
|
760
|
+
|
|
761
|
+
// We had written to the output buffer, we'd also need to flush it
|
|
762
|
+
rsp_bufs[0].fd = bufs[2].fd;
|
|
763
|
+
rsp_bufs[0].ptr = bufs[2].ptr;
|
|
764
|
+
rsp_bufs[0].offset = bufs[2].offset;
|
|
765
|
+
rsp_bufs[0].size = bufs[2].size;
|
|
766
|
+
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
|
767
|
+
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
|
768
|
+
|
|
769
|
+
// Setup Op context
|
|
770
|
+
struct htp_ops_context octx = { 0 };
|
|
771
|
+
octx.ctx = ctx;
|
|
772
|
+
octx.src0 = req->src0;
|
|
773
|
+
octx.src1 = req->src1;
|
|
774
|
+
octx.dst = req->dst;
|
|
775
|
+
octx.flags = req->flags;
|
|
776
|
+
octx.op = req->op;
|
|
777
|
+
|
|
778
|
+
// Update data pointers
|
|
779
|
+
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
780
|
+
octx.src1.data = (uint32_t) bufs[1].ptr;
|
|
781
|
+
octx.dst.data = (uint32_t) bufs[2].ptr;
|
|
782
|
+
octx.n_threads = ctx->n_threads;
|
|
783
|
+
|
|
784
|
+
struct profile_data prof;
|
|
785
|
+
profile_start(&prof);
|
|
786
|
+
|
|
787
|
+
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
|
788
|
+
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
|
789
|
+
rsp_status = op_set_rows(&octx);
|
|
790
|
+
vtcm_release(ctx);
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
profile_stop(&prof);
|
|
794
|
+
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
static void proc_flash_attn_ext_req(struct htp_context * ctx,
|
|
798
|
+
struct htp_general_req * req,
|
|
799
|
+
struct dspqueue_buffer * bufs,
|
|
800
|
+
uint32_t n_bufs) {
|
|
801
|
+
// Setup Op context
|
|
802
|
+
struct htp_ops_context octx;
|
|
803
|
+
memset(&octx, 0, sizeof(octx));
|
|
804
|
+
|
|
805
|
+
octx.ctx = ctx;
|
|
806
|
+
octx.n_threads = ctx->n_threads;
|
|
807
|
+
|
|
808
|
+
octx.src0 = req->src0;
|
|
809
|
+
octx.src1 = req->src1;
|
|
810
|
+
octx.src2 = req->src2;
|
|
811
|
+
octx.src3 = req->src3;
|
|
812
|
+
octx.src4 = req->src4;
|
|
813
|
+
octx.dst = req->dst;
|
|
814
|
+
octx.flags = req->flags;
|
|
815
|
+
octx.op = req->op;
|
|
816
|
+
|
|
817
|
+
memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
|
|
818
|
+
|
|
819
|
+
// Update data pointers
|
|
820
|
+
octx.src0.data = (uint32_t) bufs[0].ptr;
|
|
821
|
+
octx.src1.data = (uint32_t) bufs[1].ptr;
|
|
822
|
+
octx.src2.data = (uint32_t) bufs[2].ptr;
|
|
823
|
+
|
|
824
|
+
int last_buf = 3;
|
|
825
|
+
|
|
826
|
+
if (octx.src3.ne[0]) {
|
|
827
|
+
octx.src3.data = (uint32_t) bufs[last_buf++].ptr; // mask is valid
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
if (octx.src4.ne[0]) {
|
|
831
|
+
octx.src4.data = (uint32_t) bufs[last_buf++].ptr; // sinks is valid
|
|
832
|
+
}
|
|
833
|
+
|
|
834
|
+
octx.dst.data = (uint32_t) bufs[last_buf].ptr;
|
|
835
|
+
|
|
836
|
+
struct profile_data prof;
|
|
837
|
+
profile_start(&prof);
|
|
838
|
+
|
|
839
|
+
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
|
840
|
+
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
|
841
|
+
rsp_status = op_flash_attn_ext(&octx);
|
|
842
|
+
vtcm_release(ctx);
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
profile_stop(&prof);
|
|
846
|
+
|
|
847
|
+
struct dspqueue_buffer rsp_buf = bufs[last_buf];
|
|
848
|
+
rsp_buf.flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
|
849
|
+
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
|
850
|
+
|
|
851
|
+
send_htp_rsp(ctx, req->op, rsp_status, &bufs[last_buf], 1, &prof);
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
|
855
|
+
struct htp_context * ctx = (struct htp_context *) context;
|
|
856
|
+
|
|
857
|
+
// Repeatedly read packets from the queue until it's empty. We don't
|
|
858
|
+
// necessarily get a separate callback for each packet, and new packets
|
|
859
|
+
// may arrive while we're processing the previous one. This ensures we
|
|
860
|
+
// keep the DSP busy as much as possible and avoid waiting for the CPU.
|
|
861
|
+
|
|
862
|
+
while (1) {
|
|
863
|
+
struct htp_general_req req;
|
|
864
|
+
uint32_t req_size;
|
|
865
|
+
|
|
866
|
+
struct dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
|
|
867
|
+
uint32_t n_bufs;
|
|
868
|
+
uint32_t flags;
|
|
869
|
+
|
|
870
|
+
// Read packet from queue
|
|
871
|
+
int err = dspqueue_read_noblock(queue, &flags,
|
|
872
|
+
HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
|
|
873
|
+
&n_bufs, // Number of buffer references
|
|
874
|
+
bufs, // Buffer references
|
|
875
|
+
sizeof(req), // Max message length
|
|
876
|
+
&req_size, // Message length
|
|
877
|
+
(uint8_t *) &req); // Message
|
|
878
|
+
|
|
879
|
+
if (err == AEE_EWOULDBLOCK) {
|
|
880
|
+
// Consumed all packets available for now
|
|
881
|
+
return;
|
|
882
|
+
}
|
|
883
|
+
|
|
884
|
+
if (err != 0) {
|
|
885
|
+
FARF(ERROR, "dspqueue_read_noblock failed: 0x%08x", (unsigned) err);
|
|
886
|
+
return;
|
|
887
|
+
}
|
|
888
|
+
|
|
889
|
+
if (req_size != sizeof(req)) {
|
|
890
|
+
FARF(ERROR, "Invalid request size");
|
|
891
|
+
continue;
|
|
892
|
+
}
|
|
893
|
+
|
|
894
|
+
if (req.flags & HTP_OPFLAGS_EARLY_WAKEUP) {
|
|
895
|
+
// Host wants early notification
|
|
896
|
+
dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);
|
|
897
|
+
}
|
|
898
|
+
|
|
899
|
+
// Process packet based on its message type
|
|
900
|
+
switch (req.op) {
|
|
901
|
+
case HTP_OP_MUL_MAT:
|
|
902
|
+
if (n_bufs != 3) {
|
|
903
|
+
FARF(ERROR, "Bad matmul-req buffer list");
|
|
904
|
+
continue;
|
|
905
|
+
}
|
|
906
|
+
proc_matmul_req(ctx, &req, bufs, n_bufs);
|
|
907
|
+
break;
|
|
908
|
+
|
|
909
|
+
case HTP_OP_MUL_MAT_ID:
|
|
910
|
+
if (n_bufs != 4) {
|
|
911
|
+
FARF(ERROR, "Bad matmul-id-req buffer list");
|
|
912
|
+
continue;
|
|
913
|
+
}
|
|
914
|
+
proc_matmul_id_req(ctx, &req, bufs, n_bufs);
|
|
915
|
+
break;
|
|
916
|
+
|
|
917
|
+
case HTP_OP_MUL:
|
|
918
|
+
case HTP_OP_ADD:
|
|
919
|
+
case HTP_OP_SUB:
|
|
920
|
+
if (n_bufs != 3) {
|
|
921
|
+
FARF(ERROR, "Bad binary-req buffer list");
|
|
922
|
+
continue;
|
|
923
|
+
}
|
|
924
|
+
proc_binary_req(ctx, &req, bufs);
|
|
925
|
+
break;
|
|
926
|
+
|
|
927
|
+
case HTP_OP_RMS_NORM:
|
|
928
|
+
case HTP_OP_SCALE:
|
|
929
|
+
if (n_bufs != 2) {
|
|
930
|
+
FARF(ERROR, "Bad unary-req buffer list");
|
|
931
|
+
continue;
|
|
932
|
+
}
|
|
933
|
+
|
|
934
|
+
proc_unary_req(ctx, &req, bufs);
|
|
935
|
+
break;
|
|
936
|
+
|
|
937
|
+
case HTP_OP_UNARY_SILU:
|
|
938
|
+
case HTP_OP_UNARY_GELU:
|
|
939
|
+
if (n_bufs != 2) {
|
|
940
|
+
FARF(ERROR, "Bad act-req buffer list");
|
|
941
|
+
continue;
|
|
942
|
+
}
|
|
943
|
+
proc_activations_req(ctx, &req, bufs, n_bufs);
|
|
944
|
+
break;
|
|
945
|
+
|
|
946
|
+
case HTP_OP_GLU_SWIGLU:
|
|
947
|
+
case HTP_OP_GLU_SWIGLU_OAI:
|
|
948
|
+
case HTP_OP_SOFTMAX:
|
|
949
|
+
if ((n_bufs != 2) && (n_bufs != 3)) {
|
|
950
|
+
FARF(ERROR, "Bad act-req buffer list");
|
|
951
|
+
continue;
|
|
952
|
+
}
|
|
953
|
+
proc_activations_req(ctx, &req, bufs, n_bufs);
|
|
954
|
+
break;
|
|
955
|
+
|
|
956
|
+
case HTP_OP_ADD_ID:
|
|
957
|
+
if (n_bufs != 4) {
|
|
958
|
+
FARF(ERROR, "Bad add-id-req buffer list");
|
|
959
|
+
continue;
|
|
960
|
+
}
|
|
961
|
+
proc_add_id_req(ctx, &req, bufs);
|
|
962
|
+
break;
|
|
963
|
+
|
|
964
|
+
case HTP_OP_ROPE:
|
|
965
|
+
if ((n_bufs != 3) && (n_bufs != 4)) {
|
|
966
|
+
FARF(ERROR, "Bad rope-req buffer list");
|
|
967
|
+
continue;
|
|
968
|
+
}
|
|
969
|
+
proc_rope_req(ctx, &req, bufs, n_bufs);
|
|
970
|
+
break;
|
|
971
|
+
|
|
972
|
+
case HTP_OP_FLASH_ATTN_EXT:
|
|
973
|
+
if (!(n_bufs >= 4 && n_bufs <= 6)) {
|
|
974
|
+
FARF(ERROR, "Bad flash-attn-ext-req buffer list");
|
|
975
|
+
continue;
|
|
976
|
+
}
|
|
977
|
+
proc_flash_attn_ext_req(ctx, &req, bufs, n_bufs);
|
|
978
|
+
break;
|
|
979
|
+
|
|
980
|
+
case HTP_OP_SET_ROWS:
|
|
981
|
+
if (n_bufs != 3) {
|
|
982
|
+
FARF(ERROR, "Bad set-rows-req buffer list");
|
|
983
|
+
continue;
|
|
984
|
+
}
|
|
985
|
+
proc_set_rows_req(ctx, &req, bufs);
|
|
986
|
+
break;
|
|
987
|
+
|
|
988
|
+
case HTP_OP_GET_ROWS:
|
|
989
|
+
if (n_bufs != 3) {
|
|
990
|
+
FARF(ERROR, "Bad get-rows-req buffer list");
|
|
991
|
+
continue;
|
|
992
|
+
}
|
|
993
|
+
proc_get_rows_req(ctx, &req, bufs);
|
|
994
|
+
break;
|
|
995
|
+
|
|
996
|
+
default:
|
|
997
|
+
FARF(ERROR, "Unknown Op %u", req.op);
|
|
998
|
+
break;
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
}
|