whispercpp 1.3.5 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +99 -2
- data/ext/extconf.rb +1 -0
- data/ext/ruby_whisper.c +20 -4
- data/ext/ruby_whisper.h +30 -2
- data/ext/ruby_whisper_context.c +216 -124
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_model.c +0 -1
- data/ext/ruby_whisper_params.c +0 -1
- data/ext/ruby_whisper_segment.c +0 -1
- data/ext/ruby_whisper_token.c +29 -9
- data/ext/ruby_whisper_transcribe.cpp +4 -1
- data/ext/ruby_whisper_vad_context.c +48 -1
- data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +0 -1
- data/ext/ruby_whisper_vad_segments.c +0 -1
- data/ext/sources/CMakeLists.txt +1 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/examples/bench/bench.cpp +23 -18
- data/ext/sources/examples/cli/cli.cpp +8 -0
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/server/server.cpp +18 -4
- data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
- data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
- data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
- data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
- data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
- data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
- data/ext/sources/examples/talk-llama/llama-context.h +27 -28
- data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
- data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
- data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
- data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
- data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
- data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
- data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
- data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
- data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
- data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
- data/ext/sources/examples/talk-llama/llama-model.h +72 -19
- data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
- data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
- data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
- data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
- data/ext/sources/examples/talk-llama/llama.cpp +76 -22
- data/ext/sources/examples/talk-llama/llama.h +63 -30
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
- data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
- data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
- data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
- data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
- data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/models.h +181 -46
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
- data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
- data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
- data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
- data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
- data/ext/sources/ggml/CMakeLists.txt +9 -3
- data/ext/sources/ggml/include/ggml-backend.h +1 -1
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +5 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +6 -1
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml.h +56 -9
- data/ext/sources/ggml/src/CMakeLists.txt +3 -0
- data/ext/sources/ggml/src/ggml-alloc.c +4 -9
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
- data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
- data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
- data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
- data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
- data/ext/sources/ggml/src/ggml-impl.h +62 -0
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
- data/ext/sources/ggml/src/ggml-quants.c +96 -5
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
- data/ext/sources/ggml/src/ggml.c +167 -33
- data/ext/sources/ggml/src/gguf.cpp +229 -44
- data/ext/sources/src/whisper.cpp +6 -28
- data/sig/whisper.rbs +43 -2
- data/test/test_context_params.rb +82 -0
- data/test/test_token.rb +11 -0
- data/test/test_vad_context.rb +58 -8
- data/test/test_whisper.rb +20 -0
- data/whispercpp.gemspec +1 -1
- metadata +240 -28
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
#include "ggml-openvino-extra.h" // For ExtraQuantType
|
|
3
|
+
#include "ggml.h"
|
|
4
|
+
|
|
5
|
+
#include <cstdint>
|
|
6
|
+
#include <openvino/op/constant.hpp>
|
|
7
|
+
#include <openvino/runtime/tensor.hpp>
|
|
8
|
+
|
|
9
|
+
void unpack_32_4(const uint8_t* data, uint8_t* dst);
|
|
10
|
+
|
|
11
|
+
void extract_q4_0_data(const ggml_tensor * tensor,
|
|
12
|
+
ov::Tensor & weights_arr,
|
|
13
|
+
ov::Tensor & scales_arr,
|
|
14
|
+
ov::Tensor & zp_arr);
|
|
15
|
+
|
|
16
|
+
void extract_q4_1_data(const ggml_tensor * tensor,
|
|
17
|
+
ov::Tensor & weights_arr,
|
|
18
|
+
ov::Tensor & scales_arr,
|
|
19
|
+
ov::Tensor & zp_arr,
|
|
20
|
+
bool use_bias = false);
|
|
21
|
+
|
|
22
|
+
void extract_q8_0_data(const ggml_tensor * tensor,
|
|
23
|
+
ov::Tensor & weights_arr,
|
|
24
|
+
ov::Tensor & scales_arr,
|
|
25
|
+
ov::Tensor & zp_arr);
|
|
26
|
+
|
|
27
|
+
void unpack_256_4(const uint8_t* data, uint8_t* dst);
|
|
28
|
+
|
|
29
|
+
void extract_q4_k_data(const ggml_tensor * tensor,
|
|
30
|
+
ov::Tensor & weights_arr,
|
|
31
|
+
ov::Tensor & scales_arr,
|
|
32
|
+
ov::Tensor & zp_arr,
|
|
33
|
+
bool use_bias = false);
|
|
34
|
+
|
|
35
|
+
void extract_q5_k_data(const ggml_tensor * tensor,
|
|
36
|
+
ov::Tensor & weights_arr,
|
|
37
|
+
ov::Tensor & scales_arr,
|
|
38
|
+
ov::Tensor & zp_arr,
|
|
39
|
+
bool use_bias = false);
|
|
40
|
+
|
|
41
|
+
void extract_q6_k_data(const ggml_tensor * tensor,
|
|
42
|
+
ov::Tensor & weights_arr,
|
|
43
|
+
ov::Tensor & scales_arr,
|
|
44
|
+
ov::Tensor & zp_arr);
|
|
45
|
+
|
|
46
|
+
static constexpr size_t GGML_QUANTIZATION_GROUP_SIZE = 32;
|
|
47
|
+
|
|
48
|
+
ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
|
|
49
|
+
ov::Tensor & scales,
|
|
50
|
+
ov::Tensor & zp,
|
|
51
|
+
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE,
|
|
52
|
+
bool use_bias = false);
|
|
53
|
+
|
|
54
|
+
ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
|
|
55
|
+
ov::Tensor & scales,
|
|
56
|
+
ov::Tensor & zp,
|
|
57
|
+
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE,
|
|
58
|
+
bool use_bias = false);
|
|
59
|
+
|
|
60
|
+
// Extract quantized weights from tensor and create weight subgraph
|
|
61
|
+
// If weights/scales/zp are provided (non-empty), uses them as output buffers
|
|
62
|
+
// Otherwise allocates new ov::Tensors internally
|
|
63
|
+
// Returns the weight node (make_int4_weights or make_int8_weights result)
|
|
64
|
+
std::shared_ptr<ov::Node> extract_quantized_weights(
|
|
65
|
+
const ggml_tensor * tensor,
|
|
66
|
+
const void * data, // Source data pointer (may differ from tensor->data)
|
|
67
|
+
ov::Tensor & weights,
|
|
68
|
+
ov::Tensor & scales,
|
|
69
|
+
ov::Tensor & zp,
|
|
70
|
+
bool use_bias = false); // Use fp bias instead of quantized zero_point (for test-backend-ops)
|
|
71
|
+
|
|
72
|
+
// Requantize weights from tensor to target format, writing to provided buffers
|
|
73
|
+
// For F16 target, only weights buffer is used (scales/zp ignored)
|
|
74
|
+
// Returns the weight node
|
|
75
|
+
std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
|
|
76
|
+
const void * data, // Source data pointer
|
|
77
|
+
ExtraQuantType requant_type,
|
|
78
|
+
int64_t block_size,
|
|
79
|
+
ov::Tensor & weights,
|
|
80
|
+
ov::Tensor & scales,
|
|
81
|
+
ov::Tensor & zp);
|
|
82
|
+
|
|
83
|
+
inline const char * extra_quant_type_name(ExtraQuantType t) {
|
|
84
|
+
switch (t) {
|
|
85
|
+
case ExtraQuantType::F16:
|
|
86
|
+
return "F16";
|
|
87
|
+
case ExtraQuantType::Q4_0_C:
|
|
88
|
+
return "Q4_0_C";
|
|
89
|
+
case ExtraQuantType::Q4_0_128:
|
|
90
|
+
return "Q4_0_128";
|
|
91
|
+
case ExtraQuantType::Q8_0_C:
|
|
92
|
+
return "Q8_0_C";
|
|
93
|
+
case ExtraQuantType::Q8_0_32:
|
|
94
|
+
return "Q8_0_32";
|
|
95
|
+
case ExtraQuantType::Q8_1_C:
|
|
96
|
+
return "Q8_1_C";
|
|
97
|
+
default:
|
|
98
|
+
return "unknown";
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// Result from process_weight_tensor containing the weight node and tensors.
|
|
103
|
+
// For quantized weights, also contains the extracted layout and scale/zp tensors.
|
|
104
|
+
struct OvWeight {
|
|
105
|
+
std::shared_ptr<ov::Node> weight_node;
|
|
106
|
+
ggml_openvino_extracted_layout layout; // Only meaningful for quantized (layout.total_size > 0)
|
|
107
|
+
ov::Tensor weights;
|
|
108
|
+
ov::Tensor scales;
|
|
109
|
+
ov::Tensor zp;
|
|
110
|
+
|
|
111
|
+
bool is_quantized() const { return layout.scales_size > 0; }
|
|
112
|
+
};
|
|
113
|
+
|
|
114
|
+
// Process weight tensor and create an OpenVINO weight node
|
|
115
|
+
// Handles F16/F32/BF16 and quantized weights, with optional requantization
|
|
116
|
+
// If output_base_ptr is nullptr, allocates internal buffers (for decoder use)
|
|
117
|
+
// If output_base_ptr is provided, uses pre-allocated buffers at specified offsets (for backend buffer use)
|
|
118
|
+
// Returns OvWeight with the weight node and optional quantized tensors
|
|
119
|
+
OvWeight process_weight_tensor(
|
|
120
|
+
const ggml_tensor * tensor,
|
|
121
|
+
const void * data, // Source data pointer (may differ from tensor->data)
|
|
122
|
+
void * output_base_ptr = nullptr, // Base pointer for output buffers (or nullptr for internal allocation)
|
|
123
|
+
bool use_bias = false); // Use fp bias instead of quantized zero_point, only used in test-backend-ops
|
|
124
|
+
|
|
125
|
+
void quantize_q4_0(const float * x,
|
|
126
|
+
ov::Tensor & weights_arr,
|
|
127
|
+
ov::Tensor & scales_arr,
|
|
128
|
+
ov::Tensor & zp_arr,
|
|
129
|
+
int64_t k,
|
|
130
|
+
int64_t qk);
|
|
131
|
+
void quantize_q8_1(const float * x,
|
|
132
|
+
ov::Tensor & weights_arr,
|
|
133
|
+
ov::Tensor & scales_arr,
|
|
134
|
+
ov::Tensor & zp_arr,
|
|
135
|
+
int64_t k,
|
|
136
|
+
int64_t qk);
|
|
137
|
+
void quantize_q8_0(const float * x,
|
|
138
|
+
ov::Tensor & weights_arr,
|
|
139
|
+
ov::Tensor & scales_arr,
|
|
140
|
+
ov::Tensor & zp_arr,
|
|
141
|
+
int64_t k,
|
|
142
|
+
int64_t qk);
|
|
143
|
+
|
|
144
|
+
namespace ov {
|
|
145
|
+
namespace op {
|
|
146
|
+
namespace util {
|
|
147
|
+
// From <openvino>/src/common/transformations/include/transformations/utils/utils.hpp
|
|
148
|
+
bool get_single_value(const std::shared_ptr<ov::op::v0::Constant>& const_node,
|
|
149
|
+
float& value,
|
|
150
|
+
bool check_value_range = true);
|
|
151
|
+
} // namespace util
|
|
152
|
+
} // namespace op
|
|
153
|
+
} // namespace ov
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include <cstdint>
|
|
4
|
+
#include <map>
|
|
5
|
+
#include <openvino/core/node.hpp>
|
|
6
|
+
#include <openvino/frontend/decoder.hpp>
|
|
7
|
+
#include <string>
|
|
8
|
+
|
|
9
|
+
namespace ov {
|
|
10
|
+
namespace frontend {
|
|
11
|
+
namespace ggml {
|
|
12
|
+
|
|
13
|
+
class GgmlDecoder : public DecoderBase {
|
|
14
|
+
public:
|
|
15
|
+
virtual ov::Any get_attribute(const std::string& name) const = 0;
|
|
16
|
+
|
|
17
|
+
virtual PartialShape get_input_shape(int node_idx, const std::string& name) const = 0;
|
|
18
|
+
|
|
19
|
+
virtual std::vector<size_t> get_input_stride(int node_idx, const std::string& name) const = 0;
|
|
20
|
+
|
|
21
|
+
virtual element::Type get_input_type(int node_idx, const std::string& name) const = 0;
|
|
22
|
+
|
|
23
|
+
virtual size_t get_input_size() const = 0;
|
|
24
|
+
|
|
25
|
+
virtual size_t get_input_size(int node_idx) const = 0;
|
|
26
|
+
|
|
27
|
+
virtual void get_input_node(size_t input_port_idx,
|
|
28
|
+
std::string& producer_name,
|
|
29
|
+
std::string& producer_output_port_name,
|
|
30
|
+
size_t& producer_output_port_index) const = 0;
|
|
31
|
+
|
|
32
|
+
virtual std::vector<std::string> get_input_names(int node_idx) const = 0;
|
|
33
|
+
|
|
34
|
+
virtual PartialShape get_output_shape(int node_idx) const = 0;
|
|
35
|
+
|
|
36
|
+
virtual element::Type get_output_type(const int node_idx) const = 0;
|
|
37
|
+
|
|
38
|
+
virtual int32_t* get_input_op_params(int node_idx, const std::string& name) const = 0;
|
|
39
|
+
|
|
40
|
+
virtual int32_t * get_output_op_params(int node_idx) const = 0;
|
|
41
|
+
|
|
42
|
+
virtual std::vector<std::string> get_output_names(int node_idx) const = 0;
|
|
43
|
+
|
|
44
|
+
virtual const std::string& get_op_type() const = 0;
|
|
45
|
+
|
|
46
|
+
virtual const std::string& get_op_type(int node_idx) const = 0;
|
|
47
|
+
|
|
48
|
+
virtual const std::string& get_op_name() const = 0;
|
|
49
|
+
|
|
50
|
+
virtual const std::string& get_op_name(int node_idx) const = 0;
|
|
51
|
+
|
|
52
|
+
virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const = 0;
|
|
53
|
+
|
|
54
|
+
virtual int get_op_case(int node_idx) const = 0;
|
|
55
|
+
|
|
56
|
+
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const = 0;
|
|
57
|
+
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_extra_inputs() const = 0;
|
|
58
|
+
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_weights() const = 0;
|
|
59
|
+
virtual std::vector<std::string> get_model_output_names() const = 0;
|
|
60
|
+
|
|
61
|
+
virtual int32_t* get_rope_params() const = 0;
|
|
62
|
+
|
|
63
|
+
virtual std::map<std::string, std::string> get_kv_param_res_names() const = 0;
|
|
64
|
+
|
|
65
|
+
virtual bool is_static() const = 0;
|
|
66
|
+
|
|
67
|
+
virtual bool is_stateful() const = 0;
|
|
68
|
+
|
|
69
|
+
virtual int is_swa_layer(int layer) const = 0;
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
} // namespace ggml
|
|
73
|
+
} // namespace frontend
|
|
74
|
+
} // namespace ov
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
#include "frontend.h"
|
|
2
|
+
|
|
3
|
+
#include "input_model.h"
|
|
4
|
+
#include "op_table.h"
|
|
5
|
+
#include "translate_session.h"
|
|
6
|
+
|
|
7
|
+
namespace ov {
|
|
8
|
+
namespace frontend {
|
|
9
|
+
namespace ggml {
|
|
10
|
+
|
|
11
|
+
FrontEnd::FrontEnd() {}
|
|
12
|
+
|
|
13
|
+
std::shared_ptr<Model> FrontEnd::convert(const InputModel::Ptr & model, bool naive) {
|
|
14
|
+
auto ggml_model = std::dynamic_pointer_cast<ggml::InputModel>(model);
|
|
15
|
+
FRONT_END_GENERAL_CHECK(ggml_model, "Invalid input model");
|
|
16
|
+
std::shared_ptr<Model> converted_model;
|
|
17
|
+
const auto & supported_ops = get_supported_ops();
|
|
18
|
+
{
|
|
19
|
+
TranslateSession translate_session(model, supported_ops, naive);
|
|
20
|
+
converted_model = translate_session.get_converted_model();
|
|
21
|
+
}
|
|
22
|
+
return converted_model;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
} // namespace ggml
|
|
26
|
+
} // namespace frontend
|
|
27
|
+
} // namespace ov
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
// Copyright (C) 2018-2024 Intel Corporation
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
//
|
|
4
|
+
|
|
5
|
+
#pragma once
|
|
6
|
+
|
|
7
|
+
#include <openvino/frontend/frontend.hpp>
|
|
8
|
+
|
|
9
|
+
namespace ov {
|
|
10
|
+
namespace frontend {
|
|
11
|
+
namespace ggml {
|
|
12
|
+
|
|
13
|
+
class FrontEnd {
|
|
14
|
+
public:
|
|
15
|
+
using Ptr = std::shared_ptr<FrontEnd>;
|
|
16
|
+
FrontEnd();
|
|
17
|
+
|
|
18
|
+
static std::shared_ptr<Model> convert(const InputModel::Ptr& model, bool naive = false);
|
|
19
|
+
};
|
|
20
|
+
|
|
21
|
+
} // namespace ggml
|
|
22
|
+
} // namespace frontend
|
|
23
|
+
} // namespace ov
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
#include "input_model.h"
|
|
2
|
+
|
|
3
|
+
#include "decoder.h"
|
|
4
|
+
|
|
5
|
+
namespace ov {
|
|
6
|
+
namespace frontend {
|
|
7
|
+
namespace ggml {
|
|
8
|
+
|
|
9
|
+
InputModel::InputModel(const std::shared_ptr<GgmlDecoder> & gdecoder) : m_decoder(gdecoder) {}
|
|
10
|
+
|
|
11
|
+
const std::shared_ptr<GgmlDecoder> & InputModel::get_model_decoder() const {
|
|
12
|
+
return m_decoder;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
} // namespace ggml
|
|
16
|
+
} // namespace frontend
|
|
17
|
+
} // namespace ov
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include <openvino/frontend/input_model.hpp>
|
|
4
|
+
|
|
5
|
+
#include "decoder.h"
|
|
6
|
+
|
|
7
|
+
namespace ov {
|
|
8
|
+
namespace frontend {
|
|
9
|
+
namespace ggml {
|
|
10
|
+
|
|
11
|
+
class FrontEnd;
|
|
12
|
+
class GgmlDecoder;
|
|
13
|
+
using ov::frontend::ggml::GgmlDecoder;
|
|
14
|
+
|
|
15
|
+
class InputModel : public ov::frontend::InputModel {
|
|
16
|
+
friend class ::ov::frontend::ggml::FrontEnd;
|
|
17
|
+
|
|
18
|
+
public:
|
|
19
|
+
explicit InputModel(const std::shared_ptr<GgmlDecoder>& gdecoder);
|
|
20
|
+
|
|
21
|
+
const std::shared_ptr<GgmlDecoder>& get_model_decoder() const;
|
|
22
|
+
|
|
23
|
+
private:
|
|
24
|
+
std::shared_ptr<GgmlDecoder> m_decoder;
|
|
25
|
+
};
|
|
26
|
+
|
|
27
|
+
} // namespace ggml
|
|
28
|
+
} // namespace frontend
|
|
29
|
+
} // namespace ov
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include <cstdint>
|
|
4
|
+
#include <openvino/frontend/node_context.hpp>
|
|
5
|
+
#include <string>
|
|
6
|
+
|
|
7
|
+
#include "decoder.h"
|
|
8
|
+
|
|
9
|
+
namespace ov {
|
|
10
|
+
namespace frontend {
|
|
11
|
+
namespace ggml {
|
|
12
|
+
|
|
13
|
+
class TranslateSession;
|
|
14
|
+
|
|
15
|
+
typedef std::map<std::string, Output<Node>> TensorMap;
|
|
16
|
+
|
|
17
|
+
class NodeContext : public frontend::NodeContext {
|
|
18
|
+
public:
|
|
19
|
+
NodeContext(const std::shared_ptr<GgmlDecoder>& decoder,
|
|
20
|
+
std::shared_ptr<TensorMap>& tensor_map,
|
|
21
|
+
int node_idx,
|
|
22
|
+
TranslateSession* translate_session = nullptr)
|
|
23
|
+
: ov::frontend::NodeContext(decoder->get_op_type(node_idx)),
|
|
24
|
+
m_decoder(decoder),
|
|
25
|
+
m_tensor_map(tensor_map),
|
|
26
|
+
m_node_idx(node_idx),
|
|
27
|
+
m_translate_session(translate_session) {
|
|
28
|
+
m_input_names = decoder->get_input_names(m_node_idx);
|
|
29
|
+
m_output_names = decoder->get_output_names(m_node_idx);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
TranslateSession* get_translate_session() const {
|
|
33
|
+
return m_translate_session;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
const std::vector<std::string>& get_input_names() const { return m_input_names; }
|
|
37
|
+
|
|
38
|
+
size_t get_input_size() const override {
|
|
39
|
+
return m_decoder->get_input_size(m_node_idx);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
ov::element::Type get_input_type(size_t index) const {
|
|
43
|
+
return m_decoder->get_input_type(m_node_idx, m_input_names[index]);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
PartialShape get_input_shape(size_t input_index) const {
|
|
47
|
+
return m_decoder->get_input_shape(m_node_idx, m_input_names[input_index]);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
std::vector<size_t> get_input_stride(size_t index) const {
|
|
51
|
+
return m_decoder->get_input_stride(m_node_idx, m_input_names[index]);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
std::string get_output_name() const { return m_output_names[0]; }
|
|
55
|
+
|
|
56
|
+
PartialShape get_output_shape() const { return m_decoder->get_output_shape(m_node_idx); }
|
|
57
|
+
|
|
58
|
+
int32_t* get_input_op_params(size_t index) const {
|
|
59
|
+
return m_decoder->get_input_op_params(m_node_idx, m_input_names[index]);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
int32_t * get_output_op_params() const { return m_decoder->get_output_op_params(m_node_idx); }
|
|
63
|
+
|
|
64
|
+
ov::element::Type get_output_type() const {
|
|
65
|
+
return m_decoder->get_output_type(m_node_idx);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
Output<Node> get_input(int idx) const override {
|
|
69
|
+
return m_tensor_map->at(m_input_names[idx]);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
Output<Node> get_input(const std::string& name) const override {
|
|
73
|
+
if (m_tensor_map->find(name) == m_tensor_map->end()) {
|
|
74
|
+
throw std::runtime_error("'" + name + "' not found in tensor map.");
|
|
75
|
+
}
|
|
76
|
+
return m_tensor_map->at(name);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
bool has_input(const std::string& name) const {
|
|
80
|
+
return m_tensor_map->find(name) != m_tensor_map->end();
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const std::string& get_name() const override {
|
|
84
|
+
return m_decoder->get_op_name(m_node_idx);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
ov::Any get_attribute_as_any(const std::string& name) const override {
|
|
88
|
+
return m_decoder->get_attribute(name);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
int get_op_case() const {
|
|
92
|
+
return m_decoder->get_op_case(m_node_idx);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
bool is_static() const { return m_decoder->is_static(); }
|
|
96
|
+
|
|
97
|
+
bool is_stateful() const { return m_decoder->is_stateful(); }
|
|
98
|
+
|
|
99
|
+
private:
|
|
100
|
+
std::shared_ptr<GgmlDecoder> m_decoder;
|
|
101
|
+
std::shared_ptr<TensorMap>& m_tensor_map;
|
|
102
|
+
int m_node_idx;
|
|
103
|
+
TranslateSession* m_translate_session;
|
|
104
|
+
std::vector<std::string> m_input_names;
|
|
105
|
+
std::vector<std::string> m_output_names;
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
using CreatorFunction = std::function<ov::OutputVector(const ov::frontend::ggml::NodeContext&)>;
|
|
109
|
+
|
|
110
|
+
} // namespace ggml
|
|
111
|
+
} // namespace frontend
|
|
112
|
+
} // namespace ov
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
|
|
2
|
+
#include "../node_context.h"
|
|
3
|
+
#include "../op_table.h"
|
|
4
|
+
#include "../utils.h"
|
|
5
|
+
|
|
6
|
+
#include <climits>
|
|
7
|
+
#include <cstdint>
|
|
8
|
+
#include <memory>
|
|
9
|
+
#include <openvino/op/reshape.hpp>
|
|
10
|
+
#include <openvino/op/slice.hpp>
|
|
11
|
+
#include <vector>
|
|
12
|
+
|
|
13
|
+
namespace ov {
|
|
14
|
+
namespace frontend {
|
|
15
|
+
namespace ggml {
|
|
16
|
+
namespace op {
|
|
17
|
+
|
|
18
|
+
OutputVector translate_cont(const NodeContext & context) {
|
|
19
|
+
num_inputs_check(context, 1, 1);
|
|
20
|
+
|
|
21
|
+
int op_case = context.get_op_case();
|
|
22
|
+
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case");
|
|
23
|
+
|
|
24
|
+
auto src_shape = context.get_input_shape(0).to_shape();
|
|
25
|
+
auto dst_shape = context.get_output_shape().to_shape();
|
|
26
|
+
ov::Output<Node> res;
|
|
27
|
+
|
|
28
|
+
if (op_case == 1) {
|
|
29
|
+
// The input comes from a PERMUTE
|
|
30
|
+
throw std::runtime_error("Code of this case might be outdated");
|
|
31
|
+
dst_shape[1] = -1;
|
|
32
|
+
res = std::make_shared<ov::op::v1::Reshape>(
|
|
33
|
+
context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false);
|
|
34
|
+
} else if (op_case == 2) {
|
|
35
|
+
// The input comes from a TRANSPOSE
|
|
36
|
+
return {context.get_input(0)};
|
|
37
|
+
} else {
|
|
38
|
+
// The input comes from a VIEW
|
|
39
|
+
res = process_view_input(context, 0);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
return rename_outputs_with_suffix({res}, context.get_name());
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
} // namespace op
|
|
46
|
+
} // namespace ggml
|
|
47
|
+
} // namespace frontend
|
|
48
|
+
} // namespace ov
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
#include "../node_context.h"
|
|
2
|
+
#include "../op_table.h"
|
|
3
|
+
#include "../utils.h"
|
|
4
|
+
|
|
5
|
+
#include <memory>
|
|
6
|
+
#include <openvino/op/convert.hpp>
|
|
7
|
+
|
|
8
|
+
namespace ov {
|
|
9
|
+
namespace frontend {
|
|
10
|
+
namespace ggml {
|
|
11
|
+
namespace op {
|
|
12
|
+
|
|
13
|
+
OutputVector translate_cpy(const NodeContext & context) {
|
|
14
|
+
auto res = std::make_shared<ov::op::v0::Convert>(context.get_input(0), context.get_output_type());
|
|
15
|
+
return rename_outputs_with_suffix({res}, context.get_name());
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
} // namespace op
|
|
19
|
+
} // namespace ggml
|
|
20
|
+
} // namespace frontend
|
|
21
|
+
} // namespace ov
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
#include "../node_context.h"
|
|
2
|
+
#include "../op_table.h"
|
|
3
|
+
#include "../utils.h"
|
|
4
|
+
|
|
5
|
+
#include <cstdint>
|
|
6
|
+
#include <memory>
|
|
7
|
+
#include <openvino/op/broadcast.hpp>
|
|
8
|
+
#include <openvino/op/concat.hpp>
|
|
9
|
+
#include <openvino/op/constant.hpp>
|
|
10
|
+
#include <openvino/op/convert.hpp>
|
|
11
|
+
#include <openvino/op/reshape.hpp>
|
|
12
|
+
#include <openvino/op/scaled_dot_product_attention.hpp>
|
|
13
|
+
#include <openvino/op/transpose.hpp>
|
|
14
|
+
#include <openvino/op/unsqueeze.hpp>
|
|
15
|
+
#include <string>
|
|
16
|
+
|
|
17
|
+
namespace ov {
|
|
18
|
+
namespace frontend {
|
|
19
|
+
namespace ggml {
|
|
20
|
+
namespace op {
|
|
21
|
+
|
|
22
|
+
OutputVector translate_flash_attn_ext(const NodeContext & context) {
|
|
23
|
+
num_inputs_check(context, 4, 4);
|
|
24
|
+
auto q_f32 = context.get_input(0);
|
|
25
|
+
auto k = context.get_input(1);
|
|
26
|
+
auto v = context.get_input(2);
|
|
27
|
+
auto mask = context.get_input(3);
|
|
28
|
+
|
|
29
|
+
float * params = reinterpret_cast<float *>(context.get_output_op_params());
|
|
30
|
+
float scale = params[0];
|
|
31
|
+
// float max_bias = params[1];
|
|
32
|
+
// float logit_softcap = params[2];
|
|
33
|
+
|
|
34
|
+
auto q = std::make_shared<ov::op::v0::Convert>(q_f32, ov::element::f16);
|
|
35
|
+
auto scale_node = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{}, std::vector<float>{scale});
|
|
36
|
+
|
|
37
|
+
ov::Output<ov::Node> mask_sliced, res;
|
|
38
|
+
std::string mask_name = "KQ_mask_sliced";
|
|
39
|
+
if (context.get_input_names()[3].find("swa") != std::string::npos) {
|
|
40
|
+
mask_name = "KQ_mask_swa_sliced";
|
|
41
|
+
}
|
|
42
|
+
if (context.has_input(mask_name)) {
|
|
43
|
+
mask_sliced = context.get_input(mask_name);
|
|
44
|
+
} else {
|
|
45
|
+
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
|
46
|
+
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
|
47
|
+
auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
|
|
48
|
+
auto token_len = get_dimensions(q, {2});
|
|
49
|
+
mask_sliced = std::make_shared<ov::op::v8::Slice>(mask, zero, token_len, one, two);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
if (mask_sliced.get_element_type() != ov::element::f16) {
|
|
53
|
+
mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
auto tile_kv = [&](int64_t num_heads, int64_t num_heads_kv, int64_t head_size, ov::Output<Node> kv) {
|
|
57
|
+
int64_t factor = num_heads / num_heads_kv;
|
|
58
|
+
if (factor > 1 && num_heads_kv > 1) {
|
|
59
|
+
ov::Output<ov::Node> kv_broadcast_shape, kv_unsqueezed, new_kv_shape;
|
|
60
|
+
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2});
|
|
61
|
+
kv_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(kv, unsqueeze_axes);
|
|
62
|
+
|
|
63
|
+
kv_broadcast_shape = ov::op::v0::Constant::create(
|
|
64
|
+
ov::element::i64, {5}, {(int64_t) 1, (int64_t) 1, factor, (int64_t) 1, (int64_t) 1});
|
|
65
|
+
new_kv_shape =
|
|
66
|
+
ov::op::v0::Constant::create(ov::element::i64, {4}, {(int64_t) 0, num_heads, (int64_t) -1, head_size});
|
|
67
|
+
|
|
68
|
+
kv = std::make_shared<ov::op::v3::Broadcast>(kv_unsqueezed, kv_broadcast_shape,
|
|
69
|
+
ov::op::BroadcastType::BIDIRECTIONAL);
|
|
70
|
+
kv = std::make_shared<ov::op::v1::Reshape>(kv, new_kv_shape, true);
|
|
71
|
+
}
|
|
72
|
+
return kv;
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
auto q_shape = context.get_input_shape(0).to_shape();
|
|
76
|
+
auto k_shape = context.get_input_shape(1).to_shape();
|
|
77
|
+
k = tile_kv(q_shape[1], k_shape[1], q_shape[3], k);
|
|
78
|
+
v = tile_kv(q_shape[1], k_shape[1], q_shape[3], v);
|
|
79
|
+
|
|
80
|
+
auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q, k, v, mask_sliced, scale_node, false);
|
|
81
|
+
res = std::make_shared<ov::op::v1::Transpose>(sdpa,
|
|
82
|
+
ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
|
|
83
|
+
res = std::make_shared<ov::op::v0::Convert>(res, ov::element::f32);
|
|
84
|
+
return rename_outputs_with_suffix({res}, context.get_name());
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
} // namespace op
|
|
88
|
+
} // namespace ggml
|
|
89
|
+
} // namespace frontend
|
|
90
|
+
} // namespace ov
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
#include "../node_context.h"
|
|
2
|
+
#include "../op_table.h"
|
|
3
|
+
#include "../utils.h"
|
|
4
|
+
|
|
5
|
+
#include <openvino/core/node.hpp>
|
|
6
|
+
#include <openvino/core/node_output.hpp>
|
|
7
|
+
#include <openvino/op/constant.hpp>
|
|
8
|
+
#include <openvino/op/convert.hpp>
|
|
9
|
+
#include <openvino/op/gather.hpp>
|
|
10
|
+
#include <openvino/op/squeeze.hpp>
|
|
11
|
+
#include <openvino/op/unsqueeze.hpp>
|
|
12
|
+
|
|
13
|
+
namespace ov {
|
|
14
|
+
namespace frontend {
|
|
15
|
+
namespace ggml {
|
|
16
|
+
namespace op {
|
|
17
|
+
|
|
18
|
+
OutputVector translate_get_rows(const NodeContext & context) {
|
|
19
|
+
num_inputs_check(context, 2, 2);
|
|
20
|
+
|
|
21
|
+
int op_case = context.get_op_case();
|
|
22
|
+
|
|
23
|
+
Output<Node> res;
|
|
24
|
+
auto data = context.get_input(0);
|
|
25
|
+
auto indices = context.get_input(1);
|
|
26
|
+
|
|
27
|
+
if (op_case == 2) {
|
|
28
|
+
// The input comes from a VIEW
|
|
29
|
+
indices = process_view_input(context, 1);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// data[1,b,x,y] ind[1,1,b,x'] test-backend-ops case
|
|
33
|
+
// data[x,y] ind[1,1,1,x'] normal case
|
|
34
|
+
indices =
|
|
35
|
+
std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
|
|
36
|
+
if (data.get_partial_shape().rank() == 4) {
|
|
37
|
+
if (!(data.get_partial_shape()[1].is_dynamic()) && data.get_partial_shape()[1].get_length() == 1) {
|
|
38
|
+
// Work-around for a bug in ov cpu plugin for test-backend-ops
|
|
39
|
+
data = std::make_shared<ov::op::v0::Squeeze>(data,
|
|
40
|
+
ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
|
|
41
|
+
auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0});
|
|
42
|
+
res = std::make_shared<ov::op::v8::Gather>(data, indices, axis);
|
|
43
|
+
} else {
|
|
44
|
+
auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1});
|
|
45
|
+
data =
|
|
46
|
+
std::make_shared<ov::op::v0::Squeeze>(data, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
|
|
47
|
+
res = std::make_shared<ov::op::v8::Gather>(data, indices, axis, 1);
|
|
48
|
+
}
|
|
49
|
+
} else if (context.is_stateful() && data.get_partial_shape().rank() == 3) {
|
|
50
|
+
auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1});
|
|
51
|
+
res = std::make_shared<ov::op::v8::Gather>(data, indices, axis, 1);
|
|
52
|
+
} else {
|
|
53
|
+
auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0});
|
|
54
|
+
res = std::make_shared<ov::op::v8::Gather>(data, indices, axis);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
if (res.get_element_type() != context.get_output_type()) {
|
|
58
|
+
res = std::make_shared<ov::op::v0::Convert>(res, context.get_output_type());
|
|
59
|
+
}
|
|
60
|
+
if (!(context.is_stateful())) {
|
|
61
|
+
res = std::make_shared<ov::op::v0::Unsqueeze>(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
|
|
62
|
+
}
|
|
63
|
+
return rename_outputs_with_suffix({res}, context.get_name());
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
} // namespace op
|
|
67
|
+
} // namespace ggml
|
|
68
|
+
} // namespace frontend
|
|
69
|
+
} // namespace ov
|