whispercpp 1.3.5 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +99 -2
- data/ext/extconf.rb +1 -0
- data/ext/ruby_whisper.c +20 -4
- data/ext/ruby_whisper.h +30 -2
- data/ext/ruby_whisper_context.c +216 -124
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_model.c +0 -1
- data/ext/ruby_whisper_params.c +0 -1
- data/ext/ruby_whisper_segment.c +0 -1
- data/ext/ruby_whisper_token.c +29 -9
- data/ext/ruby_whisper_transcribe.cpp +4 -1
- data/ext/ruby_whisper_vad_context.c +48 -1
- data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +0 -1
- data/ext/ruby_whisper_vad_segments.c +0 -1
- data/ext/sources/CMakeLists.txt +1 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/examples/bench/bench.cpp +23 -18
- data/ext/sources/examples/cli/cli.cpp +8 -0
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/server/server.cpp +18 -4
- data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
- data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
- data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
- data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
- data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
- data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
- data/ext/sources/examples/talk-llama/llama-context.h +27 -28
- data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
- data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
- data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
- data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
- data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
- data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
- data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
- data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
- data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
- data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
- data/ext/sources/examples/talk-llama/llama-model.h +72 -19
- data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
- data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
- data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
- data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
- data/ext/sources/examples/talk-llama/llama.cpp +76 -22
- data/ext/sources/examples/talk-llama/llama.h +63 -30
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
- data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
- data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
- data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
- data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
- data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/models.h +181 -46
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
- data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
- data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
- data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
- data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
- data/ext/sources/ggml/CMakeLists.txt +9 -3
- data/ext/sources/ggml/include/ggml-backend.h +1 -1
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +5 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +6 -1
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml.h +56 -9
- data/ext/sources/ggml/src/CMakeLists.txt +3 -0
- data/ext/sources/ggml/src/ggml-alloc.c +4 -9
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
- data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
- data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
- data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
- data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
- data/ext/sources/ggml/src/ggml-impl.h +62 -0
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
- data/ext/sources/ggml/src/ggml-quants.c +96 -5
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
- data/ext/sources/ggml/src/ggml.c +167 -33
- data/ext/sources/ggml/src/gguf.cpp +229 -44
- data/ext/sources/src/whisper.cpp +6 -28
- data/sig/whisper.rbs +43 -2
- data/test/test_context_params.rb +82 -0
- data/test/test_token.rb +11 -0
- data/test/test_vad_context.rb +58 -8
- data/test/test_whisper.rb +20 -0
- data/whispercpp.gemspec +1 -1
- metadata +240 -28
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
|
@@ -19,10 +19,13 @@
|
|
|
19
19
|
#include <string>
|
|
20
20
|
|
|
21
21
|
#include "dpct/helper.hpp"
|
|
22
|
+
#include "ggml.h"
|
|
23
|
+
#include "ggml-impl.h"
|
|
22
24
|
#include "ggml-sycl.h"
|
|
23
25
|
#include "presets.hpp"
|
|
24
26
|
#include "sycl_hw.hpp"
|
|
25
27
|
|
|
28
|
+
namespace syclexp = sycl::ext::oneapi::experimental;
|
|
26
29
|
|
|
27
30
|
#if GGML_SYCL_DNNL
|
|
28
31
|
#include "dnnl.hpp"
|
|
@@ -31,6 +34,9 @@
|
|
|
31
34
|
|
|
32
35
|
#define GGML_COMMON_DECL_SYCL
|
|
33
36
|
#define GGML_COMMON_IMPL_SYCL
|
|
37
|
+
#define SYCL_FLASH_ATTN //remove it to disable FLASH_ATTENTION in building.
|
|
38
|
+
#define SYCL_FAST_FP16 //don't change. remove it will break fattn-tile.hpp building
|
|
39
|
+
|
|
34
40
|
/* suppress warning spam */
|
|
35
41
|
#pragma clang diagnostic push
|
|
36
42
|
#pragma clang diagnostic ignored "-Wnested-anon-types"
|
|
@@ -45,6 +51,8 @@ void ggml_sycl_host_free(void* ptr);
|
|
|
45
51
|
extern int g_ggml_sycl_debug;
|
|
46
52
|
extern int g_ggml_sycl_disable_optimize;
|
|
47
53
|
extern int g_ggml_sycl_prioritize_dmmv;
|
|
54
|
+
extern int g_ggml_sycl_enable_flash_attention;
|
|
55
|
+
|
|
48
56
|
|
|
49
57
|
#if defined(__clang__) && __has_builtin(__builtin_expect)
|
|
50
58
|
// Hint the optimizer to pipeline the more likely following instruction in branches
|
|
@@ -76,10 +84,10 @@ extern int g_ggml_sycl_prioritize_dmmv;
|
|
|
76
84
|
|
|
77
85
|
|
|
78
86
|
#define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
|
|
79
|
-
#define VER_4VEC 610 // todo for
|
|
80
|
-
#define VER_GEN9 700 // todo for
|
|
81
|
-
#define VER_GEN12 1000000 // todo for
|
|
82
|
-
#define VER_GEN13 (VER_GEN12 + 1030) // todo for
|
|
87
|
+
#define VER_4VEC 610 // todo for hardware optimize.
|
|
88
|
+
#define VER_GEN9 700 // todo for hardware optimize.
|
|
89
|
+
#define VER_GEN12 1000000 // todo for hardware optimize.
|
|
90
|
+
#define VER_GEN13 (VER_GEN12 + 1030) // todo for hardware optimize.
|
|
83
91
|
|
|
84
92
|
#define GGML_SYCL_MAX_NODES 8192 // TODO: adapt to hardwares
|
|
85
93
|
|
|
@@ -170,6 +178,10 @@ static size_t g_scratch_offset = 0;
|
|
|
170
178
|
|
|
171
179
|
int get_current_device_id();
|
|
172
180
|
|
|
181
|
+
inline int ggml_sycl_get_device() {
|
|
182
|
+
return get_current_device_id();
|
|
183
|
+
}
|
|
184
|
+
|
|
173
185
|
inline dpct::err0 ggml_sycl_set_device(const int device) try {
|
|
174
186
|
int current_device_id;
|
|
175
187
|
SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id()));
|
|
@@ -194,11 +206,14 @@ struct optimize_feature {
|
|
|
194
206
|
};
|
|
195
207
|
|
|
196
208
|
struct sycl_device_info {
|
|
197
|
-
int
|
|
209
|
+
int cc; // compute capability
|
|
198
210
|
int nsm; // number of streaming multiprocessors (CUDA) maps to the maximum
|
|
199
211
|
// number of compute units on a SYCL device.
|
|
200
212
|
// size_t smpb; // max. shared memory per block
|
|
201
213
|
size_t smpbo; // max. shared memory per block (with opt-in)
|
|
214
|
+
int warp_size; // WARP_SIZE(16)|WARP_32_SIZE(32)|WARP_16_SIZE(16). For Intel GPU, 16 is better in most cases. Some OP support 32 only.
|
|
215
|
+
int max_wg_per_cu; // max work groups per compute unit - refer to
|
|
216
|
+
// cudaOccupancyMaxActiveBlocksPerMultiprocessor
|
|
202
217
|
bool vmm; // virtual memory support
|
|
203
218
|
size_t total_vram;
|
|
204
219
|
//sycl_hw_info hw_info; \\ device id and aarch, currently not used
|
|
@@ -435,13 +450,15 @@ warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3>& item_ct1) {
|
|
|
435
450
|
return a;
|
|
436
451
|
}
|
|
437
452
|
|
|
438
|
-
|
|
453
|
+
/* use WARP_SIZE or WARP_32_SIZE*/
|
|
454
|
+
template <int width>
|
|
439
455
|
static __dpct_inline__ int warp_reduce_sum(int x) {
|
|
440
456
|
return sycl::reduce_over_group(
|
|
441
457
|
sycl::ext::oneapi::this_work_item::get_sub_group(), x, sycl::plus<>());
|
|
442
458
|
}
|
|
443
459
|
|
|
444
|
-
|
|
460
|
+
/* use WARP_SIZE or WARP_32_SIZE*/
|
|
461
|
+
template <int width>
|
|
445
462
|
static __dpct_inline__ float warp_reduce_sum(float x) {
|
|
446
463
|
#pragma unroll
|
|
447
464
|
for (int offset = width / 2; offset > 0; offset >>= 1) {
|
|
@@ -451,7 +468,19 @@ static __dpct_inline__ float warp_reduce_sum(float x) {
|
|
|
451
468
|
return x;
|
|
452
469
|
}
|
|
453
470
|
|
|
454
|
-
|
|
471
|
+
/* use WARP_SIZE or WARP_32_SIZE*/
|
|
472
|
+
template <int width>
|
|
473
|
+
static __dpct_inline__ float warp_reduce_sum(float x, const sycl::nd_item<3>& item_ct1) {
|
|
474
|
+
#pragma unroll
|
|
475
|
+
for (int offset = width / 2; offset > 0; offset >>= 1) {
|
|
476
|
+
x += dpct::permute_sub_group_by_xor(
|
|
477
|
+
item_ct1.get_sub_group(), x, offset);
|
|
478
|
+
}
|
|
479
|
+
return x;
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
/* use WARP_SIZE or WARP_32_SIZE*/
|
|
483
|
+
template <int width>
|
|
455
484
|
static __dpct_inline__ sycl::float2 warp_reduce_sum(sycl::float2 a) {
|
|
456
485
|
#pragma unroll
|
|
457
486
|
for (int offset = width / 2; offset > 0; offset >>= 1) {
|
|
@@ -465,7 +494,8 @@ static __dpct_inline__ sycl::float2 warp_reduce_sum(sycl::float2 a) {
|
|
|
465
494
|
return a;
|
|
466
495
|
}
|
|
467
496
|
|
|
468
|
-
|
|
497
|
+
/* use WARP_SIZE or WARP_32_SIZE*/
|
|
498
|
+
template <int width>
|
|
469
499
|
static __dpct_inline__ sycl::half2 warp_reduce_sum(sycl::half2 a) {
|
|
470
500
|
#pragma unroll
|
|
471
501
|
for (int offset = width / 2; offset > 0; offset >>= 1) {
|
|
@@ -481,7 +511,52 @@ static constexpr int ggml_sycl_get_physical_warp_size() {
|
|
|
481
511
|
return WARP_SIZE;
|
|
482
512
|
}
|
|
483
513
|
|
|
484
|
-
|
|
514
|
+
/* use WARP_SIZE or WARP_32_SIZE*/
|
|
515
|
+
template <int width>
|
|
516
|
+
static __dpct_inline__ int warp_reduce_all(int x) {
|
|
517
|
+
if (width == ggml_sycl_get_physical_warp_size()) {
|
|
518
|
+
return sycl::all_of_group(
|
|
519
|
+
sycl::ext::oneapi::this_work_item::get_sub_group(),
|
|
520
|
+
(~0xffffffff &
|
|
521
|
+
(0x1 << sycl::ext::oneapi::this_work_item::get_sub_group()
|
|
522
|
+
.get_local_linear_id())) ||
|
|
523
|
+
x);
|
|
524
|
+
} else {
|
|
525
|
+
#pragma unroll
|
|
526
|
+
for (int offset = width / 2; offset > 0; offset >>= 1) {
|
|
527
|
+
x = dpct::permute_sub_group_by_xor(
|
|
528
|
+
sycl::ext::oneapi::this_work_item::get_sub_group(), x,
|
|
529
|
+
offset, width) &&
|
|
530
|
+
x;
|
|
531
|
+
}
|
|
532
|
+
return x;
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
/* use WARP_SIZE or WARP_32_SIZE*/
|
|
537
|
+
template <int width>
|
|
538
|
+
static __dpct_inline__ int warp_reduce_any(int x) {
|
|
539
|
+
if (width == ggml_sycl_get_physical_warp_size()) {
|
|
540
|
+
return sycl::any_of_group(
|
|
541
|
+
sycl::ext::oneapi::this_work_item::get_sub_group(),
|
|
542
|
+
(0xffffffff &
|
|
543
|
+
(0x1 << sycl::ext::oneapi::this_work_item::get_sub_group()
|
|
544
|
+
.get_local_linear_id())) &&
|
|
545
|
+
x);
|
|
546
|
+
} else {
|
|
547
|
+
#pragma unroll
|
|
548
|
+
for (int offset = width / 2; offset > 0; offset >>= 1) {
|
|
549
|
+
x = dpct::permute_sub_group_by_xor(
|
|
550
|
+
sycl::ext::oneapi::this_work_item::get_sub_group(), x,
|
|
551
|
+
offset, width) ||
|
|
552
|
+
x;
|
|
553
|
+
}
|
|
554
|
+
return x;
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
/* use WARP_SIZE or WARP_32_SIZE*/
|
|
559
|
+
template <int width>
|
|
485
560
|
static __dpct_inline__ float warp_reduce_max(float x) {
|
|
486
561
|
#pragma unroll
|
|
487
562
|
for (int offset = width / 2; offset > 0; offset >>= 1) {
|
|
@@ -629,6 +704,42 @@ static const sycl::uint3 init_fastdiv_values(uint32_t d) {
|
|
|
629
704
|
return sycl::uint3(mp, L, d);
|
|
630
705
|
}
|
|
631
706
|
|
|
707
|
+
// Maximum number of bytes that can be copied in a single instruction.
|
|
708
|
+
// Set by test result.
|
|
709
|
+
static constexpr int ggml_sycl_get_max_cpy_bytes() {
|
|
710
|
+
return 16;
|
|
711
|
+
}
|
|
712
|
+
|
|
713
|
+
// Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes.
|
|
714
|
+
template <int nbytes, int alignment = 0>
|
|
715
|
+
static __dpct_inline__ void ggml_sycl_memcpy_1(void * dst, const void * src) {
|
|
716
|
+
if constexpr (alignment != 0) {
|
|
717
|
+
static_assert(nbytes % alignment == 0, "bad alignment");
|
|
718
|
+
}
|
|
719
|
+
constexpr int nb_per_cpy = alignment == 0 ? nbytes : alignment;
|
|
720
|
+
|
|
721
|
+
#pragma unroll
|
|
722
|
+
for (int i = 0; i < nbytes/nb_per_cpy; ++i) {
|
|
723
|
+
if constexpr (nb_per_cpy == 1) {
|
|
724
|
+
((char *) dst)[i] = ((const char *) src)[i];
|
|
725
|
+
} else if constexpr (nb_per_cpy == 2) {
|
|
726
|
+
((short *) dst)[i] = ((const short *) src)[i];
|
|
727
|
+
} else if constexpr (nb_per_cpy == 4) {
|
|
728
|
+
((int *) dst)[i] = ((const int *) src)[i];
|
|
729
|
+
} else if constexpr (nb_per_cpy == 8) {
|
|
730
|
+
((sycl::int2 *) dst)[i] = ((const sycl::int2 *) src)[i];
|
|
731
|
+
} else if constexpr (nb_per_cpy == 16) {
|
|
732
|
+
((sycl::int4 *) dst)[i] = ((const sycl::int4 *) src)[i];
|
|
733
|
+
} else {
|
|
734
|
+
static_assert(nbytes == 0 && nbytes == -1, "bad nbytes");
|
|
735
|
+
}
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
template <typename T>
|
|
739
|
+
sycl::half2 __dpct_inline__ make_half2( T x, T y) {
|
|
740
|
+
sycl::half2 res(static_cast<sycl::half>(x),static_cast<sycl::half>(y));
|
|
741
|
+
return res;
|
|
742
|
+
}
|
|
632
743
|
|
|
633
744
|
static __dpct_inline__ uint32_t fastdiv(uint32_t n, const sycl::uint3 fastdiv_values) {
|
|
634
745
|
const uint32_t hi = sycl::mul_hi<unsigned>(n, fastdiv_values.x());
|
|
@@ -636,6 +747,17 @@ static __dpct_inline__ uint32_t fastdiv(uint32_t n, const sycl::uint3 fastdiv_va
|
|
|
636
747
|
}
|
|
637
748
|
|
|
638
749
|
|
|
750
|
+
template <typename T>
|
|
751
|
+
sycl::float2 __dpct_inline__ make_float2( T x, T y) {
|
|
752
|
+
sycl::float2 res(static_cast<float>(x),static_cast<float>(y));
|
|
753
|
+
return res;
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
sycl::float2 __dpct_inline__ __half22float2(sycl::half2 &H) {
|
|
757
|
+
sycl::float2 float2_value(static_cast<float>(H.x()), static_cast<float>(H.y()));
|
|
758
|
+
return float2_value;
|
|
759
|
+
}
|
|
760
|
+
|
|
639
761
|
static __dpct_inline__ sycl::uint2 fast_div_modulo(uint32_t n, const sycl::uint3 fastdiv_values) {
|
|
640
762
|
const uint32_t div_val = fastdiv(n, fastdiv_values);
|
|
641
763
|
const uint32_t mod_val = n - div_val * fastdiv_values.z();
|
|
@@ -659,5 +781,188 @@ static __dpct_inline__ float ggml_sycl_e8m0_to_fp32(uint8_t x) {
|
|
|
659
781
|
return result;
|
|
660
782
|
}
|
|
661
783
|
|
|
784
|
+
sycl::float2 __dpct_inline__ __half22float2(const sycl::half2 &H) {
|
|
785
|
+
sycl::float2 float2_value(static_cast<float>(H.x()), static_cast<float>(H.y()));
|
|
786
|
+
return float2_value;
|
|
787
|
+
}
|
|
788
|
+
|
|
789
|
+
float __dpct_inline__ __half2float(sycl::half H) {
|
|
790
|
+
return static_cast<float>(H);
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
static __dpct_inline__ void ggml_sycl_mad(float & acc, const float v, const float u) {
|
|
794
|
+
acc += v*u;
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
static __dpct_inline__ void ggml_sycl_mad(float & acc, const sycl::float2 v, const sycl::float2 u) {
|
|
798
|
+
acc += v.x() * u.x();
|
|
799
|
+
acc += v.y() * u.y();
|
|
800
|
+
}
|
|
801
|
+
|
|
802
|
+
static __dpct_inline__ void ggml_sycl_mad(float & acc, const sycl::half2 v, const sycl::half2 u) {
|
|
803
|
+
#ifdef GGML_SYCL_F16
|
|
804
|
+
const sycl::float2 tmp = (v * u).template convert<float, sycl::rounding_mode::automatic>();
|
|
805
|
+
acc += tmp.x() + tmp.y();
|
|
806
|
+
#else
|
|
807
|
+
const sycl::float2 tmpv = __half22float2(v);
|
|
808
|
+
const sycl::float2 tmpu = __half22float2(u);
|
|
809
|
+
acc += tmpv.x() * tmpu.x();
|
|
810
|
+
acc += tmpv.y() * tmpu.y();
|
|
811
|
+
#endif // GGML_SYCL_F16
|
|
812
|
+
}
|
|
813
|
+
|
|
814
|
+
static __dpct_inline__ void ggml_sycl_mad(sycl::half2 & acc, const sycl::half2 v, const sycl::half2 u) {
|
|
815
|
+
#ifdef GGML_SYCL_F16
|
|
816
|
+
acc += v*u;
|
|
817
|
+
#else
|
|
818
|
+
const sycl::float2 tmpv = __half22float2(v);
|
|
819
|
+
const sycl::float2 tmpu = __half22float2(u);
|
|
820
|
+
sycl::float2 tmpacc = __half22float2(acc);
|
|
821
|
+
// tmpacc.x += tmpv.x() * tmpu.x();
|
|
822
|
+
// tmpacc.y += tmpv.y() * tmpu.y();
|
|
823
|
+
sycl::float2 tmp1(tmpacc.x() + tmpv.x() * tmpu.x(), tmpacc.y() + tmpv.y() * tmpu.y());
|
|
824
|
+
acc = make_half2(tmp1.x(), tmp1.y());
|
|
825
|
+
#endif // GGML_SYCL_F16
|
|
826
|
+
}
|
|
827
|
+
|
|
828
|
+
template <int n>
|
|
829
|
+
struct ggml_sycl_unroll {
|
|
830
|
+
template <typename Func, typename... Args>
|
|
831
|
+
void operator()(const Func & f, Args... args) const {
|
|
832
|
+
f(n - 1, args...);
|
|
833
|
+
ggml_sycl_unroll<n - 1>{}(f, args...);
|
|
834
|
+
}
|
|
835
|
+
};
|
|
836
|
+
|
|
837
|
+
template <>
|
|
838
|
+
struct ggml_sycl_unroll<1> {
|
|
839
|
+
template <typename Func, typename... Args>
|
|
840
|
+
void operator()(const Func & f, Args... args) const {
|
|
841
|
+
f(0, args...);
|
|
842
|
+
}
|
|
843
|
+
};
|
|
844
|
+
|
|
845
|
+
static __dpct_inline__ sycl::half2 ggml_sycl_hmax2(const sycl::half2 a, const sycl::half2 b) {
|
|
846
|
+
sycl::half2 ret;
|
|
847
|
+
reinterpret_cast<sycl::half &>(ret.x()) =
|
|
848
|
+
sycl::vec<float, 1>(sycl::fmax(a[0], b[0])).convert<sycl::half, sycl::rounding_mode::automatic>()[0];
|
|
849
|
+
reinterpret_cast<sycl::half &>(ret.y()) =
|
|
850
|
+
sycl::vec<float, 1>(sycl::fmax(a[1], b[1])).convert<sycl::half, sycl::rounding_mode::automatic>()[0];
|
|
851
|
+
return ret;
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
static __dpct_inline__ sycl::half ggml_sycl_hmax(const sycl::half a, const sycl::half b) {
|
|
855
|
+
return sycl::vec<float, 1>(
|
|
856
|
+
sycl::fmax(sycl::vec<sycl::half, 1>(a).convert<float, sycl::rounding_mode::automatic>()[0],
|
|
857
|
+
sycl::vec<sycl::half, 1>(b).convert<float, sycl::rounding_mode::automatic>()[0]))
|
|
858
|
+
.convert<sycl::half, sycl::rounding_mode::automatic>()[0];
|
|
859
|
+
}
|
|
860
|
+
|
|
861
|
+
static __dpct_inline__ uint32_t __hgt2_mask(const sycl::half2 a, const sycl::half2 b) {
|
|
862
|
+
const uint32_t mask_low = 0x0000FFFF * (float(a[0]) > float(b[0]));
|
|
863
|
+
const uint32_t mask_high = 0xFFFF0000 * (float(a[1]) > float(b[1]));
|
|
864
|
+
return mask_low | mask_high;
|
|
865
|
+
}
|
|
866
|
+
|
|
867
|
+
static __dpct_inline__ uint32_t fastmodulo(uint32_t n, const sycl::uint3 fastdiv_values) {
|
|
868
|
+
// expects fastdiv_values to contain <mp, L, divisor> in <x, y, z> (see init_fastdiv_values)
|
|
869
|
+
return n - fastdiv(n, fastdiv_values) * fastdiv_values.z();
|
|
870
|
+
}
|
|
871
|
+
|
|
872
|
+
static bool fast_fp16_available(const int cc) {
|
|
873
|
+
GGML_UNUSED(cc);
|
|
874
|
+
return true; //Intel GPUs always support FP16.
|
|
875
|
+
}
|
|
876
|
+
|
|
877
|
+
enum class block_reduce_method {
|
|
878
|
+
MAX,
|
|
879
|
+
SUM,
|
|
880
|
+
};
|
|
881
|
+
|
|
882
|
+
template<block_reduce_method method_t, typename T, int warp_size>
|
|
883
|
+
struct block_reduce_policy;
|
|
884
|
+
|
|
885
|
+
template <typename T, typename... Ts>
|
|
886
|
+
inline constexpr bool is_any = (std::is_same_v<T, Ts> || ...);
|
|
887
|
+
|
|
888
|
+
template<typename...>
|
|
889
|
+
inline constexpr bool ggml_sycl_dependent_false_v = false;
|
|
890
|
+
|
|
891
|
+
#define WARP_32_SIZE 32
|
|
892
|
+
|
|
893
|
+
template <typename T, int warp_size> struct block_reduce_policy<block_reduce_method::SUM, T, warp_size> {
|
|
894
|
+
static T reduce(T val) {
|
|
895
|
+
if constexpr (is_any<T, float, sycl::float2, sycl::half2, int>) {
|
|
896
|
+
return warp_reduce_sum<warp_size>(val);
|
|
897
|
+
} else {
|
|
898
|
+
static_assert(ggml_sycl_dependent_false_v<T>, "Unsupported type for block reduce sum");
|
|
899
|
+
}
|
|
900
|
+
}
|
|
901
|
+
|
|
902
|
+
static T sentinel() {
|
|
903
|
+
if constexpr (std::is_same_v<T, float>) {
|
|
904
|
+
return 0.0f;
|
|
905
|
+
} else if constexpr (std::is_same_v<T, sycl::float2>) {
|
|
906
|
+
return sycl::float2(0.0f, 0.0f);
|
|
907
|
+
} else if constexpr (std::is_same_v<T, sycl::half2>) {
|
|
908
|
+
return sycl::half2(0.0f, 0.0f);
|
|
909
|
+
} else if constexpr (std::is_same_v<T, int>) {
|
|
910
|
+
return 0;
|
|
911
|
+
} else {
|
|
912
|
+
static_assert(ggml_sycl_dependent_false_v<T>, "Unsupported type for block reduce sum");
|
|
913
|
+
}
|
|
914
|
+
}
|
|
915
|
+
};
|
|
916
|
+
|
|
917
|
+
template <typename T, int warp_size> struct block_reduce_policy<block_reduce_method::MAX, T, warp_size> {
|
|
918
|
+
static T reduce(T val) {
|
|
919
|
+
if constexpr (is_any<T, float, sycl::half2>) {
|
|
920
|
+
return warp_reduce_max<warp_size>(val);
|
|
921
|
+
} else {
|
|
922
|
+
static_assert(ggml_sycl_dependent_false_v<T>, "Unsupported type for block reduce max");
|
|
923
|
+
}
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
static T sentinel() {
|
|
927
|
+
if constexpr (std::is_same_v<T, float>) {
|
|
928
|
+
return -INFINITY;
|
|
929
|
+
} else if constexpr (std::is_same_v<T, sycl::half2>) {
|
|
930
|
+
return sycl::half2(-INFINITY, -INFINITY);
|
|
931
|
+
} else {
|
|
932
|
+
static_assert(ggml_sycl_dependent_false_v<T>, "Unsupported type for block reduce max");
|
|
933
|
+
}
|
|
934
|
+
}
|
|
935
|
+
};
|
|
936
|
+
|
|
937
|
+
|
|
938
|
+
template <block_reduce_method reduce_method_t, int warp_size, typename T>
|
|
939
|
+
static T block_reduce(T val, T * shared_vals, int block_size_template) {
|
|
940
|
+
auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
|
|
941
|
+
val = block_reduce_policy<reduce_method_t, T,warp_size>::reduce(val);
|
|
942
|
+
const int block_size = block_size_template == 0 ? item_ct1.get_local_range(2) : block_size_template;
|
|
943
|
+
const int nthreads = item_ct1.get_local_range(2);
|
|
944
|
+
const int nwarps = nthreads / WARP_SIZE;
|
|
945
|
+
|
|
946
|
+
if (block_size > warp_size) {
|
|
947
|
+
assert((block_size <= 1024) && (block_size % warp_size) == 0);
|
|
948
|
+
const int warp_id = item_ct1.get_local_id(2) / warp_size;
|
|
949
|
+
const int lane_id = item_ct1.get_local_id(2) % warp_size;
|
|
950
|
+
if (lane_id == 0) {
|
|
951
|
+
shared_vals[warp_id] = val;
|
|
952
|
+
}
|
|
953
|
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
954
|
+
|
|
955
|
+
size_t nreduce = nwarps / WARP_SIZE;
|
|
956
|
+
float tmp = 0.f;
|
|
957
|
+
if (lane_id < (static_cast<int>(block_size) / warp_size)) {
|
|
958
|
+
for (size_t i = 0; i < nreduce; i += 1)
|
|
959
|
+
{
|
|
960
|
+
tmp += shared_vals[lane_id + i * WARP_SIZE];
|
|
961
|
+
}
|
|
962
|
+
}
|
|
963
|
+
return block_reduce_policy<reduce_method_t, T, warp_size>::reduce(tmp);
|
|
964
|
+
}
|
|
965
|
+
return val;
|
|
966
|
+
}
|
|
662
967
|
|
|
663
968
|
#endif // GGML_SYCL_COMMON_HPP
|
|
@@ -482,6 +482,63 @@ static void dequantize_row_mxfp4_sycl(const void * vx, dst_t * y, const int64_t
|
|
|
482
482
|
});
|
|
483
483
|
}
|
|
484
484
|
|
|
485
|
+
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
|
486
|
+
static void dequantize_block_nc(const void * __restrict__ vx, dst_t * __restrict__ y,
|
|
487
|
+
const int64_t ne00, const int64_t ne01, const int64_t ne02,
|
|
488
|
+
const int64_t s01, const int64_t s02, const int64_t s03) {
|
|
489
|
+
auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
|
|
490
|
+
const int64_t i00 = 2 * (int64_t(item_ct1.get_local_range(2)) * item_ct1.get_group(2) + item_ct1.get_local_id(2));
|
|
491
|
+
|
|
492
|
+
if (i00 >= ne00) {
|
|
493
|
+
return;
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
const int64_t i01 = item_ct1.get_group(1);
|
|
497
|
+
const int64_t i02 = item_ct1.get_group(0) % ne02;
|
|
498
|
+
const int64_t i03 = item_ct1.get_group(0) / ne02;
|
|
499
|
+
|
|
500
|
+
const int64_t ibx0 = i03*s03 + i02*s02 + i01*s01;
|
|
501
|
+
|
|
502
|
+
const int64_t ib = ibx0 + i00/qk; // block index
|
|
503
|
+
const int64_t iqs = (i00%qk)/qr; // quant index
|
|
504
|
+
const int64_t iybs = i00 - i00%qk; // y block start index
|
|
505
|
+
const int64_t y_offset = qr == 1 ? 1 : qk/2;
|
|
506
|
+
|
|
507
|
+
// dequantize
|
|
508
|
+
#ifdef GGML_SYCL_F16
|
|
509
|
+
sycl::half2 v;
|
|
510
|
+
#else
|
|
511
|
+
sycl::float2 v;
|
|
512
|
+
#endif
|
|
513
|
+
|
|
514
|
+
dequantize_kernel(vx, ib, iqs, v);
|
|
515
|
+
|
|
516
|
+
const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs;
|
|
517
|
+
y[iy0 + 0] = ggml_sycl_cast<dst_t>(v.x());
|
|
518
|
+
y[iy0 + y_offset] = ggml_sycl_cast<dst_t>(v.y());
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
|
523
|
+
static void dequantize_block_nc_sycl(const void * vx,
|
|
524
|
+
dst_t * y,
|
|
525
|
+
const int64_t ne00,
|
|
526
|
+
const int64_t ne01,
|
|
527
|
+
const int64_t ne02,
|
|
528
|
+
const int64_t ne03,
|
|
529
|
+
const int64_t s01,
|
|
530
|
+
const int64_t s02,
|
|
531
|
+
const int64_t s03,
|
|
532
|
+
dpct::queue_ptr stream) {
|
|
533
|
+
const dpct::dim3 num_blocks((ne00 + 2 * SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / (2 * SYCL_DEQUANTIZE_BLOCK_SIZE), ne01,
|
|
534
|
+
ne02 * ne03);
|
|
535
|
+
stream->parallel_for(sycl::nd_range<3>(num_blocks * sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
|
|
536
|
+
sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
|
|
537
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
538
|
+
GGML_UNUSED(item_ct1);
|
|
539
|
+
dequantize_block_nc<qk, qr, dequantize_kernel>(vx, y, ne00, ne01, ne02, s01, s02, s03);
|
|
540
|
+
});
|
|
541
|
+
}
|
|
485
542
|
template <typename src_t, typename dst_t>
|
|
486
543
|
static void convert_unary_nc(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01,
|
|
487
544
|
const int64_t ne02, const int64_t s01, const int64_t s02, const int64_t s03,
|
|
@@ -662,7 +719,8 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
|
|
|
662
719
|
}
|
|
663
720
|
}
|
|
664
721
|
|
|
665
|
-
|
|
722
|
+
|
|
723
|
+
to_fp16_nc_sycl_t ggml_get_to_fp16_nc_sycl(ggml_type type) {
|
|
666
724
|
switch (type) {
|
|
667
725
|
case GGML_TYPE_F32:
|
|
668
726
|
return convert_unary_nc_sycl<float>;
|
|
@@ -670,6 +728,16 @@ to_fp16_nc_sycl_t get_to_fp16_nc_sycl(ggml_type type) {
|
|
|
670
728
|
case GGML_TYPE_BF16:
|
|
671
729
|
return convert_unary_nc_sycl<sycl::ext::oneapi::bfloat16>;
|
|
672
730
|
#endif
|
|
731
|
+
case GGML_TYPE_Q4_0:
|
|
732
|
+
return dequantize_block_nc_sycl<QK4_0, QR4_0, dequantize_q4_0>;
|
|
733
|
+
case GGML_TYPE_Q4_1:
|
|
734
|
+
return dequantize_block_nc_sycl<QK4_1, QR4_1, dequantize_q4_1>;
|
|
735
|
+
case GGML_TYPE_Q5_0:
|
|
736
|
+
return dequantize_block_nc_sycl<QK5_0, QR5_0, dequantize_q5_0>;
|
|
737
|
+
case GGML_TYPE_Q5_1:
|
|
738
|
+
return dequantize_block_nc_sycl<QK5_1, QR5_1, dequantize_q5_1>;
|
|
739
|
+
case GGML_TYPE_Q8_0:
|
|
740
|
+
return dequantize_block_nc_sycl<QK8_0, QR8_0, dequantize_q8_0>;
|
|
673
741
|
default:
|
|
674
742
|
return nullptr;
|
|
675
743
|
}
|
|
@@ -29,6 +29,27 @@ using to_t_nc_sycl_t = void (*)(const void * x, T * y, int64_t ne00, int64_t ne0
|
|
|
29
29
|
int64_t s01, int64_t s02, int64_t s03, dpct::queue_ptr queue);
|
|
30
30
|
|
|
31
31
|
typedef to_t_nc_sycl_t<sycl::half> to_fp16_nc_sycl_t;
|
|
32
|
-
to_fp16_nc_sycl_t
|
|
32
|
+
to_fp16_nc_sycl_t ggml_get_to_fp16_nc_sycl(ggml_type type);
|
|
33
|
+
|
|
34
|
+
template<typename dst_t, typename src_t>
|
|
35
|
+
inline dst_t ggml_sycl_cast(src_t x) {
|
|
36
|
+
if constexpr (std::is_same_v<dst_t, src_t>) {
|
|
37
|
+
return x;
|
|
38
|
+
} else if constexpr (std::is_same_v<dst_t, sycl::ext::oneapi::bfloat16>) {
|
|
39
|
+
return sycl::ext::oneapi::bfloat16(float(x));
|
|
40
|
+
} else if constexpr (std::is_same_v<src_t, sycl::ext::oneapi::bfloat16>) {
|
|
41
|
+
return static_cast<float>(x);
|
|
42
|
+
} else if constexpr (std::is_same_v<src_t, sycl::float2> && std::is_same_v<dst_t, sycl::half2>) {
|
|
43
|
+
return x.template convert<sycl::half, sycl::rounding_mode::rte>();
|
|
44
|
+
} else if constexpr (std::is_same_v<src_t, sycl::float2> &&
|
|
45
|
+
std::is_same_v<dst_t, sycl::vec<sycl::ext::oneapi::bfloat16, 2>>) {
|
|
46
|
+
return {x.x, x.y};
|
|
47
|
+
} else if constexpr(std::is_same_v<dst_t, int32_t>) {
|
|
48
|
+
return int32_t(x);
|
|
49
|
+
} else {
|
|
50
|
+
return float(x);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
33
54
|
|
|
34
55
|
#endif // GGML_SYCL_CONVERT_HPP
|