whispercpp 1.3.5 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +99 -2
- data/ext/extconf.rb +1 -0
- data/ext/ruby_whisper.c +20 -4
- data/ext/ruby_whisper.h +30 -2
- data/ext/ruby_whisper_context.c +216 -124
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_model.c +0 -1
- data/ext/ruby_whisper_params.c +0 -1
- data/ext/ruby_whisper_segment.c +0 -1
- data/ext/ruby_whisper_token.c +29 -9
- data/ext/ruby_whisper_transcribe.cpp +4 -1
- data/ext/ruby_whisper_vad_context.c +48 -1
- data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +0 -1
- data/ext/ruby_whisper_vad_segments.c +0 -1
- data/ext/sources/CMakeLists.txt +1 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/examples/bench/bench.cpp +23 -18
- data/ext/sources/examples/cli/cli.cpp +8 -0
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/server/server.cpp +18 -4
- data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
- data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
- data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
- data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
- data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
- data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
- data/ext/sources/examples/talk-llama/llama-context.h +27 -28
- data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
- data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
- data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
- data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
- data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
- data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
- data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
- data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
- data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
- data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
- data/ext/sources/examples/talk-llama/llama-model.h +72 -19
- data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
- data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
- data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
- data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
- data/ext/sources/examples/talk-llama/llama.cpp +76 -22
- data/ext/sources/examples/talk-llama/llama.h +63 -30
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
- data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
- data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
- data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
- data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
- data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/models.h +181 -46
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
- data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
- data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
- data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
- data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
- data/ext/sources/ggml/CMakeLists.txt +9 -3
- data/ext/sources/ggml/include/ggml-backend.h +1 -1
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +5 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +6 -1
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml.h +56 -9
- data/ext/sources/ggml/src/CMakeLists.txt +3 -0
- data/ext/sources/ggml/src/ggml-alloc.c +4 -9
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
- data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
- data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
- data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
- data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
- data/ext/sources/ggml/src/ggml-impl.h +62 -0
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
- data/ext/sources/ggml/src/ggml-quants.c +96 -5
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
- data/ext/sources/ggml/src/ggml.c +167 -33
- data/ext/sources/ggml/src/gguf.cpp +229 -44
- data/ext/sources/src/whisper.cpp +6 -28
- data/sig/whisper.rbs +43 -2
- data/test/test_context_params.rb +82 -0
- data/test/test_token.rb +11 -0
- data/test/test_vad_context.rb +58 -8
- data/test/test_whisper.rb +20 -0
- data/whispercpp.gemspec +1 -1
- metadata +240 -28
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
#include "ggml-backend-impl.h"
|
|
2
|
+
#include "ggml-decoder.h"
|
|
3
|
+
#include "ggml-impl.h"
|
|
4
|
+
|
|
5
|
+
#include <algorithm>
|
|
6
|
+
#include <cstddef>
|
|
7
|
+
#include <memory>
|
|
8
|
+
#include <openvino/runtime/core.hpp>
|
|
9
|
+
#include <openvino/runtime/infer_request.hpp>
|
|
10
|
+
#include <string>
|
|
11
|
+
#include <unordered_map>
|
|
12
|
+
#include <vector>
|
|
13
|
+
|
|
14
|
+
struct graph_key {
|
|
15
|
+
int n_nodes;
|
|
16
|
+
std::string first_node_name;
|
|
17
|
+
std::string last_node_name;
|
|
18
|
+
|
|
19
|
+
graph_key(const ggml_cgraph * cgraph) : n_nodes(cgraph->n_nodes) {
|
|
20
|
+
if (n_nodes > 0) {
|
|
21
|
+
first_node_name = cgraph->nodes[0]->name;
|
|
22
|
+
last_node_name = cgraph->nodes[n_nodes - 1]->name;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
bool operator==(const graph_key & other) const {
|
|
27
|
+
return n_nodes == other.n_nodes && first_node_name == other.first_node_name &&
|
|
28
|
+
last_node_name == other.last_node_name;
|
|
29
|
+
}
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
struct graph_key_hash {
|
|
33
|
+
size_t operator()(const graph_key & key) const {
|
|
34
|
+
size_t h = std::hash<int>{}(key.n_nodes);
|
|
35
|
+
if (key.n_nodes > 0) {
|
|
36
|
+
h ^= std::hash<std::string>{}(key.first_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2);
|
|
37
|
+
h ^= std::hash<std::string>{}(key.last_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2);
|
|
38
|
+
}
|
|
39
|
+
return h;
|
|
40
|
+
}
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
struct ov_runtime_context {
|
|
44
|
+
std::mutex ov_compute_mutex;
|
|
45
|
+
std::string device;
|
|
46
|
+
bool stateful;
|
|
47
|
+
std::unordered_map<graph_key, std::shared_ptr<GgmlOvDecoder>, graph_key_hash> decoder_cache;
|
|
48
|
+
std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache;
|
|
49
|
+
std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache_prefill;
|
|
50
|
+
std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_input_names_cache;
|
|
51
|
+
std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_output_names_cache;
|
|
52
|
+
//TODO: Stateful is only supported for single request at a time.
|
|
53
|
+
// Simultanous stateful inference request support to be added.
|
|
54
|
+
size_t stateful_kv_size;
|
|
55
|
+
std::map<std::string, std::string> kv_state_input_name_map;
|
|
56
|
+
|
|
57
|
+
ov_runtime_context() :
|
|
58
|
+
device("CPU"),
|
|
59
|
+
stateful(false),
|
|
60
|
+
stateful_kv_size(0) {}
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
enum ggml_status ov_graph_compute(struct ggml_cgraph * cgraph, ggml_backend_t backend);
|
|
64
|
+
|
|
65
|
+
enum ggml_status ov_graph_compute_dynamic(struct ggml_cgraph * cgraph, std::shared_ptr<ov_runtime_context> r_ctx);
|
|
66
|
+
enum ggml_status ov_graph_compute_static(struct ggml_cgraph * cgraph, std::shared_ptr<ov_runtime_context> r_ctx);
|
|
67
|
+
|
|
68
|
+
size_t checksum(const void * data, size_t size);
|
|
69
|
+
|
|
70
|
+
void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor);
|
|
71
|
+
|
|
72
|
+
void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, const void * output_dst);
|
|
73
|
+
|
|
74
|
+
template <typename T>
|
|
75
|
+
std::vector<T> pad_input(const T * data,
|
|
76
|
+
size_t rows,
|
|
77
|
+
size_t cols,
|
|
78
|
+
size_t padded_rows,
|
|
79
|
+
size_t padded_cols,
|
|
80
|
+
T pad_value) {
|
|
81
|
+
std::vector<T> padded(padded_rows * padded_cols, pad_value);
|
|
82
|
+
|
|
83
|
+
for (size_t i = 0; i < std::min(rows, padded_rows); ++i) {
|
|
84
|
+
for (size_t j = 0; j < std::min(cols, padded_cols); ++j) {
|
|
85
|
+
padded[i * padded_cols + j] = data[i * cols + j];
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return padded;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
template <typename T>
|
|
93
|
+
std::vector<T> pad_input(const ggml_tensor * tensor, size_t padded_rows, size_t padded_cols, T pad_value) {
|
|
94
|
+
return pad_input<T>(reinterpret_cast<const T *>(tensor->data),
|
|
95
|
+
static_cast<size_t>(tensor->ne[1]), // rows
|
|
96
|
+
static_cast<size_t>(tensor->ne[0]), // cols
|
|
97
|
+
padded_rows, padded_cols, pad_value);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
void set_zero_diagonal(std::vector<float> & matrix, size_t rows, size_t cols);
|
|
101
|
+
|
|
102
|
+
const ggml_tensor * get_inp_pos_tensor(struct ggml_cgraph * cgraph);
|
|
103
|
+
|
|
104
|
+
bool get_is_prefill(const ggml_tensor * inp_pos);
|
|
105
|
+
|
|
106
|
+
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & param_name);
|
|
107
|
+
ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
|
|
108
|
+
const std::string & param_name);
|
|
109
|
+
ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
|
|
110
|
+
const std::string & param_name,
|
|
111
|
+
int chunk_index);
|
|
112
|
+
|
|
113
|
+
ov::Tensor create_ov_output_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
|
|
114
|
+
std::shared_ptr<ov::InferRequest> infer_request,
|
|
115
|
+
int output_index,
|
|
116
|
+
const ggml_tensor * ggml_tensor);
|
|
117
|
+
|
|
118
|
+
bool is_naive(struct ggml_cgraph * cgraph);
|
|
119
|
+
|
|
120
|
+
enum ggml_status naive_compute(struct ggml_cgraph * cgraph,
|
|
121
|
+
ov::Core & core,
|
|
122
|
+
const std::string & device,
|
|
123
|
+
const ov::AnyMap & config);
|
|
@@ -304,6 +304,41 @@ void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RE
|
|
|
304
304
|
}
|
|
305
305
|
}
|
|
306
306
|
|
|
307
|
+
void quantize_row_nvfp4_ref(const float * GGML_RESTRICT x, block_nvfp4 * GGML_RESTRICT y, int64_t k) {
|
|
308
|
+
static const int qk = QK_NVFP4;
|
|
309
|
+
static const int qk_sub = QK_NVFP4_SUB;
|
|
310
|
+
static const int n_sub = QK_NVFP4 / QK_NVFP4_SUB;
|
|
311
|
+
|
|
312
|
+
assert(k % qk == 0);
|
|
313
|
+
|
|
314
|
+
const int nb = k / qk;
|
|
315
|
+
|
|
316
|
+
for (int i = 0; i < nb; i++) {
|
|
317
|
+
for (int s = 0; s < n_sub; s++) {
|
|
318
|
+
const float * xb = x + i*qk + s*qk_sub;
|
|
319
|
+
|
|
320
|
+
float amax = 0.0f;
|
|
321
|
+
for (int j = 0; j < qk_sub; j++) {
|
|
322
|
+
if (amax < fabsf(xb[j])) {
|
|
323
|
+
amax = fabsf(xb[j]);
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
// UE4M3 scale: amax / 6.0 maps the max E2M1 value (6.0) to amax
|
|
328
|
+
const uint8_t ue = ggml_fp32_to_ue4m3(amax / 6.0f);
|
|
329
|
+
y[i].d[s] = ue;
|
|
330
|
+
const float d = ggml_ue4m3_to_fp32(ue);
|
|
331
|
+
|
|
332
|
+
for (int j = 0; j < qk_sub/2; ++j) {
|
|
333
|
+
const uint8_t x0 = best_index_mxfp4(xb[0 + j], d);
|
|
334
|
+
const uint8_t x1 = best_index_mxfp4(xb[qk_sub/2 + j], d);
|
|
335
|
+
|
|
336
|
+
y[i].qs[s*(qk_sub/2) + j] = x0 | (x1 << 4);
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
|
|
307
342
|
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
308
343
|
static const int qk = QK4_0;
|
|
309
344
|
|
|
@@ -434,6 +469,31 @@ void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_REST
|
|
|
434
469
|
}
|
|
435
470
|
}
|
|
436
471
|
|
|
472
|
+
void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
473
|
+
static const int qk = QK_NVFP4;
|
|
474
|
+
static const int qk_sub = QK_NVFP4_SUB;
|
|
475
|
+
static const int n_sub = QK_NVFP4 / QK_NVFP4_SUB;
|
|
476
|
+
|
|
477
|
+
assert(k % qk == 0);
|
|
478
|
+
|
|
479
|
+
const int nb = k / qk;
|
|
480
|
+
|
|
481
|
+
for (int i = 0; i < nb; i++) {
|
|
482
|
+
for (int s = 0; s < n_sub; s++) {
|
|
483
|
+
const float d = ggml_ue4m3_to_fp32(x[i].d[s]);
|
|
484
|
+
float * yb = y + i*qk + s*qk_sub;
|
|
485
|
+
|
|
486
|
+
for (int j = 0; j < qk_sub/2; ++j) {
|
|
487
|
+
const int8_t v0 = kvalues_mxfp4[x[i].qs[s*(qk_sub/2) + j] & 0x0F];
|
|
488
|
+
const int8_t v1 = kvalues_mxfp4[x[i].qs[s*(qk_sub/2) + j] >> 4];
|
|
489
|
+
|
|
490
|
+
yb[j + 0 ] = v0*d;
|
|
491
|
+
yb[j + qk_sub/2] = v1*d;
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
|
|
437
497
|
//
|
|
438
498
|
// 2-6 bit quantization in super-blocks
|
|
439
499
|
//
|
|
@@ -2098,6 +2158,12 @@ size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
|
|
2098
2158
|
return nrow * ggml_row_size(GGML_TYPE_MXFP4, n_per_row);
|
|
2099
2159
|
}
|
|
2100
2160
|
|
|
2161
|
+
size_t quantize_nvfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
2162
|
+
GGML_UNUSED(quant_weights);
|
|
2163
|
+
quantize_row_nvfp4_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
2164
|
+
return nrow * ggml_row_size(GGML_TYPE_NVFP4, n_per_row);
|
|
2165
|
+
}
|
|
2166
|
+
|
|
2101
2167
|
// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
|
|
2102
2168
|
|
|
2103
2169
|
void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k) {
|
|
@@ -3104,6 +3170,11 @@ static void quantize_row_iq2_xxs_impl(const float * GGML_RESTRICT x, void * GGML
|
|
|
3104
3170
|
}
|
|
3105
3171
|
float scale = make_qp_quants(32, kMaxQ+1, xval, (uint8_t*)L, weight);
|
|
3106
3172
|
float eff_max = scale*kMaxQ;
|
|
3173
|
+
if (eff_max <= 0) {
|
|
3174
|
+
scales[ib] = 0;
|
|
3175
|
+
memset(L, 0, 32);
|
|
3176
|
+
continue;
|
|
3177
|
+
}
|
|
3107
3178
|
float best = 0;
|
|
3108
3179
|
for (int is = -6; is <= 6; ++is) {
|
|
3109
3180
|
float id = (2*kMaxQ-1+is*0.1f)/eff_max;
|
|
@@ -3273,9 +3344,9 @@ static void quantize_row_iq2_xs_impl(const float * GGML_RESTRICT x, void * GGML_
|
|
|
3273
3344
|
}
|
|
3274
3345
|
float max = xval[0];
|
|
3275
3346
|
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
|
3347
|
+
memset(L, 0, 16);
|
|
3276
3348
|
if (max < GROUP_MAX_EPS) {
|
|
3277
3349
|
scales[ib] = 0;
|
|
3278
|
-
memset(L, 0, 16);
|
|
3279
3350
|
continue;
|
|
3280
3351
|
}
|
|
3281
3352
|
float best = 0;
|
|
@@ -3714,9 +3785,9 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * GGML_RESTRICT
|
|
|
3714
3785
|
}
|
|
3715
3786
|
float max = xval[0];
|
|
3716
3787
|
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
|
3788
|
+
memset(L, 0, 32);
|
|
3717
3789
|
if (max < GROUP_MAX_EPS_IQ3_XXS) {
|
|
3718
3790
|
scales[ib] = 0;
|
|
3719
|
-
memset(L, 0, 32);
|
|
3720
3791
|
continue;
|
|
3721
3792
|
}
|
|
3722
3793
|
float best = 0;
|
|
@@ -3922,6 +3993,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * GGML_RESTRICT
|
|
|
3922
3993
|
}
|
|
3923
3994
|
float max = xval[0];
|
|
3924
3995
|
for (int i = 1; i < block_size; ++i) max = MAX(max, xval[i]);
|
|
3996
|
+
memset(L, 0, block_size);
|
|
3925
3997
|
if (!max) {
|
|
3926
3998
|
scales[ib] = 0;
|
|
3927
3999
|
continue;
|
|
@@ -4245,6 +4317,7 @@ static void quantize_row_iq1_s_impl(const float * GGML_RESTRICT x, void * GGML_R
|
|
|
4245
4317
|
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
|
4246
4318
|
if (max < GROUP_MAX_EPS_IQ1_S) {
|
|
4247
4319
|
scales[ib] = 0;
|
|
4320
|
+
shifts[ib] = 1;
|
|
4248
4321
|
memset(L, 1, block_size);
|
|
4249
4322
|
continue;
|
|
4250
4323
|
}
|
|
@@ -4285,7 +4358,12 @@ static void quantize_row_iq1_s_impl(const float * GGML_RESTRICT x, void * GGML_R
|
|
|
4285
4358
|
}
|
|
4286
4359
|
}
|
|
4287
4360
|
}
|
|
4288
|
-
|
|
4361
|
+
if (besti1 < 0 || besti2 < 0 || best_shift == 0) {
|
|
4362
|
+
scales[ib] = 0;
|
|
4363
|
+
shifts[ib] = 1;
|
|
4364
|
+
memset(L, 1, block_size);
|
|
4365
|
+
continue;
|
|
4366
|
+
}
|
|
4289
4367
|
for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
|
|
4290
4368
|
for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
|
|
4291
4369
|
for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
|
|
@@ -4429,6 +4507,7 @@ static void quantize_row_iq1_m_impl(const float * GGML_RESTRICT x, void * GGML_R
|
|
|
4429
4507
|
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
|
4430
4508
|
if (max < GROUP_MAX_EPS_IQ1_M) {
|
|
4431
4509
|
scales[ib] = 0;
|
|
4510
|
+
shifts[ib] = 0;
|
|
4432
4511
|
memset(L, 1, block_size);
|
|
4433
4512
|
continue;
|
|
4434
4513
|
}
|
|
@@ -4527,7 +4606,12 @@ static void quantize_row_iq1_m_impl(const float * GGML_RESTRICT x, void * GGML_R
|
|
|
4527
4606
|
}
|
|
4528
4607
|
}
|
|
4529
4608
|
}
|
|
4530
|
-
|
|
4609
|
+
if (besti1 < 0 || besti2 < 0 || best_k < 0) {
|
|
4610
|
+
scales[ib] = 0;
|
|
4611
|
+
shifts[ib] = 0;
|
|
4612
|
+
memset(L, 1, block_size);
|
|
4613
|
+
continue;
|
|
4614
|
+
}
|
|
4531
4615
|
for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
|
|
4532
4616
|
for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
|
|
4533
4617
|
for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
|
|
@@ -4683,7 +4767,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
|
|
4683
4767
|
sumqx += w*q*xb[j];
|
|
4684
4768
|
sumq2 += w*q*q;
|
|
4685
4769
|
}
|
|
4686
|
-
d = sumqx/sumq2;
|
|
4770
|
+
d = sumq2 > 0 ? sumqx/sumq2 : 0.f;
|
|
4687
4771
|
float best = d*sumqx;
|
|
4688
4772
|
for (int itry = -ntry; itry <= ntry; ++itry) {
|
|
4689
4773
|
id = (itry + values[0])/max;
|
|
@@ -4874,6 +4958,7 @@ static void quantize_row_iq2_s_impl(const float * GGML_RESTRICT x, void * GGML_R
|
|
|
4874
4958
|
}
|
|
4875
4959
|
float max = xval[0];
|
|
4876
4960
|
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
|
4961
|
+
memset(L, 0, 16);
|
|
4877
4962
|
if (max < GROUP_MAX_EPS_IQ2_S) {
|
|
4878
4963
|
scales[ib] = 0;
|
|
4879
4964
|
continue;
|
|
@@ -5225,6 +5310,12 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
|
|
|
5225
5310
|
{
|
|
5226
5311
|
VALIDATE_ROW_DATA_E_E8M0_IMPL(block_mxfp4, data, nb);
|
|
5227
5312
|
} break;
|
|
5313
|
+
case GGML_TYPE_NVFP4:
|
|
5314
|
+
{
|
|
5315
|
+
// UE4M3 scales are uint8_t — all byte values are valid
|
|
5316
|
+
GGML_UNUSED(data);
|
|
5317
|
+
GGML_UNUSED(nb);
|
|
5318
|
+
} break;
|
|
5228
5319
|
case GGML_TYPE_Q2_K:
|
|
5229
5320
|
{
|
|
5230
5321
|
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q2_K, data, nb, d, dmin);
|
|
@@ -22,6 +22,7 @@ GGML_API void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 *
|
|
|
22
22
|
GGML_API void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
|
23
23
|
|
|
24
24
|
GGML_API void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RESTRICT y, int64_t k);
|
|
25
|
+
GGML_API void quantize_row_nvfp4_ref(const float * GGML_RESTRICT x, block_nvfp4 * GGML_RESTRICT y, int64_t k);
|
|
25
26
|
|
|
26
27
|
GGML_API void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
|
27
28
|
GGML_API void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
|
@@ -48,6 +49,7 @@ GGML_API void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GG
|
|
|
48
49
|
//GGML_API void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
49
50
|
|
|
50
51
|
GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
52
|
+
GGML_API void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
51
53
|
|
|
52
54
|
GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
53
55
|
GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
@@ -95,6 +97,7 @@ GGML_API size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTR
|
|
|
95
97
|
GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
96
98
|
|
|
97
99
|
GGML_API size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
100
|
+
GGML_API size_t quantize_nvfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
98
101
|
|
|
99
102
|
GGML_API void iq2xs_init_impl(enum ggml_type type);
|
|
100
103
|
GGML_API void iq2xs_free_impl(enum ggml_type type);
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
message(STATUS "GGML_SYCL_TARGET=${GGML_SYCL_TARGET}")
|
|
2
2
|
|
|
3
|
-
if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL
|
|
4
|
-
message(FATAL_ERROR "Invalid
|
|
3
|
+
if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL)$")
|
|
4
|
+
message(FATAL_ERROR "GGML_SYCL_TARGET: Invalid target, the supported options are [INTEL]")
|
|
5
5
|
endif()
|
|
6
6
|
|
|
7
7
|
check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
|
|
@@ -25,6 +25,11 @@ ggml_add_backend_library(ggml-sycl
|
|
|
25
25
|
|
|
26
26
|
file(GLOB GGML_HEADERS_SYCL "*.hpp")
|
|
27
27
|
file(GLOB GGML_SOURCES_SYCL "*.cpp")
|
|
28
|
+
file(GLOB SRCS "template-instances/fattn-tile*.cpp")
|
|
29
|
+
list(APPEND GGML_SOURCES_SYCL ${SRCS})
|
|
30
|
+
file(GLOB SRCS "template-instances/fattn-vec*.cpp")
|
|
31
|
+
list(APPEND GGML_SOURCES_SYCL ${SRCS})
|
|
32
|
+
|
|
28
33
|
target_sources(ggml-sycl PRIVATE ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL})
|
|
29
34
|
|
|
30
35
|
if (WIN32)
|
|
@@ -125,106 +130,28 @@ endif()
|
|
|
125
130
|
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_DNNL=${GGML_SYCL_DNNL})
|
|
126
131
|
|
|
127
132
|
if (GGML_SYCL_F16)
|
|
128
|
-
if (GGML_SYCL_TARGET STREQUAL "AMD")
|
|
129
|
-
message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.")
|
|
130
|
-
endif()
|
|
131
133
|
add_compile_definitions(GGML_SYCL_F16)
|
|
132
134
|
endif()
|
|
133
135
|
|
|
134
136
|
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
|
135
137
|
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
|
|
136
138
|
target_link_options(ggml-sycl PRIVATE -Xs -ze-intel-greater-than-4GB-buffer-required)
|
|
137
|
-
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
|
138
|
-
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
|
139
|
-
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
|
140
|
-
# INFO: Allowed Sub_group_sizes are not consistent through all
|
|
141
|
-
# hip targets. For example, 64 is used for certain models, but the backend
|
|
142
|
-
# does not support it.
|
|
143
|
-
# Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32)
|
|
144
|
-
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
|
145
|
-
else()
|
|
146
|
-
# default for other target
|
|
147
|
-
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
|
148
|
-
endif()
|
|
149
|
-
|
|
150
|
-
if (GGML_SYCL_GRAPH)
|
|
151
|
-
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GRAPH)
|
|
152
|
-
endif()
|
|
153
139
|
|
|
154
|
-
# Link against Intel oneMKL
|
|
155
|
-
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
|
156
|
-
# Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically
|
|
157
|
-
# See https://github.com/uxlfoundation/oneMath/issues/654
|
|
140
|
+
# Link against Intel oneMKL
|
|
158
141
|
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
|
159
142
|
set(SYCL_COMPILER ON)
|
|
160
143
|
endif()
|
|
161
144
|
find_package(MKL REQUIRED)
|
|
162
145
|
target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
|
|
163
|
-
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL)
|
|
164
146
|
else()
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
include(FetchContent)
|
|
170
|
-
set(BUILD_FUNCTIONAL_TESTS False)
|
|
171
|
-
set(BUILD_EXAMPLES False)
|
|
172
|
-
set(TARGET_DOMAINS blas)
|
|
173
|
-
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
|
174
|
-
set(ENABLE_MKLCPU_BACKEND False)
|
|
175
|
-
set(ENABLE_MKLGPU_BACKEND False)
|
|
176
|
-
set(ENABLE_CUBLAS_BACKEND True)
|
|
177
|
-
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
|
178
|
-
set(ENABLE_MKLCPU_BACKEND False)
|
|
179
|
-
set(ENABLE_MKLGPU_BACKEND False)
|
|
180
|
-
set(ENABLE_ROCBLAS_BACKEND True)
|
|
181
|
-
# Ensure setting a string variable here is not overriden by oneMath CACHE variables
|
|
182
|
-
cmake_policy(SET CMP0126 NEW)
|
|
183
|
-
# Setting the device architecture is only needed and useful for AMD devices in oneMath
|
|
184
|
-
set(HIP_TARGETS ${GGML_SYCL_DEVICE_ARCH} CACHE STRING "oneMath HIP target" FORCE)
|
|
185
|
-
endif()
|
|
186
|
-
FetchContent_Declare(
|
|
187
|
-
ONEMATH
|
|
188
|
-
GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
|
|
189
|
-
GIT_TAG 8efe85f5aaebb37f1d8c503b7af66315feabf142
|
|
190
|
-
)
|
|
191
|
-
FetchContent_MakeAvailable(ONEMATH)
|
|
192
|
-
# Create alias to match with find_package targets name
|
|
193
|
-
function(onemath_alias target)
|
|
194
|
-
if (TARGET ${target}_obj)
|
|
195
|
-
# Silence verbose warnings from external libraries
|
|
196
|
-
target_compile_options(${target}_obj PRIVATE -w)
|
|
197
|
-
endif()
|
|
198
|
-
if (TARGET ${target})
|
|
199
|
-
add_library(ONEMATH::${target} ALIAS ${target})
|
|
200
|
-
endif()
|
|
201
|
-
endfunction()
|
|
202
|
-
onemath_alias(onemath)
|
|
203
|
-
onemath_alias(onemath_blas_mklcpu)
|
|
204
|
-
onemath_alias(onemath_blas_mklgpu)
|
|
205
|
-
onemath_alias(onemath_blas_cublas)
|
|
206
|
-
onemath_alias(onemath_blas_rocblas)
|
|
207
|
-
endif()
|
|
147
|
+
# default for other target
|
|
148
|
+
message(FATAL_ERROR "GGML_SYCL_TARGET is not supported")
|
|
149
|
+
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
|
150
|
+
endif()
|
|
208
151
|
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
|
|
213
|
-
target_link_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
|
|
214
|
-
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA)
|
|
215
|
-
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
|
216
|
-
if (NOT GGML_SYCL_DEVICE_ARCH)
|
|
217
|
-
message(FATAL_ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
|
|
218
|
-
endif()
|
|
219
|
-
target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas)
|
|
220
|
-
target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
|
|
221
|
-
target_link_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
|
|
222
|
-
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_AMD)
|
|
223
|
-
else()
|
|
224
|
-
# Fallback to oneMath runtime dispatcher
|
|
225
|
-
target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath)
|
|
226
|
-
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GENERIC)
|
|
227
|
-
endif()
|
|
152
|
+
if (GGML_SYCL_GRAPH)
|
|
153
|
+
message(STATUS "find GGML_SYCL_GRAPH")
|
|
154
|
+
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GRAPH)
|
|
228
155
|
endif()
|
|
229
156
|
|
|
230
157
|
if (GGML_SYCL_DEVICE_ARCH)
|
|
@@ -55,7 +55,11 @@ void ggml_sycl_add_id(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
|
|
55
55
|
const int32_t* src2_d = (const int32_t*)src2->data;
|
|
56
56
|
float* dst_d = (float*)dst->data;
|
|
57
57
|
|
|
58
|
-
int
|
|
58
|
+
const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
|
|
59
|
+
assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
|
|
60
|
+
|
|
61
|
+
int threads = std::min((unsigned int)ne00, max_work_group_size); // cols
|
|
62
|
+
|
|
59
63
|
ctx.stream()->parallel_for(
|
|
60
64
|
sycl::nd_range<3>(
|
|
61
65
|
sycl::range<3>(1, ne02, ne01) * sycl::range<3>(1, 1, threads),
|
|
@@ -11,8 +11,8 @@ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
|
|
11
11
|
int ne0, int ne1, int ne2, int ne3,
|
|
12
12
|
int ne10, int ne11, int ne12, int ne13,
|
|
13
13
|
/*int s0, */ int s1, int s2, int s3,
|
|
14
|
-
|
|
15
|
-
|
|
14
|
+
int s00, int s01, int s02, int s03,
|
|
15
|
+
int s10, int s11, int s12, int s13,
|
|
16
16
|
const sycl::nd_item<3> &item_ct1) {
|
|
17
17
|
const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
18
18
|
item_ct1.get_local_id(2);
|
|
@@ -44,7 +44,7 @@ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
|
|
44
44
|
for (int i0 = i0s; i0 < ne0;
|
|
45
45
|
i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
|
|
46
46
|
const int i10 = i0 % ne10;
|
|
47
|
-
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
|
47
|
+
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0*s00] : 0.0f, (float)src1_row[i10*s10]);
|
|
48
48
|
}
|
|
49
49
|
}
|
|
50
50
|
|
|
@@ -53,8 +53,8 @@ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t
|
|
|
53
53
|
int ne0, int ne1, int ne2, int ne3,
|
|
54
54
|
int ne10, int ne11, int ne12, int ne13,
|
|
55
55
|
/*int s0, */ int s1, int s2, int s3,
|
|
56
|
-
|
|
57
|
-
|
|
56
|
+
int s00, int s01, int s02, int s03,
|
|
57
|
+
int s10, int s11, int s12, int s13,
|
|
58
58
|
const sycl::nd_item<3> &item_ct1) {
|
|
59
59
|
|
|
60
60
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
@@ -82,7 +82,7 @@ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t
|
|
|
82
82
|
dst_t * dst_row = dst + i_dst;
|
|
83
83
|
|
|
84
84
|
const int i10 = i0 % ne10;
|
|
85
|
-
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
|
85
|
+
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0*s00] : 0.0f, (float)src1_row[i10*s10]);
|
|
86
86
|
}
|
|
87
87
|
|
|
88
88
|
|
|
@@ -95,7 +95,8 @@ struct bin_bcast_sycl {
|
|
|
95
95
|
const int64_t ne3, const size_t nb00, const size_t nb01, const size_t nb02, const size_t nb03,
|
|
96
96
|
const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, const size_t nb0,
|
|
97
97
|
const size_t nb1, const size_t nb2, const size_t nb3, const bool src0_is_contiguous,
|
|
98
|
-
const bool src1_is_contiguous, const bool
|
|
98
|
+
const bool src1_is_contiguous, const bool src0_is_permuted, const bool src1_is_permuted,
|
|
99
|
+
queue_ptr stream) {
|
|
99
100
|
int nr0 = ne10 / ne0;
|
|
100
101
|
int nr1 = ne11/ne1;
|
|
101
102
|
int nr2 = ne12/ne2;
|
|
@@ -123,7 +124,7 @@ struct bin_bcast_sycl {
|
|
|
123
124
|
cnb[3] *= cne[3];
|
|
124
125
|
};
|
|
125
126
|
|
|
126
|
-
if (src0_is_contiguous && src1_is_contiguous &&
|
|
127
|
+
if (src0_is_contiguous && src1_is_contiguous && !src0_is_permuted && !src1_is_permuted) {
|
|
127
128
|
for (int i = 0; i < 4; i++) {
|
|
128
129
|
if (nr[i] != 1) {
|
|
129
130
|
break;
|
|
@@ -164,7 +165,7 @@ struct bin_bcast_sycl {
|
|
|
164
165
|
size_t nb12 = cnb1[2];
|
|
165
166
|
size_t nb13 = cnb1[3];
|
|
166
167
|
|
|
167
|
-
size_t s0 = nb0 / sizeof(dst_t);
|
|
168
|
+
// size_t s0 = nb0 / sizeof(dst_t);
|
|
168
169
|
size_t s1 = nb1 / sizeof(dst_t);
|
|
169
170
|
size_t s2 = nb2 / sizeof(dst_t);
|
|
170
171
|
size_t s3 = nb3 / sizeof(dst_t);
|
|
@@ -196,9 +197,6 @@ struct bin_bcast_sycl {
|
|
|
196
197
|
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
|
|
197
198
|
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
|
|
198
199
|
|
|
199
|
-
GGML_ASSERT(s0 == 1);
|
|
200
|
-
GGML_ASSERT(s10 == 1);
|
|
201
|
-
|
|
202
200
|
const int block_size = 128;
|
|
203
201
|
|
|
204
202
|
int64_t hne0 = std::max(ne0/2LL, 1LL);
|
|
@@ -232,8 +230,8 @@ struct bin_bcast_sycl {
|
|
|
232
230
|
[=](sycl::nd_item<3> item_ct1) {
|
|
233
231
|
k_bin_bcast_unravel<bin_op>(
|
|
234
232
|
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
|
|
235
|
-
ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
|
|
236
|
-
s03, s11, s12, s13, item_ct1);
|
|
233
|
+
ne10, ne11, ne12, ne13, s1, s2, s3, s00, s01, s02,
|
|
234
|
+
s03, s10, s11, s12, s13, item_ct1);
|
|
237
235
|
});
|
|
238
236
|
}
|
|
239
237
|
} else {
|
|
@@ -251,7 +249,7 @@ struct bin_bcast_sycl {
|
|
|
251
249
|
[=](sycl::nd_item<3> item_ct1) {
|
|
252
250
|
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
|
|
253
251
|
ne2, ne3, ne10, ne11, ne12, ne13,
|
|
254
|
-
s1, s2, s3, s01, s02, s03, s11, s12, s13,
|
|
252
|
+
s1, s2, s3, s00, s01, s02, s03, s10, s11, s12, s13,
|
|
255
253
|
item_ct1);
|
|
256
254
|
});
|
|
257
255
|
}
|
|
@@ -268,24 +266,27 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
|
|
|
268
266
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
269
267
|
op()((const float *) src0->data, (const float *) src1->data, (float *) dst->data, ne00, ne01, ne02, ne03, ne10,
|
|
270
268
|
ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, nb3,
|
|
271
|
-
ggml_is_contiguous(src0), ggml_is_contiguous(src1),
|
|
269
|
+
ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1), main_stream);
|
|
272
270
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
|
273
271
|
op()((const sycl::half *) src0->data, (const sycl::half *) src1->data, (sycl::half *) dst->data, ne00, ne01,
|
|
274
272
|
ne02, ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13,
|
|
275
|
-
nb0, nb1, nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1),
|
|
273
|
+
nb0, nb1, nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
|
|
276
274
|
main_stream);
|
|
277
275
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
|
278
276
|
op()((const sycl::half *) src0->data, (const float *) src1->data, (sycl::half *) dst->data, ne00, ne01, ne02,
|
|
279
277
|
ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1,
|
|
280
|
-
nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1),
|
|
278
|
+
nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
|
|
279
|
+
main_stream);
|
|
281
280
|
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
|
|
282
281
|
op()((const int32_t *) src0->data, (const int32_t *) src1->data, (int32_t *) dst->data, ne00, ne01, ne02, ne03,
|
|
283
282
|
ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
|
|
284
|
-
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1),
|
|
283
|
+
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
|
|
284
|
+
main_stream);
|
|
285
285
|
} else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
|
|
286
286
|
op()((const int16_t *) src0->data, (const int16_t *) src1->data, (int16_t *) dst->data, ne00, ne01, ne02, ne03,
|
|
287
287
|
ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
|
|
288
|
-
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1),
|
|
288
|
+
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
|
|
289
|
+
main_stream);
|
|
289
290
|
} else {
|
|
290
291
|
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type),
|
|
291
292
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|