whispercpp 1.3.2 → 1.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +6 -3
- data/README.md +71 -14
- data/Rakefile +20 -7
- data/ext/.gitignore +4 -6
- data/ext/dependencies.rb +36 -24
- data/ext/extconf.rb +1 -1
- data/ext/options.rb +48 -184
- data/ext/ruby_whisper.c +18 -0
- data/ext/ruby_whisper_context.c +43 -12
- data/ext/ruby_whisper_model.c +1 -1
- data/ext/ruby_whisper_params.c +59 -27
- data/ext/ruby_whisper_segment.c +81 -4
- data/ext/ruby_whisper_transcribe.cpp +13 -7
- data/ext/ruby_whisper_vad_params.c +1 -1
- data/ext/sources/CMakeLists.txt +5 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/build-xcframework.sh +24 -0
- data/ext/sources/examples/CMakeLists.txt +1 -0
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
- data/ext/sources/examples/addon.node/addon.cpp +154 -35
- data/ext/sources/examples/addon.node/index.js +10 -5
- data/ext/sources/examples/addon.node/vad-example.js +132 -0
- data/ext/sources/examples/bench/bench.cpp +29 -18
- data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
- data/ext/sources/examples/cli/cli.cpp +7 -4
- data/ext/sources/examples/command/command.cpp +58 -32
- data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/common-whisper.cpp +14 -7
- data/ext/sources/examples/lsp/lsp.cpp +21 -17
- data/ext/sources/examples/quantize/quantize.cpp +3 -0
- data/ext/sources/examples/server/CMakeLists.txt +3 -0
- data/ext/sources/examples/server/server.cpp +193 -35
- data/ext/sources/examples/server.py +6 -1
- data/ext/sources/examples/stream/stream.cpp +10 -2
- data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
- data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
- data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -0
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
- data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
- data/ext/sources/examples/talk-llama/llama-arch.cpp +756 -15
- data/ext/sources/examples/talk-llama/llama-arch.h +85 -1
- data/ext/sources/examples/talk-llama/llama-batch.cpp +773 -272
- data/ext/sources/examples/talk-llama/llama-batch.h +126 -55
- data/ext/sources/examples/talk-llama/llama-chat.cpp +150 -13
- data/ext/sources/examples/talk-llama/llama-chat.h +8 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +814 -542
- data/ext/sources/examples/talk-llama/llama-context.h +68 -32
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
- data/ext/sources/examples/talk-llama/llama-graph.cpp +787 -440
- data/ext/sources/examples/talk-llama/llama-graph.h +333 -153
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +128 -6
- data/ext/sources/examples/talk-llama/llama-hparams.h +80 -17
- data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +137 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +1248 -1967
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +218 -345
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +164 -52
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +266 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +139 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1154 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +182 -0
- data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
- data/ext/sources/examples/talk-llama/llama-memory.h +94 -4
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +44 -17
- data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +11377 -5248
- data/ext/sources/examples/talk-llama/llama-model.h +87 -9
- data/ext/sources/examples/talk-llama/llama-quant.cpp +137 -16
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +502 -38
- data/ext/sources/examples/talk-llama/llama-vocab.h +46 -0
- data/ext/sources/examples/talk-llama/llama.cpp +76 -17
- data/ext/sources/examples/talk-llama/llama.h +176 -151
- data/ext/sources/examples/talk-llama/talk-llama.cpp +11 -6
- data/ext/sources/examples/talk-llama/unicode.cpp +212 -0
- data/ext/sources/examples/talk-llama/unicode.h +45 -0
- data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +6 -2
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
- data/ext/sources/ggml/CMakeLists.txt +106 -33
- data/ext/sources/ggml/cmake/common.cmake +24 -0
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
- data/ext/sources/ggml/include/ggml-backend.h +18 -2
- data/ext/sources/ggml/include/ggml-cpu.h +2 -0
- data/ext/sources/ggml/include/ggml-metal.h +1 -6
- data/ext/sources/ggml/include/ggml-opt.h +25 -6
- data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
- data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
- data/ext/sources/ggml/include/ggml.h +365 -21
- data/ext/sources/ggml/src/CMakeLists.txt +98 -25
- data/ext/sources/ggml/src/ggml-alloc.c +265 -141
- data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +35 -13
- data/ext/sources/ggml/src/ggml-backend.cpp +266 -60
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +4 -4
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
- data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +15 -0
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
- data/ext/sources/ggml/src/ggml-cann/common.h +149 -2
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +521 -78
- data/ext/sources/ggml/src/ggml-common.h +21 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +165 -50
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -3
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +3650 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1891 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2160 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1897 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +214 -0
- data/ext/sources/ggml/src/ggml-cpu/common.h +18 -3
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +23 -7
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +179 -110
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +44 -33
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +228 -98
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +532 -1124
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +3374 -2081
- data/ext/sources/ggml/src/ggml-cpu/ops.h +13 -8
- data/ext/sources/ggml/src/ggml-cpu/quants.c +1193 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +34 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1982 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.h +120 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +367 -46
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +3 -3
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +272 -35
- data/ext/sources/ggml/src/ggml-cpu/vec.h +794 -142
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
- data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
- data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +291 -81
- data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +117 -22
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +20 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
- data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +499 -368
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +142 -93
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +90 -50
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +636 -222
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
- data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +73 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +506 -0
- data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +4 -5
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
- data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -98
- data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +23 -19
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
- data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +179 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +15 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +14 -2
- data/ext/sources/ggml/src/ggml-impl.h +229 -175
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +21 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +163 -63
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +3208 -1575
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
- data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +32 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4430 -792
- data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +138 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
- data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
- data/ext/sources/ggml/src/ggml-quants.c +117 -24
- data/ext/sources/ggml/src/ggml-quants.h +6 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +85 -62
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +13 -17
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +21 -2
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +116 -211
- data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +700 -1041
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +20 -9
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +17 -26
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +2 -96
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +393 -250
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +32 -8
- data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -11
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +125 -21
- data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +4 -3
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +105 -17
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4198 -1145
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +19 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +64 -11
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +338 -71
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
- data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
- data/ext/sources/ggml/src/ggml.c +802 -142
- data/ext/sources/ggml/src/ggml.cpp +26 -0
- data/ext/sources/ggml/src/gguf.cpp +32 -4
- data/ext/sources/include/whisper.h +2 -0
- data/ext/sources/src/CMakeLists.txt +2 -0
- data/ext/sources/src/coreml/whisper-compat.h +10 -0
- data/ext/sources/src/coreml/whisper-compat.m +35 -0
- data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
- data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
- data/ext/sources/src/whisper.cpp +241 -215
- data/ext/sources/tests/CMakeLists.txt +8 -1
- data/ext/sources/tests/test-vad-full.cpp +3 -3
- data/ext/sources/tests/test-vad.cpp +2 -2
- data/extsources.rb +15 -9
- data/lib/whisper/context.rb +15 -0
- data/lib/whisper/model/uri.rb +57 -2
- data/lib/whisper/segment.rb +58 -0
- data/sig/whisper.rbs +75 -38
- data/{tests → test}/helper.rb +1 -12
- data/{tests → test}/test_model.rb +9 -0
- data/test/test_package.rb +51 -0
- data/{tests → test}/test_params.rb +8 -0
- data/test/test_segment.rb +146 -0
- data/{tests → test}/test_whisper.rb +70 -0
- data/whispercpp.gemspec +2 -3
- metadata +246 -191
- data/ext/sources/.dockerignore +0 -3
- data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
- data/ext/sources/ci/run.sh +0 -336
- data/ext/sources/close-issue.yml +0 -28
- data/ext/sources/ggml/include/ggml-kompute.h +0 -50
- data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
- data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
- data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
- data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
- data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
- data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -6431
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
- data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -336
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -5998
- data/tests/test_package.rb +0 -46
- data/tests/test_segment.rb +0 -74
- /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /data/{tests → test}/jfk_reader/.gitignore +0 -0
- /data/{tests → test}/jfk_reader/extconf.rb +0 -0
- /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
- /data/{tests → test}/test_callback.rb +0 -0
- /data/{tests → test}/test_error.rb +0 -0
- /data/{tests → test}/test_vad.rb +0 -0
- /data/{tests → test}/test_vad_params.rb +0 -0
@@ -41,8 +41,10 @@
|
|
41
41
|
#include "ggml-sycl/element_wise.hpp"
|
42
42
|
#include "ggml-sycl/presets.hpp"
|
43
43
|
#include "ggml-sycl/gemm.hpp"
|
44
|
+
#include "ggml-sycl/set_rows.hpp"
|
44
45
|
#include "ggml-sycl/sycl_hw.hpp"
|
45
46
|
#include "ggml-sycl/getrows.hpp"
|
47
|
+
#include "ggml-sycl/quantize.hpp"
|
46
48
|
#include "ggml.h"
|
47
49
|
|
48
50
|
static bool g_sycl_loaded = false;
|
@@ -83,9 +85,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
|
83
85
|
|
84
86
|
info.devices[i].cc =
|
85
87
|
100 * prop.get_major_version() + 10 * prop.get_minor_version();
|
86
|
-
info.devices[i].
|
87
|
-
info.devices[i].opt_feature = check_gpu_optimize_feature(info.devices[i].hw_info.arch);
|
88
|
-
|
88
|
+
info.devices[i].opt_feature.reorder = device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu);
|
89
89
|
info.max_work_group_sizes[i] = prop.get_max_work_group_size();
|
90
90
|
}
|
91
91
|
|
@@ -195,7 +195,7 @@ static void ggml_check_sycl() try {
|
|
195
195
|
|
196
196
|
if (!initialized) {
|
197
197
|
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
198
|
-
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT",
|
198
|
+
g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
|
199
199
|
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
200
200
|
g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
|
201
201
|
g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
|
@@ -347,14 +347,15 @@ static enum ggml_status
|
|
347
347
|
ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
348
348
|
ggml_tensor *tensor) try {
|
349
349
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
350
|
-
|
350
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str());
|
351
351
|
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
|
352
352
|
|
353
353
|
if (tensor->view_src != NULL) {
|
354
354
|
assert(tensor->view_src->buffer->buft == buffer->buft);
|
355
355
|
return GGML_STATUS_SUCCESS;
|
356
356
|
}
|
357
|
-
if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K
|
357
|
+
if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K || tensor->type == GGML_TYPE_Q6_K) &&
|
358
|
+
!g_ggml_sycl_disable_optimize) {
|
358
359
|
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
359
360
|
tensor->extra = extra;
|
360
361
|
ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
|
@@ -384,7 +385,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
384
385
|
const void *data, size_t offset,
|
385
386
|
size_t size) try {
|
386
387
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
387
|
-
|
388
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
388
389
|
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
389
390
|
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
390
391
|
ggml_sycl_set_device(ctx->device);
|
@@ -412,7 +413,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
412
413
|
void *data, size_t offset,
|
413
414
|
size_t size) try {
|
414
415
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
415
|
-
|
416
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
416
417
|
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
417
418
|
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
418
419
|
|
@@ -443,8 +444,8 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
|
443
444
|
ggml_tensor *dst) try {
|
444
445
|
bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer);
|
445
446
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
446
|
-
|
447
|
-
|
447
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str());
|
448
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str());
|
448
449
|
GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
|
449
450
|
if (is_cpy_supported) {
|
450
451
|
ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
|
@@ -524,7 +525,7 @@ catch (sycl::exception const &exc) {
|
|
524
525
|
static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value,
|
525
526
|
size_t offset, size_t size) {
|
526
527
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
527
|
-
|
528
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
528
529
|
GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value);
|
529
530
|
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
|
530
531
|
SYCL_CHECK(ggml_sycl_set_device(ctx->device));
|
@@ -804,7 +805,7 @@ static enum ggml_status
|
|
804
805
|
ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
805
806
|
ggml_tensor *tensor) try {
|
806
807
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
807
|
-
|
808
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str());
|
808
809
|
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
809
810
|
|
810
811
|
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
@@ -890,7 +891,7 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
890
891
|
ggml_tensor *tensor, const void *data,
|
891
892
|
size_t offset, size_t size) try {
|
892
893
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
893
|
-
|
894
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
894
895
|
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
895
896
|
// split tensors must always be set in their entirety at once
|
896
897
|
GGML_ASSERT(offset == 0);
|
@@ -946,7 +947,7 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
946
947
|
const ggml_tensor *tensor, void *data,
|
947
948
|
size_t offset, size_t size) try {
|
948
949
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
949
|
-
|
950
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
950
951
|
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
951
952
|
// split tensors must always be set in their entirety at once
|
952
953
|
GGML_ASSERT(offset == 0);
|
@@ -1373,67 +1374,6 @@ typedef void (*ggml_sycl_op_mul_mat_t)(
|
|
1373
1374
|
|
1374
1375
|
|
1375
1376
|
|
1376
|
-
template<int QUANT_BLOCK_TILE>
|
1377
|
-
static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded,
|
1378
|
-
const sycl::nd_item<3> &item_ct1) {
|
1379
|
-
const int ix = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
1380
|
-
item_ct1.get_local_id(2)) * QUANT_BLOCK_TILE;
|
1381
|
-
|
1382
|
-
if (ix >= kx_padded) {
|
1383
|
-
return;
|
1384
|
-
}
|
1385
|
-
|
1386
|
-
const int iy = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
1387
|
-
item_ct1.get_local_id(1);
|
1388
|
-
|
1389
|
-
const int i_padded = iy*kx_padded + ix;
|
1390
|
-
|
1391
|
-
block_q8_1 * y = (block_q8_1 *) vy;
|
1392
|
-
|
1393
|
-
const int ib = i_padded / QK8_1; // block index
|
1394
|
-
const int iqs = i_padded % QK8_1; // quant index
|
1395
|
-
typedef sycl::vec<float, QUANT_BLOCK_TILE> TC;
|
1396
|
-
typedef sycl::vec<int8_t, QUANT_BLOCK_TILE> TQ;
|
1397
|
-
TC zeros;
|
1398
|
-
TQ qzeros;
|
1399
|
-
#pragma unroll
|
1400
|
-
for (int i = 0; i < QUANT_BLOCK_TILE; i++)
|
1401
|
-
{
|
1402
|
-
zeros[i] = 0.f;
|
1403
|
-
qzeros[i] = 0;
|
1404
|
-
}
|
1405
|
-
const TC xi = ix < kx ? *(const TC *)&x[iy * kx + ix] : zeros;
|
1406
|
-
float sum = xi[0];
|
1407
|
-
float amax = sycl::fabs(xi[0]);
|
1408
|
-
#pragma unroll
|
1409
|
-
for (int i = 1; i < QUANT_BLOCK_TILE; i++)
|
1410
|
-
{
|
1411
|
-
sum += xi[i];
|
1412
|
-
amax = sycl::fmax(sycl::fabs(xi[i]), amax);
|
1413
|
-
}
|
1414
|
-
sum = warp_reduce_sum(sum, item_ct1);
|
1415
|
-
amax = warp_reduce_max(amax, item_ct1);
|
1416
|
-
|
1417
|
-
const float d = amax / 127;
|
1418
|
-
TQ q = qzeros;
|
1419
|
-
if (amax != 0.0f)
|
1420
|
-
{
|
1421
|
-
#pragma unroll
|
1422
|
-
for (int i = 0; i < QUANT_BLOCK_TILE; i++) {
|
1423
|
-
q[i] = sycl::round(xi[i] / d);
|
1424
|
-
}
|
1425
|
-
}
|
1426
|
-
|
1427
|
-
*(TQ *)&y[ib].qs[iqs] = q;
|
1428
|
-
|
1429
|
-
if (iqs > 0) {
|
1430
|
-
return;
|
1431
|
-
}
|
1432
|
-
|
1433
|
-
reinterpret_cast<sycl::half &>(y[ib].ds.x()) = d;
|
1434
|
-
reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
|
1435
|
-
}
|
1436
|
-
|
1437
1377
|
static void mul_mat_p021_f16_f32(
|
1438
1378
|
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
|
1439
1379
|
const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
|
@@ -1493,7 +1433,7 @@ static void mul_mat_p021_f16_f32(
|
|
1493
1433
|
|
1494
1434
|
static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
1495
1435
|
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
|
1496
|
-
const int row_stride_x, const int channel_stride_x, const int channel_x_divisor,
|
1436
|
+
const int row_stride_x, const int channel_stride_x,const int channel_stride_y, const int channel_x_divisor,
|
1497
1437
|
const sycl::nd_item<3> &item_ct1) {
|
1498
1438
|
|
1499
1439
|
const sycl::half *x = (const sycl::half *)vx;
|
@@ -1504,7 +1444,6 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
1504
1444
|
item_ct1.get_local_id(0);
|
1505
1445
|
const int channel_x = channel / channel_x_divisor;
|
1506
1446
|
|
1507
|
-
const int nrows_y = ncols_x;
|
1508
1447
|
const int nrows_dst = nrows_x;
|
1509
1448
|
const int row_dst = row_x;
|
1510
1449
|
|
@@ -1523,7 +1462,7 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
1523
1462
|
const int row_y = col_x;
|
1524
1463
|
|
1525
1464
|
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
1526
|
-
const int iy = channel*
|
1465
|
+
const int iy = channel * channel_stride_y + row_y;
|
1527
1466
|
|
1528
1467
|
const float xi =
|
1529
1468
|
sycl::vec<sycl::half, 1>(x[ix])
|
@@ -1643,7 +1582,7 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
|
|
1643
1582
|
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
|
1644
1583
|
}
|
1645
1584
|
|
1646
|
-
static void scale_f32(const float * x, float * dst, const float scale, const int k,
|
1585
|
+
static void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k,
|
1647
1586
|
const sycl::nd_item<3> &item_ct1) {
|
1648
1587
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
1649
1588
|
item_ct1.get_local_id(2);
|
@@ -1652,7 +1591,7 @@ static void scale_f32(const float * x, float * dst, const float scale, const int
|
|
1652
1591
|
return;
|
1653
1592
|
}
|
1654
1593
|
|
1655
|
-
dst[i] = scale * x[i];
|
1594
|
+
dst[i] = scale * x[i] + bias;
|
1656
1595
|
}
|
1657
1596
|
|
1658
1597
|
|
@@ -1718,25 +1657,6 @@ static void pool2d_nchw_kernel(
|
|
1718
1657
|
o_ptr[cur_oh * ow + cur_ow] = res;
|
1719
1658
|
}
|
1720
1659
|
|
1721
|
-
static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
|
1722
|
-
const int ky, const int kx_padded,
|
1723
|
-
queue_ptr stream) {
|
1724
|
-
const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
|
1725
|
-
const sycl::range<3> num_blocks(1, ky, block_num_x);
|
1726
|
-
int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE;
|
1727
|
-
static_assert(QK8_1 % WARP_SIZE == 0);
|
1728
|
-
const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE);
|
1729
|
-
{
|
1730
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
1731
|
-
{sycl::aspect::fp16});
|
1732
|
-
|
1733
|
-
stream->parallel_for(
|
1734
|
-
sycl::nd_range<3>(num_blocks * block_size, block_size),
|
1735
|
-
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
1736
|
-
quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
|
1737
|
-
});
|
1738
|
-
}
|
1739
|
-
}
|
1740
1660
|
|
1741
1661
|
static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
|
1742
1662
|
float *dst, const int ncols_x,
|
@@ -1763,7 +1683,7 @@ static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
|
|
1763
1683
|
static void ggml_mul_mat_vec_nc_f16_f32_sycl(
|
1764
1684
|
const void *vx, const float *y, float *dst, const int ncols_x,
|
1765
1685
|
const int nrows_x, const int row_stride_x, const int nchannels_x,
|
1766
|
-
const int nchannels_y, const int channel_stride_x, queue_ptr stream) {
|
1686
|
+
const int nchannels_y, const int channel_stride_x, const int channel_stride_y, queue_ptr stream) {
|
1767
1687
|
|
1768
1688
|
const sycl::range<3> block_nums(nchannels_y, nrows_x, 1);
|
1769
1689
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
@@ -1775,7 +1695,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
|
|
1775
1695
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
1776
1696
|
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
1777
1697
|
mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
|
1778
|
-
row_stride_x, channel_stride_x,
|
1698
|
+
row_stride_x, channel_stride_x, channel_stride_y,
|
1779
1699
|
nchannels_y / nchannels_x, item_ct1);
|
1780
1700
|
});
|
1781
1701
|
}
|
@@ -1783,7 +1703,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
|
|
1783
1703
|
|
1784
1704
|
|
1785
1705
|
|
1786
|
-
static void scale_f32_sycl(const float *x, float *dst, const float scale,
|
1706
|
+
static void scale_f32_sycl(const float *x, float *dst, const float scale, const float bias,
|
1787
1707
|
const int k, queue_ptr stream) {
|
1788
1708
|
const int num_blocks = (k + SYCL_SCALE_BLOCK_SIZE - 1) / SYCL_SCALE_BLOCK_SIZE;
|
1789
1709
|
stream->parallel_for(
|
@@ -1791,7 +1711,7 @@ static void scale_f32_sycl(const float *x, float *dst, const float scale,
|
|
1791
1711
|
sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE),
|
1792
1712
|
sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE)),
|
1793
1713
|
[=](sycl::nd_item<3> item_ct1) {
|
1794
|
-
scale_f32(x, dst, scale, k, item_ct1);
|
1714
|
+
scale_f32(x, dst, scale, bias, k, item_ct1);
|
1795
1715
|
});
|
1796
1716
|
}
|
1797
1717
|
|
@@ -2066,21 +1986,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
2066
1986
|
const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16
|
2067
1987
|
? (const sycl::half *)src1->data + src1_padded_row_size
|
2068
1988
|
: src1_as_f16.get();
|
2069
|
-
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
|
2070
1989
|
|
2071
1990
|
#if GGML_SYCL_DNNL
|
2072
1991
|
if (!g_ggml_sycl_disable_dnn) {
|
2073
|
-
|
2074
|
-
|
2075
|
-
|
2076
|
-
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
|
2077
|
-
" : converting dst to fp32");
|
2078
|
-
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
2079
|
-
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
|
1992
|
+
DnnlGemmWrapper::row_gemm(ctx,row_diff, src1_ncols , ne10, src0_ptr,
|
1993
|
+
DnnlGemmWrapper::to_dt<sycl::half>(), src1_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
|
1994
|
+
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
|
2080
1995
|
}
|
2081
1996
|
else
|
2082
1997
|
#endif
|
2083
1998
|
{
|
1999
|
+
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
|
2000
|
+
|
2084
2001
|
const sycl::half alpha_f16 = 1.0f;
|
2085
2002
|
const sycl::half beta_f16 = 0.0f;
|
2086
2003
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
|
@@ -2119,8 +2036,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
2119
2036
|
|
2120
2037
|
#if GGML_SYCL_DNNL
|
2121
2038
|
if (!g_ggml_sycl_disable_dnn) {
|
2122
|
-
DnnlGemmWrapper::row_gemm(ctx,
|
2123
|
-
DnnlGemmWrapper::to_dt<float>(),
|
2039
|
+
DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10, src0_ddf_i,
|
2040
|
+
DnnlGemmWrapper::to_dt<float>(), src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
|
2124
2041
|
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
|
2125
2042
|
}
|
2126
2043
|
else
|
@@ -2268,9 +2185,11 @@ inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * ds
|
|
2268
2185
|
float * dst_dd = static_cast<float *>(dst->data);
|
2269
2186
|
|
2270
2187
|
float scale;
|
2271
|
-
|
2188
|
+
float bias;
|
2189
|
+
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
2190
|
+
memcpy(&bias, (float *) dst->op_params + 1, sizeof(float));
|
2272
2191
|
|
2273
|
-
scale_f32_sycl(src0_dd, dst_dd, scale, ggml_nelements(dst->src[0]), main_stream);
|
2192
|
+
scale_f32_sycl(src0_dd, dst_dd, scale, bias, ggml_nelements(dst->src[0]), main_stream);
|
2274
2193
|
/*
|
2275
2194
|
DPCT1010:87: SYCL uses exceptions to report errors and does not use the
|
2276
2195
|
error codes. The call was replaced with 0. You need to rewrite this code.
|
@@ -2319,10 +2238,10 @@ static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
|
|
2319
2238
|
peer_access_enabled = enable_peer_access;
|
2320
2239
|
}
|
2321
2240
|
|
2241
|
+
template <template <int> typename quantize_f>
|
2322
2242
|
static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
2323
2243
|
const ggml_tensor *src1, ggml_tensor *dst,
|
2324
|
-
ggml_sycl_op_mul_mat_t op
|
2325
|
-
const bool convert_src1_to_q8_1) try {
|
2244
|
+
ggml_sycl_op_mul_mat_t op) try {
|
2326
2245
|
|
2327
2246
|
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
2328
2247
|
|
@@ -2417,6 +2336,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|
2417
2336
|
}
|
2418
2337
|
}
|
2419
2338
|
|
2339
|
+
constexpr bool quantize_enabled = !std::is_same_v<quantize_f<QK8_1 / WARP_SIZE>,
|
2340
|
+
no_quantize_q8_1<QK8_1 / WARP_SIZE>>;
|
2420
2341
|
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
2421
2342
|
if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) {
|
2422
2343
|
continue;
|
@@ -2442,19 +2363,19 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|
2442
2363
|
dev[i].src1_ddf = dev[i].src1_ddf_alloc.alloc(ctx.pool(i), ggml_nelements(src1));
|
2443
2364
|
}
|
2444
2365
|
|
2445
|
-
if (
|
2366
|
+
if constexpr(quantize_enabled) {
|
2446
2367
|
dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
|
2447
2368
|
|
2448
2369
|
if (src1_on_device && src1_is_contiguous) {
|
2449
2370
|
scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
|
2450
2371
|
/*num_src=*/2, " : converting src1 to Q8_1");
|
2451
|
-
|
2452
|
-
|
2453
|
-
|
2454
|
-
|
2455
|
-
|
2456
|
-
|
2457
|
-
|
2372
|
+
try {
|
2373
|
+
quantize_row_q8_1_sycl<quantize_f>(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
|
2374
|
+
} catch (sycl::exception const &exc) {
|
2375
|
+
std::cerr << "Quantize_row_q8_1_sycl error" << exc.what() << "Exception caught at file:" << __FILE__
|
2376
|
+
<< ", line:" << __LINE__ << std::endl;
|
2377
|
+
std::exit(1);
|
2378
|
+
}
|
2458
2379
|
}
|
2459
2380
|
}
|
2460
2381
|
|
@@ -2470,11 +2391,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|
2470
2391
|
// here an event is recorded that signals that the main device has finished calculating the input data
|
2471
2392
|
if (split && used_devices > 1) {
|
2472
2393
|
ggml_sycl_set_device(ctx.device);
|
2473
|
-
/*
|
2474
|
-
DPCT1024:91: The original code returned the error code that was further
|
2475
|
-
consumed by the program logic. This original code was replaced with 0.
|
2476
|
-
You may need to rewrite the program logic consuming the error code.
|
2477
|
-
*/
|
2478
2394
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
2479
2395
|
*src0_extra->events[ctx.device][0] =
|
2480
2396
|
ctx.stream()->ext_oneapi_submit_barrier()));
|
@@ -2498,11 +2414,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|
2498
2414
|
|
2499
2415
|
// wait for main GPU data if necessary
|
2500
2416
|
if (split && (i != ctx.device || is != 0)) {
|
2501
|
-
/*
|
2502
|
-
DPCT1009:163: SYCL uses exceptions to report errors and does not
|
2503
|
-
use the error codes. The original code was commented out and a
|
2504
|
-
warning string was inserted. You need to rewrite this code.
|
2505
|
-
*/
|
2506
2417
|
SYCL_CHECK(CHECK_TRY_ERROR(stream->ext_oneapi_submit_barrier(
|
2507
2418
|
{*src0_extra->events[ctx.device][0]})));
|
2508
2419
|
}
|
@@ -2528,39 +2439,42 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|
2528
2439
|
// copy src0, src1 to device if necessary
|
2529
2440
|
if (src1_is_contiguous) {
|
2530
2441
|
if (i != ctx.device) {
|
2531
|
-
if (
|
2442
|
+
if constexpr (quantize_enabled) {
|
2532
2443
|
char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
|
2533
|
-
|
2534
|
-
|
2535
|
-
|
2536
|
-
|
2444
|
+
SYCL_CHECK(
|
2445
|
+
CHECK_TRY_ERROR(stream
|
2446
|
+
->memcpy(src1_ddq_i, src1_ddq_i_source,
|
2447
|
+
src1_ncols * src1_padded_col_size * q8_1_ts / q8_1_bs)
|
2448
|
+
.wait()));
|
2537
2449
|
} else {
|
2538
|
-
|
2539
2450
|
float * src1_ddf_i_source = (float *) src1_extra->data_device[ctx.device];
|
2540
|
-
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
|
2451
|
+
src1_ddf_i_source += (i0 * ne11 + src1_col_0) * ne10;
|
2541
2452
|
|
2542
|
-
SYCL_CHECK(
|
2543
|
-
src1_ddf_i, src1_ddf_i_source,
|
2544
|
-
|
2453
|
+
SYCL_CHECK(
|
2454
|
+
CHECK_TRY_ERROR(dev2dev_memcpy(*stream, *main_stream, src1_ddf_i, src1_ddf_i_source,
|
2455
|
+
src1_ncols * ne10 * sizeof(float))));
|
2545
2456
|
}
|
2546
2457
|
}
|
2547
|
-
} else if (src1_on_device && !src1_is_contiguous) {
|
2548
|
-
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
|
2549
|
-
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
2550
2458
|
} else {
|
2551
|
-
|
2552
|
-
|
2459
|
+
if (src1_on_device) {
|
2460
|
+
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, src1_col_0,
|
2461
|
+
src1_col_0 + src1_ncols, stream));
|
2462
|
+
} else {
|
2463
|
+
GGML_ABORT("src1 is non-contiguous and not on device");
|
2464
|
+
}
|
2553
2465
|
|
2554
|
-
|
2555
|
-
|
2556
|
-
|
2557
|
-
|
2558
|
-
|
2559
|
-
|
2560
|
-
|
2561
|
-
|
2562
|
-
|
2563
|
-
|
2466
|
+
if constexpr (quantize_enabled) {
|
2467
|
+
scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
|
2468
|
+
/*num_src=*/2, " : converting src1 to Q8_1");
|
2469
|
+
try {
|
2470
|
+
quantize_row_q8_1_sycl<quantize_q8_1>(src1_ddf_i, src1_ddq_i, ne10, src1_ncols,
|
2471
|
+
src1_padded_col_size, stream);
|
2472
|
+
} catch (const sycl::exception & exc) {
|
2473
|
+
std::cerr << "Quantize_row_q8_1_sycl error" << exc.what()
|
2474
|
+
<< "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
|
2475
|
+
std::exit(1);
|
2476
|
+
}
|
2477
|
+
}
|
2564
2478
|
}
|
2565
2479
|
|
2566
2480
|
if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) {
|
@@ -2572,12 +2486,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|
2572
2486
|
// do the computation
|
2573
2487
|
SYCL_CHECK(CHECK_TRY_ERROR(op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
|
2574
2488
|
dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream)));
|
2575
|
-
/*
|
2576
|
-
DPCT1010:93: SYCL uses exceptions to report errors and does not
|
2577
|
-
use the error codes. The call was replaced with 0. You need to
|
2578
|
-
rewrite this code.
|
2579
|
-
*/
|
2580
|
-
SYCL_CHECK(0);
|
2581
2489
|
|
2582
2490
|
// copy dst to host or other device if necessary
|
2583
2491
|
if (!dst_on_device) {
|
@@ -2608,12 +2516,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|
2608
2516
|
|
2609
2517
|
// add event for the main device to wait on until other device is done
|
2610
2518
|
if (split && (i != ctx.device || is != 0)) {
|
2611
|
-
/*
|
2612
|
-
DPCT1024:94: The original code returned the error code that
|
2613
|
-
was further consumed by the program logic. This original
|
2614
|
-
code was replaced with 0. You may need to rewrite the
|
2615
|
-
program logic consuming the error code.
|
2616
|
-
*/
|
2617
2519
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
2618
2520
|
*src0_extra->events[i][is] =
|
2619
2521
|
stream->ext_oneapi_submit_barrier()));
|
@@ -2712,6 +2614,8 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
|
|
2712
2614
|
GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
|
2713
2615
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
2714
2616
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
2617
|
+
GGML_ASSERT(src1->ne[1] == 1);
|
2618
|
+
GGML_ASSERT(src1->ne[3] == 1);
|
2715
2619
|
|
2716
2620
|
const int64_t ne00 = src0->ne[0];
|
2717
2621
|
const int64_t ne01 = src0->ne[1];
|
@@ -2721,6 +2625,7 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
|
|
2721
2625
|
const int64_t nb02 = src0->nb[2];
|
2722
2626
|
|
2723
2627
|
const int64_t ne12 = src1->ne[2];
|
2628
|
+
const int64_t nb11 = src1->nb[1];
|
2724
2629
|
|
2725
2630
|
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
2726
2631
|
queue_ptr main_stream = ctx.stream();
|
@@ -2731,8 +2636,9 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
|
|
2731
2636
|
|
2732
2637
|
const int64_t row_stride_x = nb01 / sizeof(sycl::half);
|
2733
2638
|
const int64_t channel_stride_x = nb02 / sizeof(sycl::half);
|
2639
|
+
const int64_t channel_stride_y = nb11 / sizeof(float);
|
2734
2640
|
|
2735
|
-
ggml_mul_mat_vec_nc_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
2641
|
+
ggml_mul_mat_vec_nc_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x,channel_stride_y, main_stream);
|
2736
2642
|
}
|
2737
2643
|
catch (sycl::exception const &exc) {
|
2738
2644
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
@@ -2786,8 +2692,11 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
|
2786
2692
|
float * dst_ddf = static_cast<float *>(dst->data);
|
2787
2693
|
|
2788
2694
|
const sycl::half * src1_f16 = static_cast<const sycl::half *>(src1->data);
|
2695
|
+
const size_t type_size_src0 = ggml_type_size(src0->type);
|
2789
2696
|
const size_t type_size_src1 = ggml_type_size(src1->type);
|
2790
|
-
|
2697
|
+
|
2698
|
+
bool is_src0_cont_2 = ggml_is_contiguous_2(src0);
|
2699
|
+
bool is_src1_cont_2 = ggml_is_contiguous_2(src1);
|
2791
2700
|
|
2792
2701
|
// SRC1 strides
|
2793
2702
|
int64_t s11 = nb11 / type_size_src1;
|
@@ -2799,16 +2708,47 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
|
2799
2708
|
if (src1->type != GGML_TYPE_F16) {
|
2800
2709
|
scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_nc_sycl", dst, /*num_src=*/2,
|
2801
2710
|
" : converting src1 to fp16");
|
2802
|
-
|
2803
|
-
|
2711
|
+
|
2712
|
+
// iterate tensor dims and find the slowest moving dim and stride
|
2713
|
+
int last_dim=0;
|
2714
|
+
int last_str=0;
|
2715
|
+
size_t largest_str=0;
|
2716
|
+
for(int i = 0; i< 4; i++){
|
2717
|
+
// last stride is always the largest
|
2718
|
+
if(src1->nb[i] == largest_str){
|
2719
|
+
if(src1->ne[last_dim] == 1){
|
2720
|
+
last_str = i;
|
2721
|
+
last_dim = i;
|
2722
|
+
}
|
2723
|
+
}
|
2724
|
+
if(src1->nb[i] > largest_str){
|
2725
|
+
largest_str = src1->nb[i];
|
2726
|
+
last_str = i;
|
2727
|
+
last_dim = i;
|
2728
|
+
}
|
2729
|
+
|
2730
|
+
}
|
2731
|
+
#if GGML_SYCL_DNNL
|
2732
|
+
// oneDNN handles strided data and does not need overhead of get_to_fp16_nc_sycl
|
2733
|
+
const int64_t ne_src1 = src1->nb[last_str] * src1->ne[last_dim] / type_size_src1;
|
2734
|
+
src1_f16_alloc.alloc(ne_src1);
|
2735
|
+
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
|
2736
|
+
GGML_ASSERT(to_fp16_sycl != nullptr);
|
2737
|
+
to_fp16_sycl(src1_f16, src1_f16_alloc.get(), ne_src1, queue);
|
2738
|
+
# else
|
2804
2739
|
const int64_t ne_src1 = ggml_nelements(src1);
|
2805
2740
|
src1_f16_alloc.alloc(ne_src1);
|
2741
|
+
const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type);
|
2742
|
+
GGML_ASSERT(to_fp16_nc_sycl != nullptr);
|
2806
2743
|
to_fp16_nc_sycl(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, queue);
|
2744
|
+
#endif
|
2807
2745
|
|
2808
2746
|
src1_f16 = src1_f16_alloc.get();
|
2809
2747
|
s11 = ne10;
|
2810
2748
|
s12 = ne11 * s11;
|
2811
2749
|
s13 = ne12 * s12;
|
2750
|
+
|
2751
|
+
is_src1_cont_2 = true;
|
2812
2752
|
}
|
2813
2753
|
|
2814
2754
|
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool());
|
@@ -2837,48 +2777,115 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
|
2837
2777
|
|
2838
2778
|
#if GGML_SYCL_DNNL
|
2839
2779
|
if (!g_ggml_sycl_disable_dnn) {
|
2840
|
-
|
2841
|
-
|
2842
|
-
|
2843
|
-
|
2844
|
-
|
2845
|
-
|
2846
|
-
|
2847
|
-
|
2848
|
-
|
2849
|
-
|
2850
|
-
|
2851
|
-
|
2852
|
-
|
2853
|
-
|
2854
|
-
|
2855
|
-
|
2856
|
-
|
2857
|
-
|
2858
|
-
|
2780
|
+
int64_t str_a0 = nb00 / type_size_src0;
|
2781
|
+
int64_t str_a1 = nb01 / type_size_src0;
|
2782
|
+
int64_t str_a2 = nb02 / type_size_src0;
|
2783
|
+
|
2784
|
+
int64_t str_b0 = nb10 / type_size_src1;
|
2785
|
+
int64_t str_b1 = nb11 / type_size_src1;
|
2786
|
+
int64_t str_b2 = nb12 / type_size_src1;
|
2787
|
+
|
2788
|
+
auto launch_gemm_for_batches = [&ctx, queue](const sycl::half *src0,
|
2789
|
+
const sycl::half *src1, float *dst,
|
2790
|
+
int64_t a0, int64_t a1, int64_t batcha,
|
2791
|
+
int64_t /*b0*/, int64_t b1, int64_t batchb,
|
2792
|
+
int64_t sa0, int64_t sa1, int64_t sa2,
|
2793
|
+
int64_t sb0, int64_t sb1, int64_t sb2,
|
2794
|
+
int64_t sd2) {
|
2795
|
+
bool supported_broadcast = batchb == batcha ? true
|
2796
|
+
: batchb == 1 || batcha == 1 ? true
|
2797
|
+
: false;
|
2798
|
+
if (supported_broadcast) {
|
2799
|
+
DnnlGemmWrapper::gemm(ctx, a1, b1, a0, src0,
|
2800
|
+
DnnlGemmWrapper::to_dt<sycl::half>(), sa0, sa1, sa2, src1,
|
2801
|
+
DnnlGemmWrapper::to_dt<sycl::half>(), sb0, sb1, sb2, dst,
|
2802
|
+
DnnlGemmWrapper::to_dt<float>(), queue, batcha, batchb);
|
2803
|
+
} else {
|
2804
|
+
// iterate over batches from smaller set of matrices (matrix 0)
|
2805
|
+
int64_t batches0 = batcha;
|
2806
|
+
int64_t batches1 = batchb;
|
2807
|
+
|
2808
|
+
if (batches0 > batches1) {
|
2809
|
+
int64_t num_mul_mats = batches1;
|
2810
|
+
int64_t sub_batch = batches0 / num_mul_mats;
|
2811
|
+
// src0 is batched and bigger, shift and multiply with src1
|
2812
|
+
for (int64_t i0 = 0; i0 < num_mul_mats; i0++) {
|
2813
|
+
const sycl::half *src0_shifted = src0 + (sa2 * i0 * sub_batch);
|
2814
|
+
const sycl::half *src1_shifted = src1 + (sb2 * i0);
|
2815
|
+
float *dst_shifted = dst + (sd2 * i0 * sub_batch);
|
2816
|
+
DnnlGemmWrapper::gemm(ctx, a1, b1, a0, src0_shifted,
|
2817
|
+
DnnlGemmWrapper::to_dt<sycl::half>(), sa0, sa1, sa2,
|
2818
|
+
src1_shifted, DnnlGemmWrapper::to_dt<sycl::half>(), sb0,
|
2819
|
+
sb1, sb2, dst_shifted, DnnlGemmWrapper::to_dt<float>(),
|
2820
|
+
queue, sub_batch, 1);
|
2821
|
+
}
|
2822
|
+
} else {
|
2823
|
+
int64_t num_mul_mats = batches0;
|
2824
|
+
int64_t sub_batch = batches1 / num_mul_mats;
|
2825
|
+
// src1 is batched and bigger, shift and multiply with src0
|
2826
|
+
for (int64_t i1 = 0; i1 < num_mul_mats; i1++) {
|
2827
|
+
const sycl::half *src0_shifted = src0 + (sa2 * i1);
|
2828
|
+
const sycl::half *src1_shifted = src1 + (sb2 * i1 * sub_batch);
|
2829
|
+
float *dst_shifted = dst + (sd2 * i1 * sub_batch);
|
2830
|
+
DnnlGemmWrapper::gemm(ctx, a1, b1, a0, src0_shifted,
|
2831
|
+
DnnlGemmWrapper::to_dt<sycl::half>(), sa0, sa1, sa2,
|
2832
|
+
src1_shifted, DnnlGemmWrapper::to_dt<sycl::half>(), sb0,
|
2833
|
+
sb1, sb2, dst_shifted, DnnlGemmWrapper::to_dt<float>(),
|
2834
|
+
queue, 1, sub_batch);
|
2835
|
+
}
|
2836
|
+
}
|
2859
2837
|
}
|
2860
|
-
}
|
2861
|
-
|
2862
|
-
|
2863
|
-
|
2864
|
-
|
2865
|
-
|
2866
|
-
|
2867
|
-
|
2868
|
-
|
2838
|
+
};
|
2839
|
+
|
2840
|
+
const bool cont_batches_dim2_a = nb02 * ne02 == nb03;
|
2841
|
+
const bool cont_batches_dim2_b = nb12 * ne12 == nb13;
|
2842
|
+
const bool cont_batches_dim3_a = ne02 == 1 && nb02 * ne01 == nb03;
|
2843
|
+
const bool cont_batches_dim3_b = ne12 == 1 && nb12 * ne11 == nb13;
|
2844
|
+
if (cont_batches_dim2_a && cont_batches_dim2_b) {
|
2845
|
+
// A batch is considered contiguous if the dimension 2 is not strided
|
2846
|
+
int64_t batches0 = ne02 * ne03;
|
2847
|
+
int64_t batches1 = ne12 * ne13;
|
2848
|
+
launch_gemm_for_batches(src0_f16, src1_f16, dst_ddf, ne00, ne01, batches0,
|
2849
|
+
ne10, ne11, batches1, str_a0, str_a1, str_a2, str_b0, str_b1,
|
2850
|
+
str_b2, nb2 / sizeof(float));
|
2851
|
+
} else if (cont_batches_dim3_a && cont_batches_dim3_b) {
|
2852
|
+
// This case is similar to the one above with the difference that only the batch in dimension 3 is used and the dimension 2 is of size 1.
|
2853
|
+
int64_t batches0 = ne02 * ne03;
|
2854
|
+
int64_t batches1 = ne12 * ne13;
|
2855
|
+
int64_t str_a3 = nb03 / type_size_src0;
|
2856
|
+
int64_t str_b3 = nb13 / type_size_src1;
|
2857
|
+
launch_gemm_for_batches(src0_f16, src1_f16, dst_ddf, ne00, ne01, batches0,
|
2858
|
+
ne10, ne11, batches1, str_a0, str_a1, str_a3, str_b0, str_b1,
|
2859
|
+
str_b3, nb2 / sizeof(float));
|
2860
|
+
} else {
|
2861
|
+
for (int64_t b_a = 0; b_a < ne03; b_a++) {
|
2862
|
+
const sycl::half *src0_f16_shifted
|
2863
|
+
= src0_f16 + (nb03 * b_a / type_size_src0);
|
2864
|
+
const sycl::half *src1_f16_shifted
|
2865
|
+
= src1_f16 + (nb13 * b_a / type_size_src1);
|
2866
|
+
float *dst_shifted = dst_ddf + (nb3 * b_a / sizeof(float));
|
2867
|
+
int64_t batches0 = ne02;
|
2868
|
+
int64_t batches1 = ne12;
|
2869
|
+
launch_gemm_for_batches(src0_f16_shifted, src1_f16_shifted, dst_shifted,
|
2870
|
+
ne00, ne01, batches0, ne10, ne11, batches1, str_a0, str_a1,
|
2871
|
+
str_a2, str_b0, str_b1, str_b2, nb2 / sizeof(float));
|
2869
2872
|
}
|
2870
2873
|
}
|
2871
|
-
|
2874
|
+
|
2872
2875
|
}
|
2873
2876
|
else
|
2874
2877
|
#endif
|
2875
2878
|
{
|
2876
|
-
if (r2 == 1 && r3 == 1 &&
|
2879
|
+
if (r2 == 1 && r3 == 1 && is_src0_cont_2 && is_src1_cont_2) {
|
2880
|
+
// with a [0, 2, 1, 3] perm. and ne02==1 the matrix strides need to be determined from dim 3:
|
2881
|
+
const int64_t sma = ne02 == 1 ? nb03/nb00 : nb02/nb00;
|
2882
|
+
const int64_t smb = ne12 == 1 ? s13 : s12;
|
2883
|
+
|
2877
2884
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
2878
2885
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
|
2879
2886
|
oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
2880
|
-
src0_f16, dpct::library_data_t::real_half, nb01 / nb00,
|
2881
|
-
src1_f16, dpct::library_data_t::real_half, s11,
|
2887
|
+
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, sma,
|
2888
|
+
src1_f16, dpct::library_data_t::real_half, s11, smb, beta, dst_ddf,
|
2882
2889
|
mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
|
2883
2890
|
} else {
|
2884
2891
|
const int ne23 = ne12 * ne13;
|
@@ -2928,6 +2935,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
|
|
2928
2935
|
case GGML_TYPE_Q4_0:
|
2929
2936
|
return true;
|
2930
2937
|
case GGML_TYPE_Q4_K:
|
2938
|
+
case GGML_TYPE_Q6_K:
|
2931
2939
|
return !g_ggml_sycl_prioritize_dmmv;
|
2932
2940
|
default:
|
2933
2941
|
return false;
|
@@ -2947,6 +2955,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
|
|
2947
2955
|
switch (type) {
|
2948
2956
|
case GGML_TYPE_Q4_0:
|
2949
2957
|
case GGML_TYPE_Q4_K:
|
2958
|
+
case GGML_TYPE_Q6_K:
|
2950
2959
|
return true;
|
2951
2960
|
default:
|
2952
2961
|
return false;
|
@@ -3031,6 +3040,50 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
|
|
3031
3040
|
sycl::free(tmp_buf, *stream);
|
3032
3041
|
}
|
3033
3042
|
|
3043
|
+
static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
3044
|
+
GGML_ASSERT(size % sizeof(block_q6_K) == 0);
|
3045
|
+
GGML_ASSERT(offset % sizeof(block_q6_K) == 0);
|
3046
|
+
|
3047
|
+
const int nblocks = size / sizeof(block_q6_K);
|
3048
|
+
|
3049
|
+
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
|
3050
|
+
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
|
3051
|
+
|
3052
|
+
auto * ql_ptr = data_device;
|
3053
|
+
auto * qh_ptr = ql_ptr + (QK_K / 2) * nblocks;
|
3054
|
+
auto * scales_ptr = qh_ptr + (QK_K / 4) * nblocks;
|
3055
|
+
sycl::half * dm_ptr = (sycl::half *) (scales_ptr + (QK_K / 16) * nblocks);
|
3056
|
+
|
3057
|
+
stream
|
3058
|
+
->parallel_for(nblocks,
|
3059
|
+
[=](auto i) {
|
3060
|
+
const block_q6_K * x = (const block_q6_K *) tmp_buf;
|
3061
|
+
const int ib = i;
|
3062
|
+
|
3063
|
+
const uint8_t * ql = x[ib].ql;
|
3064
|
+
const uint8_t * qh = x[ib].qh;
|
3065
|
+
uint8_t * base_ql_ptr = ql_ptr + (QK_K / 2) * ib;
|
3066
|
+
uint8_t * base_qh_ptr = qh_ptr + (QK_K / 4) * ib;
|
3067
|
+
uint8_t * base_scales_ptr = scales_ptr + (QK_K / 16) * ib;
|
3068
|
+
|
3069
|
+
for (int j = 0; j < QK_K / 2; ++j) {
|
3070
|
+
base_ql_ptr[j] = ql[j];
|
3071
|
+
}
|
3072
|
+
for (int j = 0; j < QK_K / 4; ++j) {
|
3073
|
+
base_qh_ptr[j] = qh[j];
|
3074
|
+
}
|
3075
|
+
|
3076
|
+
for (int j = 0; j < QK_K / 16; ++j) {
|
3077
|
+
base_scales_ptr[j] = x[ib].scales[j];
|
3078
|
+
}
|
3079
|
+
|
3080
|
+
dm_ptr[ib] = x[ib].d;
|
3081
|
+
})
|
3082
|
+
.wait_and_throw();
|
3083
|
+
|
3084
|
+
sycl::free(tmp_buf, *stream);
|
3085
|
+
}
|
3086
|
+
|
3034
3087
|
static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
3035
3088
|
uint8_t * data_device = (uint8_t *) src0->data;
|
3036
3089
|
size_t ncols = src0->ne[0];
|
@@ -3044,6 +3097,9 @@ static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
|
3044
3097
|
case GGML_TYPE_Q4_K:
|
3045
3098
|
reorder_qw_q4_k(data_device, size, 0, stream);
|
3046
3099
|
break;
|
3100
|
+
case GGML_TYPE_Q6_K:
|
3101
|
+
reorder_qw_q6_k(data_device, size, 0, stream);
|
3102
|
+
break;
|
3047
3103
|
default:
|
3048
3104
|
GGML_ABORT("reorder_qw() called with unsupported type");
|
3049
3105
|
break;
|
@@ -3159,26 +3215,27 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
3159
3215
|
// The kernel from the if path is faster for that specific case, but does not support all mul mats.
|
3160
3216
|
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
3161
3217
|
}
|
3162
|
-
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) &&
|
3218
|
+
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1 && src1->ne[3] == 1) {
|
3163
3219
|
// KQV single-batch
|
3164
3220
|
ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
|
3165
|
-
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
3221
|
+
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2] * src1->ne[3] > 1) {
|
3166
3222
|
// KQ + KQV multi-batch
|
3167
3223
|
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
3168
3224
|
} else if (use_dequantize_mul_mat_vec) {
|
3169
|
-
constexpr bool convert_src1_to_q8_1 = false;
|
3170
3225
|
opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::DMMV);
|
3171
|
-
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec
|
3226
|
+
ggml_sycl_op_mul_mat<no_quantize_q8_1>(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec);
|
3172
3227
|
} else if (use_mul_mat_vec_q) {
|
3173
|
-
constexpr bool convert_src1_to_q8_1 = true;
|
3174
3228
|
opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::MMVQ);
|
3175
|
-
|
3229
|
+
ggml_tensor_extra_gpu * extra = static_cast<ggml_tensor_extra_gpu *>(src0->extra);
|
3230
|
+
if (extra && extra->optimized_feature.reorder) {
|
3231
|
+
ggml_sycl_op_mul_mat<quantize_and_reorder_q8_1_soa>(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q);
|
3232
|
+
} else {
|
3233
|
+
ggml_sycl_op_mul_mat<quantize_q8_1>(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q);
|
3234
|
+
}
|
3176
3235
|
} else if (use_mul_mat_q) {
|
3177
|
-
|
3178
|
-
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, convert_src1_to_q8_1);
|
3236
|
+
ggml_sycl_op_mul_mat<quantize_q8_1>(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q);
|
3179
3237
|
} else {
|
3180
|
-
|
3181
|
-
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1);
|
3238
|
+
ggml_sycl_op_mul_mat<no_quantize_q8_1>(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl);
|
3182
3239
|
}
|
3183
3240
|
}
|
3184
3241
|
|
@@ -3345,8 +3402,11 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
|
3345
3402
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
3346
3403
|
stream->memset(dev_cur_src1_row.get(), 0, sizeof(int))));
|
3347
3404
|
|
3405
|
+
const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
|
3406
|
+
assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
|
3407
|
+
|
3348
3408
|
{
|
3349
|
-
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10,
|
3409
|
+
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
|
3350
3410
|
sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
|
3351
3411
|
stream->submit([&](sycl::handler &cgh) {
|
3352
3412
|
sycl::local_accessor<int, 0> src1_row_acc(cgh);
|
@@ -3391,7 +3451,7 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
|
3391
3451
|
ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
|
3392
3452
|
|
3393
3453
|
{
|
3394
|
-
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0,
|
3454
|
+
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, max_work_group_size));
|
3395
3455
|
sycl::range<3> grid_dims(1, 1, num_src1_rows);
|
3396
3456
|
stream->submit([&](sycl::handler &cgh) {
|
3397
3457
|
const char *__restrict dst_contiguous_get =
|
@@ -3504,6 +3564,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
|
3504
3564
|
case GGML_OP_GET_ROWS:
|
3505
3565
|
ggml_sycl_get_rows(ctx, dst);
|
3506
3566
|
break;
|
3567
|
+
case GGML_OP_SET_ROWS:
|
3568
|
+
ggml_sycl_op_set_rows(ctx, dst);
|
3569
|
+
break;
|
3507
3570
|
case GGML_OP_DUP:
|
3508
3571
|
ggml_sycl_dup(ctx, dst);
|
3509
3572
|
break;
|
@@ -3514,6 +3577,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
|
3514
3577
|
case GGML_OP_SUB:
|
3515
3578
|
ggml_sycl_sub(ctx, dst);
|
3516
3579
|
break;
|
3580
|
+
case GGML_OP_COUNT_EQUAL:
|
3581
|
+
ggml_sycl_count_equal(ctx, dst);
|
3582
|
+
break;
|
3517
3583
|
case GGML_OP_ACC:
|
3518
3584
|
ggml_sycl_acc(ctx, dst);
|
3519
3585
|
break;
|
@@ -3543,6 +3609,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
|
3543
3609
|
case GGML_UNARY_OP_GELU_QUICK:
|
3544
3610
|
ggml_sycl_gelu_quick(ctx, dst);
|
3545
3611
|
break;
|
3612
|
+
case GGML_UNARY_OP_GELU_ERF:
|
3613
|
+
ggml_sycl_gelu_erf(ctx, dst);
|
3614
|
+
break;
|
3546
3615
|
case GGML_UNARY_OP_TANH:
|
3547
3616
|
ggml_sycl_tanh(ctx, dst);
|
3548
3617
|
break;
|
@@ -3574,6 +3643,27 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
|
3574
3643
|
return false;
|
3575
3644
|
}
|
3576
3645
|
break;
|
3646
|
+
case GGML_OP_GLU:
|
3647
|
+
switch (ggml_get_glu_op(dst)) {
|
3648
|
+
case GGML_GLU_OP_REGLU:
|
3649
|
+
ggml_sycl_reglu(ctx, dst);
|
3650
|
+
break;
|
3651
|
+
case GGML_GLU_OP_GEGLU:
|
3652
|
+
ggml_sycl_geglu(ctx, dst);
|
3653
|
+
break;
|
3654
|
+
case GGML_GLU_OP_SWIGLU:
|
3655
|
+
ggml_sycl_swiglu(ctx, dst);
|
3656
|
+
break;
|
3657
|
+
case GGML_GLU_OP_GEGLU_ERF:
|
3658
|
+
ggml_sycl_geglu_erf(ctx, dst);
|
3659
|
+
break;
|
3660
|
+
case GGML_GLU_OP_GEGLU_QUICK:
|
3661
|
+
ggml_sycl_geglu_quick(ctx, dst);
|
3662
|
+
break;
|
3663
|
+
default:
|
3664
|
+
return false;
|
3665
|
+
}
|
3666
|
+
break;
|
3577
3667
|
case GGML_OP_NORM:
|
3578
3668
|
ggml_sycl_norm(ctx, dst);
|
3579
3669
|
break;
|
@@ -3752,7 +3842,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
|
3752
3842
|
const void *data, size_t offset,
|
3753
3843
|
size_t size) try {
|
3754
3844
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
3755
|
-
|
3845
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
3756
3846
|
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
3757
3847
|
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
3758
3848
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
@@ -3773,7 +3863,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
|
3773
3863
|
void *data, size_t offset,
|
3774
3864
|
size_t size) try {
|
3775
3865
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
3776
|
-
|
3866
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
3777
3867
|
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
3778
3868
|
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
3779
3869
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
@@ -3796,8 +3886,8 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
|
|
3796
3886
|
bool is_cpy_supported = dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) &&
|
3797
3887
|
ggml_backend_buffer_is_sycl(src->buffer);
|
3798
3888
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
3799
|
-
|
3800
|
-
|
3889
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str());
|
3890
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str());
|
3801
3891
|
GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
|
3802
3892
|
if (is_cpy_supported) {
|
3803
3893
|
/*
|
@@ -3983,6 +4073,7 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|
3983
4073
|
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
3984
4074
|
/* .event_record = */ ggml_backend_sycl_event_record,
|
3985
4075
|
/* .event_wait = */ ggml_backend_sycl_event_wait,
|
4076
|
+
/* .graph_optimize = */ NULL,
|
3986
4077
|
};
|
3987
4078
|
|
3988
4079
|
static ggml_guid_t ggml_backend_sycl_guid() {
|
@@ -4096,6 +4187,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
4096
4187
|
case GGML_UNARY_OP_HARDSIGMOID:
|
4097
4188
|
case GGML_UNARY_OP_HARDSWISH:
|
4098
4189
|
case GGML_UNARY_OP_GELU_QUICK:
|
4190
|
+
case GGML_UNARY_OP_GELU_ERF:
|
4099
4191
|
case GGML_UNARY_OP_TANH:
|
4100
4192
|
case GGML_UNARY_OP_EXP:
|
4101
4193
|
case GGML_UNARY_OP_SGN:
|
@@ -4109,18 +4201,24 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
4109
4201
|
default:
|
4110
4202
|
return false;
|
4111
4203
|
}
|
4204
|
+
case GGML_OP_GLU:
|
4205
|
+
switch (ggml_get_glu_op(op)) {
|
4206
|
+
case GGML_GLU_OP_REGLU:
|
4207
|
+
case GGML_GLU_OP_GEGLU:
|
4208
|
+
case GGML_GLU_OP_SWIGLU:
|
4209
|
+
case GGML_GLU_OP_GEGLU_ERF:
|
4210
|
+
case GGML_GLU_OP_GEGLU_QUICK:
|
4211
|
+
return ggml_is_contiguous_1(op->src[0]);
|
4212
|
+
default:
|
4213
|
+
return false;
|
4214
|
+
}
|
4215
|
+
break;
|
4112
4216
|
case GGML_OP_MUL_MAT:
|
4113
4217
|
case GGML_OP_MUL_MAT_ID:
|
4114
4218
|
{
|
4115
|
-
struct ggml_tensor * a;
|
4116
|
-
struct ggml_tensor * b;
|
4117
|
-
|
4118
|
-
a = op->src[0];
|
4119
|
-
b = op->src[1];
|
4120
|
-
} else {
|
4121
|
-
a = op->src[2];
|
4122
|
-
b = op->src[1];
|
4123
|
-
}
|
4219
|
+
struct ggml_tensor * a = op->src[0];
|
4220
|
+
struct ggml_tensor * b = op->src[1];
|
4221
|
+
|
4124
4222
|
if (a->ne[3] != b->ne[3]) {
|
4125
4223
|
return false;
|
4126
4224
|
}
|
@@ -4135,7 +4233,18 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
4135
4233
|
}
|
4136
4234
|
}
|
4137
4235
|
ggml_type src0_type = op->src[0]->type;
|
4138
|
-
if (src0_type == GGML_TYPE_BF16) {
|
4236
|
+
if (src0_type == GGML_TYPE_BF16 || src0_type == GGML_TYPE_MXFP4) {
|
4237
|
+
// TODO: support MXFP4
|
4238
|
+
// FIXME: keep a list of supported types to avoid breaking the backend when a new type is added
|
4239
|
+
return false;
|
4240
|
+
}
|
4241
|
+
// TODO: The configuration below needs more work to be supported with oneDNN
|
4242
|
+
if (ggml_is_permuted(a) && !ggml_is_contiguous(a) && a->ne[2] > 1 && a->ne[3] > 1) {
|
4243
|
+
return false;
|
4244
|
+
}
|
4245
|
+
// TODO: This specific configuration can fail with oneDNN and needs more debugging
|
4246
|
+
if (!ggml_is_permuted(a) && ggml_is_permuted(b) && b->ne[2] > 1 && b->ne[3] > 1 &&
|
4247
|
+
a->ne[0] > 128 && a->ne[2] == 1 && src0_type == GGML_TYPE_F16) {
|
4139
4248
|
return false;
|
4140
4249
|
}
|
4141
4250
|
return true;
|
@@ -4157,10 +4266,21 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
4157
4266
|
return false;
|
4158
4267
|
}
|
4159
4268
|
}
|
4269
|
+
case GGML_OP_SET_ROWS:
|
4270
|
+
{
|
4271
|
+
return ((op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
|
4272
|
+
op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q5_0 ||
|
4273
|
+
op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_IQ4_NL) &&
|
4274
|
+
(op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32));
|
4275
|
+
}
|
4276
|
+
break;
|
4160
4277
|
case GGML_OP_CPY:
|
4161
4278
|
{
|
4162
4279
|
ggml_type src0_type = op->src[0]->type;
|
4163
4280
|
ggml_type src1_type = op->src[1]->type;
|
4281
|
+
if (src0_type == src1_type && (ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) && src0_type != GGML_TYPE_BF16) {
|
4282
|
+
return true;
|
4283
|
+
}
|
4164
4284
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
4165
4285
|
return true;
|
4166
4286
|
}
|
@@ -4206,6 +4326,21 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
4206
4326
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
|
4207
4327
|
return true;
|
4208
4328
|
}
|
4329
|
+
if(src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_Q8_0) {
|
4330
|
+
return true;
|
4331
|
+
}
|
4332
|
+
if(src0_type == GGML_TYPE_Q5_0 && src1_type == GGML_TYPE_Q5_0) {
|
4333
|
+
return true;
|
4334
|
+
}
|
4335
|
+
if(src0_type == GGML_TYPE_Q5_1 && src1_type == GGML_TYPE_Q5_1) {
|
4336
|
+
return true;
|
4337
|
+
}
|
4338
|
+
if(src0_type == GGML_TYPE_Q4_0 && src1_type == GGML_TYPE_Q4_0) {
|
4339
|
+
return true;
|
4340
|
+
}
|
4341
|
+
if(src0_type == GGML_TYPE_Q4_1 && src1_type == GGML_TYPE_Q4_1) {
|
4342
|
+
return true;
|
4343
|
+
}
|
4209
4344
|
return false;
|
4210
4345
|
}
|
4211
4346
|
case GGML_OP_CONCAT:
|
@@ -4224,6 +4359,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
4224
4359
|
case GGML_OP_ADD:
|
4225
4360
|
case GGML_OP_ADD1:
|
4226
4361
|
case GGML_OP_SUB:
|
4362
|
+
case GGML_OP_COUNT_EQUAL:
|
4227
4363
|
case GGML_OP_MUL:
|
4228
4364
|
case GGML_OP_DIV:
|
4229
4365
|
case GGML_OP_REPEAT:
|
@@ -4240,37 +4376,44 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
4240
4376
|
return (op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32) && (op->type == op->src[0]->type);
|
4241
4377
|
#endif
|
4242
4378
|
case GGML_OP_NORM:
|
4243
|
-
case GGML_OP_RMS_NORM:
|
4244
4379
|
return true;
|
4245
4380
|
case GGML_OP_L2_NORM:
|
4246
4381
|
case GGML_OP_GROUP_NORM:
|
4247
4382
|
return ggml_is_contiguous(op->src[0]);
|
4383
|
+
case GGML_OP_RMS_NORM:
|
4384
|
+
return ((op->src[0]->ne[0] % WARP_SIZE) == 0);
|
4248
4385
|
case GGML_OP_SCALE:
|
4249
4386
|
return true;
|
4250
4387
|
case GGML_OP_CONT:
|
4251
4388
|
return op->src[0]->type != GGML_TYPE_BF16;
|
4252
|
-
case GGML_OP_DIAG_MASK_INF:
|
4253
4389
|
case GGML_OP_SOFT_MAX:
|
4254
|
-
|
4255
|
-
|
4256
|
-
|
4257
|
-
|
4258
|
-
|
4259
|
-
|
4260
|
-
|
4261
|
-
}
|
4262
|
-
return true;
|
4390
|
+
// TODO: support batching
|
4391
|
+
if (op->src[0]->ne[3] != 1) {
|
4392
|
+
return false;
|
4393
|
+
}
|
4394
|
+
// TODO: support attention sinks [TAG_ATTN_SINKS]
|
4395
|
+
if (op->src[2]) {
|
4396
|
+
return false;
|
4263
4397
|
}
|
4398
|
+
// TODO: support broadcast
|
4399
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
|
4400
|
+
return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1);
|
4401
|
+
case GGML_OP_DIAG_MASK_INF:
|
4402
|
+
case GGML_OP_ROPE:
|
4264
4403
|
case GGML_OP_IM2COL:
|
4265
4404
|
return true;
|
4266
4405
|
case GGML_OP_UPSCALE:
|
4267
4406
|
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
|
4268
|
-
case GGML_OP_POOL_2D:
|
4269
4407
|
case GGML_OP_SUM:
|
4270
4408
|
case GGML_OP_SUM_ROWS:
|
4271
4409
|
case GGML_OP_ARGSORT:
|
4410
|
+
return ggml_is_contiguous(op->src[0]);
|
4411
|
+
case GGML_OP_POOL_2D:
|
4272
4412
|
case GGML_OP_ACC:
|
4413
|
+
return true;
|
4273
4414
|
case GGML_OP_PAD:
|
4415
|
+
return (ggml_get_op_params_i32(op, 0) == 0) && (ggml_get_op_params_i32(op, 2) == 0) &&
|
4416
|
+
(ggml_get_op_params_i32(op, 4) == 0) && (ggml_get_op_params_i32(op, 6) == 0);
|
4274
4417
|
case GGML_OP_LEAKY_RELU:
|
4275
4418
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
4276
4419
|
case GGML_OP_RWKV_WKV6:
|
@@ -4481,10 +4624,10 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
|
|
4481
4624
|
};
|
4482
4625
|
|
4483
4626
|
ggml_backend_t sycl_backend = new ggml_backend {
|
4484
|
-
/* .guid
|
4485
|
-
/* .
|
4486
|
-
/* .device
|
4487
|
-
/* .context
|
4627
|
+
/* .guid = */ ggml_backend_sycl_guid(),
|
4628
|
+
/* .iface = */ ggml_backend_sycl_interface,
|
4629
|
+
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), device),
|
4630
|
+
/* .context = */ ctx
|
4488
4631
|
};
|
4489
4632
|
|
4490
4633
|
return sycl_backend;
|