whispercpp 1.3.2 → 1.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +6 -3
- data/README.md +71 -14
- data/Rakefile +20 -7
- data/ext/.gitignore +4 -6
- data/ext/dependencies.rb +36 -24
- data/ext/extconf.rb +1 -1
- data/ext/options.rb +48 -184
- data/ext/ruby_whisper.c +18 -0
- data/ext/ruby_whisper_context.c +43 -12
- data/ext/ruby_whisper_model.c +1 -1
- data/ext/ruby_whisper_params.c +59 -27
- data/ext/ruby_whisper_segment.c +81 -4
- data/ext/ruby_whisper_transcribe.cpp +13 -7
- data/ext/ruby_whisper_vad_params.c +1 -1
- data/ext/sources/CMakeLists.txt +5 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/build-xcframework.sh +24 -0
- data/ext/sources/examples/CMakeLists.txt +1 -0
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
- data/ext/sources/examples/addon.node/addon.cpp +154 -35
- data/ext/sources/examples/addon.node/index.js +10 -5
- data/ext/sources/examples/addon.node/vad-example.js +132 -0
- data/ext/sources/examples/bench/bench.cpp +29 -18
- data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
- data/ext/sources/examples/cli/cli.cpp +7 -4
- data/ext/sources/examples/command/command.cpp +58 -32
- data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/common-whisper.cpp +14 -7
- data/ext/sources/examples/lsp/lsp.cpp +21 -17
- data/ext/sources/examples/quantize/quantize.cpp +3 -0
- data/ext/sources/examples/server/CMakeLists.txt +3 -0
- data/ext/sources/examples/server/server.cpp +193 -35
- data/ext/sources/examples/server.py +6 -1
- data/ext/sources/examples/stream/stream.cpp +10 -2
- data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
- data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
- data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -0
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
- data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
- data/ext/sources/examples/talk-llama/llama-arch.cpp +756 -15
- data/ext/sources/examples/talk-llama/llama-arch.h +85 -1
- data/ext/sources/examples/talk-llama/llama-batch.cpp +773 -272
- data/ext/sources/examples/talk-llama/llama-batch.h +126 -55
- data/ext/sources/examples/talk-llama/llama-chat.cpp +150 -13
- data/ext/sources/examples/talk-llama/llama-chat.h +8 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +814 -542
- data/ext/sources/examples/talk-llama/llama-context.h +68 -32
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
- data/ext/sources/examples/talk-llama/llama-graph.cpp +787 -440
- data/ext/sources/examples/talk-llama/llama-graph.h +333 -153
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +128 -6
- data/ext/sources/examples/talk-llama/llama-hparams.h +80 -17
- data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +137 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +1248 -1967
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +218 -345
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +164 -52
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +266 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +139 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1154 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +182 -0
- data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
- data/ext/sources/examples/talk-llama/llama-memory.h +94 -4
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +44 -17
- data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +11377 -5248
- data/ext/sources/examples/talk-llama/llama-model.h +87 -9
- data/ext/sources/examples/talk-llama/llama-quant.cpp +137 -16
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +502 -38
- data/ext/sources/examples/talk-llama/llama-vocab.h +46 -0
- data/ext/sources/examples/talk-llama/llama.cpp +76 -17
- data/ext/sources/examples/talk-llama/llama.h +176 -151
- data/ext/sources/examples/talk-llama/talk-llama.cpp +11 -6
- data/ext/sources/examples/talk-llama/unicode.cpp +212 -0
- data/ext/sources/examples/talk-llama/unicode.h +45 -0
- data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +6 -2
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
- data/ext/sources/ggml/CMakeLists.txt +106 -33
- data/ext/sources/ggml/cmake/common.cmake +24 -0
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
- data/ext/sources/ggml/include/ggml-backend.h +18 -2
- data/ext/sources/ggml/include/ggml-cpu.h +2 -0
- data/ext/sources/ggml/include/ggml-metal.h +1 -6
- data/ext/sources/ggml/include/ggml-opt.h +25 -6
- data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
- data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
- data/ext/sources/ggml/include/ggml.h +365 -21
- data/ext/sources/ggml/src/CMakeLists.txt +98 -25
- data/ext/sources/ggml/src/ggml-alloc.c +265 -141
- data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +35 -13
- data/ext/sources/ggml/src/ggml-backend.cpp +266 -60
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +4 -4
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
- data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +15 -0
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
- data/ext/sources/ggml/src/ggml-cann/common.h +149 -2
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +521 -78
- data/ext/sources/ggml/src/ggml-common.h +21 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +165 -50
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -3
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +3650 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1891 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2160 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1897 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +214 -0
- data/ext/sources/ggml/src/ggml-cpu/common.h +18 -3
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +23 -7
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +179 -110
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +44 -33
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +228 -98
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +532 -1124
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +3374 -2081
- data/ext/sources/ggml/src/ggml-cpu/ops.h +13 -8
- data/ext/sources/ggml/src/ggml-cpu/quants.c +1193 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +34 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1982 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.h +120 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +367 -46
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +3 -3
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +272 -35
- data/ext/sources/ggml/src/ggml-cpu/vec.h +794 -142
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
- data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
- data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +291 -81
- data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +117 -22
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +20 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
- data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +499 -368
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +142 -93
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +90 -50
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +636 -222
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
- data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +73 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +506 -0
- data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +4 -5
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
- data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -98
- data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +23 -19
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
- data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +179 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +15 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +14 -2
- data/ext/sources/ggml/src/ggml-impl.h +229 -175
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +21 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +163 -63
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +3208 -1575
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
- data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +32 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4430 -792
- data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +138 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
- data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
- data/ext/sources/ggml/src/ggml-quants.c +117 -24
- data/ext/sources/ggml/src/ggml-quants.h +6 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +85 -62
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +13 -17
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +21 -2
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +116 -211
- data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +700 -1041
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +20 -9
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +17 -26
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +2 -96
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +393 -250
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +32 -8
- data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -11
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +125 -21
- data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +4 -3
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +105 -17
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4198 -1145
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +19 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +64 -11
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +338 -71
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
- data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
- data/ext/sources/ggml/src/ggml.c +802 -142
- data/ext/sources/ggml/src/ggml.cpp +26 -0
- data/ext/sources/ggml/src/gguf.cpp +32 -4
- data/ext/sources/include/whisper.h +2 -0
- data/ext/sources/src/CMakeLists.txt +2 -0
- data/ext/sources/src/coreml/whisper-compat.h +10 -0
- data/ext/sources/src/coreml/whisper-compat.m +35 -0
- data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
- data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
- data/ext/sources/src/whisper.cpp +241 -215
- data/ext/sources/tests/CMakeLists.txt +8 -1
- data/ext/sources/tests/test-vad-full.cpp +3 -3
- data/ext/sources/tests/test-vad.cpp +2 -2
- data/extsources.rb +15 -9
- data/lib/whisper/context.rb +15 -0
- data/lib/whisper/model/uri.rb +57 -2
- data/lib/whisper/segment.rb +58 -0
- data/sig/whisper.rbs +75 -38
- data/{tests → test}/helper.rb +1 -12
- data/{tests → test}/test_model.rb +9 -0
- data/test/test_package.rb +51 -0
- data/{tests → test}/test_params.rb +8 -0
- data/test/test_segment.rb +146 -0
- data/{tests → test}/test_whisper.rb +70 -0
- data/whispercpp.gemspec +2 -3
- metadata +246 -191
- data/ext/sources/.dockerignore +0 -3
- data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
- data/ext/sources/ci/run.sh +0 -336
- data/ext/sources/close-issue.yml +0 -28
- data/ext/sources/ggml/include/ggml-kompute.h +0 -50
- data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
- data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
- data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
- data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
- data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
- data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -6431
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
- data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -336
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -5998
- data/tests/test_package.rb +0 -46
- data/tests/test_segment.rb +0 -74
- /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /data/{tests → test}/jfk_reader/.gitignore +0 -0
- /data/{tests → test}/jfk_reader/extconf.rb +0 -0
- /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
- /data/{tests → test}/test_callback.rb +0 -0
- /data/{tests → test}/test_error.rb +0 -0
- /data/{tests → test}/test_vad.rb +0 -0
- /data/{tests → test}/test_vad_params.rb +0 -0
@@ -24,6 +24,7 @@
|
|
24
24
|
|
25
25
|
#include <acl/acl.h>
|
26
26
|
#include <stdarg.h>
|
27
|
+
#include <aclnnop/aclnn_trans_matmul_weight.h>
|
27
28
|
|
28
29
|
#include <cmath>
|
29
30
|
#include <cstdio>
|
@@ -31,6 +32,8 @@
|
|
31
32
|
#include <mutex>
|
32
33
|
#include <queue>
|
33
34
|
#include <chrono>
|
35
|
+
#include <unordered_set>
|
36
|
+
#include <optional>
|
34
37
|
|
35
38
|
#include "ggml-impl.h"
|
36
39
|
#include "ggml-backend-impl.h"
|
@@ -72,13 +75,12 @@
|
|
72
75
|
* @param device The device ID to set.
|
73
76
|
*/
|
74
77
|
void ggml_cann_set_device(const int32_t device) {
|
75
|
-
|
76
|
-
|
77
|
-
// ACL_CHECK(aclrtGetDevice(¤t_device));
|
78
|
+
int current_device = -1;
|
79
|
+
aclrtGetDevice(¤t_device);
|
78
80
|
|
79
|
-
|
80
|
-
|
81
|
-
|
81
|
+
if (device == current_device) {
|
82
|
+
return;
|
83
|
+
}
|
82
84
|
ACL_CHECK(aclrtSetDevice(device));
|
83
85
|
}
|
84
86
|
|
@@ -93,6 +95,44 @@ int32_t ggml_cann_get_device() {
|
|
93
95
|
return id;
|
94
96
|
}
|
95
97
|
|
98
|
+
/**
|
99
|
+
* @brief Get the value of the specified environment variable (name).
|
100
|
+
* if not empty, return a std::string object
|
101
|
+
*/
|
102
|
+
std::optional<std::string> get_env(const std::string& name) {
|
103
|
+
const char* val = std::getenv(name.c_str());
|
104
|
+
if (!val) return std::nullopt;
|
105
|
+
std::string res = std::string(val);
|
106
|
+
std::transform(res.begin(), res.end(), res.begin(), ::tolower);
|
107
|
+
return res;
|
108
|
+
}
|
109
|
+
|
110
|
+
/**
|
111
|
+
* @brief Verify whether the environment variable is a valid value.
|
112
|
+
*/
|
113
|
+
bool parse_bool(const std::string& value) {
|
114
|
+
std::unordered_set<std::string> valid_values = {"on", "1", "yes", "y", "enable", "true"};
|
115
|
+
return valid_values.find(value) != valid_values.end();
|
116
|
+
}
|
117
|
+
|
118
|
+
/**
|
119
|
+
* @brief Parse a string as an integer, returning 0 if invalid.
|
120
|
+
*
|
121
|
+
* This function attempts to convert the input string `value` to an `int`.
|
122
|
+
* If the string is not a valid integer or is out of the `int` range,
|
123
|
+
* it returns 0.
|
124
|
+
*
|
125
|
+
* @param value The string to parse.
|
126
|
+
* @return The parsed integer, or 0 if conversion fails.
|
127
|
+
*/
|
128
|
+
int parse_integer(const std::string& value) {
|
129
|
+
try {
|
130
|
+
return std::stoi(value);
|
131
|
+
} catch (...) {
|
132
|
+
return 0;
|
133
|
+
}
|
134
|
+
}
|
135
|
+
|
96
136
|
/**
|
97
137
|
* @brief Initialize the CANN device information.
|
98
138
|
*
|
@@ -214,7 +254,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
214
254
|
* @param device The device ID to associate with this buffer pool.
|
215
255
|
*/
|
216
256
|
explicit ggml_cann_pool_buf_prio(int device) : device(device) {
|
217
|
-
disable_clean =
|
257
|
+
disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
|
218
258
|
}
|
219
259
|
|
220
260
|
/**
|
@@ -410,7 +450,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
410
450
|
* @param device The device ID to associate with this buffer pool.
|
411
451
|
*/
|
412
452
|
explicit ggml_cann_pool_buf(int device) : device(device) {
|
413
|
-
disable_clean =
|
453
|
+
disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
|
414
454
|
}
|
415
455
|
|
416
456
|
/**
|
@@ -731,16 +771,18 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
731
771
|
*/
|
732
772
|
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
|
733
773
|
int device) {
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
|
738
|
-
}
|
739
|
-
bool enable_buf_prio = (getenv("GGML_CANN_ENABLE_BUF_PRIO_POOL") != nullptr);
|
740
|
-
if (enable_buf_prio) {
|
774
|
+
std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
|
775
|
+
|
776
|
+
if (mem_pool_type == "prio") {
|
741
777
|
GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
|
742
778
|
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf_prio(device));
|
743
779
|
}
|
780
|
+
|
781
|
+
if (ggml_cann_info().devices[device].vmm && mem_pool_type != "leg") {
|
782
|
+
GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
|
783
|
+
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
|
784
|
+
}
|
785
|
+
|
744
786
|
GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device);
|
745
787
|
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf(device));
|
746
788
|
}
|
@@ -1091,6 +1133,98 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
|
|
1091
1133
|
return GGML_STATUS_SUCCESS;
|
1092
1134
|
}
|
1093
1135
|
|
1136
|
+
/**
|
1137
|
+
* @brief Workspace for caching NZ buffers per device.
|
1138
|
+
*
|
1139
|
+
* This struct manages a device buffer used in NZ computations. It supports
|
1140
|
+
* allocation, reallocation, and clearing of cached memory. The struct is
|
1141
|
+
* designed to be used with a global array, one per device.
|
1142
|
+
*/
|
1143
|
+
struct ggml_cann_nz_workspace {
|
1144
|
+
void* ptr; // Pointer to allocated device buffer
|
1145
|
+
size_t allocated; // Size of currently allocated buffer in bytes
|
1146
|
+
|
1147
|
+
/**
|
1148
|
+
* @brief Constructor. Initializes the workspace with no allocated memory.
|
1149
|
+
*/
|
1150
|
+
ggml_cann_nz_workspace() : ptr(nullptr), allocated(0) {}
|
1151
|
+
|
1152
|
+
/**
|
1153
|
+
* @brief Free cached memory and reset the workspace.
|
1154
|
+
*
|
1155
|
+
* If a buffer has been allocated, this function releases it using
|
1156
|
+
* aclrtFree and resets internal state.
|
1157
|
+
*/
|
1158
|
+
void clear() {
|
1159
|
+
if (ptr) {
|
1160
|
+
ACL_CHECK(aclrtFree(ptr));
|
1161
|
+
ptr = nullptr;
|
1162
|
+
allocated = 0;
|
1163
|
+
}
|
1164
|
+
}
|
1165
|
+
|
1166
|
+
/**
|
1167
|
+
* @brief Allocate or reallocate the workspace buffer.
|
1168
|
+
*
|
1169
|
+
* If the requested size is larger than the currently allocated size,
|
1170
|
+
* the old buffer will be freed and a new buffer of the requested size
|
1171
|
+
* will be allocated on the device.
|
1172
|
+
*
|
1173
|
+
* @param new_size Size in bytes to allocate for the workspace.
|
1174
|
+
*/
|
1175
|
+
void realloc(size_t new_size) {
|
1176
|
+
if (new_size > allocated) {
|
1177
|
+
clear();
|
1178
|
+
ACL_CHECK(aclrtMalloc(&ptr, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
|
1179
|
+
allocated = new_size;
|
1180
|
+
}
|
1181
|
+
}
|
1182
|
+
|
1183
|
+
/**
|
1184
|
+
* @brief Get the device buffer pointer.
|
1185
|
+
*
|
1186
|
+
* @return Pointer to the allocated buffer, or nullptr if not allocated.
|
1187
|
+
*/
|
1188
|
+
void* get() const { return ptr; }
|
1189
|
+
};
|
1190
|
+
|
1191
|
+
/**
|
1192
|
+
* @brief Global array of NZ workspaces, one per device.
|
1193
|
+
*/
|
1194
|
+
static ggml_cann_nz_workspace g_nz_workspaces[GGML_CANN_MAX_DEVICES];
|
1195
|
+
|
1196
|
+
/**
|
1197
|
+
* @brief Convert tensor weights to NZ format using Ascend CANN API.
|
1198
|
+
*
|
1199
|
+
* This function creates a transposed tensor descriptor and performs the
|
1200
|
+
* TransMatmulWeight operation. Converting tensor formats can significantly
|
1201
|
+
* improve performance on certain hardware.
|
1202
|
+
*
|
1203
|
+
* @param tensor Pointer to the input ggml_tensor containing the weights.
|
1204
|
+
* @param offset Byte offset within the tensor data buffer where weights start.
|
1205
|
+
* @param device device id.
|
1206
|
+
*
|
1207
|
+
* @note The workspace buffer used in this function is managed globally and reused
|
1208
|
+
* across calls. This reduces overhead from repeated memory allocation and deallocation.
|
1209
|
+
*/
|
1210
|
+
static void weight_format_to_nz(ggml_tensor *tensor, size_t offset, int device) {
|
1211
|
+
aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne,
|
1212
|
+
tensor->nb, 2, ACL_FORMAT_ND, offset);
|
1213
|
+
uint64_t workspaceSize = 0;
|
1214
|
+
aclOpExecutor *executor;
|
1215
|
+
|
1216
|
+
// TransMatmulWeight
|
1217
|
+
ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed,
|
1218
|
+
&workspaceSize, &executor));
|
1219
|
+
// Avoid frequent malloc/free of the workspace.
|
1220
|
+
g_nz_workspaces[device].realloc(workspaceSize);
|
1221
|
+
|
1222
|
+
void* g_nz_workspace = g_nz_workspaces[device].get();
|
1223
|
+
|
1224
|
+
ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
|
1225
|
+
ACL_CHECK(aclDestroyTensor(weightTransposed));
|
1226
|
+
}
|
1227
|
+
|
1094
1228
|
// TODO: need handle tensor which has paddings.
|
1095
1229
|
/**
|
1096
1230
|
* @brief Set tensor data in a CANN buffer.
|
@@ -1115,9 +1249,16 @@ static void ggml_backend_cann_buffer_set_tensor(
|
|
1115
1249
|
// For acl, synchronous functions use this default stream.
|
1116
1250
|
// Why aclrtSynchronizeDevice?
|
1117
1251
|
|
1252
|
+
// Only check env once.
|
1253
|
+
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
1118
1254
|
if (!need_transform(tensor->type)) {
|
1119
1255
|
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
|
1120
1256
|
ACL_MEMCPY_HOST_TO_DEVICE));
|
1257
|
+
if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
|
1258
|
+
GGML_ASSERT(tensor->ne[2] == 1);
|
1259
|
+
GGML_ASSERT(tensor->ne[3] == 1);
|
1260
|
+
weight_format_to_nz(tensor, offset, ctx->device);
|
1261
|
+
}
|
1121
1262
|
} else {
|
1122
1263
|
void *transform_buffer = malloc(size);
|
1123
1264
|
ggml_backend_cann_transform(tensor, data, transform_buffer);
|
@@ -1192,6 +1333,10 @@ static bool ggml_backend_cann_buffer_cpy_tensor(
|
|
1192
1333
|
ACL_MEMCPY_DEVICE_TO_DEVICE));
|
1193
1334
|
return true;
|
1194
1335
|
} else {
|
1336
|
+
#ifdef ASCEND_310P
|
1337
|
+
// TODO: Support 310p P2P copy
|
1338
|
+
return false;
|
1339
|
+
#endif
|
1195
1340
|
// Different device but can access by peer.
|
1196
1341
|
int32_t canAccessPeer = 0;
|
1197
1342
|
ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device,
|
@@ -1351,20 +1496,32 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
|
|
1351
1496
|
size_t size = ggml_nbytes(tensor);
|
1352
1497
|
int64_t ne0 = tensor->ne[0];
|
1353
1498
|
|
1499
|
+
// Only check env once.
|
1500
|
+
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
1501
|
+
|
1354
1502
|
// last line must bigger than 32, because every single op deal at
|
1355
1503
|
// least 32 bytes.
|
1356
1504
|
// TODO: quantized type?
|
1357
1505
|
// int64_t line_size = ne0 * ggml_element_size(tensor);
|
1358
1506
|
// int64_t line_size_align_32 = (line_size + 31) & ~31;
|
1359
1507
|
// size += (line_size_align_32 - line_size);
|
1360
|
-
|
1361
|
-
// TODO: not support quantized yet.
|
1362
|
-
// TODO: consider un-continue tensor.
|
1363
1508
|
if (ggml_is_quantized(tensor->type)) {
|
1364
1509
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
1365
1510
|
size += ggml_row_size(
|
1366
1511
|
tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
1367
1512
|
}
|
1513
|
+
} else if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
|
1514
|
+
// NZ format weight are not support quantized yet.
|
1515
|
+
// If ND tensor transform to NZ, size may changed.
|
1516
|
+
int64_t shape[] = {tensor->ne[1], tensor->ne[0]};
|
1517
|
+
GGML_ASSERT(tensor->ne[2] == 1);
|
1518
|
+
GGML_ASSERT(tensor->ne[3] == 1);
|
1519
|
+
const aclIntArray *acl_shape = aclCreateIntArray(shape, 2);
|
1520
|
+
size_t new_size;
|
1521
|
+
ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape,
|
1522
|
+
ggml_cann_type_mapping(tensor->type), &new_size));
|
1523
|
+
ACL_CHECK(aclDestroyIntArray(acl_shape));
|
1524
|
+
size = std::max(size, new_size);
|
1368
1525
|
}
|
1369
1526
|
|
1370
1527
|
return size;
|
@@ -1570,6 +1727,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
1570
1727
|
case GGML_OP_GET_ROWS:
|
1571
1728
|
ggml_cann_get_rows(ctx, dst);
|
1572
1729
|
break;
|
1730
|
+
case GGML_OP_SET_ROWS:
|
1731
|
+
ggml_cann_set_rows(ctx, dst);
|
1732
|
+
break;
|
1573
1733
|
case GGML_OP_DUP:
|
1574
1734
|
ggml_cann_dup(ctx, dst);
|
1575
1735
|
break;
|
@@ -1592,16 +1752,18 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
1592
1752
|
case GGML_OP_UNARY:
|
1593
1753
|
switch (ggml_get_unary_op(dst)) {
|
1594
1754
|
case GGML_UNARY_OP_ABS:
|
1595
|
-
|
1755
|
+
GGML_CANN_CALL_OP_UNARY(Abs);
|
1596
1756
|
break;
|
1597
1757
|
case GGML_UNARY_OP_NEG:
|
1598
|
-
|
1758
|
+
GGML_CANN_CALL_OP_UNARY(Neg);
|
1599
1759
|
break;
|
1600
1760
|
case GGML_UNARY_OP_GELU:
|
1601
|
-
|
1761
|
+
case GGML_UNARY_OP_GELU_ERF:
|
1762
|
+
// aclnnGelu internally uses the erf-based approximation.
|
1763
|
+
GGML_CANN_CALL_OP_UNARY(Gelu);
|
1602
1764
|
break;
|
1603
1765
|
case GGML_UNARY_OP_SILU:
|
1604
|
-
|
1766
|
+
GGML_CANN_CALL_OP_UNARY(Silu);
|
1605
1767
|
break;
|
1606
1768
|
case GGML_UNARY_OP_GELU_QUICK: {
|
1607
1769
|
auto lambda = [](ggml_backend_cann_context& ctx,
|
@@ -1609,31 +1771,31 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
1609
1771
|
aclTensor* acl_dst) {
|
1610
1772
|
GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
|
1611
1773
|
};
|
1612
|
-
|
1774
|
+
ggml_cann_op_unary(lambda, ctx, dst);
|
1613
1775
|
} break;
|
1614
1776
|
case GGML_UNARY_OP_TANH:
|
1615
|
-
|
1777
|
+
GGML_CANN_CALL_OP_UNARY(Tanh);
|
1616
1778
|
break;
|
1617
1779
|
case GGML_UNARY_OP_RELU:
|
1618
|
-
|
1780
|
+
GGML_CANN_CALL_OP_UNARY(Relu);
|
1619
1781
|
break;
|
1620
1782
|
case GGML_UNARY_OP_SIGMOID:
|
1621
|
-
|
1783
|
+
GGML_CANN_CALL_OP_UNARY(Sigmoid);
|
1622
1784
|
break;
|
1623
1785
|
case GGML_UNARY_OP_HARDSIGMOID:
|
1624
|
-
|
1786
|
+
GGML_CANN_CALL_OP_UNARY(Hardsigmoid);
|
1625
1787
|
break;
|
1626
1788
|
case GGML_UNARY_OP_HARDSWISH:
|
1627
|
-
|
1789
|
+
GGML_CANN_CALL_OP_UNARY(Hardswish);
|
1628
1790
|
break;
|
1629
1791
|
case GGML_UNARY_OP_EXP:
|
1630
|
-
|
1792
|
+
GGML_CANN_CALL_OP_UNARY(Exp);
|
1631
1793
|
break;
|
1632
1794
|
case GGML_UNARY_OP_ELU:
|
1633
1795
|
ggml_cann_elu(ctx, dst);
|
1634
1796
|
break;
|
1635
1797
|
case GGML_UNARY_OP_SGN:
|
1636
|
-
|
1798
|
+
GGML_CANN_CALL_OP_UNARY(Sign);
|
1637
1799
|
break;
|
1638
1800
|
case GGML_UNARY_OP_STEP:
|
1639
1801
|
ggml_cann_step(ctx, dst);
|
@@ -1642,6 +1804,31 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
1642
1804
|
return false;
|
1643
1805
|
}
|
1644
1806
|
break;
|
1807
|
+
case GGML_OP_GLU:
|
1808
|
+
switch (ggml_get_glu_op(dst)) {
|
1809
|
+
case GGML_GLU_OP_REGLU:
|
1810
|
+
GGML_CANN_CALL_OP_UNARY_GATED(Relu);
|
1811
|
+
break;
|
1812
|
+
case GGML_GLU_OP_GEGLU:
|
1813
|
+
case GGML_GLU_OP_GEGLU_ERF:
|
1814
|
+
// aclnnGelu internally uses the erf-based approximation.
|
1815
|
+
GGML_CANN_CALL_OP_UNARY_GATED(Gelu);
|
1816
|
+
break;
|
1817
|
+
case GGML_GLU_OP_SWIGLU:
|
1818
|
+
GGML_CANN_CALL_OP_UNARY_GATED(Silu);
|
1819
|
+
break;
|
1820
|
+
case GGML_GLU_OP_GEGLU_QUICK: {
|
1821
|
+
auto lambda = [](ggml_backend_cann_context& ctx,
|
1822
|
+
aclTensor* acl_src,
|
1823
|
+
aclTensor* acl_dst) {
|
1824
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
|
1825
|
+
};
|
1826
|
+
ggml_cann_op_unary_gated(lambda, ctx, dst);
|
1827
|
+
} break;
|
1828
|
+
default:
|
1829
|
+
return false;
|
1830
|
+
}
|
1831
|
+
break;
|
1645
1832
|
case GGML_OP_NORM:
|
1646
1833
|
ggml_cann_norm(ctx, dst);
|
1647
1834
|
break;
|
@@ -1684,7 +1871,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
1684
1871
|
ggml_cann_binary_op<aclnn_mul>(ctx, dst);
|
1685
1872
|
break;
|
1686
1873
|
case GGML_OP_SQRT:
|
1687
|
-
|
1874
|
+
GGML_CANN_CALL_OP_UNARY(Sqrt);
|
1688
1875
|
break;
|
1689
1876
|
case GGML_OP_CLAMP:
|
1690
1877
|
ggml_cann_clamp(ctx, dst);
|
@@ -1729,16 +1916,16 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
1729
1916
|
ggml_cann_argmax(ctx, dst);
|
1730
1917
|
break;
|
1731
1918
|
case GGML_OP_COS:
|
1732
|
-
|
1919
|
+
ggml_cann_op_unary<aclnn_cos>(ctx, dst);
|
1733
1920
|
break;
|
1734
1921
|
case GGML_OP_SIN:
|
1735
|
-
|
1922
|
+
ggml_cann_op_unary<aclnn_sin>(ctx, dst);
|
1736
1923
|
break;
|
1737
1924
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
1738
1925
|
ggml_cann_conv_transpose_1d(ctx, dst);
|
1739
1926
|
break;
|
1740
1927
|
case GGML_OP_LOG:
|
1741
|
-
|
1928
|
+
GGML_CANN_CALL_OP_UNARY(Log);
|
1742
1929
|
break;
|
1743
1930
|
case GGML_OP_MEAN:
|
1744
1931
|
ggml_cann_mean(ctx, dst);
|
@@ -1871,6 +2058,8 @@ static bool ggml_backend_cann_cpy_tensor_async(
|
|
1871
2058
|
GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
|
1872
2059
|
ggml_backend_is_cann(backend_dst));
|
1873
2060
|
|
2061
|
+
GGML_ASSERT(!is_matmul_weight((const ggml_tensor*)src));
|
2062
|
+
|
1874
2063
|
if (!ggml_backend_buffer_is_cann(src->buffer) ||
|
1875
2064
|
!ggml_backend_buffer_is_cann(dst->buffer)) {
|
1876
2065
|
return false;
|
@@ -1887,7 +2076,14 @@ static bool ggml_backend_cann_cpy_tensor_async(
|
|
1887
2076
|
(ggml_backend_cann_context*)backend_dst->context;
|
1888
2077
|
|
1889
2078
|
size_t copy_size = ggml_nbytes(dst);
|
2079
|
+
if (copy_size == 0) {
|
2080
|
+
return true;
|
2081
|
+
}
|
1890
2082
|
if (backend_src != backend_dst) {
|
2083
|
+
#ifdef ASCEND_310P
|
2084
|
+
// TODO: Support 310p P2P copy
|
2085
|
+
return false;
|
2086
|
+
#endif
|
1891
2087
|
ggml_backend_cann_buffer_context* buf_ctx_src =
|
1892
2088
|
(ggml_backend_cann_buffer_context*)buf_src->context;
|
1893
2089
|
ggml_backend_cann_buffer_context* buf_ctx_dst =
|
@@ -1904,7 +2100,6 @@ static bool ggml_backend_cann_cpy_tensor_async(
|
|
1904
2100
|
}
|
1905
2101
|
|
1906
2102
|
// need open both directions for memcpyasync between devices.
|
1907
|
-
ggml_cann_set_device(cann_ctx_dst->device);
|
1908
2103
|
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
|
1909
2104
|
ggml_cann_set_device(cann_ctx_src->device);
|
1910
2105
|
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
|
@@ -1914,9 +2109,17 @@ static bool ggml_backend_cann_cpy_tensor_async(
|
|
1914
2109
|
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
|
1915
2110
|
ACL_MEMCPY_DEVICE_TO_DEVICE,
|
1916
2111
|
cann_ctx_src->stream()));
|
1917
|
-
|
1918
|
-
//TODO:
|
1919
|
-
|
2112
|
+
// record event on src stream after the copy
|
2113
|
+
// TODO: this event is not effective with acl graph mode, change to use aclrtSynchronizeStream
|
2114
|
+
// if (!cann_ctx_src->copy_event) {
|
2115
|
+
// ACL_CHECK(aclrtCreateEventWithFlag(&cann_ctx_src->copy_event, ACL_EVENT_SYNC));
|
2116
|
+
// }
|
2117
|
+
// ACL_CHECK(aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
|
2118
|
+
|
2119
|
+
// // wait on dst stream for the copy to complete
|
2120
|
+
// ggml_cann_set_device(cann_ctx_dst->device);
|
2121
|
+
// ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), cann_ctx_src->copy_event));
|
2122
|
+
ACL_CHECK(aclrtSynchronizeStream(cann_ctx_src->stream()));
|
1920
2123
|
} else {
|
1921
2124
|
// src and dst are on the same backend
|
1922
2125
|
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
|
@@ -1943,6 +2146,193 @@ static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
|
|
1943
2146
|
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
|
1944
2147
|
}
|
1945
2148
|
|
2149
|
+
#ifdef USE_ACL_GRAPH
|
2150
|
+
/**
|
2151
|
+
* @brief Add a new CANN graph to the LRU cache by populating node properties from the ggml graph.
|
2152
|
+
*
|
2153
|
+
* This function creates a new ggml_cann_graph object and fills its node properties
|
2154
|
+
* (operation type, dimensions, strides, input sources, and operation parameters)
|
2155
|
+
* based on the current ggml computation graph.
|
2156
|
+
*
|
2157
|
+
* Each node in the ggml graph is mapped to a property entry in the new CANN graph:
|
2158
|
+
* - node address
|
2159
|
+
* - operation type
|
2160
|
+
* - shape (ne) and strides (nb)
|
2161
|
+
* - source tensor addresses
|
2162
|
+
* - operation parameters
|
2163
|
+
*
|
2164
|
+
* After initialization, the new graph is pushed into the LRU cache owned by the
|
2165
|
+
* CANN backend context. The cache takes ownership of the graph and manages its
|
2166
|
+
* lifetime (including deletion upon eviction).
|
2167
|
+
*
|
2168
|
+
* @param cann_ctx The CANN backend context containing the graph cache.
|
2169
|
+
* @param cgraph The current ggml computation graph.
|
2170
|
+
*/
|
2171
|
+
static void add_lru_matched_graph_node_properties(
|
2172
|
+
ggml_backend_cann_context * cann_ctx,
|
2173
|
+
ggml_cgraph * cgraph) {
|
2174
|
+
// Create a new ggml_cann_graph object on the heap (its lifetime is managed by the cache).
|
2175
|
+
ggml_cann_graph * new_graph = new ggml_cann_graph();
|
2176
|
+
new_graph->ggml_graph_properties.resize(cgraph->n_nodes);
|
2177
|
+
|
2178
|
+
for (int node_idx = 0; node_idx < cgraph->n_nodes; ++node_idx) {
|
2179
|
+
ggml_tensor * node = cgraph->nodes[node_idx];
|
2180
|
+
auto & prop = new_graph->ggml_graph_properties[node_idx];
|
2181
|
+
|
2182
|
+
prop.node_address = node->data;
|
2183
|
+
prop.node_op = node->op;
|
2184
|
+
|
2185
|
+
std::copy_n(node->ne, GGML_MAX_DIMS, prop.ne);
|
2186
|
+
std::copy_n(node->nb, GGML_MAX_DIMS, prop.nb);
|
2187
|
+
|
2188
|
+
for (int src = 0; src < GGML_MAX_SRC; ++src) {
|
2189
|
+
prop.src_address[src] = node->src[src] ? node->src[src]->data : nullptr;
|
2190
|
+
}
|
2191
|
+
|
2192
|
+
memcpy(prop.op_params, node->op_params, GGML_MAX_OP_PARAMS);
|
2193
|
+
}
|
2194
|
+
|
2195
|
+
// Insert into the LRU cache (cache takes ownership and will delete it when evicted).
|
2196
|
+
cann_ctx->graph_lru_cache.push(new_graph);
|
2197
|
+
}
|
2198
|
+
|
2199
|
+
/**
|
2200
|
+
* @brief Check if a ggml tensor node matches a previously captured CANN graph node.
|
2201
|
+
*
|
2202
|
+
* This function compares all relevant fields (address, op type, shape, source inputs, op params)
|
2203
|
+
* to determine whether the current node matches a previously recorded version.
|
2204
|
+
*
|
2205
|
+
* @param node The current ggml tensor node.
|
2206
|
+
* @param graph_node_properties The stored properties of a CANN graph node.
|
2207
|
+
* @return true if all fields match (excluding GGML_OP_VIEW); false otherwise.
|
2208
|
+
*/
|
2209
|
+
static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
|
2210
|
+
if (node->data != graph_node_properties->node_address &&
|
2211
|
+
node->op != GGML_OP_VIEW) {
|
2212
|
+
return false;
|
2213
|
+
}
|
2214
|
+
if (node->op != graph_node_properties->node_op) {
|
2215
|
+
return false;
|
2216
|
+
}
|
2217
|
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
2218
|
+
if (node->ne[i] != graph_node_properties->ne[i]) {
|
2219
|
+
return false;
|
2220
|
+
}
|
2221
|
+
if (node->nb[i] != graph_node_properties->nb[i]) {
|
2222
|
+
return false;
|
2223
|
+
}
|
2224
|
+
}
|
2225
|
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
2226
|
+
if (node->src[i] &&
|
2227
|
+
node->src[i]->data != graph_node_properties->src_address[i] &&
|
2228
|
+
node->op != GGML_OP_VIEW
|
2229
|
+
) {
|
2230
|
+
return false;
|
2231
|
+
}
|
2232
|
+
}
|
2233
|
+
if (node->op == GGML_OP_SCALE &&
|
2234
|
+
memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
|
2235
|
+
return false;
|
2236
|
+
}
|
2237
|
+
return true;
|
2238
|
+
}
|
2239
|
+
|
2240
|
+
/**
|
2241
|
+
* @brief Check whether there is a cached CANN graph that matches the current ggml graph.
|
2242
|
+
*
|
2243
|
+
* This function iterates through the cached CANN graphs stored in the LRU cache and
|
2244
|
+
* compares them against the given ggml computation graph. A match requires that the
|
2245
|
+
* number of nodes is the same and that each node’s properties (operation type,
|
2246
|
+
* dimensions, strides, inputs, and operation parameters) are identical.
|
2247
|
+
*
|
2248
|
+
* If a matching graph is found, it is promoted to the front of the LRU cache and the
|
2249
|
+
* function returns true. Otherwise, the function returns false, indicating that a new
|
2250
|
+
* CANN graph needs to be captured.
|
2251
|
+
*
|
2252
|
+
* @param cann_ctx The CANN backend context containing the graph cache.
|
2253
|
+
* @param cgraph The current ggml computation graph.
|
2254
|
+
* @return true if a matching cached graph exists; false otherwise.
|
2255
|
+
*/
|
2256
|
+
static bool is_matched_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
|
2257
|
+
ggml_cann_graph_lru_cache &lru_cache = cann_ctx->graph_lru_cache;
|
2258
|
+
for (auto &graph_ptr : lru_cache.cache_list) {
|
2259
|
+
// Skip graphs with a different number of nodes.
|
2260
|
+
if (graph_ptr->ggml_graph_properties.size() != static_cast<size_t>(cgraph->n_nodes)) {
|
2261
|
+
continue;
|
2262
|
+
}
|
2263
|
+
|
2264
|
+
// Check if all nodes match.
|
2265
|
+
bool all_match = true;
|
2266
|
+
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
2267
|
+
if (!ggml_graph_node_has_matching_properties(cgraph->nodes[i], &graph_ptr->ggml_graph_properties[i])) {
|
2268
|
+
all_match = false;
|
2269
|
+
break;
|
2270
|
+
}
|
2271
|
+
}
|
2272
|
+
|
2273
|
+
if (all_match) {
|
2274
|
+
// update cache_list && renturn graph_ptr
|
2275
|
+
lru_cache.move_to_front(graph_ptr);
|
2276
|
+
return true;
|
2277
|
+
}
|
2278
|
+
}
|
2279
|
+
|
2280
|
+
return false;
|
2281
|
+
}
|
2282
|
+
#endif // USE_ACL_GRAPH
|
2283
|
+
|
2284
|
+
/**
|
2285
|
+
* @brief Evaluate the computation graph and optionally capture or execute it using CANN graph API.
|
2286
|
+
*
|
2287
|
+
* If CANN graph execution is enabled and graph capture is required, this function begins
|
2288
|
+
* graph capture, runs the graph, ends capture, and stores the captured graph.
|
2289
|
+
*
|
2290
|
+
* Otherwise, it falls back to op-by-op execution using the CANN compute kernel dispatcher.
|
2291
|
+
*
|
2292
|
+
* @param cann_ctx The CANN backend context.
|
2293
|
+
* @param cgraph The ggml computation graph.
|
2294
|
+
* @param use_cann_graph Whether to use CANN graph execution.
|
2295
|
+
* @param cann_graph_update_required Whether graph capture is needed due to graph changes.
|
2296
|
+
*/
|
2297
|
+
static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph,
|
2298
|
+
bool & use_cann_graph, bool & cann_graph_update_required) {
|
2299
|
+
#ifdef USE_ACL_GRAPH
|
2300
|
+
ggml_cann_graph* matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
|
2301
|
+
if (use_cann_graph && cann_graph_update_required) {
|
2302
|
+
ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
|
2303
|
+
}
|
2304
|
+
#endif // USE_ACL_GRAPH
|
2305
|
+
// Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
|
2306
|
+
// With the use of CANN graphs, the execution will be performed by the graph launch.
|
2307
|
+
if (!use_cann_graph || cann_graph_update_required) {
|
2308
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
2309
|
+
ggml_tensor * node = cgraph->nodes[i];
|
2310
|
+
|
2311
|
+
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
2312
|
+
continue;
|
2313
|
+
}
|
2314
|
+
|
2315
|
+
bool ok = ggml_cann_compute_forward(*cann_ctx, node);
|
2316
|
+
if (!ok) {
|
2317
|
+
GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
2318
|
+
}
|
2319
|
+
GGML_ASSERT(ok);
|
2320
|
+
}
|
2321
|
+
}
|
2322
|
+
|
2323
|
+
#ifdef USE_ACL_GRAPH
|
2324
|
+
if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture
|
2325
|
+
ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
|
2326
|
+
}
|
2327
|
+
|
2328
|
+
if (use_cann_graph) {
|
2329
|
+
// Execute graph
|
2330
|
+
ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream()));
|
2331
|
+
}
|
2332
|
+
#endif // USE_ACL_GRAPH
|
2333
|
+
}
|
2334
|
+
|
2335
|
+
|
1946
2336
|
/**
|
1947
2337
|
* @brief Computes a computational graph using a CANN backend.
|
1948
2338
|
*
|
@@ -1959,24 +2349,53 @@ static enum ggml_status ggml_backend_cann_graph_compute(
|
|
1959
2349
|
ggml_backend_t backend, ggml_cgraph* cgraph) {
|
1960
2350
|
ggml_backend_cann_context* cann_ctx =
|
1961
2351
|
(ggml_backend_cann_context*)backend->context;
|
1962
|
-
|
1963
2352
|
ggml_cann_set_device(cann_ctx->device);
|
1964
|
-
|
1965
|
-
|
1966
|
-
|
1967
|
-
|
1968
|
-
|
1969
|
-
|
2353
|
+
g_nz_workspaces[cann_ctx->device].clear();
|
2354
|
+
|
2355
|
+
// calculate rope cache for fist layer in current device.
|
2356
|
+
cann_ctx->rope_cache.cached = false;
|
2357
|
+
|
2358
|
+
#ifdef USE_ACL_GRAPH
|
2359
|
+
bool use_cann_graph = true;
|
2360
|
+
bool cann_graph_update_required = false;
|
2361
|
+
|
2362
|
+
static bool prefill_use_graph = parse_bool(get_env("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
|
2363
|
+
if (!prefill_use_graph) {
|
2364
|
+
// Do not use acl_graph for prefill.
|
2365
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
2366
|
+
ggml_tensor * node = cgraph->nodes[i];
|
2367
|
+
// TODO: Optimize here. Currently, we can only
|
2368
|
+
// get seq_len by FA's input.
|
2369
|
+
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
|
2370
|
+
// Q -> src[0], shape: [B, S, N, D]
|
2371
|
+
use_cann_graph = (node->src[0]->ne[1] == 1);
|
2372
|
+
break;
|
2373
|
+
}
|
1970
2374
|
}
|
2375
|
+
}
|
1971
2376
|
|
1972
|
-
|
2377
|
+
if (!cann_ctx->acl_graph_mode) {
|
2378
|
+
use_cann_graph = false;
|
2379
|
+
}
|
1973
2380
|
|
1974
|
-
|
1975
|
-
|
1976
|
-
|
2381
|
+
if (use_cann_graph) {
|
2382
|
+
// If no matching graph is found, the graph needs to be recaptured.
|
2383
|
+
cann_graph_update_required = !is_matched_graph(cann_ctx, cgraph);
|
2384
|
+
if (cann_graph_update_required) {
|
2385
|
+
// If no matching graph is found, add a new ACL graph.
|
2386
|
+
add_lru_matched_graph_node_properties(cann_ctx, cgraph);
|
1977
2387
|
}
|
1978
|
-
GGML_ASSERT(ok);
|
1979
2388
|
}
|
2389
|
+
#else
|
2390
|
+
bool use_cann_graph = false;
|
2391
|
+
bool cann_graph_update_required = false;
|
2392
|
+
#endif // USE_ACL_GRAPH
|
2393
|
+
evaluate_and_capture_cann_graph(
|
2394
|
+
cann_ctx,
|
2395
|
+
cgraph,
|
2396
|
+
use_cann_graph,
|
2397
|
+
cann_graph_update_required
|
2398
|
+
);
|
1980
2399
|
|
1981
2400
|
return GGML_STATUS_SUCCESS;
|
1982
2401
|
}
|
@@ -2012,10 +2431,23 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
2012
2431
|
case GGML_UNARY_OP_ELU:
|
2013
2432
|
case GGML_UNARY_OP_SGN:
|
2014
2433
|
case GGML_UNARY_OP_STEP:
|
2434
|
+
case GGML_UNARY_OP_GELU_ERF:
|
2015
2435
|
return true;
|
2016
2436
|
default:
|
2017
2437
|
return false;
|
2018
2438
|
}
|
2439
|
+
case GGML_OP_GLU:
|
2440
|
+
switch (ggml_get_glu_op(op)) {
|
2441
|
+
case GGML_GLU_OP_REGLU:
|
2442
|
+
case GGML_GLU_OP_GEGLU:
|
2443
|
+
case GGML_GLU_OP_SWIGLU:
|
2444
|
+
case GGML_GLU_OP_GEGLU_ERF:
|
2445
|
+
case GGML_GLU_OP_GEGLU_QUICK:
|
2446
|
+
return true;
|
2447
|
+
default:
|
2448
|
+
return false;
|
2449
|
+
}
|
2450
|
+
break;
|
2019
2451
|
case GGML_OP_MUL_MAT: {
|
2020
2452
|
switch (op->src[0]->type) {
|
2021
2453
|
case GGML_TYPE_F16:
|
@@ -2024,7 +2456,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
2024
2456
|
case GGML_TYPE_Q8_0:
|
2025
2457
|
case GGML_TYPE_Q4_0:
|
2026
2458
|
#ifdef ASCEND_310P
|
2027
|
-
// Q4 && Q8 per group is not
|
2459
|
+
// Q4 && Q8 per group is not support on 310p device
|
2028
2460
|
return false;
|
2029
2461
|
#endif
|
2030
2462
|
// only support contiguous for quantized types.
|
@@ -2042,7 +2474,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
2042
2474
|
case GGML_TYPE_Q8_0:
|
2043
2475
|
case GGML_TYPE_Q4_0:
|
2044
2476
|
#ifdef ASCEND_310P
|
2045
|
-
// Q4 && Q8 per group is not
|
2477
|
+
// Q4 && Q8 per group is not support on 310p device
|
2046
2478
|
return false;
|
2047
2479
|
#endif
|
2048
2480
|
// only support contiguous for quantized types.
|
@@ -2062,6 +2494,15 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
2062
2494
|
return false;
|
2063
2495
|
}
|
2064
2496
|
} break;
|
2497
|
+
case GGML_OP_SET_ROWS: {
|
2498
|
+
switch (op->type) {
|
2499
|
+
case GGML_TYPE_F32:
|
2500
|
+
case GGML_TYPE_F16:
|
2501
|
+
return true;
|
2502
|
+
default:
|
2503
|
+
return false;
|
2504
|
+
}
|
2505
|
+
} break;
|
2065
2506
|
case GGML_OP_CPY: {
|
2066
2507
|
ggml_tensor *src = op->src[0];
|
2067
2508
|
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
|
@@ -2070,12 +2511,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
2070
2511
|
// only support F32 and F16.
|
2071
2512
|
return false;
|
2072
2513
|
}
|
2073
|
-
|
2074
|
-
if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
|
2075
|
-
// unsupport dst is not contiguous.
|
2076
|
-
return false;
|
2077
|
-
}
|
2078
|
-
|
2079
2514
|
return true;
|
2080
2515
|
} break;
|
2081
2516
|
case GGML_OP_CONT: {
|
@@ -2090,16 +2525,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
2090
2525
|
}
|
2091
2526
|
case GGML_OP_ROPE: {
|
2092
2527
|
// TODO: with ops-test v == 1
|
2093
|
-
float ext_factor = 0.0f;
|
2094
|
-
memcpy(&ext_factor, (const float *) op->op_params + 7, sizeof(float));
|
2095
2528
|
// TODO: n_dims <= ne0
|
2096
2529
|
if (op->src[0]->ne[0] != op->op_params[1]) {
|
2097
2530
|
return false;
|
2098
2531
|
}
|
2099
|
-
// TODO: ext_factor != 0
|
2100
|
-
if (ext_factor != 0) {
|
2101
|
-
return false;
|
2102
|
-
}
|
2103
2532
|
|
2104
2533
|
const int mode = ((const int32_t *) op->op_params)[2];
|
2105
2534
|
if (mode & GGML_ROPE_TYPE_MROPE) {
|
@@ -2108,10 +2537,11 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
2108
2537
|
if (mode & GGML_ROPE_TYPE_VISION) {
|
2109
2538
|
return false;
|
2110
2539
|
}
|
2111
|
-
|
2540
|
+
#ifdef ASCEND_310P
|
2112
2541
|
if(!ggml_is_contiguous(op->src[0])){
|
2113
2542
|
return false;
|
2114
2543
|
}
|
2544
|
+
#endif
|
2115
2545
|
return true;
|
2116
2546
|
}
|
2117
2547
|
case GGML_OP_UPSCALE: {
|
@@ -2141,8 +2571,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
2141
2571
|
// value of paddingW should be at most half of kernelW
|
2142
2572
|
return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
|
2143
2573
|
}
|
2144
|
-
case GGML_OP_SUM:
|
2145
2574
|
case GGML_OP_DUP:
|
2575
|
+
case GGML_OP_SUM:
|
2146
2576
|
case GGML_OP_IM2COL:
|
2147
2577
|
case GGML_OP_CONCAT:
|
2148
2578
|
case GGML_OP_REPEAT:
|
@@ -2158,12 +2588,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
2158
2588
|
case GGML_OP_MUL:
|
2159
2589
|
case GGML_OP_DIV:
|
2160
2590
|
case GGML_OP_RMS_NORM:
|
2161
|
-
case GGML_OP_SCALE:
|
2162
2591
|
case GGML_OP_SQR:
|
2163
2592
|
case GGML_OP_SQRT:
|
2164
2593
|
case GGML_OP_CLAMP:
|
2165
2594
|
case GGML_OP_DIAG_MASK_INF:
|
2166
|
-
case GGML_OP_SOFT_MAX:
|
2167
2595
|
case GGML_OP_SUM_ROWS:
|
2168
2596
|
case GGML_OP_ARGSORT:
|
2169
2597
|
case GGML_OP_ACC:
|
@@ -2175,13 +2603,29 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
2175
2603
|
case GGML_OP_ARGMAX:
|
2176
2604
|
case GGML_OP_COS:
|
2177
2605
|
case GGML_OP_SIN:
|
2178
|
-
case GGML_OP_CONV_TRANSPOSE_1D:
|
2179
2606
|
case GGML_OP_LOG:
|
2180
2607
|
case GGML_OP_MEAN:
|
2181
2608
|
case GGML_OP_PAD_REFLECT_1D:
|
2182
2609
|
case GGML_OP_COUNT_EQUAL:
|
2183
2610
|
return true;
|
2611
|
+
case GGML_OP_CONV_TRANSPOSE_1D:
|
2612
|
+
// TODO: ((weightL - 1) * dilationW - padLeft)=1336 should not be larger than 255.
|
2613
|
+
return (op->src[0]->ne[0] - 1) <= 255;
|
2614
|
+
case GGML_OP_SCALE:
|
2615
|
+
float bias;
|
2616
|
+
memcpy(&bias, (const float *)(op->op_params) + 1, sizeof(float));
|
2617
|
+
return bias == 0.0f; // TODO: support bias != 0.0f
|
2618
|
+
case GGML_OP_SOFT_MAX:
|
2619
|
+
// TODO: support attention sinks [TAG_ATTN_SINKS]
|
2620
|
+
if (op->src[2]) {
|
2621
|
+
return false;
|
2622
|
+
}
|
2623
|
+
return true;
|
2184
2624
|
case GGML_OP_FLASH_ATTN_EXT:{
|
2625
|
+
#ifdef ASCEND_310P
|
2626
|
+
// FA not support on 310p device
|
2627
|
+
return false;
|
2628
|
+
#endif
|
2185
2629
|
// derived from [ggml-cuda.cu]
|
2186
2630
|
if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
|
2187
2631
|
return false;
|
@@ -2192,22 +2636,20 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
2192
2636
|
if(op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16){
|
2193
2637
|
return false;
|
2194
2638
|
}
|
2195
|
-
|
2196
|
-
|
2197
|
-
return false;
|
2198
|
-
}
|
2199
|
-
if (op->src[0]->ne[0] == 192) {
|
2639
|
+
// TODO: support attention sinks [TAG_ATTN_SINKS]
|
2640
|
+
if (op->src[4]) {
|
2200
2641
|
return false;
|
2201
2642
|
}
|
2202
|
-
if (op->src[
|
2203
|
-
//
|
2643
|
+
if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
|
2644
|
+
// different head sizes of K and V are not supported yet
|
2204
2645
|
return false;
|
2205
2646
|
}
|
2206
|
-
if (op->src[0]->ne[
|
2647
|
+
if (op->src[0]->ne[0] % 16 != 0) {
|
2648
|
+
// TODO: padding to support
|
2207
2649
|
return false;
|
2208
2650
|
}
|
2209
2651
|
float logitSoftcap = 0.0f;
|
2210
|
-
memcpy(&logitSoftcap,
|
2652
|
+
memcpy(&logitSoftcap, (const float *)(op->op_params) + 2, sizeof(float));
|
2211
2653
|
if(logitSoftcap != 0.0f) {
|
2212
2654
|
return false;
|
2213
2655
|
}
|
@@ -2314,6 +2756,7 @@ static const ggml_backend_i ggml_backend_cann_interface = {
|
|
2314
2756
|
/* .graph_compute = */ ggml_backend_cann_graph_compute,
|
2315
2757
|
/* .event_record = */ ggml_backend_cann_event_record,
|
2316
2758
|
/* .event_wait = */ ggml_backend_cann_event_wait,
|
2759
|
+
/* .graph_optimize = */ NULL,
|
2317
2760
|
};
|
2318
2761
|
|
2319
2762
|
/**
|