whispercpp 1.3.3 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -43
- data/ext/extconf.rb +2 -2
- data/ext/ruby_whisper.c +14 -2
- data/ext/ruby_whisper.h +39 -0
- data/ext/ruby_whisper_context.c +22 -22
- data/ext/ruby_whisper_model.c +12 -12
- data/ext/ruby_whisper_params.c +79 -25
- data/ext/ruby_whisper_segment.c +84 -19
- data/ext/ruby_whisper_token.c +351 -0
- data/ext/ruby_whisper_transcribe.cpp +1 -1
- data/ext/ruby_whisper_vad_context.c +75 -0
- data/ext/ruby_whisper_vad_context_detect.cpp +50 -0
- data/ext/ruby_whisper_vad_segment.c +139 -0
- data/ext/ruby_whisper_vad_segments.c +106 -0
- data/ext/sources/CMakeLists.txt +4 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/arm64-apple-clang.cmake +16 -0
- data/ext/sources/cmake/arm64-windows-llvm.cmake +16 -0
- data/ext/sources/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
- data/ext/sources/cmake/x64-windows-llvm.cmake +5 -0
- data/ext/sources/examples/CMakeLists.txt +1 -0
- data/ext/sources/examples/addon.node/addon.cpp +19 -19
- data/ext/sources/examples/addon.node/index.js +7 -5
- data/ext/sources/examples/addon.node/vad-example.js +2 -2
- data/ext/sources/examples/bench/bench.cpp +26 -16
- data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
- data/ext/sources/examples/cli/cli.cpp +122 -111
- data/ext/sources/examples/command/command.cpp +26 -24
- data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/lsp/CMakeLists.txt +2 -1
- data/ext/sources/examples/lsp/lsp.cpp +19 -17
- data/ext/sources/examples/quantize/CMakeLists.txt +2 -1
- data/ext/sources/examples/server/server.cpp +34 -24
- data/ext/sources/examples/server.py +6 -1
- data/ext/sources/examples/stream/stream.cpp +4 -2
- data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
- data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
- data/ext/sources/examples/talk-llama/CMakeLists.txt +7 -3
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +113 -7
- data/ext/sources/examples/talk-llama/llama-adapter.h +13 -1
- data/ext/sources/examples/talk-llama/llama-arch.cpp +2136 -1491
- data/ext/sources/examples/talk-llama/llama-arch.h +125 -3
- data/ext/sources/examples/talk-llama/llama-batch.cpp +174 -100
- data/ext/sources/examples/talk-llama/llama-batch.h +46 -20
- data/ext/sources/examples/talk-llama/llama-chat.cpp +199 -8
- data/ext/sources/examples/talk-llama/llama-chat.h +11 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +1213 -413
- data/ext/sources/examples/talk-llama/llama-context.h +99 -36
- data/ext/sources/examples/talk-llama/llama-cparams.h +5 -4
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +288 -53
- data/ext/sources/examples/talk-llama/llama-grammar.h +22 -1
- data/ext/sources/examples/talk-llama/llama-graph.cpp +883 -294
- data/ext/sources/examples/talk-llama/llama-graph.h +361 -161
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +144 -6
- data/ext/sources/examples/talk-llama/llama-hparams.h +100 -23
- data/ext/sources/examples/talk-llama/llama-impl.cpp +7 -3
- data/ext/sources/examples/talk-llama/llama-impl.h +3 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +328 -0
- data/ext/sources/examples/talk-llama/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +38 -29
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2100 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +373 -27
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +124 -30
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +63 -41
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +30 -29
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +77 -35
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +15 -16
- data/ext/sources/examples/talk-llama/llama-memory.h +16 -10
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +172 -37
- data/ext/sources/examples/talk-llama/llama-mmap.h +8 -3
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +93 -9
- data/ext/sources/examples/talk-llama/llama-model-loader.h +9 -2
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +3 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +3369 -10145
- data/ext/sources/examples/talk-llama/llama-model.h +104 -12
- data/ext/sources/examples/talk-llama/llama-quant.cpp +53 -30
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +1520 -324
- data/ext/sources/examples/talk-llama/llama-sampling.h +19 -7
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +562 -39
- data/ext/sources/examples/talk-llama/llama-vocab.h +50 -0
- data/ext/sources/examples/talk-llama/llama.cpp +794 -12
- data/ext/sources/examples/talk-llama/llama.h +246 -190
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +191 -0
- data/ext/sources/examples/talk-llama/models/apertus.cpp +125 -0
- data/ext/sources/examples/talk-llama/models/arcee.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/arctic.cpp +138 -0
- data/ext/sources/examples/talk-llama/models/arwkv7.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/bert.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +160 -0
- data/ext/sources/examples/talk-llama/models/bloom.cpp +101 -0
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +111 -0
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +102 -0
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +134 -0
- data/ext/sources/examples/talk-llama/models/command-r.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/deci.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +259 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +134 -0
- data/ext/sources/examples/talk-llama/models/dream.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +114 -0
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +113 -0
- data/ext/sources/examples/talk-llama/models/falcon.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +116 -0
- data/ext/sources/examples/talk-llama/models/gemma.cpp +112 -0
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +155 -0
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +384 -0
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +170 -0
- data/ext/sources/examples/talk-llama/models/glm4.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +196 -0
- data/ext/sources/examples/talk-llama/models/granite.cpp +211 -0
- data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +283 -0
- data/ext/sources/examples/talk-llama/models/grok.cpp +159 -0
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +141 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +154 -0
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/jais.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +106 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +175 -0
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/llada.cpp +99 -0
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/llama.cpp +168 -0
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/mamba.cpp +55 -0
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +199 -0
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +160 -0
- data/ext/sources/examples/talk-llama/models/models.h +569 -0
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +116 -0
- data/ext/sources/examples/talk-llama/models/mpt.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +104 -0
- data/ext/sources/examples/talk-llama/models/olmo.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +127 -0
- data/ext/sources/examples/talk-llama/models/openelm.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/orion.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi3.cpp +152 -0
- data/ext/sources/examples/talk-llama/models/plamo.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +316 -0
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/plm.cpp +168 -0
- data/ext/sources/examples/talk-llama/models/qwen.cpp +108 -0
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +151 -0
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +873 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +149 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +141 -0
- data/ext/sources/examples/talk-llama/models/refact.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +162 -0
- data/ext/sources/examples/talk-llama/models/rwkv6.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/rwkv7.cpp +90 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +146 -0
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +100 -0
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +166 -0
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +96 -0
- data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +149 -0
- data/ext/sources/examples/talk-llama/models/xverse.cpp +108 -0
- data/ext/sources/examples/talk-llama/talk-llama.cpp +9 -6
- data/ext/sources/examples/talk-llama/unicode.cpp +309 -16
- data/ext/sources/examples/talk-llama/unicode.h +45 -0
- data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +1 -1
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +4 -2
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +18 -17
- data/ext/sources/ggml/CMakeLists.txt +135 -79
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
- data/ext/sources/ggml/include/ggml-alloc.h +9 -0
- data/ext/sources/ggml/include/ggml-backend.h +21 -2
- data/ext/sources/ggml/include/ggml-cpu.h +2 -1
- data/ext/sources/ggml/include/ggml-hexagon.h +19 -0
- data/ext/sources/ggml/include/ggml-metal.h +1 -6
- data/ext/sources/ggml/include/ggml-opt.h +25 -6
- data/ext/sources/ggml/include/ggml-rpc.h +8 -11
- data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
- data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
- data/ext/sources/ggml/include/ggml-zendnn.h +22 -0
- data/ext/sources/ggml/include/ggml.h +406 -23
- data/ext/sources/ggml/src/CMakeLists.txt +99 -13
- data/ext/sources/ggml/src/ggml-alloc.c +368 -161
- data/ext/sources/ggml/src/ggml-backend-impl.h +5 -5
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +55 -14
- data/ext/sources/ggml/src/ggml-backend.cpp +290 -57
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +17 -3
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +10 -13
- data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +14 -0
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +59 -45
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +138 -47
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +2586 -1917
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +348 -309
- data/ext/sources/ggml/src/ggml-cann/common.h +350 -133
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +894 -625
- data/ext/sources/ggml/src/ggml-common.h +17 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +167 -75
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +560 -622
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1002 -270
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +107 -587
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +373 -486
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +521 -353
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +4682 -1660
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +82 -4
- data/ext/sources/ggml/src/ggml-cpu/common.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +18 -9
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +263 -111
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +39 -28
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +683 -82
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +38 -43
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +435 -119
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1234 -1182
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +2167 -1480
- data/ext/sources/ggml/src/ggml-cpu/ops.h +10 -12
- data/ext/sources/ggml/src/ggml-cpu/quants.c +35 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1132 -81
- data/ext/sources/ggml/src/ggml-cpu/repack.h +36 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +120 -93
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- data/ext/sources/ggml/src/ggml-cpu/traits.cpp +2 -2
- data/ext/sources/ggml/src/ggml-cpu/traits.h +1 -1
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +7 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +294 -27
- data/ext/sources/ggml/src/ggml-cpu/vec.h +606 -48
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +92 -17
- data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
- data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/argmax.cu +2 -2
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +123 -6
- data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +16 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +588 -128
- data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +95 -22
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +25 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +335 -485
- data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +1 -5
- data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cu +307 -0
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
- data/ext/sources/ggml/src/ggml-cuda/diag.cu +77 -0
- data/ext/sources/ggml/src/ggml-cuda/diag.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +519 -378
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +750 -637
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +1244 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +98 -61
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +230 -197
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/fill.cu +37 -0
- data/ext/sources/ggml/src/ggml-cuda/fill.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1557 -294
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
- data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +57 -2
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +915 -69
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +171 -0
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +835 -0
- data/ext/sources/ggml/src/ggml-cuda/mmid.cu +164 -0
- data/ext/sources/ggml/src/ggml-cuda/mmid.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +109 -67
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1601 -733
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +802 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +12 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +286 -149
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
- data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +86 -32
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +163 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +14 -0
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +207 -98
- data/ext/sources/ggml/src/ggml-cuda/rope.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +330 -0
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/set.cu +39 -0
- data/ext/sources/ggml/src/ggml-cuda/set.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +325 -61
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +275 -0
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +14 -12
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +291 -104
- data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +21 -4
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +96 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +351 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +21 -0
- data/ext/sources/ggml/src/ggml-cuda/tri.cu +136 -0
- data/ext/sources/ggml/src/ggml-cuda/tri.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +189 -5
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +44 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +248 -6
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +8 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +70 -37
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +10 -3
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +80 -0
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3151 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +44 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +682 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +566 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +112 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.c +63 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +157 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +165 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +92 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +94 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +72 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +1020 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +1353 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +1001 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2503 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +487 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +168 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +287 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +454 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +221 -0
- data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +153 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +16 -13
- data/ext/sources/ggml/src/ggml-impl.h +186 -15
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -7
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +609 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1743 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +273 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1686 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +356 -61
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +4161 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +724 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +4495 -1876
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +21 -9
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +29 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4005 -427
- data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +147 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +66 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +82 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
- data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +177 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +94 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +33 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
- data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
- data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
- data/ext/sources/ggml/src/ggml-quants.c +111 -16
- data/ext/sources/ggml/src/ggml-quants.h +6 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +497 -195
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +48 -3
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +77 -0
- data/ext/sources/ggml/src/ggml-sycl/add-id.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +6 -5
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +117 -15
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +50 -30
- data/ext/sources/ggml/src/ggml-sycl/conv.cpp +10 -4
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +200 -99
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +79 -0
- data/ext/sources/ggml/src/ggml-sycl/count-equal.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +72 -309
- data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +18 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +67 -49
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +77 -34
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +397 -314
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +12 -2
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +14 -26
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +9 -6
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +643 -413
- data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +80 -60
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +223 -132
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +230 -55
- data/ext/sources/ggml/src/ggml-sycl/norm.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.cpp +97 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.hpp +24 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +8 -9
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.cpp +122 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +65 -59
- data/ext/sources/ggml/src/ggml-sycl/set.cpp +73 -0
- data/ext/sources/ggml/src/ggml-sycl/set.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +330 -165
- data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +4 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +12 -6
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +60 -6
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +16 -12
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +7398 -2635
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +43 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +15 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +56 -39
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +347 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +5 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +67 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +158 -16
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +38 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +103 -36
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +220 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +139 -45
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +113 -38
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +75 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +19 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +2 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +33 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +227 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +71 -21
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +41 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +44 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +20 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +143 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +494 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -556
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +230 -51
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +566 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +72 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +90 -223
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +195 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +41 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +59 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +104 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +234 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +6 -52
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +6 -35
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +6 -35
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +6 -39
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +5 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +30 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +16 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +43 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +435 -24
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +148 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.glsl +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +619 -177
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +169 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3087 -0
- data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +147 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +591 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +112 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +483 -0
- data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +466 -0
- data/ext/sources/ggml/src/ggml.c +901 -129
- data/ext/sources/ggml/src/gguf.cpp +8 -1
- data/ext/sources/include/whisper.h +1 -0
- data/ext/sources/src/CMakeLists.txt +3 -1
- data/ext/sources/src/whisper.cpp +124 -81
- data/ext/sources/tests/CMakeLists.txt +8 -1
- data/ext/sources/tests/test-vad-full.cpp +7 -5
- data/ext/sources/tests/test-vad.cpp +3 -3
- data/extsources.rb +1 -0
- data/lib/whisper/model/uri.rb +17 -18
- data/sig/whisper.rbs +126 -2
- data/test/test_params.rb +24 -8
- data/test/test_segment.rb +0 -1
- data/test/test_token.rb +70 -0
- data/test/test_vad.rb +1 -1
- data/test/test_vad_context.rb +50 -0
- data/test/test_vad_segment.rb +19 -0
- data/test/test_vad_segments.rb +16 -0
- data/test/test_whisper.rb +8 -1
- data/whispercpp.gemspec +1 -1
- metadata +439 -179
- data/ext/sources/build-xcframework.sh +0 -547
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +0 -279
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +0 -1841
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +0 -303
- data/ext/sources/ggml/include/ggml-kompute.h +0 -50
- data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
- data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
- data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
- data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
- data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
- data/ext/sources/ggml/src/ggml-cann/Doxyfile +0 -2579
- data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
- data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -506
- data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +0 -11
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -6280
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +0 -162
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -118
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -99
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -58
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
|
@@ -22,23 +22,24 @@
|
|
|
22
22
|
|
|
23
23
|
#include "ggml-cann.h"
|
|
24
24
|
|
|
25
|
+
#include "ggml-backend-impl.h"
|
|
26
|
+
#include "ggml-cann/aclnn_ops.h"
|
|
27
|
+
#include "ggml-cann/common.h"
|
|
28
|
+
#include "ggml-impl.h"
|
|
29
|
+
#include "ggml.h"
|
|
30
|
+
|
|
25
31
|
#include <acl/acl.h>
|
|
32
|
+
#include <aclnnop/aclnn_trans_matmul_weight.h>
|
|
26
33
|
#include <stdarg.h>
|
|
27
34
|
|
|
35
|
+
#include <chrono>
|
|
28
36
|
#include <cmath>
|
|
29
37
|
#include <cstdio>
|
|
30
38
|
#include <cstring>
|
|
31
39
|
#include <mutex>
|
|
40
|
+
#include <optional>
|
|
32
41
|
#include <queue>
|
|
33
|
-
#include <chrono>
|
|
34
42
|
#include <unordered_set>
|
|
35
|
-
#include <optional>
|
|
36
|
-
|
|
37
|
-
#include "ggml-impl.h"
|
|
38
|
-
#include "ggml-backend-impl.h"
|
|
39
|
-
#include "ggml-cann/aclnn_ops.h"
|
|
40
|
-
#include "ggml-cann/common.h"
|
|
41
|
-
#include "ggml.h"
|
|
42
43
|
|
|
43
44
|
#define GGML_COMMON_DECL_C
|
|
44
45
|
|
|
@@ -55,33 +56,41 @@
|
|
|
55
56
|
* @param line The line number where the error occurred.
|
|
56
57
|
* @param msg The error message.
|
|
57
58
|
*/
|
|
58
|
-
[[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
|
|
59
|
-
const char* file, int line, const char* msg) {
|
|
59
|
+
[[noreturn]] void ggml_cann_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
|
60
60
|
int32_t id = -1;
|
|
61
61
|
aclrtGetDevice(&id);
|
|
62
62
|
|
|
63
63
|
GGML_LOG_ERROR("CANN error: %s\n", msg);
|
|
64
|
-
GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func,
|
|
65
|
-
file, line);
|
|
64
|
+
GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
|
66
65
|
GGML_LOG_ERROR(" %s\n", stmt);
|
|
67
66
|
// abort with GGML_ASSERT to get a stack trace
|
|
68
67
|
GGML_ABORT("CANN error");
|
|
69
68
|
}
|
|
70
69
|
|
|
70
|
+
// Thread-local variable to record the current device of this thread.
|
|
71
|
+
thread_local int g_current_cann_device = -1;
|
|
72
|
+
|
|
71
73
|
/**
|
|
72
|
-
* @brief
|
|
74
|
+
* @brief Set the CANN device to be used.
|
|
73
75
|
*
|
|
74
|
-
* @param device The device ID to set.
|
|
76
|
+
* @param device The target device ID to set.
|
|
75
77
|
*/
|
|
76
78
|
void ggml_cann_set_device(const int32_t device) {
|
|
77
|
-
//
|
|
78
|
-
//
|
|
79
|
-
//
|
|
79
|
+
// int current_device = -1;
|
|
80
|
+
// Note: In some CANN versions, if no device has been set yet,
|
|
81
|
+
// aclrtGetDevice(¤t_device) may return 0 by default.
|
|
82
|
+
// aclrtGetDevice(¤t_device);
|
|
83
|
+
|
|
84
|
+
// If the current device is already the target one, no need to switch.
|
|
85
|
+
if (device == g_current_cann_device) {
|
|
86
|
+
return;
|
|
87
|
+
}
|
|
80
88
|
|
|
81
|
-
//
|
|
82
|
-
// return;
|
|
83
|
-
// }
|
|
89
|
+
// Switch to the new device.
|
|
84
90
|
ACL_CHECK(aclrtSetDevice(device));
|
|
91
|
+
|
|
92
|
+
// Update the global device record.
|
|
93
|
+
g_current_cann_device = device;
|
|
85
94
|
}
|
|
86
95
|
|
|
87
96
|
/**
|
|
@@ -96,12 +105,14 @@ int32_t ggml_cann_get_device() {
|
|
|
96
105
|
}
|
|
97
106
|
|
|
98
107
|
/**
|
|
99
|
-
* @brief Get the value of the specified environment variable (name).
|
|
108
|
+
* @brief Get the value of the specified environment variable (name) as lowercase.
|
|
100
109
|
* if not empty, return a std::string object
|
|
101
110
|
*/
|
|
102
|
-
std::optional<std::string>
|
|
103
|
-
const char* val = std::getenv(name.c_str());
|
|
104
|
-
if (!val)
|
|
111
|
+
std::optional<std::string> get_env_as_lowercase(const std::string & name) {
|
|
112
|
+
const char * val = std::getenv(name.c_str());
|
|
113
|
+
if (!val) {
|
|
114
|
+
return std::nullopt;
|
|
115
|
+
}
|
|
105
116
|
std::string res = std::string(val);
|
|
106
117
|
std::transform(res.begin(), res.end(), res.begin(), ::tolower);
|
|
107
118
|
return res;
|
|
@@ -110,11 +121,29 @@ std::optional<std::string> get_env(const std::string& name) {
|
|
|
110
121
|
/**
|
|
111
122
|
* @brief Verify whether the environment variable is a valid value.
|
|
112
123
|
*/
|
|
113
|
-
bool parse_bool(const std::string& value) {
|
|
114
|
-
std::unordered_set<std::string> valid_values = {"on", "1", "yes", "y", "enable", "true"};
|
|
124
|
+
bool parse_bool(const std::string & value) {
|
|
125
|
+
static const std::unordered_set<std::string> valid_values = { "on", "1", "yes", "y", "enable", "true" };
|
|
115
126
|
return valid_values.find(value) != valid_values.end();
|
|
116
127
|
}
|
|
117
128
|
|
|
129
|
+
/**
|
|
130
|
+
* @brief Parse a string as an integer, returning 0 if invalid.
|
|
131
|
+
*
|
|
132
|
+
* This function attempts to convert the input string `value` to an `int`.
|
|
133
|
+
* If the string is not a valid integer or is out of the `int` range,
|
|
134
|
+
* it returns 0.
|
|
135
|
+
*
|
|
136
|
+
* @param value The string to parse.
|
|
137
|
+
* @return The parsed integer, or 0 if conversion fails.
|
|
138
|
+
*/
|
|
139
|
+
int parse_integer(const std::string & value) {
|
|
140
|
+
try {
|
|
141
|
+
return std::stoi(value);
|
|
142
|
+
} catch (...) {
|
|
143
|
+
return 0;
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
118
147
|
/**
|
|
119
148
|
* @brief Initialize the CANN device information.
|
|
120
149
|
*
|
|
@@ -126,11 +155,10 @@ bool parse_bool(const std::string& value) {
|
|
|
126
155
|
static ggml_cann_device_info ggml_cann_init() {
|
|
127
156
|
ggml_cann_device_info info = {};
|
|
128
157
|
|
|
129
|
-
aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count);
|
|
158
|
+
aclError err = aclrtGetDeviceCount((uint32_t *) &info.device_count);
|
|
130
159
|
|
|
131
160
|
if (err != ACL_SUCCESS) {
|
|
132
|
-
GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n",
|
|
133
|
-
__func__, aclGetRecentErrMsg());
|
|
161
|
+
GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n", __func__, aclGetRecentErrMsg());
|
|
134
162
|
return info;
|
|
135
163
|
}
|
|
136
164
|
|
|
@@ -138,16 +166,15 @@ static ggml_cann_device_info ggml_cann_init() {
|
|
|
138
166
|
|
|
139
167
|
for (int id = 0; id < info.device_count; ++id) {
|
|
140
168
|
aclrtPhysicalMemProp prop = {};
|
|
141
|
-
prop.handleType
|
|
142
|
-
prop.allocationType
|
|
143
|
-
prop.memAttr
|
|
144
|
-
prop.location.type
|
|
145
|
-
prop.location.id
|
|
146
|
-
prop.reserve
|
|
147
|
-
err
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
info.devices[id].vmm = err == ACL_SUCCESS;
|
|
169
|
+
prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
|
|
170
|
+
prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
|
|
171
|
+
prop.memAttr = ACL_HBM_MEM_HUGE;
|
|
172
|
+
prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
|
|
173
|
+
prop.location.id = id;
|
|
174
|
+
prop.reserve = 0;
|
|
175
|
+
err = aclrtMemGetAllocationGranularity(&prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
|
|
176
|
+
&info.devices[id].vmm_granularity);
|
|
177
|
+
info.devices[id].vmm = err == ACL_SUCCESS;
|
|
151
178
|
|
|
152
179
|
size_t free, total;
|
|
153
180
|
ggml_backend_cann_get_device_memory(id, &free, &total);
|
|
@@ -167,7 +194,7 @@ static ggml_cann_device_info ggml_cann_init() {
|
|
|
167
194
|
*
|
|
168
195
|
* @return A reference to the structure containing the device information.
|
|
169
196
|
*/
|
|
170
|
-
const ggml_cann_device_info& ggml_cann_info() {
|
|
197
|
+
const ggml_cann_device_info & ggml_cann_info() {
|
|
171
198
|
static ggml_cann_device_info info = ggml_cann_init();
|
|
172
199
|
return info;
|
|
173
200
|
}
|
|
@@ -187,7 +214,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
|
187
214
|
/**
|
|
188
215
|
* @brief The minimum free margin for a buffer.
|
|
189
216
|
*/
|
|
190
|
-
static const size_t min_free_margin = 1ull << 20;
|
|
217
|
+
static const size_t min_free_margin = 1ull << 20; // 1MB
|
|
191
218
|
|
|
192
219
|
/**
|
|
193
220
|
* @brief The alignment for buffer allocation.
|
|
@@ -208,22 +235,18 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
|
208
235
|
* @brief Structure representing a CANN buffer.
|
|
209
236
|
*/
|
|
210
237
|
struct ggml_cann_buffer {
|
|
211
|
-
void*
|
|
212
|
-
size_t
|
|
213
|
-
std::chrono::steady_clock::time_point last_used;
|
|
238
|
+
void * ptr = nullptr; ///< Pointer to the buffer.
|
|
239
|
+
size_t size = 0; ///< Size of the buffer.
|
|
240
|
+
std::chrono::steady_clock::time_point last_used; ///< Last used time.
|
|
214
241
|
|
|
215
|
-
bool operator>(const ggml_cann_buffer& other) const {
|
|
216
|
-
return size > other.size;
|
|
217
|
-
}
|
|
242
|
+
bool operator>(const ggml_cann_buffer & other) const { return size > other.size; }
|
|
218
243
|
};
|
|
219
244
|
|
|
220
245
|
/**
|
|
221
246
|
* @brief Array of CANN buffers in the pool.
|
|
222
247
|
*/
|
|
223
|
-
std::unordered_map<void*, size_t>
|
|
224
|
-
std::priority_queue<ggml_cann_buffer,
|
|
225
|
-
std::vector<ggml_cann_buffer>,
|
|
226
|
-
std::greater<>> free_buffers ;
|
|
248
|
+
std::unordered_map<void *, size_t> buffer_pool;
|
|
249
|
+
std::priority_queue<ggml_cann_buffer, std::vector<ggml_cann_buffer>, std::greater<>> free_buffers;
|
|
227
250
|
|
|
228
251
|
/**
|
|
229
252
|
* @brief Total size of all buffers in the pool.
|
|
@@ -236,7 +259,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
|
236
259
|
* @param device The device ID to associate with this buffer pool.
|
|
237
260
|
*/
|
|
238
261
|
explicit ggml_cann_pool_buf_prio(int device) : device(device) {
|
|
239
|
-
disable_clean = parse_bool(
|
|
262
|
+
disable_clean = parse_bool(get_env_as_lowercase("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
|
|
240
263
|
}
|
|
241
264
|
|
|
242
265
|
/**
|
|
@@ -244,7 +267,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
|
244
267
|
*/
|
|
245
268
|
~ggml_cann_pool_buf_prio() {
|
|
246
269
|
ggml_cann_set_device(device);
|
|
247
|
-
for (auto& [b_ptr, b_size] : buffer_pool) {
|
|
270
|
+
for (auto & [b_ptr, b_size] : buffer_pool) {
|
|
248
271
|
aclrtFree(b_ptr);
|
|
249
272
|
pool_size -= b_size;
|
|
250
273
|
}
|
|
@@ -260,14 +283,14 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
|
260
283
|
* the allocated buffer.
|
|
261
284
|
* @return A pointer to the allocated buffer.
|
|
262
285
|
*/
|
|
263
|
-
void* alloc(size_t size, size_t* actual_size) override {
|
|
286
|
+
void * alloc(size_t size, size_t * actual_size) override {
|
|
264
287
|
size = GGML_PAD(size, alignment);
|
|
265
288
|
if (size == 0) {
|
|
266
289
|
size = alignment;
|
|
267
290
|
}
|
|
268
291
|
|
|
269
|
-
void* ptr = nullptr;
|
|
270
|
-
auto
|
|
292
|
+
void * ptr = nullptr;
|
|
293
|
+
auto now = std::chrono::steady_clock::now();
|
|
271
294
|
|
|
272
295
|
std::vector<ggml_cann_buffer> free_buffers_rest;
|
|
273
296
|
free_buffers_rest.reserve(free_buffers.size());
|
|
@@ -280,24 +303,22 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
|
280
303
|
const size_t margin = b.size - size;
|
|
281
304
|
if (margin <= max_reuse_margin) {
|
|
282
305
|
*actual_size = b.size;
|
|
283
|
-
ptr
|
|
306
|
+
ptr = b.ptr;
|
|
284
307
|
#ifdef DEBUG_CANN_MALLOC
|
|
285
308
|
GGML_LOG_INFO(
|
|
286
309
|
"cann pool[%d]: reused %p, "
|
|
287
310
|
"pool_size = %5u MB, "
|
|
288
311
|
"size = %5u MB, "
|
|
289
312
|
"margin = %5u MB\n",
|
|
290
|
-
device, b.ptr,
|
|
291
|
-
(uint32_t)(GGML_PAD(
|
|
292
|
-
(uint32_t)(GGML_PAD(
|
|
293
|
-
(uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
|
|
313
|
+
device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
|
|
314
|
+
(uint32_t) (GGML_PAD(size, 1048576) / 1048576),
|
|
315
|
+
(uint32_t) (GGML_PAD(margin, 1048576) / 1048576));
|
|
294
316
|
#endif
|
|
295
317
|
break;
|
|
296
318
|
}
|
|
297
319
|
}
|
|
298
320
|
|
|
299
|
-
bool should_clean = !disable_clean &&
|
|
300
|
-
b.size > min_free_margin &&
|
|
321
|
+
bool should_clean = !disable_clean && b.size > min_free_margin &&
|
|
301
322
|
std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
|
|
302
323
|
if (should_clean) {
|
|
303
324
|
// free the buffer if the size is needed to be freed
|
|
@@ -309,20 +330,20 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
|
309
330
|
"cann pool[%d]: clean %p, "
|
|
310
331
|
"pool_size = %5u MB, "
|
|
311
332
|
"size = %5u MB\n",
|
|
312
|
-
device, b.ptr,
|
|
313
|
-
(uint32_t)(GGML_PAD(
|
|
314
|
-
(uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
|
|
333
|
+
device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
|
|
334
|
+
(uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
|
|
315
335
|
#endif
|
|
316
336
|
continue;
|
|
317
337
|
}
|
|
318
338
|
free_buffers_rest.push_back(b);
|
|
319
339
|
}
|
|
320
|
-
for (ggml_cann_buffer &b : free_buffers_rest) {
|
|
340
|
+
for (ggml_cann_buffer & b : free_buffers_rest) {
|
|
321
341
|
free_buffers.push(std::move(b));
|
|
322
342
|
}
|
|
323
343
|
|
|
324
344
|
#ifdef DEBUG_CANN_MALLOC
|
|
325
|
-
GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device,
|
|
345
|
+
GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device,
|
|
346
|
+
(uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
|
|
326
347
|
#endif
|
|
327
348
|
if (ptr != nullptr) {
|
|
328
349
|
return ptr;
|
|
@@ -338,8 +359,8 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
|
338
359
|
"cann pool[%d]: allocate %p, "
|
|
339
360
|
"pool_size = %5u MB, "
|
|
340
361
|
"size = %5u MB\n",
|
|
341
|
-
device, ptr, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
|
|
342
|
-
(uint32_t)(GGML_PAD(size, 1048576) / 1048576));
|
|
362
|
+
device, ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
|
|
363
|
+
(uint32_t) (GGML_PAD(size, 1048576) / 1048576));
|
|
343
364
|
#endif
|
|
344
365
|
buffer_pool.emplace(ptr, size);
|
|
345
366
|
return ptr;
|
|
@@ -351,7 +372,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
|
351
372
|
* @param ptr Pointer to the buffer to free.
|
|
352
373
|
* @param size Size of the buffer to free.
|
|
353
374
|
*/
|
|
354
|
-
void free(void* ptr, size_t size) override {
|
|
375
|
+
void free(void * ptr, size_t size) override {
|
|
355
376
|
GGML_UNUSED(size);
|
|
356
377
|
auto it = buffer_pool.find(ptr);
|
|
357
378
|
if (it == buffer_pool.end()) {
|
|
@@ -359,13 +380,12 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
|
359
380
|
}
|
|
360
381
|
|
|
361
382
|
auto now = std::chrono::steady_clock::now();
|
|
362
|
-
free_buffers.emplace(ggml_cann_buffer{ptr, it->second, now});
|
|
383
|
+
free_buffers.emplace(ggml_cann_buffer{ ptr, it->second, now });
|
|
363
384
|
#ifdef DEBUG_CANN_MALLOC
|
|
364
385
|
GGML_LOG_INFO(
|
|
365
386
|
"cann pool[%d]: return %p, "
|
|
366
387
|
"pool_size = %5u MB\n",
|
|
367
|
-
device, ptr,
|
|
368
|
-
(uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
|
|
388
|
+
device, ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
|
|
369
389
|
#endif
|
|
370
390
|
}
|
|
371
391
|
};
|
|
@@ -384,7 +404,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
|
384
404
|
/**
|
|
385
405
|
* @brief The minimum free margin for a buffer.
|
|
386
406
|
*/
|
|
387
|
-
static const size_t min_free_margin = 1ull << 20;
|
|
407
|
+
static const size_t min_free_margin = 1ull << 20; // 1MB
|
|
388
408
|
|
|
389
409
|
/**
|
|
390
410
|
* @brief The alignment for buffer allocation.
|
|
@@ -410,10 +430,10 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
|
410
430
|
* @brief Structure representing a CANN buffer.
|
|
411
431
|
*/
|
|
412
432
|
struct ggml_cann_buffer {
|
|
413
|
-
void*
|
|
414
|
-
size_t
|
|
415
|
-
bool
|
|
416
|
-
std::chrono::steady_clock::time_point last_used;
|
|
433
|
+
void * ptr = nullptr; ///< Pointer to the buffer memory.
|
|
434
|
+
size_t size = 0; ///< Size of the buffer.
|
|
435
|
+
bool used = false; ///< Whether the buffer is currently in use.
|
|
436
|
+
std::chrono::steady_clock::time_point last_used; ///< Last used time.
|
|
417
437
|
};
|
|
418
438
|
|
|
419
439
|
/**
|
|
@@ -432,7 +452,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
|
432
452
|
* @param device The device ID to associate with this buffer pool.
|
|
433
453
|
*/
|
|
434
454
|
explicit ggml_cann_pool_buf(int device) : device(device) {
|
|
435
|
-
disable_clean = parse_bool(
|
|
455
|
+
disable_clean = parse_bool(get_env_as_lowercase("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
|
|
436
456
|
}
|
|
437
457
|
|
|
438
458
|
/**
|
|
@@ -441,7 +461,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
|
441
461
|
~ggml_cann_pool_buf() {
|
|
442
462
|
ggml_cann_set_device(device);
|
|
443
463
|
for (int i = 0; i < MAX_BUFFERS; ++i) {
|
|
444
|
-
ggml_cann_buffer& b = buffer_pool[i];
|
|
464
|
+
ggml_cann_buffer & b = buffer_pool[i];
|
|
445
465
|
if (b.ptr != nullptr) {
|
|
446
466
|
aclrtFree(b.ptr);
|
|
447
467
|
pool_size -= b.size;
|
|
@@ -458,18 +478,18 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
|
458
478
|
* the allocated buffer.
|
|
459
479
|
* @return A pointer to the allocated buffer.
|
|
460
480
|
*/
|
|
461
|
-
void* alloc(size_t size, size_t* actual_size) override {
|
|
481
|
+
void * alloc(size_t size, size_t * actual_size) override {
|
|
462
482
|
size = GGML_PAD(size, alignment);
|
|
463
483
|
if (size == 0) {
|
|
464
484
|
size = alignment;
|
|
465
485
|
}
|
|
466
486
|
|
|
467
|
-
void* ptr = nullptr;
|
|
468
|
-
auto
|
|
487
|
+
void * ptr = nullptr;
|
|
488
|
+
auto now = std::chrono::steady_clock::now();
|
|
469
489
|
|
|
470
490
|
int i = 0;
|
|
471
491
|
for (; i < MAX_BUFFERS; ++i) {
|
|
472
|
-
ggml_cann_buffer& b = buffer_pool[i];
|
|
492
|
+
ggml_cann_buffer & b = buffer_pool[i];
|
|
473
493
|
if (b.ptr == nullptr) {
|
|
474
494
|
break;
|
|
475
495
|
}
|
|
@@ -481,25 +501,23 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
|
481
501
|
const size_t margin = b.size - size;
|
|
482
502
|
if (margin <= max_reuse_margin) {
|
|
483
503
|
*actual_size = b.size;
|
|
484
|
-
b.used
|
|
485
|
-
ptr
|
|
504
|
+
b.used = true;
|
|
505
|
+
ptr = b.ptr;
|
|
486
506
|
#ifdef DEBUG_CANN_MALLOC
|
|
487
507
|
GGML_LOG_INFO(
|
|
488
508
|
"cann pool[%d]: reused %p, "
|
|
489
509
|
"pool_size = %5u MB, "
|
|
490
510
|
"size = %5u MB, "
|
|
491
511
|
"margin = %5u MB\n",
|
|
492
|
-
device, b.ptr,
|
|
493
|
-
(uint32_t)(GGML_PAD(
|
|
494
|
-
(uint32_t)(GGML_PAD(
|
|
495
|
-
(uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
|
|
512
|
+
device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
|
|
513
|
+
(uint32_t) (GGML_PAD(size, 1048576) / 1048576),
|
|
514
|
+
(uint32_t) (GGML_PAD(margin, 1048576) / 1048576));
|
|
496
515
|
#endif
|
|
497
516
|
break;
|
|
498
517
|
}
|
|
499
518
|
}
|
|
500
519
|
|
|
501
|
-
bool should_clean = !disable_clean &&
|
|
502
|
-
b.size > min_free_margin &&
|
|
520
|
+
bool should_clean = !disable_clean && b.size > min_free_margin &&
|
|
503
521
|
std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
|
|
504
522
|
if (should_clean) {
|
|
505
523
|
// free the buffer if the size is needed to be freed
|
|
@@ -510,9 +528,8 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
|
510
528
|
"cann pool[%d]: clean %p, "
|
|
511
529
|
"pool_size = %5u MB, "
|
|
512
530
|
"size = %5u MB\n",
|
|
513
|
-
device, b.ptr,
|
|
514
|
-
(uint32_t)(GGML_PAD(
|
|
515
|
-
(uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
|
|
531
|
+
device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
|
|
532
|
+
(uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
|
|
516
533
|
#endif
|
|
517
534
|
b.ptr = nullptr;
|
|
518
535
|
}
|
|
@@ -523,13 +540,13 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
|
523
540
|
|
|
524
541
|
if (i < MAX_BUFFERS) {
|
|
525
542
|
// allocate a new buffer if no buffer can be reused
|
|
526
|
-
ggml_cann_buffer& b = buffer_pool[i];
|
|
543
|
+
ggml_cann_buffer & b = buffer_pool[i];
|
|
527
544
|
ggml_cann_set_device(device);
|
|
528
545
|
ACL_CHECK(aclrtMalloc(&b.ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
|
|
529
546
|
pool_size += size;
|
|
530
547
|
*actual_size = size;
|
|
531
|
-
b.size
|
|
532
|
-
b.used
|
|
548
|
+
b.size = size;
|
|
549
|
+
b.used = true;
|
|
533
550
|
if (i >= MAX_BUFFERS - 8) {
|
|
534
551
|
GGML_LOG_WARN("cann pool[%d]: slots almost full\n", device);
|
|
535
552
|
}
|
|
@@ -538,9 +555,8 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
|
538
555
|
"cann pool[%d]: allocate %p, "
|
|
539
556
|
"pool_size = %5u MB, "
|
|
540
557
|
"size = %5u MB\n",
|
|
541
|
-
device, b.ptr,
|
|
542
|
-
(uint32_t)(GGML_PAD(
|
|
543
|
-
(uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
|
|
558
|
+
device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
|
|
559
|
+
(uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
|
|
544
560
|
#endif
|
|
545
561
|
return b.ptr;
|
|
546
562
|
}
|
|
@@ -554,21 +570,20 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
|
554
570
|
* @param ptr Pointer to the buffer to free.
|
|
555
571
|
* @param size Size of the buffer to free.
|
|
556
572
|
*/
|
|
557
|
-
void free(void* ptr, size_t size) override {
|
|
573
|
+
void free(void * ptr, size_t size) override {
|
|
558
574
|
GGML_UNUSED(size);
|
|
559
575
|
for (int i = 0; i < MAX_BUFFERS; ++i) {
|
|
560
|
-
ggml_cann_buffer& b = buffer_pool[i];
|
|
576
|
+
ggml_cann_buffer & b = buffer_pool[i];
|
|
561
577
|
if (b.ptr != ptr) {
|
|
562
578
|
continue;
|
|
563
579
|
}
|
|
564
|
-
b.used
|
|
580
|
+
b.used = false;
|
|
565
581
|
b.last_used = std::chrono::steady_clock::now();
|
|
566
582
|
#ifdef DEBUG_CANN_MALLOC
|
|
567
583
|
GGML_LOG_INFO(
|
|
568
584
|
"cann pool[%d]: return %p, "
|
|
569
585
|
"pool_size = %5u MB\n",
|
|
570
|
-
device, b.ptr,
|
|
571
|
-
(uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
|
|
586
|
+
device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
|
|
572
587
|
#endif
|
|
573
588
|
return;
|
|
574
589
|
}
|
|
@@ -596,7 +611,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
|
596
611
|
/**
|
|
597
612
|
* @brief Pointer to the start of the virtual memory pool.
|
|
598
613
|
*/
|
|
599
|
-
void* pool_addr = 0;
|
|
614
|
+
void * pool_addr = 0;
|
|
600
615
|
|
|
601
616
|
/**
|
|
602
617
|
* @brief Amount of virtual memory used in the pool.
|
|
@@ -621,7 +636,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
|
621
636
|
/**
|
|
622
637
|
* @brief Offsets for the mapped memory regions.
|
|
623
638
|
*/
|
|
624
|
-
std::vector<void*> map_offsets;
|
|
639
|
+
std::vector<void *> map_offsets;
|
|
625
640
|
|
|
626
641
|
/**
|
|
627
642
|
* @brief Constructor to initialize the buffer pool with virtual memory for
|
|
@@ -629,11 +644,10 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
|
629
644
|
*
|
|
630
645
|
* @param device The device ID to associate with this buffer pool.
|
|
631
646
|
*/
|
|
632
|
-
explicit ggml_cann_pool_vmm(int device)
|
|
633
|
-
|
|
634
|
-
auto dev = ggml_cann_info().devices[device];
|
|
647
|
+
explicit ggml_cann_pool_vmm(int device) : device(device) {
|
|
648
|
+
auto dev = ggml_cann_info().devices[device];
|
|
635
649
|
granularity = dev.vmm_granularity;
|
|
636
|
-
max_size
|
|
650
|
+
max_size = dev.total_vram;
|
|
637
651
|
}
|
|
638
652
|
|
|
639
653
|
/**
|
|
@@ -641,10 +655,10 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
|
641
655
|
*/
|
|
642
656
|
~ggml_cann_pool_vmm() {
|
|
643
657
|
if (pool_addr != 0) {
|
|
644
|
-
for (auto& offset : map_offsets) {
|
|
658
|
+
for (auto & offset : map_offsets) {
|
|
645
659
|
ACL_CHECK(aclrtUnmapMem(offset));
|
|
646
660
|
}
|
|
647
|
-
for (auto& handle : handles) {
|
|
661
|
+
for (auto & handle : handles) {
|
|
648
662
|
ACL_CHECK(aclrtFreePhysical(handle));
|
|
649
663
|
}
|
|
650
664
|
ACL_CHECK(aclrtReleaseMemAddress(pool_addr));
|
|
@@ -659,11 +673,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
|
659
673
|
* the allocated buffer.
|
|
660
674
|
* @return A pointer to the allocated buffer.
|
|
661
675
|
*/
|
|
662
|
-
void* alloc(size_t size, size_t* actual_size) override {
|
|
676
|
+
void * alloc(size_t size, size_t * actual_size) override {
|
|
663
677
|
// round up the allocation size to the alignment to ensure that all
|
|
664
678
|
// allocations are aligned for all data types
|
|
665
679
|
const size_t alignment = 128;
|
|
666
|
-
size
|
|
680
|
+
size = GGML_PAD(size, alignment);
|
|
667
681
|
if (size == 0) {
|
|
668
682
|
size = alignment;
|
|
669
683
|
}
|
|
@@ -673,53 +687,51 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
|
673
687
|
if (size > avail) {
|
|
674
688
|
// round up to the next multiple of the granularity
|
|
675
689
|
size_t reserve_size = size - avail;
|
|
676
|
-
reserve_size
|
|
690
|
+
reserve_size = GGML_PAD(reserve_size, granularity);
|
|
677
691
|
|
|
678
692
|
GGML_ASSERT(pool_size + reserve_size <= max_size);
|
|
679
693
|
|
|
680
694
|
// allocate more physical memory
|
|
681
695
|
aclrtPhysicalMemProp prop = {};
|
|
682
|
-
prop.handleType
|
|
683
|
-
prop.allocationType
|
|
684
|
-
prop.memAttr
|
|
685
|
-
prop.location.type
|
|
686
|
-
prop.location.id
|
|
687
|
-
prop.reserve
|
|
696
|
+
prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
|
|
697
|
+
prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
|
|
698
|
+
prop.memAttr = ACL_HBM_MEM_HUGE;
|
|
699
|
+
prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
|
|
700
|
+
prop.location.id = device;
|
|
701
|
+
prop.reserve = 0;
|
|
688
702
|
aclrtDrvMemHandle handle;
|
|
689
703
|
ACL_CHECK(aclrtMallocPhysical(&handle, reserve_size, &prop, 0));
|
|
690
704
|
|
|
691
705
|
// reserve virtual address space (if not already reserved)
|
|
692
706
|
if (pool_addr == 0) {
|
|
693
|
-
ACL_CHECK(aclrtReserveMemAddress(
|
|
694
|
-
&pool_addr, max_size, 0, NULL, 1));
|
|
707
|
+
ACL_CHECK(aclrtReserveMemAddress(&pool_addr, max_size, 0, NULL, 1));
|
|
695
708
|
}
|
|
696
709
|
|
|
697
710
|
// map at the end of the pool
|
|
698
|
-
ACL_CHECK(aclrtMapMem((char*)pool_addr + pool_size, reserve_size, 0,
|
|
699
|
-
handle, 0));
|
|
711
|
+
ACL_CHECK(aclrtMapMem((char *) pool_addr + pool_size, reserve_size, 0, handle, 0));
|
|
700
712
|
|
|
701
713
|
handles.push_back(handle);
|
|
702
|
-
map_offsets.push_back((char*)pool_addr + pool_size);
|
|
714
|
+
map_offsets.push_back((char *) pool_addr + pool_size);
|
|
703
715
|
|
|
704
716
|
// add to the pool
|
|
705
717
|
pool_size += reserve_size;
|
|
706
718
|
|
|
707
719
|
#ifdef DEBUG_CANN_MALLOC
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
720
|
+
GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n", device,
|
|
721
|
+
(unsigned long long) (pool_size / 1024 / 1024),
|
|
722
|
+
(unsigned long long) (reserve_size / 1024 / 1024));
|
|
711
723
|
#endif
|
|
712
724
|
}
|
|
713
725
|
|
|
714
726
|
GGML_ASSERT(pool_addr != 0);
|
|
715
727
|
|
|
716
|
-
void* ptr
|
|
728
|
+
void * ptr = (void *) ((char *) pool_addr + pool_used);
|
|
717
729
|
*actual_size = size;
|
|
718
730
|
pool_used += size;
|
|
719
731
|
|
|
720
732
|
#ifdef DEBUG_CANN_MALLOC
|
|
721
|
-
GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
|
|
722
|
-
|
|
733
|
+
GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size,
|
|
734
|
+
(unsigned long long) ptr);
|
|
723
735
|
#endif
|
|
724
736
|
return ptr;
|
|
725
737
|
}
|
|
@@ -730,16 +742,16 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
|
730
742
|
* @param ptr Pointer to the buffer to free.
|
|
731
743
|
* @param size Size of the buffer to free.
|
|
732
744
|
*/
|
|
733
|
-
void free(void* ptr, size_t size) override {
|
|
745
|
+
void free(void * ptr, size_t size) override {
|
|
734
746
|
#ifdef DEBUG_CANN_MALLOC
|
|
735
|
-
GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
|
|
736
|
-
|
|
747
|
+
GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size,
|
|
748
|
+
(unsigned long long) ptr);
|
|
737
749
|
#endif
|
|
738
750
|
|
|
739
751
|
pool_used -= size;
|
|
740
752
|
|
|
741
753
|
// all deallocations must be in reverse order of the allocations
|
|
742
|
-
GGML_ASSERT(ptr == (void*)((char*)pool_addr + pool_used));
|
|
754
|
+
GGML_ASSERT(ptr == (void *) ((char *) pool_addr + pool_used));
|
|
743
755
|
}
|
|
744
756
|
};
|
|
745
757
|
|
|
@@ -751,9 +763,8 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
|
751
763
|
* @param device The device ID for which to create the pool.
|
|
752
764
|
* @return A unique pointer to the created CANN pool.
|
|
753
765
|
*/
|
|
754
|
-
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
|
|
755
|
-
|
|
756
|
-
std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
|
|
766
|
+
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(int device) {
|
|
767
|
+
std::string mem_pool_type = get_env_as_lowercase("GGML_CANN_MEM_POOL").value_or("");
|
|
757
768
|
|
|
758
769
|
if (mem_pool_type == "prio") {
|
|
759
770
|
GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
|
|
@@ -777,9 +788,8 @@ std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
|
|
|
777
788
|
* ID, device pointer, and a name derived from GGML_CANN_NAME and the device ID.
|
|
778
789
|
*/
|
|
779
790
|
struct ggml_backend_cann_buffer_context {
|
|
780
|
-
int32_t device;
|
|
781
|
-
void*
|
|
782
|
-
nullptr; ///< Pointer to the device memory allocated for the buffer.
|
|
791
|
+
int32_t device; ///< The device ID associated with this buffer context.
|
|
792
|
+
void * dev_ptr = nullptr; ///< Pointer to the device memory allocated for the buffer.
|
|
783
793
|
|
|
784
794
|
/**
|
|
785
795
|
* @brief Constructor to initialize the CANN buffer context.
|
|
@@ -787,9 +797,7 @@ struct ggml_backend_cann_buffer_context {
|
|
|
787
797
|
* @param device The device ID associated with this buffer context.
|
|
788
798
|
* @param dev_ptr Pointer to the device memory allocated for the buffer.
|
|
789
799
|
*/
|
|
790
|
-
ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr)
|
|
791
|
-
: device(device),
|
|
792
|
-
dev_ptr(dev_ptr) {}
|
|
800
|
+
ggml_backend_cann_buffer_context(int32_t device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
|
|
793
801
|
|
|
794
802
|
/**
|
|
795
803
|
* @brief Destructor to free the device memory allocated for the buffer.
|
|
@@ -807,8 +815,8 @@ struct ggml_backend_cann_buffer_context {
|
|
|
807
815
|
* @return true if the buffer is a CANN buffer, false otherwise.
|
|
808
816
|
*/
|
|
809
817
|
static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
|
|
810
|
-
|
|
811
|
-
|
|
818
|
+
|
|
819
|
+
static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
|
|
812
820
|
return ggml_backend_buft_is_cann(buffer->buft);
|
|
813
821
|
}
|
|
814
822
|
|
|
@@ -820,10 +828,8 @@ static bool ggml_backend_buffer_is_cann(
|
|
|
820
828
|
*
|
|
821
829
|
* @param buffer The CANN buffer to free.
|
|
822
830
|
*/
|
|
823
|
-
static void ggml_backend_cann_buffer_free_buffer(
|
|
824
|
-
|
|
825
|
-
ggml_backend_cann_buffer_context* ctx =
|
|
826
|
-
(ggml_backend_cann_buffer_context*)buffer->context;
|
|
831
|
+
static void ggml_backend_cann_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
832
|
+
ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
|
|
827
833
|
delete ctx;
|
|
828
834
|
}
|
|
829
835
|
|
|
@@ -836,10 +842,8 @@ static void ggml_backend_cann_buffer_free_buffer(
|
|
|
836
842
|
* @param buffer The CANN buffer whose base pointer is to be retrieved.
|
|
837
843
|
* @return A pointer to the base of the device memory allocated for the buffer.
|
|
838
844
|
*/
|
|
839
|
-
static void* ggml_backend_cann_buffer_get_base(
|
|
840
|
-
|
|
841
|
-
ggml_backend_cann_buffer_context* ctx =
|
|
842
|
-
(ggml_backend_cann_buffer_context*)buffer->context;
|
|
845
|
+
static void * ggml_backend_cann_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
846
|
+
ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
|
|
843
847
|
return ctx->dev_ptr;
|
|
844
848
|
}
|
|
845
849
|
|
|
@@ -856,21 +860,17 @@ static void* ggml_backend_cann_buffer_get_base(
|
|
|
856
860
|
* @param dst Pointer to the destination buffer where transformed data will be
|
|
857
861
|
* stored.
|
|
858
862
|
*/
|
|
859
|
-
static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
int64_t n_elems = ggml_nelements(tensor);
|
|
864
|
-
int64_t groups = n_elems / QK4_0;
|
|
865
|
-
size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
|
|
863
|
+
static void ggml_backend_cann_transform_q4_0(ggml_tensor * tensor, const void * src, void * dst) {
|
|
864
|
+
int64_t n_elems = ggml_nelements(tensor);
|
|
865
|
+
int64_t groups = n_elems / QK4_0;
|
|
866
|
+
size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
|
|
866
867
|
|
|
867
|
-
uint8_t*
|
|
868
|
-
uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
|
|
868
|
+
uint8_t * quant_offset = (uint8_t *) dst;
|
|
869
|
+
uint16_t * scale_offset = (uint16_t *) ((char *) dst + quant_bytes);
|
|
869
870
|
|
|
870
871
|
for (int i = 0; i < groups; i++) {
|
|
871
|
-
const block_q4_0* group =
|
|
872
|
-
|
|
873
|
-
*scale_offset = group->d;
|
|
872
|
+
const block_q4_0 * group = (const block_q4_0 *) ((const char *) src + i * sizeof(block_q4_0));
|
|
873
|
+
*scale_offset = group->d;
|
|
874
874
|
scale_offset++;
|
|
875
875
|
|
|
876
876
|
// 0-15
|
|
@@ -889,8 +889,7 @@ static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
|
|
|
889
889
|
}
|
|
890
890
|
|
|
891
891
|
// put (uint4b_t -8) into int4b_t
|
|
892
|
-
for (quant_offset = (uint8_t*)dst;
|
|
893
|
-
quant_offset < (uint8_t*)dst + quant_bytes; quant_offset++) {
|
|
892
|
+
for (quant_offset = (uint8_t *) dst; quant_offset < (uint8_t *) dst + quant_bytes; quant_offset++) {
|
|
894
893
|
(*quant_offset) ^= 0x88;
|
|
895
894
|
}
|
|
896
895
|
}
|
|
@@ -908,29 +907,27 @@ static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
|
|
|
908
907
|
* @param dst Pointer to the destination buffer where the Q4.0 formatted data
|
|
909
908
|
* will be stored.
|
|
910
909
|
*/
|
|
911
|
-
static void ggml_backend_cann_transform_back_q4_0(
|
|
912
|
-
|
|
910
|
+
static void ggml_backend_cann_transform_back_q4_0(const ggml_tensor * tensor, void * src, void * dst) {
|
|
911
|
+
int64_t n_elems = ggml_nelements(tensor);
|
|
912
|
+
int64_t groups = n_elems / QK4_0;
|
|
913
|
+
size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
|
|
913
914
|
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
|
|
915
|
+
uint8_t * quant_offset = (uint8_t *) src;
|
|
916
|
+
uint16_t * scale_offset = (uint16_t *) ((char *) src + quant_bytes);
|
|
917
917
|
|
|
918
|
-
|
|
919
|
-
uint16_t* scale_offset = (uint16_t*)((char*)src + quant_bytes);
|
|
920
|
-
|
|
921
|
-
for (; quant_offset < (uint8_t*)src + quant_bytes; quant_offset++) {
|
|
918
|
+
for (; quant_offset < (uint8_t *) src + quant_bytes; quant_offset++) {
|
|
922
919
|
(*quant_offset) ^= 0x88;
|
|
923
920
|
}
|
|
924
|
-
quant_offset = (uint8_t*)src;
|
|
921
|
+
quant_offset = (uint8_t *) src;
|
|
925
922
|
|
|
926
923
|
for (int i = 0; i < groups; i++) {
|
|
927
|
-
block_q4_0* group = (block_q4_0*)((char*)dst + i * sizeof(block_q4_0));
|
|
928
|
-
group->d
|
|
924
|
+
block_q4_0 * group = (block_q4_0 *) ((char *) dst + i * sizeof(block_q4_0));
|
|
925
|
+
group->d = *scale_offset;
|
|
929
926
|
scale_offset++;
|
|
930
927
|
|
|
931
928
|
// 0-15
|
|
932
929
|
for (int j = 0; j < QK4_0 / 2; j += 2) {
|
|
933
|
-
group->qs[j]
|
|
930
|
+
group->qs[j] = ((*quant_offset) & 0x0F);
|
|
934
931
|
group->qs[j + 1] = ((*quant_offset) >> 4);
|
|
935
932
|
quant_offset++;
|
|
936
933
|
}
|
|
@@ -957,20 +954,17 @@ static void ggml_backend_cann_transform_back_q4_0(
|
|
|
957
954
|
* @param dst Pointer to the destination buffer where transformed data will be
|
|
958
955
|
* stored.
|
|
959
956
|
*/
|
|
960
|
-
static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
int64_t groups = n_elems / QK8_0;
|
|
965
|
-
size_t quant_bytes = n_elems * sizeof(uint8_t);
|
|
957
|
+
static void ggml_backend_cann_transform_q8_0(ggml_tensor * tensor, const void * src, void * dst) {
|
|
958
|
+
int64_t n_elems = ggml_nelements(tensor);
|
|
959
|
+
int64_t groups = n_elems / QK8_0;
|
|
960
|
+
size_t quant_bytes = n_elems * sizeof(uint8_t);
|
|
966
961
|
|
|
967
|
-
uint8_t*
|
|
968
|
-
uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
|
|
962
|
+
uint8_t * quant_offset = (uint8_t *) dst;
|
|
963
|
+
uint16_t * scale_offset = (uint16_t *) ((char *) dst + quant_bytes);
|
|
969
964
|
|
|
970
965
|
for (int i = 0; i < groups; i++) {
|
|
971
|
-
const block_q8_0* group =
|
|
972
|
-
|
|
973
|
-
*scale_offset = group->d;
|
|
966
|
+
const block_q8_0 * group = (const block_q8_0 *) ((const char *) src + i * sizeof(block_q8_0));
|
|
967
|
+
*scale_offset = group->d;
|
|
974
968
|
scale_offset++;
|
|
975
969
|
size_t group_quant_size = QK8_0 * sizeof(uint8_t);
|
|
976
970
|
memcpy(quant_offset, group->qs, group_quant_size);
|
|
@@ -991,19 +985,17 @@ static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
|
|
|
991
985
|
* @param dst Pointer to the destination buffer where the Q8.0 formatted data
|
|
992
986
|
* will be stored.
|
|
993
987
|
*/
|
|
994
|
-
static void ggml_backend_cann_transform_back_q8_0(
|
|
995
|
-
|
|
996
|
-
int64_t n_elems
|
|
997
|
-
|
|
998
|
-
size_t quant_bytes = n_elems * sizeof(uint8_t);
|
|
988
|
+
static void ggml_backend_cann_transform_back_q8_0(const ggml_tensor * tensor, const void * src, void * dst) {
|
|
989
|
+
int64_t n_elems = ggml_nelements(tensor);
|
|
990
|
+
int64_t groups = n_elems / QK8_0;
|
|
991
|
+
size_t quant_bytes = n_elems * sizeof(uint8_t);
|
|
999
992
|
|
|
1000
|
-
const uint8_t*
|
|
1001
|
-
const uint16_t* scale_offset =
|
|
1002
|
-
(const uint16_t*)((const char*)src + quant_bytes);
|
|
993
|
+
const uint8_t * quant_offset = (const uint8_t *) src;
|
|
994
|
+
const uint16_t * scale_offset = (const uint16_t *) ((const char *) src + quant_bytes);
|
|
1003
995
|
|
|
1004
996
|
for (int i = 0; i < groups; i++) {
|
|
1005
|
-
block_q8_0* group = (block_q8_0*)((char*)dst + i * sizeof(block_q8_0));
|
|
1006
|
-
group->d
|
|
997
|
+
block_q8_0 * group = (block_q8_0 *) ((char *) dst + i * sizeof(block_q8_0));
|
|
998
|
+
group->d = *scale_offset;
|
|
1007
999
|
scale_offset++;
|
|
1008
1000
|
size_t group_quant_size = QK8_0 * sizeof(uint8_t);
|
|
1009
1001
|
memcpy(group->qs, quant_offset, group_quant_size);
|
|
@@ -1023,8 +1015,7 @@ static void ggml_backend_cann_transform_back_q8_0(
|
|
|
1023
1015
|
* @param dst Pointer to the destination buffer where transformed data will be
|
|
1024
1016
|
* stored.
|
|
1025
1017
|
*/
|
|
1026
|
-
static void ggml_backend_cann_transform(ggml_tensor* tensor,
|
|
1027
|
-
const void* src, void* dst) {
|
|
1018
|
+
static void ggml_backend_cann_transform(ggml_tensor * tensor, const void * src, void * dst) {
|
|
1028
1019
|
switch (tensor->type) {
|
|
1029
1020
|
case GGML_TYPE_Q4_0:
|
|
1030
1021
|
ggml_backend_cann_transform_q4_0(tensor, src, dst);
|
|
@@ -1049,8 +1040,7 @@ static void ggml_backend_cann_transform(ggml_tensor* tensor,
|
|
|
1049
1040
|
* @param dst Pointer to the destination buffer where transformed tensor data
|
|
1050
1041
|
* will be stored.
|
|
1051
1042
|
*/
|
|
1052
|
-
static void ggml_backend_cann_transform_back(
|
|
1053
|
-
const ggml_tensor* tensor, void* src, void* dst) {
|
|
1043
|
+
static void ggml_backend_cann_transform_back(const ggml_tensor * tensor, void * src, void * dst) {
|
|
1054
1044
|
switch (tensor->type) {
|
|
1055
1045
|
case GGML_TYPE_Q4_0:
|
|
1056
1046
|
ggml_backend_cann_transform_back_q4_0(tensor, src, dst);
|
|
@@ -1091,8 +1081,7 @@ static bool need_transform(ggml_type type) {
|
|
|
1091
1081
|
* @param buffer The CANN buffer from which to initialize the tensor.
|
|
1092
1082
|
* @param tensor Pointer to the tensor to be initialized.
|
|
1093
1083
|
*/
|
|
1094
|
-
static enum ggml_status ggml_backend_cann_buffer_init_tensor(
|
|
1095
|
-
ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
|
|
1084
|
+
static enum ggml_status ggml_backend_cann_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
|
1096
1085
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
|
1097
1086
|
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
|
1098
1087
|
return GGML_STATUS_SUCCESS;
|
|
@@ -1103,18 +1092,105 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
|
|
|
1103
1092
|
if (ggml_is_quantized(tensor->type)) {
|
|
1104
1093
|
// Initialize padding to 0 to avoid possible NaN values
|
|
1105
1094
|
size_t original_size = ggml_nbytes(tensor);
|
|
1106
|
-
size_t padded_size
|
|
1107
|
-
ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
|
1095
|
+
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
|
1108
1096
|
|
|
1109
1097
|
if (padded_size > original_size && tensor->view_src == nullptr) {
|
|
1110
1098
|
size_t memset_size = padded_size - original_size;
|
|
1111
|
-
ACL_CHECK(aclrtMemset((char*)tensor->data + original_size,
|
|
1112
|
-
memset_size, 0, memset_size));
|
|
1099
|
+
ACL_CHECK(aclrtMemset((char *) tensor->data + original_size, memset_size, 0, memset_size));
|
|
1113
1100
|
}
|
|
1114
1101
|
}
|
|
1115
1102
|
return GGML_STATUS_SUCCESS;
|
|
1116
1103
|
}
|
|
1117
1104
|
|
|
1105
|
+
/**
|
|
1106
|
+
* @brief Workspace for caching NZ buffers per device.
|
|
1107
|
+
*
|
|
1108
|
+
* This struct manages a device buffer used in NZ computations. It supports
|
|
1109
|
+
* allocation, reallocation, and clearing of cached memory. The struct is
|
|
1110
|
+
* designed to be used with a global array, one per device.
|
|
1111
|
+
*/
|
|
1112
|
+
struct ggml_cann_nz_workspace {
|
|
1113
|
+
void * ptr; // Pointer to allocated device buffer
|
|
1114
|
+
size_t allocated; // Size of currently allocated buffer in bytes
|
|
1115
|
+
|
|
1116
|
+
/**
|
|
1117
|
+
* @brief Constructor. Initializes the workspace with no allocated memory.
|
|
1118
|
+
*/
|
|
1119
|
+
ggml_cann_nz_workspace() : ptr(nullptr), allocated(0) {}
|
|
1120
|
+
|
|
1121
|
+
/**
|
|
1122
|
+
* @brief Free cached memory and reset the workspace.
|
|
1123
|
+
*
|
|
1124
|
+
* If a buffer has been allocated, this function releases it using
|
|
1125
|
+
* aclrtFree and resets internal state.
|
|
1126
|
+
*/
|
|
1127
|
+
void clear() {
|
|
1128
|
+
if (ptr) {
|
|
1129
|
+
ACL_CHECK(aclrtFree(ptr));
|
|
1130
|
+
ptr = nullptr;
|
|
1131
|
+
allocated = 0;
|
|
1132
|
+
}
|
|
1133
|
+
}
|
|
1134
|
+
|
|
1135
|
+
/**
|
|
1136
|
+
* @brief Allocate or reallocate the workspace buffer.
|
|
1137
|
+
*
|
|
1138
|
+
* If the requested size is larger than the currently allocated size,
|
|
1139
|
+
* the old buffer will be freed and a new buffer of the requested size
|
|
1140
|
+
* will be allocated on the device.
|
|
1141
|
+
*
|
|
1142
|
+
* @param new_size Size in bytes to allocate for the workspace.
|
|
1143
|
+
*/
|
|
1144
|
+
void realloc(size_t new_size) {
|
|
1145
|
+
if (new_size > allocated) {
|
|
1146
|
+
clear();
|
|
1147
|
+
ACL_CHECK(aclrtMalloc(&ptr, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
|
|
1148
|
+
allocated = new_size;
|
|
1149
|
+
}
|
|
1150
|
+
}
|
|
1151
|
+
|
|
1152
|
+
/**
|
|
1153
|
+
* @brief Get the device buffer pointer.
|
|
1154
|
+
*
|
|
1155
|
+
* @return Pointer to the allocated buffer, or nullptr if not allocated.
|
|
1156
|
+
*/
|
|
1157
|
+
void * get() const { return ptr; }
|
|
1158
|
+
};
|
|
1159
|
+
|
|
1160
|
+
/**
|
|
1161
|
+
* @brief Global array of NZ workspaces, one per device.
|
|
1162
|
+
*/
|
|
1163
|
+
static ggml_cann_nz_workspace g_nz_workspaces[GGML_CANN_MAX_DEVICES];
|
|
1164
|
+
|
|
1165
|
+
/**
|
|
1166
|
+
* @brief Convert tensor weights to NZ format using Ascend CANN API.
|
|
1167
|
+
*
|
|
1168
|
+
* This function creates a transposed tensor descriptor and performs the
|
|
1169
|
+
* TransMatmulWeight operation. Converting tensor formats can significantly
|
|
1170
|
+
* improve performance on certain hardware.
|
|
1171
|
+
*
|
|
1172
|
+
* @param tensor Pointer to the input ggml_tensor containing the weights.
|
|
1173
|
+
* @param offset Byte offset within the tensor data buffer where weights start.
|
|
1174
|
+
* @param device device id.
|
|
1175
|
+
*
|
|
1176
|
+
* @note The workspace buffer used in this function is managed globally and reused
|
|
1177
|
+
* across calls. This reduces overhead from repeated memory allocation and deallocation.
|
|
1178
|
+
*/
|
|
1179
|
+
static void weight_format_to_nz(ggml_tensor * tensor, size_t offset, int device) {
|
|
1180
|
+
acl_tensor_ptr weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne, tensor->nb, 2, ACL_FORMAT_ND, offset);
|
|
1181
|
+
uint64_t workspaceSize = 0;
|
|
1182
|
+
aclOpExecutor * executor;
|
|
1183
|
+
|
|
1184
|
+
// TransMatmulWeight
|
|
1185
|
+
ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed.get(), &workspaceSize, &executor));
|
|
1186
|
+
// Avoid frequent malloc/free of the workspace.
|
|
1187
|
+
g_nz_workspaces[device].realloc(workspaceSize);
|
|
1188
|
+
|
|
1189
|
+
void * g_nz_workspace = g_nz_workspaces[device].get();
|
|
1190
|
+
|
|
1191
|
+
ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
|
|
1192
|
+
}
|
|
1193
|
+
|
|
1118
1194
|
// TODO: need handle tensor which has paddings.
|
|
1119
1195
|
/**
|
|
1120
1196
|
* @brief Set tensor data in a CANN buffer.
|
|
@@ -1128,27 +1204,32 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
|
|
|
1128
1204
|
* @param offset Offset in the source data from where to start copying.
|
|
1129
1205
|
* @param size Size of the data to be copied, in bytes.
|
|
1130
1206
|
*/
|
|
1131
|
-
static void ggml_backend_cann_buffer_set_tensor(
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1207
|
+
static void ggml_backend_cann_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
1208
|
+
ggml_tensor * tensor,
|
|
1209
|
+
const void * data,
|
|
1210
|
+
size_t offset,
|
|
1211
|
+
size_t size) {
|
|
1212
|
+
ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
|
|
1136
1213
|
|
|
1137
1214
|
ggml_cann_set_device(ctx->device);
|
|
1138
1215
|
// TODO: refer to cann(#6017), it use thread's default stream.
|
|
1139
1216
|
// For acl, synchronous functions use this default stream.
|
|
1140
1217
|
// Why aclrtSynchronizeDevice?
|
|
1141
1218
|
|
|
1219
|
+
// Only check env once.
|
|
1220
|
+
static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
|
1142
1221
|
if (!need_transform(tensor->type)) {
|
|
1143
|
-
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
|
|
1144
|
-
|
|
1222
|
+
ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
|
|
1223
|
+
if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
|
|
1224
|
+
GGML_ASSERT(tensor->ne[2] == 1);
|
|
1225
|
+
GGML_ASSERT(tensor->ne[3] == 1);
|
|
1226
|
+
weight_format_to_nz(tensor, offset, ctx->device);
|
|
1227
|
+
}
|
|
1145
1228
|
} else {
|
|
1146
|
-
void *transform_buffer = malloc(size);
|
|
1229
|
+
void * transform_buffer = malloc(size);
|
|
1147
1230
|
ggml_backend_cann_transform(tensor, data, transform_buffer);
|
|
1148
1231
|
|
|
1149
|
-
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
|
|
1150
|
-
transform_buffer, size,
|
|
1151
|
-
ACL_MEMCPY_HOST_TO_DEVICE));
|
|
1232
|
+
ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, transform_buffer, size, ACL_MEMCPY_HOST_TO_DEVICE));
|
|
1152
1233
|
free(transform_buffer);
|
|
1153
1234
|
}
|
|
1154
1235
|
}
|
|
@@ -1166,22 +1247,20 @@ static void ggml_backend_cann_buffer_set_tensor(
|
|
|
1166
1247
|
* @param offset Offset in the destination buffer where to start copying.
|
|
1167
1248
|
* @param size Size of the data to be copied, in bytes.
|
|
1168
1249
|
*/
|
|
1169
|
-
static void ggml_backend_cann_buffer_get_tensor(
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1250
|
+
static void ggml_backend_cann_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
1251
|
+
const ggml_tensor * tensor,
|
|
1252
|
+
void * data,
|
|
1253
|
+
size_t offset,
|
|
1254
|
+
size_t size) {
|
|
1255
|
+
ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
|
|
1174
1256
|
|
|
1175
1257
|
ggml_cann_set_device(ctx->device);
|
|
1176
1258
|
|
|
1177
1259
|
if (!need_transform(tensor->type)) {
|
|
1178
|
-
ACL_CHECK(aclrtMemcpy(data, size, (char*)tensor->data + offset, size,
|
|
1179
|
-
ACL_MEMCPY_DEVICE_TO_HOST));
|
|
1260
|
+
ACL_CHECK(aclrtMemcpy(data, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST));
|
|
1180
1261
|
} else {
|
|
1181
|
-
void* transform_buffer = malloc(size);
|
|
1182
|
-
ACL_CHECK(aclrtMemcpy(transform_buffer, size,
|
|
1183
|
-
(char*)tensor->data + offset, size,
|
|
1184
|
-
ACL_MEMCPY_DEVICE_TO_HOST));
|
|
1262
|
+
void * transform_buffer = malloc(size);
|
|
1263
|
+
ACL_CHECK(aclrtMemcpy(transform_buffer, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST));
|
|
1185
1264
|
ggml_backend_cann_transform_back(tensor, transform_buffer, data);
|
|
1186
1265
|
free(transform_buffer);
|
|
1187
1266
|
}
|
|
@@ -1200,31 +1279,31 @@ static void ggml_backend_cann_buffer_get_tensor(
|
|
|
1200
1279
|
* @param dst Pointer to the destination tensor where the data will be copied.
|
|
1201
1280
|
* @return true if the copy operation succeeded, false otherwise.
|
|
1202
1281
|
*/
|
|
1203
|
-
static bool ggml_backend_cann_buffer_cpy_tensor(
|
|
1204
|
-
|
|
1282
|
+
static bool ggml_backend_cann_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
|
1283
|
+
const ggml_tensor * src,
|
|
1284
|
+
ggml_tensor * dst) {
|
|
1205
1285
|
if (ggml_backend_buffer_is_cann(src->buffer)) {
|
|
1206
|
-
ggml_backend_cann_buffer_context* src_ctx =
|
|
1207
|
-
|
|
1208
|
-
ggml_backend_cann_buffer_context* dst_ctx =
|
|
1209
|
-
(ggml_backend_cann_buffer_context*)buffer->context;
|
|
1286
|
+
ggml_backend_cann_buffer_context * src_ctx = (ggml_backend_cann_buffer_context *) src->buffer->context;
|
|
1287
|
+
ggml_backend_cann_buffer_context * dst_ctx = (ggml_backend_cann_buffer_context *) buffer->context;
|
|
1210
1288
|
|
|
1211
1289
|
size_t memcpy_size = ggml_nbytes(src);
|
|
1212
1290
|
// Same device.
|
|
1213
1291
|
if (src_ctx->device == dst_ctx->device) {
|
|
1214
|
-
ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size,
|
|
1215
|
-
(const char*)src->data, memcpy_size,
|
|
1292
|
+
ACL_CHECK(aclrtMemcpy((char *) dst->data, memcpy_size, (const char *) src->data, memcpy_size,
|
|
1216
1293
|
ACL_MEMCPY_DEVICE_TO_DEVICE));
|
|
1217
1294
|
return true;
|
|
1218
1295
|
} else {
|
|
1296
|
+
#ifdef ASCEND_310P
|
|
1297
|
+
// TODO: Support 310p P2P copy
|
|
1298
|
+
return false;
|
|
1299
|
+
#endif
|
|
1219
1300
|
// Different device but can access by peer.
|
|
1220
1301
|
int32_t canAccessPeer = 0;
|
|
1221
|
-
ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device,
|
|
1222
|
-
dst_ctx->device));
|
|
1302
|
+
ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device, dst_ctx->device));
|
|
1223
1303
|
if (canAccessPeer) {
|
|
1224
1304
|
ggml_cann_set_device(src_ctx->device);
|
|
1225
1305
|
ACL_CHECK(aclrtDeviceEnablePeerAccess(dst_ctx->device, 0));
|
|
1226
|
-
ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size,
|
|
1227
|
-
(const char*)src->data, memcpy_size,
|
|
1306
|
+
ACL_CHECK(aclrtMemcpy((char *) dst->data, memcpy_size, (const char *) src->data, memcpy_size,
|
|
1228
1307
|
ACL_MEMCPY_DEVICE_TO_DEVICE));
|
|
1229
1308
|
return true;
|
|
1230
1309
|
}
|
|
@@ -1242,10 +1321,8 @@ static bool ggml_backend_cann_buffer_cpy_tensor(
|
|
|
1242
1321
|
* @param buffer The CANN buffer to be cleared.
|
|
1243
1322
|
* @param value The value to which each byte in the buffer will be set.
|
|
1244
1323
|
*/
|
|
1245
|
-
static void ggml_backend_cann_buffer_clear(
|
|
1246
|
-
|
|
1247
|
-
ggml_backend_cann_buffer_context* ctx =
|
|
1248
|
-
(ggml_backend_cann_buffer_context*)buffer->context;
|
|
1324
|
+
static void ggml_backend_cann_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
1325
|
+
ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
|
|
1249
1326
|
|
|
1250
1327
|
ggml_cann_set_device(ctx->device);
|
|
1251
1328
|
ACL_CHECK(aclrtMemset(ctx->dev_ptr, buffer->size, value, buffer->size));
|
|
@@ -1275,9 +1352,8 @@ static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
|
|
|
1275
1352
|
* buffer type.
|
|
1276
1353
|
*/
|
|
1277
1354
|
struct ggml_backend_cann_buffer_type_context {
|
|
1278
|
-
int32_t
|
|
1279
|
-
|
|
1280
|
-
std::string name; /**< Name associated with the buffer context. */
|
|
1355
|
+
int32_t device; /**< Device identifier associated with the buffer context. */
|
|
1356
|
+
std::string name; /**< Name associated with the buffer context. */
|
|
1281
1357
|
};
|
|
1282
1358
|
|
|
1283
1359
|
/**
|
|
@@ -1289,10 +1365,8 @@ struct ggml_backend_cann_buffer_type_context {
|
|
|
1289
1365
|
* @param buft Pointer to the buffer type context.
|
|
1290
1366
|
* @return Const pointer to the C-style string containing the name.
|
|
1291
1367
|
*/
|
|
1292
|
-
static const char* ggml_backend_cann_buffer_type_name(
|
|
1293
|
-
|
|
1294
|
-
ggml_backend_cann_buffer_type_context* buft_ctx =
|
|
1295
|
-
(ggml_backend_cann_buffer_type_context*)buft->context;
|
|
1368
|
+
static const char * ggml_backend_cann_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
|
1369
|
+
ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
|
|
1296
1370
|
|
|
1297
1371
|
return buft_ctx->name.c_str();
|
|
1298
1372
|
}
|
|
@@ -1307,34 +1381,27 @@ static const char* ggml_backend_cann_buffer_type_name(
|
|
|
1307
1381
|
* @param size Size in bytes of the buffer to allocate.
|
|
1308
1382
|
* @return Pointer to the allocated buffer, or nullptr if allocation fails.
|
|
1309
1383
|
*/
|
|
1310
|
-
static ggml_backend_buffer_t
|
|
1311
|
-
|
|
1312
|
-
size_t size) {
|
|
1313
|
-
ggml_backend_cann_buffer_type_context* buft_ctx =
|
|
1314
|
-
(ggml_backend_cann_buffer_type_context*)buft->context;
|
|
1384
|
+
static ggml_backend_buffer_t ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
1385
|
+
ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
|
|
1315
1386
|
|
|
1316
1387
|
ggml_cann_set_device(buft_ctx->device);
|
|
1317
1388
|
|
|
1318
1389
|
const size_t alignment = 128;
|
|
1319
|
-
size
|
|
1390
|
+
size = GGML_PAD(size, alignment);
|
|
1320
1391
|
if (size == 0) {
|
|
1321
1392
|
size = alignment;
|
|
1322
1393
|
}
|
|
1323
|
-
void*
|
|
1394
|
+
void * dev_ptr;
|
|
1324
1395
|
aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
|
|
1325
1396
|
if (err != ACL_SUCCESS) {
|
|
1326
|
-
GGML_LOG_ERROR(
|
|
1327
|
-
|
|
1328
|
-
__func__, size / 1024.0 / 1024.0, buft_ctx->device,
|
|
1329
|
-
aclGetRecentErrMsg());
|
|
1397
|
+
GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n", __func__,
|
|
1398
|
+
size / 1024.0 / 1024.0, buft_ctx->device, aclGetRecentErrMsg());
|
|
1330
1399
|
return nullptr;
|
|
1331
1400
|
}
|
|
1332
1401
|
|
|
1333
|
-
ggml_backend_cann_buffer_context* ctx =
|
|
1334
|
-
new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr);
|
|
1402
|
+
ggml_backend_cann_buffer_context * ctx = new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr);
|
|
1335
1403
|
|
|
1336
|
-
return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface,
|
|
1337
|
-
ctx, size);
|
|
1404
|
+
return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface, ctx, size);
|
|
1338
1405
|
}
|
|
1339
1406
|
|
|
1340
1407
|
/**
|
|
@@ -1349,8 +1416,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
|
|
1349
1416
|
* @return The alignment requirement in bytes (fixed at 128 bytes for CANN
|
|
1350
1417
|
* buffers).
|
|
1351
1418
|
*/
|
|
1352
|
-
static size_t ggml_backend_cann_buffer_type_get_alignment(
|
|
1353
|
-
ggml_backend_buffer_type_t buft) {
|
|
1419
|
+
static size_t ggml_backend_cann_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
1354
1420
|
return 128;
|
|
1355
1421
|
|
|
1356
1422
|
GGML_UNUSED(buft);
|
|
@@ -1370,10 +1436,13 @@ static size_t ggml_backend_cann_buffer_type_get_alignment(
|
|
|
1370
1436
|
* @return The total allocation size in bytes required for the tensor in the
|
|
1371
1437
|
* CANN buffer.
|
|
1372
1438
|
*/
|
|
1373
|
-
static size_t ggml_backend_cann_buffer_type_get_alloc_size(
|
|
1374
|
-
|
|
1375
|
-
size_t
|
|
1376
|
-
int64_t ne0
|
|
1439
|
+
static size_t ggml_backend_cann_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
|
|
1440
|
+
const ggml_tensor * tensor) {
|
|
1441
|
+
size_t size = ggml_nbytes(tensor);
|
|
1442
|
+
int64_t ne0 = tensor->ne[0];
|
|
1443
|
+
|
|
1444
|
+
// Only check env once.
|
|
1445
|
+
static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
|
1377
1446
|
|
|
1378
1447
|
// last line must bigger than 32, because every single op deal at
|
|
1379
1448
|
// least 32 bytes.
|
|
@@ -1381,14 +1450,21 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
|
|
|
1381
1450
|
// int64_t line_size = ne0 * ggml_element_size(tensor);
|
|
1382
1451
|
// int64_t line_size_align_32 = (line_size + 31) & ~31;
|
|
1383
1452
|
// size += (line_size_align_32 - line_size);
|
|
1384
|
-
|
|
1385
|
-
// TODO: not support quantized yet.
|
|
1386
|
-
// TODO: consider un-continue tensor.
|
|
1387
1453
|
if (ggml_is_quantized(tensor->type)) {
|
|
1388
1454
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
|
1389
|
-
size += ggml_row_size(
|
|
1390
|
-
tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
1455
|
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
1391
1456
|
}
|
|
1457
|
+
} else if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
|
|
1458
|
+
// NZ format weight are not support quantized yet.
|
|
1459
|
+
// If ND tensor transform to NZ, size may changed.
|
|
1460
|
+
int64_t shape[] = { tensor->ne[1], tensor->ne[0] };
|
|
1461
|
+
GGML_ASSERT(tensor->ne[2] == 1);
|
|
1462
|
+
GGML_ASSERT(tensor->ne[3] == 1);
|
|
1463
|
+
const aclIntArray * acl_shape = aclCreateIntArray(shape, 2);
|
|
1464
|
+
size_t new_size;
|
|
1465
|
+
ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape, ggml_cann_type_mapping(tensor->type), &new_size));
|
|
1466
|
+
ACL_CHECK(aclDestroyIntArray(acl_shape));
|
|
1467
|
+
size = std::max(size, new_size);
|
|
1392
1468
|
}
|
|
1393
1469
|
|
|
1394
1470
|
return size;
|
|
@@ -1427,17 +1503,15 @@ static const ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface
|
|
|
1427
1503
|
* @return A pointer to the buffer type interface for the specified device, or
|
|
1428
1504
|
* nullptr if the device index is out of range.
|
|
1429
1505
|
*/
|
|
1430
|
-
ggml_backend_buffer_type_t
|
|
1431
|
-
|
|
1432
|
-
static std::mutex mutex;
|
|
1506
|
+
ggml_backend_buffer_type_t ggml_backend_cann_buffer_type(int32_t device) {
|
|
1507
|
+
static std::mutex mutex;
|
|
1433
1508
|
std::lock_guard<std::mutex> lock(mutex);
|
|
1434
1509
|
|
|
1435
1510
|
if (device >= ggml_backend_cann_get_device_count()) {
|
|
1436
1511
|
return nullptr;
|
|
1437
1512
|
}
|
|
1438
1513
|
|
|
1439
|
-
static ggml_backend_buffer_type
|
|
1440
|
-
ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES];
|
|
1514
|
+
static ggml_backend_buffer_type ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES];
|
|
1441
1515
|
|
|
1442
1516
|
static bool ggml_backend_cann_buffer_type_initialized = false;
|
|
1443
1517
|
|
|
@@ -1447,8 +1521,7 @@ ggml_backend_cann_buffer_type(int32_t device) {
|
|
|
1447
1521
|
/* .iface = */ ggml_backend_cann_buffer_type_interface,
|
|
1448
1522
|
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
|
|
1449
1523
|
/* .context = */
|
|
1450
|
-
|
|
1451
|
-
i, "CANN" + std::to_string(i)},
|
|
1524
|
+
new ggml_backend_cann_buffer_type_context{ i, "CANN" + std::to_string(i) },
|
|
1452
1525
|
};
|
|
1453
1526
|
}
|
|
1454
1527
|
ggml_backend_cann_buffer_type_initialized = true;
|
|
@@ -1512,16 +1585,16 @@ static void * ggml_cann_host_malloc(size_t size) {
|
|
|
1512
1585
|
}
|
|
1513
1586
|
|
|
1514
1587
|
const size_t alignment = 128;
|
|
1515
|
-
size
|
|
1588
|
+
size = GGML_PAD(size, alignment);
|
|
1516
1589
|
if (size == 0) {
|
|
1517
1590
|
size = alignment;
|
|
1518
1591
|
}
|
|
1519
1592
|
|
|
1520
|
-
void *
|
|
1521
|
-
aclError err
|
|
1593
|
+
void * hostPtr = nullptr;
|
|
1594
|
+
aclError err = aclrtMallocHost((void **) &hostPtr, size);
|
|
1522
1595
|
if (err != ACL_SUCCESS) {
|
|
1523
|
-
GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
|
1524
|
-
|
|
1596
|
+
GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, size / 1024.0 / 1024.0,
|
|
1597
|
+
aclGetRecentErrMsg());
|
|
1525
1598
|
return nullptr;
|
|
1526
1599
|
}
|
|
1527
1600
|
return hostPtr;
|
|
@@ -1534,7 +1607,8 @@ static void * ggml_cann_host_malloc(size_t size) {
|
|
|
1534
1607
|
* @param size Size in bytes of the host buffer to allocate.
|
|
1535
1608
|
* @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
|
|
1536
1609
|
*/
|
|
1537
|
-
static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
|
1610
|
+
static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
|
1611
|
+
size_t size) {
|
|
1538
1612
|
void * hostPtr = ggml_cann_host_malloc(size);
|
|
1539
1613
|
|
|
1540
1614
|
if (hostPtr == nullptr) {
|
|
@@ -1543,8 +1617,8 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm
|
|
|
1543
1617
|
}
|
|
1544
1618
|
|
|
1545
1619
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
|
|
1546
|
-
buffer->buft
|
|
1547
|
-
buffer->iface.free_buffer
|
|
1620
|
+
buffer->buft = buft;
|
|
1621
|
+
buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
|
|
1548
1622
|
|
|
1549
1623
|
return buffer;
|
|
1550
1624
|
}
|
|
@@ -1558,14 +1632,15 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm
|
|
|
1558
1632
|
ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
|
|
1559
1633
|
static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
|
|
1560
1634
|
/* .iface = */ {
|
|
1561
|
-
|
|
1562
|
-
|
|
1563
|
-
|
|
1564
|
-
|
|
1635
|
+
/* .get_name = */ ggml_backend_cann_host_buffer_type_name,
|
|
1636
|
+
/* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
|
|
1637
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
|
1638
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
1565
1639
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
/* .device = */
|
|
1640
|
+
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
|
1641
|
+
},
|
|
1642
|
+
/* .device = */
|
|
1643
|
+
ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0),
|
|
1569
1644
|
/* .context = */ nullptr,
|
|
1570
1645
|
};
|
|
1571
1646
|
|
|
@@ -1585,8 +1660,7 @@ ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
|
|
|
1585
1660
|
* stored.
|
|
1586
1661
|
* @return true if the computation was successful; false otherwise.
|
|
1587
1662
|
*/
|
|
1588
|
-
static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
1589
|
-
struct ggml_tensor* dst) {
|
|
1663
|
+
static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct ggml_tensor * dst) {
|
|
1590
1664
|
switch (dst->op) {
|
|
1591
1665
|
case GGML_OP_REPEAT:
|
|
1592
1666
|
ggml_cann_repeat(ctx, dst);
|
|
@@ -1594,6 +1668,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
|
1594
1668
|
case GGML_OP_GET_ROWS:
|
|
1595
1669
|
ggml_cann_get_rows(ctx, dst);
|
|
1596
1670
|
break;
|
|
1671
|
+
case GGML_OP_SET_ROWS:
|
|
1672
|
+
ggml_cann_set_rows(ctx, dst);
|
|
1673
|
+
break;
|
|
1597
1674
|
case GGML_OP_DUP:
|
|
1598
1675
|
ggml_cann_dup(ctx, dst);
|
|
1599
1676
|
break;
|
|
@@ -1616,48 +1693,50 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
|
1616
1693
|
case GGML_OP_UNARY:
|
|
1617
1694
|
switch (ggml_get_unary_op(dst)) {
|
|
1618
1695
|
case GGML_UNARY_OP_ABS:
|
|
1619
|
-
|
|
1696
|
+
GGML_CANN_CALL_OP_UNARY(Abs);
|
|
1620
1697
|
break;
|
|
1621
1698
|
case GGML_UNARY_OP_NEG:
|
|
1622
|
-
|
|
1699
|
+
GGML_CANN_CALL_OP_UNARY(Neg);
|
|
1623
1700
|
break;
|
|
1624
1701
|
case GGML_UNARY_OP_GELU:
|
|
1625
|
-
|
|
1702
|
+
case GGML_UNARY_OP_GELU_ERF:
|
|
1703
|
+
// aclnnGelu internally uses the erf-based approximation.
|
|
1704
|
+
GGML_CANN_CALL_OP_UNARY(Gelu);
|
|
1626
1705
|
break;
|
|
1627
1706
|
case GGML_UNARY_OP_SILU:
|
|
1628
|
-
|
|
1707
|
+
GGML_CANN_CALL_OP_UNARY(Silu);
|
|
1708
|
+
break;
|
|
1709
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
|
1710
|
+
{
|
|
1711
|
+
auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
|
|
1712
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
|
|
1713
|
+
};
|
|
1714
|
+
ggml_cann_op_unary(lambda, ctx, dst);
|
|
1715
|
+
}
|
|
1629
1716
|
break;
|
|
1630
|
-
case GGML_UNARY_OP_GELU_QUICK: {
|
|
1631
|
-
auto lambda = [](ggml_backend_cann_context& ctx,
|
|
1632
|
-
aclTensor* acl_src,
|
|
1633
|
-
aclTensor* acl_dst) {
|
|
1634
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
|
|
1635
|
-
};
|
|
1636
|
-
ggml_cann_unary_op(lambda, ctx, dst);
|
|
1637
|
-
} break;
|
|
1638
1717
|
case GGML_UNARY_OP_TANH:
|
|
1639
|
-
|
|
1718
|
+
GGML_CANN_CALL_OP_UNARY(Tanh);
|
|
1640
1719
|
break;
|
|
1641
1720
|
case GGML_UNARY_OP_RELU:
|
|
1642
|
-
|
|
1721
|
+
GGML_CANN_CALL_OP_UNARY(Relu);
|
|
1643
1722
|
break;
|
|
1644
1723
|
case GGML_UNARY_OP_SIGMOID:
|
|
1645
|
-
|
|
1724
|
+
GGML_CANN_CALL_OP_UNARY(Sigmoid);
|
|
1646
1725
|
break;
|
|
1647
1726
|
case GGML_UNARY_OP_HARDSIGMOID:
|
|
1648
|
-
|
|
1727
|
+
GGML_CANN_CALL_OP_UNARY(Hardsigmoid);
|
|
1649
1728
|
break;
|
|
1650
1729
|
case GGML_UNARY_OP_HARDSWISH:
|
|
1651
|
-
|
|
1730
|
+
GGML_CANN_CALL_OP_UNARY(Hardswish);
|
|
1652
1731
|
break;
|
|
1653
1732
|
case GGML_UNARY_OP_EXP:
|
|
1654
|
-
|
|
1733
|
+
GGML_CANN_CALL_OP_UNARY(Exp);
|
|
1655
1734
|
break;
|
|
1656
1735
|
case GGML_UNARY_OP_ELU:
|
|
1657
1736
|
ggml_cann_elu(ctx, dst);
|
|
1658
1737
|
break;
|
|
1659
1738
|
case GGML_UNARY_OP_SGN:
|
|
1660
|
-
|
|
1739
|
+
GGML_CANN_CALL_OP_UNARY(Sign);
|
|
1661
1740
|
break;
|
|
1662
1741
|
case GGML_UNARY_OP_STEP:
|
|
1663
1742
|
ggml_cann_step(ctx, dst);
|
|
@@ -1666,12 +1745,43 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
|
1666
1745
|
return false;
|
|
1667
1746
|
}
|
|
1668
1747
|
break;
|
|
1748
|
+
case GGML_OP_GLU:
|
|
1749
|
+
switch (ggml_get_glu_op(dst)) {
|
|
1750
|
+
case GGML_GLU_OP_REGLU:
|
|
1751
|
+
GGML_CANN_CALL_OP_UNARY_GATED(Relu);
|
|
1752
|
+
break;
|
|
1753
|
+
case GGML_GLU_OP_GEGLU:
|
|
1754
|
+
case GGML_GLU_OP_GEGLU_ERF:
|
|
1755
|
+
// aclnnGelu internally uses the erf-based approximation.
|
|
1756
|
+
GGML_CANN_CALL_OP_UNARY_GATED(Gelu);
|
|
1757
|
+
break;
|
|
1758
|
+
case GGML_GLU_OP_SWIGLU:
|
|
1759
|
+
GGML_CANN_CALL_OP_UNARY_GATED(Silu);
|
|
1760
|
+
break;
|
|
1761
|
+
case GGML_GLU_OP_GEGLU_QUICK:
|
|
1762
|
+
{
|
|
1763
|
+
auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
|
|
1764
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
|
|
1765
|
+
};
|
|
1766
|
+
ggml_cann_op_unary_gated(lambda, ctx, dst);
|
|
1767
|
+
}
|
|
1768
|
+
break;
|
|
1769
|
+
default:
|
|
1770
|
+
return false;
|
|
1771
|
+
}
|
|
1772
|
+
break;
|
|
1669
1773
|
case GGML_OP_NORM:
|
|
1670
1774
|
ggml_cann_norm(ctx, dst);
|
|
1671
1775
|
break;
|
|
1672
1776
|
case GGML_OP_GROUP_NORM:
|
|
1673
1777
|
ggml_cann_group_norm(ctx, dst);
|
|
1674
1778
|
break;
|
|
1779
|
+
case GGML_OP_L2_NORM:
|
|
1780
|
+
ggml_cann_l2_norm(ctx, dst);
|
|
1781
|
+
break;
|
|
1782
|
+
case GGML_OP_CROSS_ENTROPY_LOSS:
|
|
1783
|
+
ggml_cann_cross_entropy_loss(ctx, dst);
|
|
1784
|
+
break;
|
|
1675
1785
|
case GGML_OP_CONCAT:
|
|
1676
1786
|
ggml_cann_concat(ctx, dst);
|
|
1677
1787
|
break;
|
|
@@ -1708,7 +1818,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
|
1708
1818
|
ggml_cann_binary_op<aclnn_mul>(ctx, dst);
|
|
1709
1819
|
break;
|
|
1710
1820
|
case GGML_OP_SQRT:
|
|
1711
|
-
|
|
1821
|
+
GGML_CANN_CALL_OP_UNARY(Sqrt);
|
|
1712
1822
|
break;
|
|
1713
1823
|
case GGML_OP_CLAMP:
|
|
1714
1824
|
ggml_cann_clamp(ctx, dst);
|
|
@@ -1753,16 +1863,16 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
|
1753
1863
|
ggml_cann_argmax(ctx, dst);
|
|
1754
1864
|
break;
|
|
1755
1865
|
case GGML_OP_COS:
|
|
1756
|
-
|
|
1866
|
+
ggml_cann_op_unary<aclnn_cos>(ctx, dst);
|
|
1757
1867
|
break;
|
|
1758
1868
|
case GGML_OP_SIN:
|
|
1759
|
-
|
|
1869
|
+
ggml_cann_op_unary<aclnn_sin>(ctx, dst);
|
|
1760
1870
|
break;
|
|
1761
1871
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
1762
1872
|
ggml_cann_conv_transpose_1d(ctx, dst);
|
|
1763
1873
|
break;
|
|
1764
1874
|
case GGML_OP_LOG:
|
|
1765
|
-
|
|
1875
|
+
GGML_CANN_CALL_OP_UNARY(Log);
|
|
1766
1876
|
break;
|
|
1767
1877
|
case GGML_OP_MEAN:
|
|
1768
1878
|
ggml_cann_mean(ctx, dst);
|
|
@@ -1776,6 +1886,12 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
|
1776
1886
|
case GGML_OP_FLASH_ATTN_EXT:
|
|
1777
1887
|
ggml_cann_flash_attn_ext(ctx, dst);
|
|
1778
1888
|
break;
|
|
1889
|
+
case GGML_OP_OUT_PROD:
|
|
1890
|
+
ggml_cann_out_prod(ctx, dst);
|
|
1891
|
+
break;
|
|
1892
|
+
case GGML_OP_SSM_CONV:
|
|
1893
|
+
ggml_cann_ssm_conv(ctx, dst);
|
|
1894
|
+
break;
|
|
1779
1895
|
default:
|
|
1780
1896
|
return false;
|
|
1781
1897
|
}
|
|
@@ -1793,9 +1909,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
|
1793
1909
|
* @param backend Pointer to the CANN backend structure.
|
|
1794
1910
|
* @return A pointer to a constant string representing the backend name.
|
|
1795
1911
|
*/
|
|
1796
|
-
static const char* ggml_backend_cann_name(ggml_backend_t backend) {
|
|
1797
|
-
ggml_backend_cann_context* cann_ctx =
|
|
1798
|
-
(ggml_backend_cann_context*)backend->context;
|
|
1912
|
+
static const char * ggml_backend_cann_name(ggml_backend_t backend) {
|
|
1913
|
+
ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
|
|
1799
1914
|
|
|
1800
1915
|
return cann_ctx->name.c_str();
|
|
1801
1916
|
}
|
|
@@ -1809,8 +1924,7 @@ static const char* ggml_backend_cann_name(ggml_backend_t backend) {
|
|
|
1809
1924
|
* @param backend Pointer to the CANN backend structure to be freed.
|
|
1810
1925
|
*/
|
|
1811
1926
|
static void ggml_backend_cann_free(ggml_backend_t backend) {
|
|
1812
|
-
ggml_backend_cann_context* cann_ctx =
|
|
1813
|
-
(ggml_backend_cann_context*)backend->context;
|
|
1927
|
+
ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
|
|
1814
1928
|
ACL_CHECK(aclrtSynchronizeDevice());
|
|
1815
1929
|
ACL_CHECK(aclrtResetDevice(cann_ctx->device));
|
|
1816
1930
|
|
|
@@ -1818,7 +1932,6 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
|
|
|
1818
1932
|
delete backend;
|
|
1819
1933
|
}
|
|
1820
1934
|
|
|
1821
|
-
|
|
1822
1935
|
/**
|
|
1823
1936
|
* @brief Sets tensor data asynchronously in the CANN backend.
|
|
1824
1937
|
*
|
|
@@ -1831,21 +1944,18 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
|
|
|
1831
1944
|
* @param size Size of the data to copy in bytes.
|
|
1832
1945
|
*/
|
|
1833
1946
|
static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
|
|
1834
|
-
ggml_tensor *tensor,
|
|
1835
|
-
const void *data,
|
|
1836
|
-
size_t
|
|
1837
|
-
size_t
|
|
1838
|
-
ggml_backend_cann_context *cann_ctx =
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
|
|
1844
|
-
"unsupported buffer type");
|
|
1947
|
+
ggml_tensor * tensor,
|
|
1948
|
+
const void * data,
|
|
1949
|
+
size_t offset,
|
|
1950
|
+
size_t size) {
|
|
1951
|
+
ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
|
|
1952
|
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
1953
|
+
|
|
1954
|
+
GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && "unsupported buffer type");
|
|
1845
1955
|
GGML_ASSERT(!ggml_is_quantized(tensor->type));
|
|
1846
1956
|
|
|
1847
|
-
|
|
1848
|
-
|
|
1957
|
+
ACL_CHECK(aclrtMemcpyAsync((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE,
|
|
1958
|
+
cann_ctx->stream()));
|
|
1849
1959
|
}
|
|
1850
1960
|
|
|
1851
1961
|
/**
|
|
@@ -1859,21 +1969,19 @@ static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
|
|
|
1859
1969
|
* @param offset Offset in bytes within the host data.
|
|
1860
1970
|
* @param size Size of the data to copy in bytes.
|
|
1861
1971
|
*/
|
|
1862
|
-
static void ggml_backend_cann_get_tensor_async(
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
|
|
1868
|
-
|
|
1869
|
-
|
|
1870
|
-
GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
|
|
1871
|
-
"unsupported buffer type");
|
|
1972
|
+
static void ggml_backend_cann_get_tensor_async(ggml_backend_t backend,
|
|
1973
|
+
const ggml_tensor * tensor,
|
|
1974
|
+
void * data,
|
|
1975
|
+
size_t offset,
|
|
1976
|
+
size_t size) {
|
|
1977
|
+
ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
|
|
1978
|
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
1979
|
+
|
|
1980
|
+
GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && "unsupported buffer type");
|
|
1872
1981
|
GGML_ASSERT(!ggml_is_quantized(tensor->type));
|
|
1873
1982
|
|
|
1874
|
-
|
|
1875
|
-
|
|
1876
|
-
|
|
1983
|
+
ACL_CHECK(aclrtMemcpyAsync(data, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST,
|
|
1984
|
+
cann_ctx->stream()));
|
|
1877
1985
|
}
|
|
1878
1986
|
|
|
1879
1987
|
/**
|
|
@@ -1889,62 +1997,67 @@ static void ggml_backend_cann_get_tensor_async(
|
|
|
1889
1997
|
* @param dst Pointer to the destination tensor to copy data to.
|
|
1890
1998
|
* @return true if the copy operation succeeds, false otherwise.
|
|
1891
1999
|
*/
|
|
1892
|
-
static bool ggml_backend_cann_cpy_tensor_async(
|
|
1893
|
-
|
|
1894
|
-
|
|
1895
|
-
|
|
1896
|
-
|
|
2000
|
+
static bool ggml_backend_cann_cpy_tensor_async(ggml_backend_t backend_src,
|
|
2001
|
+
ggml_backend_t backend_dst,
|
|
2002
|
+
const ggml_tensor * src,
|
|
2003
|
+
ggml_tensor * dst) {
|
|
2004
|
+
GGML_ASSERT(ggml_backend_is_cann(backend_src) || ggml_backend_is_cann(backend_dst));
|
|
2005
|
+
|
|
2006
|
+
GGML_ASSERT(!is_matmul_weight((const ggml_tensor *) src));
|
|
1897
2007
|
|
|
1898
|
-
if (!ggml_backend_buffer_is_cann(src->buffer) ||
|
|
1899
|
-
!ggml_backend_buffer_is_cann(dst->buffer)) {
|
|
2008
|
+
if (!ggml_backend_buffer_is_cann(src->buffer) || !ggml_backend_buffer_is_cann(dst->buffer)) {
|
|
1900
2009
|
return false;
|
|
1901
2010
|
}
|
|
1902
2011
|
|
|
1903
|
-
ggml_backend_buffer_t buf_src =
|
|
1904
|
-
|
|
1905
|
-
ggml_backend_buffer_t buf_dst =
|
|
1906
|
-
dst->view_src ? dst->view_src->buffer : dst->buffer;
|
|
2012
|
+
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
|
2013
|
+
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
|
1907
2014
|
|
|
1908
|
-
ggml_backend_cann_context* cann_ctx_src =
|
|
1909
|
-
|
|
1910
|
-
ggml_backend_cann_context* cann_ctx_dst =
|
|
1911
|
-
(ggml_backend_cann_context*)backend_dst->context;
|
|
2015
|
+
ggml_backend_cann_context * cann_ctx_src = (ggml_backend_cann_context *) backend_src->context;
|
|
2016
|
+
ggml_backend_cann_context * cann_ctx_dst = (ggml_backend_cann_context *) backend_dst->context;
|
|
1912
2017
|
|
|
1913
2018
|
size_t copy_size = ggml_nbytes(dst);
|
|
2019
|
+
if (copy_size == 0) {
|
|
2020
|
+
return true;
|
|
2021
|
+
}
|
|
1914
2022
|
if (backend_src != backend_dst) {
|
|
1915
|
-
|
|
1916
|
-
|
|
1917
|
-
|
|
1918
|
-
|
|
2023
|
+
#ifdef ASCEND_310P
|
|
2024
|
+
// TODO: Support 310p P2P copy
|
|
2025
|
+
return false;
|
|
2026
|
+
#endif
|
|
2027
|
+
ggml_backend_cann_buffer_context * buf_ctx_src = (ggml_backend_cann_buffer_context *) buf_src->context;
|
|
2028
|
+
ggml_backend_cann_buffer_context * buf_ctx_dst = (ggml_backend_cann_buffer_context *) buf_dst->context;
|
|
1919
2029
|
|
|
1920
2030
|
GGML_ASSERT(cann_ctx_src->device == buf_ctx_src->device);
|
|
1921
2031
|
GGML_ASSERT(cann_ctx_dst->device == buf_ctx_dst->device);
|
|
1922
2032
|
|
|
1923
2033
|
int32_t canAccessPeer = 0;
|
|
1924
|
-
ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device,
|
|
1925
|
-
cann_ctx_dst->device));
|
|
2034
|
+
ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device, cann_ctx_dst->device));
|
|
1926
2035
|
if (!canAccessPeer) {
|
|
1927
2036
|
return false;
|
|
1928
2037
|
}
|
|
1929
2038
|
|
|
1930
2039
|
// need open both directions for memcpyasync between devices.
|
|
1931
|
-
ggml_cann_set_device(cann_ctx_dst->device);
|
|
1932
2040
|
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
|
|
1933
2041
|
ggml_cann_set_device(cann_ctx_src->device);
|
|
1934
2042
|
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
|
|
1935
2043
|
|
|
1936
2044
|
// wait for task_queue empty to keep task order.
|
|
1937
|
-
|
|
1938
|
-
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
|
|
1939
|
-
ACL_MEMCPY_DEVICE_TO_DEVICE,
|
|
2045
|
+
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
|
|
1940
2046
|
cann_ctx_src->stream()));
|
|
1941
|
-
|
|
1942
|
-
//TODO:
|
|
1943
|
-
|
|
2047
|
+
// record event on src stream after the copy
|
|
2048
|
+
// TODO: this event is not effective with acl graph mode, change to use aclrtSynchronizeStream
|
|
2049
|
+
// if (!cann_ctx_src->copy_event) {
|
|
2050
|
+
// ACL_CHECK(aclrtCreateEventWithFlag(&cann_ctx_src->copy_event, ACL_EVENT_SYNC));
|
|
2051
|
+
// }
|
|
2052
|
+
// ACL_CHECK(aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
|
|
2053
|
+
|
|
2054
|
+
// // wait on dst stream for the copy to complete
|
|
2055
|
+
// ggml_cann_set_device(cann_ctx_dst->device);
|
|
2056
|
+
// ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), cann_ctx_src->copy_event));
|
|
2057
|
+
ACL_CHECK(aclrtSynchronizeStream(cann_ctx_src->stream()));
|
|
1944
2058
|
} else {
|
|
1945
2059
|
// src and dst are on the same backend
|
|
1946
|
-
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
|
|
1947
|
-
ACL_MEMCPY_DEVICE_TO_DEVICE,
|
|
2060
|
+
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
|
|
1948
2061
|
cann_ctx_dst->stream()));
|
|
1949
2062
|
}
|
|
1950
2063
|
|
|
@@ -1960,13 +2073,110 @@ static bool ggml_backend_cann_cpy_tensor_async(
|
|
|
1960
2073
|
* @param backend Pointer to the CANN backend structure to synchronize.
|
|
1961
2074
|
*/
|
|
1962
2075
|
static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
|
|
1963
|
-
ggml_backend_cann_context* cann_ctx =
|
|
1964
|
-
(ggml_backend_cann_context*)backend->context;
|
|
1965
|
-
cann_ctx->task_queue.wait();
|
|
2076
|
+
ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
|
|
1966
2077
|
ggml_cann_set_device(cann_ctx->device);
|
|
1967
2078
|
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
|
|
1968
2079
|
}
|
|
1969
2080
|
|
|
2081
|
+
/**
|
|
2082
|
+
* @brief Check if CANN backend can fuse the specified operation sequence
|
|
2083
|
+
*
|
|
2084
|
+
* This function determines whether an operation sequence starting from the specified node
|
|
2085
|
+
* can be fused into an optimized operation in the CANN backend. Operation fusion can reduce
|
|
2086
|
+
* memory access overhead and improve computational efficiency.
|
|
2087
|
+
*
|
|
2088
|
+
* @param cgraph Pointer to the computation graph
|
|
2089
|
+
* @param node_idx Index of the starting node in the computation graph
|
|
2090
|
+
* @param ops Sequence of operation types to check for fusion
|
|
2091
|
+
* @return true if the operations can be fused
|
|
2092
|
+
* @return false if the operations cannot be fused
|
|
2093
|
+
*/
|
|
2094
|
+
static bool ggml_cann_can_fuse(const struct ggml_cgraph * cgraph,
|
|
2095
|
+
int node_idx,
|
|
2096
|
+
std::initializer_list<enum ggml_op> ops) {
|
|
2097
|
+
if (!ggml_can_fuse(cgraph, node_idx, ops)) {
|
|
2098
|
+
return false;
|
|
2099
|
+
}
|
|
2100
|
+
|
|
2101
|
+
// CANN backend supports fusing ADD + RMS_NORM operations
|
|
2102
|
+
if ((ops.size() == 2) && ops.begin()[0] == GGML_OP_ADD && ops.begin()[1] == GGML_OP_RMS_NORM) {
|
|
2103
|
+
ggml_tensor * add_node = cgraph->nodes[node_idx];
|
|
2104
|
+
// TODO: support broadcast for ADD + RMS_NORM
|
|
2105
|
+
if (add_node->src[0]->ne[0] != add_node->src[1]->ne[0] || add_node->src[0]->ne[1] != add_node->src[1]->ne[1] ||
|
|
2106
|
+
add_node->src[0]->ne[2] != add_node->src[1]->ne[2] || add_node->src[0]->ne[3] != add_node->src[1]->ne[3]) {
|
|
2107
|
+
return false;
|
|
2108
|
+
}
|
|
2109
|
+
return true;
|
|
2110
|
+
}
|
|
2111
|
+
|
|
2112
|
+
return false;
|
|
2113
|
+
}
|
|
2114
|
+
|
|
2115
|
+
/**
|
|
2116
|
+
* @brief Evaluate the computation graph and optionally capture or execute it using CANN graph API.
|
|
2117
|
+
*
|
|
2118
|
+
* If CANN graph execution is enabled and graph capture is required, this function begins
|
|
2119
|
+
* graph capture, runs the graph, ends capture, and stores the captured graph.
|
|
2120
|
+
*
|
|
2121
|
+
* Otherwise, it falls back to op-by-op execution using the CANN compute kernel dispatcher.
|
|
2122
|
+
*
|
|
2123
|
+
* @param cann_ctx The CANN backend context.
|
|
2124
|
+
* @param cgraph The ggml computation graph.
|
|
2125
|
+
* @param use_cann_graph Whether to use CANN graph execution.
|
|
2126
|
+
* @param cann_graph_capture_required Whether graph capture is needed due to graph changes.
|
|
2127
|
+
*/
|
|
2128
|
+
static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx,
|
|
2129
|
+
ggml_cgraph * cgraph,
|
|
2130
|
+
bool use_cann_graph,
|
|
2131
|
+
bool cann_graph_capture_required) {
|
|
2132
|
+
#ifdef USE_ACL_GRAPH
|
|
2133
|
+
if (use_cann_graph && cann_graph_capture_required) { // Begin CANN graph capture
|
|
2134
|
+
ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
|
|
2135
|
+
}
|
|
2136
|
+
#endif // USE_ACL_GRAPH
|
|
2137
|
+
// Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
|
|
2138
|
+
// With the use of CANN graphs, the execution will be performed by the graph launch.
|
|
2139
|
+
static bool opt_fusion = parse_bool(get_env_as_lowercase("GGML_CANN_OPERATOR_FUSION").value_or(""));
|
|
2140
|
+
|
|
2141
|
+
if (!use_cann_graph || cann_graph_capture_required) {
|
|
2142
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
2143
|
+
ggml_tensor * node = cgraph->nodes[i];
|
|
2144
|
+
if (opt_fusion) {
|
|
2145
|
+
if (ggml_cann_can_fuse(cgraph, i, { GGML_OP_ADD, GGML_OP_RMS_NORM })) {
|
|
2146
|
+
ggml_cann_op_add_rms_norm_fused(*cann_ctx, node, cgraph->nodes[i + 1]);
|
|
2147
|
+
i++;
|
|
2148
|
+
continue;
|
|
2149
|
+
}
|
|
2150
|
+
}
|
|
2151
|
+
|
|
2152
|
+
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE ||
|
|
2153
|
+
node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
|
2154
|
+
continue;
|
|
2155
|
+
}
|
|
2156
|
+
|
|
2157
|
+
bool ok = ggml_cann_compute_forward(*cann_ctx, node);
|
|
2158
|
+
if (!ok) {
|
|
2159
|
+
GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
|
2160
|
+
}
|
|
2161
|
+
GGML_ASSERT(ok);
|
|
2162
|
+
}
|
|
2163
|
+
}
|
|
2164
|
+
|
|
2165
|
+
#ifdef USE_ACL_GRAPH
|
|
2166
|
+
if (use_cann_graph) {
|
|
2167
|
+
GGML_ASSERT(!cann_ctx->graph_lru_cache.cache_list.empty());
|
|
2168
|
+
ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
|
|
2169
|
+
|
|
2170
|
+
if (cann_graph_capture_required) { // End CANN graph capture
|
|
2171
|
+
ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
|
|
2172
|
+
}
|
|
2173
|
+
|
|
2174
|
+
// Execute CANN graph
|
|
2175
|
+
ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream()));
|
|
2176
|
+
}
|
|
2177
|
+
#endif // USE_ACL_GRAPH
|
|
2178
|
+
}
|
|
2179
|
+
|
|
1970
2180
|
/**
|
|
1971
2181
|
* @brief Computes a computational graph using a CANN backend.
|
|
1972
2182
|
*
|
|
@@ -1979,28 +2189,50 @@ static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
|
|
|
1979
2189
|
* @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
|
|
1980
2190
|
* completes successfully, otherwise an appropriate error status.
|
|
1981
2191
|
*/
|
|
1982
|
-
static enum ggml_status ggml_backend_cann_graph_compute(
|
|
1983
|
-
|
|
1984
|
-
ggml_backend_cann_context* cann_ctx =
|
|
1985
|
-
(ggml_backend_cann_context*)backend->context;
|
|
1986
|
-
|
|
2192
|
+
static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
|
2193
|
+
ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
|
|
1987
2194
|
ggml_cann_set_device(cann_ctx->device);
|
|
1988
|
-
|
|
1989
|
-
|
|
1990
|
-
|
|
1991
|
-
|
|
1992
|
-
|
|
1993
|
-
|
|
2195
|
+
g_nz_workspaces[cann_ctx->device].clear();
|
|
2196
|
+
|
|
2197
|
+
// calculate rope cache for fist layer in current device.
|
|
2198
|
+
cann_ctx->rope_cache.cached = false;
|
|
2199
|
+
|
|
2200
|
+
bool graph_capture_required = false;
|
|
2201
|
+
#ifdef USE_ACL_GRAPH
|
|
2202
|
+
bool use_cann_graph = true;
|
|
2203
|
+
|
|
2204
|
+
static bool prefill_use_graph = parse_bool(get_env_as_lowercase("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
|
|
2205
|
+
if (!prefill_use_graph) {
|
|
2206
|
+
// Do not use acl_graph for prefill.
|
|
2207
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
2208
|
+
ggml_tensor * node = cgraph->nodes[i];
|
|
2209
|
+
// TODO: Optimize here. Currently, we can only
|
|
2210
|
+
// get seq_len by FA's input.
|
|
2211
|
+
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
|
|
2212
|
+
// Q -> src[0], shape: [B, S, N, D]
|
|
2213
|
+
use_cann_graph = (node->src[0]->ne[1] == 1);
|
|
2214
|
+
break;
|
|
2215
|
+
}
|
|
1994
2216
|
}
|
|
2217
|
+
}
|
|
1995
2218
|
|
|
1996
|
-
|
|
2219
|
+
if (!cann_ctx->acl_graph_mode) {
|
|
2220
|
+
use_cann_graph = false;
|
|
2221
|
+
}
|
|
1997
2222
|
|
|
1998
|
-
|
|
1999
|
-
|
|
2000
|
-
|
|
2223
|
+
if (use_cann_graph) {
|
|
2224
|
+
// If no matching graph is found, the graph needs to be recaptured.
|
|
2225
|
+
graph_capture_required = !cann_ctx->graph_lru_cache.find_and_move_to_front(cgraph);
|
|
2226
|
+
if (graph_capture_required) {
|
|
2227
|
+
// If no matching graph is found, add a new ACL graph.
|
|
2228
|
+
ggml_cann_graph * new_graph = ggml_cann_graph::create_from_cgraph(cgraph);
|
|
2229
|
+
cann_ctx->graph_lru_cache.push(new_graph);
|
|
2001
2230
|
}
|
|
2002
|
-
GGML_ASSERT(ok);
|
|
2003
2231
|
}
|
|
2232
|
+
#else
|
|
2233
|
+
bool use_cann_graph = false;
|
|
2234
|
+
#endif // USE_ACL_GRAPH
|
|
2235
|
+
evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, graph_capture_required);
|
|
2004
2236
|
|
|
2005
2237
|
return GGML_STATUS_SUCCESS;
|
|
2006
2238
|
}
|
|
@@ -2017,8 +2249,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(
|
|
|
2017
2249
|
* @return bool Returns true if the operation is supported by the backend,
|
|
2018
2250
|
* otherwise false.
|
|
2019
2251
|
*/
|
|
2020
|
-
static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
2021
|
-
const ggml_tensor* op) {
|
|
2252
|
+
static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
|
2022
2253
|
switch (op->op) {
|
|
2023
2254
|
case GGML_OP_UNARY:
|
|
2024
2255
|
switch (ggml_get_unary_op(op)) {
|
|
@@ -2036,28 +2267,41 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
|
2036
2267
|
case GGML_UNARY_OP_ELU:
|
|
2037
2268
|
case GGML_UNARY_OP_SGN:
|
|
2038
2269
|
case GGML_UNARY_OP_STEP:
|
|
2270
|
+
case GGML_UNARY_OP_GELU_ERF:
|
|
2039
2271
|
return true;
|
|
2040
2272
|
default:
|
|
2041
2273
|
return false;
|
|
2042
2274
|
}
|
|
2043
|
-
case
|
|
2044
|
-
switch (op
|
|
2045
|
-
case
|
|
2046
|
-
case
|
|
2275
|
+
case GGML_OP_GLU:
|
|
2276
|
+
switch (ggml_get_glu_op(op)) {
|
|
2277
|
+
case GGML_GLU_OP_REGLU:
|
|
2278
|
+
case GGML_GLU_OP_GEGLU:
|
|
2279
|
+
case GGML_GLU_OP_SWIGLU:
|
|
2280
|
+
case GGML_GLU_OP_GEGLU_ERF:
|
|
2281
|
+
case GGML_GLU_OP_GEGLU_QUICK:
|
|
2047
2282
|
return true;
|
|
2048
|
-
case GGML_TYPE_Q8_0:
|
|
2049
|
-
case GGML_TYPE_Q4_0:
|
|
2050
|
-
#ifdef ASCEND_310P
|
|
2051
|
-
// Q4 && Q8 per group is not suppor on 310p device
|
|
2052
|
-
return false;
|
|
2053
|
-
#endif
|
|
2054
|
-
// only support contiguous for quantized types.
|
|
2055
|
-
return ggml_is_contiguous(op->src[0]) &&
|
|
2056
|
-
ggml_is_contiguous(op->src[1]);
|
|
2057
2283
|
default:
|
|
2058
2284
|
return false;
|
|
2059
2285
|
}
|
|
2060
|
-
|
|
2286
|
+
break;
|
|
2287
|
+
case GGML_OP_MUL_MAT:
|
|
2288
|
+
{
|
|
2289
|
+
switch (op->src[0]->type) {
|
|
2290
|
+
case GGML_TYPE_F16:
|
|
2291
|
+
case GGML_TYPE_F32:
|
|
2292
|
+
return true;
|
|
2293
|
+
case GGML_TYPE_Q8_0:
|
|
2294
|
+
case GGML_TYPE_Q4_0:
|
|
2295
|
+
#ifdef ASCEND_310P
|
|
2296
|
+
// Q4 && Q8 per group is not support on 310p device
|
|
2297
|
+
return false;
|
|
2298
|
+
#endif
|
|
2299
|
+
// only support contiguous for quantized types.
|
|
2300
|
+
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
|
2301
|
+
default:
|
|
2302
|
+
return false;
|
|
2303
|
+
}
|
|
2304
|
+
}
|
|
2061
2305
|
case GGML_OP_MUL_MAT_ID:
|
|
2062
2306
|
switch (op->src[0]->type) {
|
|
2063
2307
|
case GGML_TYPE_F16:
|
|
@@ -2066,106 +2310,112 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
|
2066
2310
|
case GGML_TYPE_Q8_0:
|
|
2067
2311
|
case GGML_TYPE_Q4_0:
|
|
2068
2312
|
#ifdef ASCEND_310P
|
|
2069
|
-
// Q4 && Q8 per group is not
|
|
2313
|
+
// Q4 && Q8 per group is not support on 310p device
|
|
2070
2314
|
return false;
|
|
2071
2315
|
#endif
|
|
2072
2316
|
// only support contiguous for quantized types.
|
|
2073
|
-
return ggml_is_contiguous(op->src[0]) &&
|
|
2074
|
-
ggml_is_contiguous(op->src[1]);
|
|
2317
|
+
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
|
2075
2318
|
default:
|
|
2076
2319
|
return false;
|
|
2077
2320
|
}
|
|
2078
2321
|
// embedding
|
|
2079
|
-
case GGML_OP_GET_ROWS:
|
|
2080
|
-
|
|
2081
|
-
|
|
2082
|
-
|
|
2083
|
-
|
|
2084
|
-
|
|
2085
|
-
|
|
2086
|
-
|
|
2087
|
-
|
|
2088
|
-
|
|
2089
|
-
case GGML_OP_CPY: {
|
|
2090
|
-
ggml_tensor *src = op->src[0];
|
|
2091
|
-
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
|
|
2092
|
-
(src->type != GGML_TYPE_F32 &&
|
|
2093
|
-
src->type != GGML_TYPE_F16)) {
|
|
2094
|
-
// only support F32 and F16.
|
|
2095
|
-
return false;
|
|
2322
|
+
case GGML_OP_GET_ROWS:
|
|
2323
|
+
{
|
|
2324
|
+
switch (op->src[0]->type) {
|
|
2325
|
+
case GGML_TYPE_F32:
|
|
2326
|
+
case GGML_TYPE_F16:
|
|
2327
|
+
case GGML_TYPE_Q8_0:
|
|
2328
|
+
return true;
|
|
2329
|
+
default:
|
|
2330
|
+
return false;
|
|
2331
|
+
}
|
|
2096
2332
|
}
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
|
|
2333
|
+
break;
|
|
2334
|
+
case GGML_OP_SET_ROWS:
|
|
2335
|
+
{
|
|
2336
|
+
switch (op->type) {
|
|
2337
|
+
case GGML_TYPE_F32:
|
|
2338
|
+
case GGML_TYPE_F16:
|
|
2339
|
+
return true;
|
|
2340
|
+
default:
|
|
2341
|
+
return false;
|
|
2342
|
+
}
|
|
2101
2343
|
}
|
|
2102
|
-
|
|
2103
|
-
|
|
2104
|
-
|
|
2105
|
-
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
|
|
2109
|
-
case GGML_TYPE_F16:
|
|
2110
|
-
return true;
|
|
2111
|
-
default:
|
|
2344
|
+
break;
|
|
2345
|
+
case GGML_OP_CPY:
|
|
2346
|
+
{
|
|
2347
|
+
ggml_tensor * src = op->src[0];
|
|
2348
|
+
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
|
|
2349
|
+
(src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_F16)) {
|
|
2350
|
+
// only support F32 and F16.
|
|
2112
2351
|
return false;
|
|
2352
|
+
}
|
|
2353
|
+
return true;
|
|
2113
2354
|
}
|
|
2114
|
-
|
|
2115
|
-
case
|
|
2116
|
-
|
|
2117
|
-
|
|
2118
|
-
|
|
2119
|
-
|
|
2120
|
-
|
|
2121
|
-
|
|
2122
|
-
|
|
2123
|
-
|
|
2124
|
-
|
|
2125
|
-
return false;
|
|
2126
|
-
}
|
|
2127
|
-
|
|
2128
|
-
const int mode = ((const int32_t *) op->op_params)[2];
|
|
2129
|
-
if (mode & GGML_ROPE_TYPE_MROPE) {
|
|
2130
|
-
return false;
|
|
2131
|
-
}
|
|
2132
|
-
if (mode & GGML_ROPE_TYPE_VISION) {
|
|
2133
|
-
return false;
|
|
2134
|
-
}
|
|
2135
|
-
|
|
2136
|
-
if(!ggml_is_contiguous(op->src[0])){
|
|
2137
|
-
return false;
|
|
2355
|
+
break;
|
|
2356
|
+
case GGML_OP_CONT:
|
|
2357
|
+
{
|
|
2358
|
+
// TODO: support GGML_TYPE_BF16
|
|
2359
|
+
switch (op->src[0]->type) {
|
|
2360
|
+
case GGML_TYPE_F32:
|
|
2361
|
+
case GGML_TYPE_F16:
|
|
2362
|
+
return true;
|
|
2363
|
+
default:
|
|
2364
|
+
return false;
|
|
2365
|
+
}
|
|
2138
2366
|
}
|
|
2139
|
-
|
|
2140
|
-
|
|
2141
|
-
|
|
2142
|
-
|
|
2143
|
-
|
|
2144
|
-
|
|
2145
|
-
|
|
2367
|
+
case GGML_OP_ROPE:
|
|
2368
|
+
{
|
|
2369
|
+
if (op->src[0]->ne[0] > 896) {
|
|
2370
|
+
return false;
|
|
2371
|
+
}
|
|
2372
|
+
#ifdef ASCEND_310P
|
|
2373
|
+
// TODO: Support rope_dim < ne00(dim)
|
|
2374
|
+
if (op->src[0]->ne[0] != op->op_params[1]) {
|
|
2375
|
+
return false;
|
|
2376
|
+
}
|
|
2377
|
+
if (!ggml_is_contiguous(op->src[0])) {
|
|
2378
|
+
return false;
|
|
2379
|
+
}
|
|
2380
|
+
#endif
|
|
2381
|
+
return true;
|
|
2146
2382
|
}
|
|
2147
|
-
|
|
2148
|
-
|
|
2383
|
+
case GGML_OP_UPSCALE:
|
|
2384
|
+
{
|
|
2385
|
+
// aclnnUpsampleNearest2dGetWorkspaceSize not support
|
|
2386
|
+
// selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
|
|
2387
|
+
if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
|
|
2388
|
+
return false;
|
|
2389
|
+
}
|
|
2390
|
+
if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) {
|
|
2391
|
+
return false;
|
|
2392
|
+
}
|
|
2393
|
+
if (op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS) {
|
|
2394
|
+
return false;
|
|
2395
|
+
}
|
|
2396
|
+
return true;
|
|
2149
2397
|
}
|
|
2150
|
-
|
|
2151
|
-
|
|
2152
|
-
|
|
2153
|
-
const int32_t * opts = (const int32_t *) op->op_params;
|
|
2398
|
+
case GGML_OP_POOL_2D:
|
|
2399
|
+
{
|
|
2400
|
+
const int32_t * opts = (const int32_t *) op->op_params;
|
|
2154
2401
|
#ifdef ASCEND_310P
|
|
2155
|
-
|
|
2156
|
-
|
|
2157
|
-
|
|
2158
|
-
|
|
2402
|
+
enum ggml_op_pool opt = static_cast<ggml_op_pool>(opts[0]);
|
|
2403
|
+
if (opt == GGML_OP_POOL_MAX) {
|
|
2404
|
+
return false;
|
|
2405
|
+
}
|
|
2159
2406
|
#endif
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
|
|
2163
|
-
|
|
2164
|
-
|
|
2165
|
-
|
|
2166
|
-
|
|
2167
|
-
|
|
2407
|
+
const int k0 = opts[1];
|
|
2408
|
+
const int k1 = opts[2];
|
|
2409
|
+
const int p0 = opts[5];
|
|
2410
|
+
const int p1 = opts[6];
|
|
2411
|
+
// value of paddingH should be at most half of kernelH
|
|
2412
|
+
// value of paddingW should be at most half of kernelW
|
|
2413
|
+
return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
|
|
2414
|
+
}
|
|
2168
2415
|
case GGML_OP_SUM:
|
|
2416
|
+
return ggml_is_contiguous_rows(op->src[0]);
|
|
2417
|
+
case GGML_OP_L2_NORM:
|
|
2418
|
+
case GGML_OP_CROSS_ENTROPY_LOSS:
|
|
2169
2419
|
case GGML_OP_DUP:
|
|
2170
2420
|
case GGML_OP_IM2COL:
|
|
2171
2421
|
case GGML_OP_CONCAT:
|
|
@@ -2182,61 +2432,93 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
|
2182
2432
|
case GGML_OP_MUL:
|
|
2183
2433
|
case GGML_OP_DIV:
|
|
2184
2434
|
case GGML_OP_RMS_NORM:
|
|
2185
|
-
case GGML_OP_SCALE:
|
|
2186
2435
|
case GGML_OP_SQR:
|
|
2187
2436
|
case GGML_OP_SQRT:
|
|
2188
2437
|
case GGML_OP_CLAMP:
|
|
2189
2438
|
case GGML_OP_DIAG_MASK_INF:
|
|
2190
|
-
case GGML_OP_SOFT_MAX:
|
|
2191
2439
|
case GGML_OP_SUM_ROWS:
|
|
2192
2440
|
case GGML_OP_ARGSORT:
|
|
2193
2441
|
case GGML_OP_ACC:
|
|
2194
2442
|
case GGML_OP_GROUP_NORM:
|
|
2443
|
+
return true;
|
|
2195
2444
|
case GGML_OP_PAD:
|
|
2445
|
+
// TODO: add circular padding support for cann, see https://github.com/ggml-org/llama.cpp/pull/16985
|
|
2446
|
+
return ggml_get_op_params_i32(op, 8) == 0;
|
|
2196
2447
|
case GGML_OP_ARANGE:
|
|
2197
2448
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
2198
2449
|
case GGML_OP_LEAKY_RELU:
|
|
2199
2450
|
case GGML_OP_ARGMAX:
|
|
2200
2451
|
case GGML_OP_COS:
|
|
2201
2452
|
case GGML_OP_SIN:
|
|
2202
|
-
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
2203
2453
|
case GGML_OP_LOG:
|
|
2204
2454
|
case GGML_OP_MEAN:
|
|
2205
2455
|
case GGML_OP_PAD_REFLECT_1D:
|
|
2206
2456
|
case GGML_OP_COUNT_EQUAL:
|
|
2207
2457
|
return true;
|
|
2208
|
-
case
|
|
2209
|
-
|
|
2210
|
-
|
|
2211
|
-
|
|
2212
|
-
}
|
|
2213
|
-
if(op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 && op->src[1]->type != GGML_TYPE_BF16){
|
|
2214
|
-
return false;
|
|
2215
|
-
}
|
|
2216
|
-
if(op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16){
|
|
2217
|
-
return false;
|
|
2218
|
-
}
|
|
2219
|
-
if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
|
|
2220
|
-
// different head sizes of K and V are not supported yet
|
|
2221
|
-
return false;
|
|
2222
|
-
}
|
|
2223
|
-
if (op->src[0]->ne[0] == 192) {
|
|
2224
|
-
return false;
|
|
2225
|
-
}
|
|
2226
|
-
if (op->src[0]->ne[0] == 576) {
|
|
2227
|
-
// DeepSeek MLA
|
|
2458
|
+
case GGML_OP_OUT_PROD:
|
|
2459
|
+
{
|
|
2460
|
+
#ifdef ASCEND_310P
|
|
2461
|
+
// Ger is not supported on 310p device
|
|
2228
2462
|
return false;
|
|
2463
|
+
#endif
|
|
2464
|
+
switch (op->src[0]->type) {
|
|
2465
|
+
case GGML_TYPE_F16:
|
|
2466
|
+
case GGML_TYPE_F32:
|
|
2467
|
+
return true;
|
|
2468
|
+
default:
|
|
2469
|
+
return false;
|
|
2470
|
+
}
|
|
2229
2471
|
}
|
|
2230
|
-
|
|
2472
|
+
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
2473
|
+
return true;
|
|
2474
|
+
case GGML_OP_SCALE:
|
|
2475
|
+
float bias;
|
|
2476
|
+
memcpy(&bias, (const float *) (op->op_params) + 1, sizeof(float));
|
|
2477
|
+
return bias == 0.0f; // TODO: support bias != 0.0f
|
|
2478
|
+
case GGML_OP_SOFT_MAX:
|
|
2479
|
+
// TODO: support attention sinks [TAG_ATTN_SINKS]
|
|
2480
|
+
if (op->src[2]) {
|
|
2231
2481
|
return false;
|
|
2232
2482
|
}
|
|
2233
|
-
|
|
2234
|
-
|
|
2235
|
-
|
|
2483
|
+
return true;
|
|
2484
|
+
case GGML_OP_FLASH_ATTN_EXT:
|
|
2485
|
+
{
|
|
2486
|
+
#ifdef ASCEND_310P
|
|
2487
|
+
// FA not support on 310p device
|
|
2236
2488
|
return false;
|
|
2489
|
+
#endif
|
|
2490
|
+
// derived from [ggml-cuda.cu]
|
|
2491
|
+
if (op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16) {
|
|
2492
|
+
return false;
|
|
2493
|
+
}
|
|
2494
|
+
if (op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 &&
|
|
2495
|
+
op->src[1]->type != GGML_TYPE_BF16) {
|
|
2496
|
+
return false;
|
|
2497
|
+
}
|
|
2498
|
+
if (op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16) {
|
|
2499
|
+
return false;
|
|
2500
|
+
}
|
|
2501
|
+
// TODO: support attention sinks [TAG_ATTN_SINKS]
|
|
2502
|
+
if (op->src[4]) {
|
|
2503
|
+
return false;
|
|
2504
|
+
}
|
|
2505
|
+
if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
|
|
2506
|
+
// different head sizes of K and V are not supported yet
|
|
2507
|
+
return false;
|
|
2508
|
+
}
|
|
2509
|
+
if (op->src[0]->ne[0] % 16 != 0) {
|
|
2510
|
+
// TODO: padding to support
|
|
2511
|
+
return false;
|
|
2512
|
+
}
|
|
2513
|
+
float logitSoftcap = 0.0f;
|
|
2514
|
+
memcpy(&logitSoftcap, (const float *) (op->op_params) + 2, sizeof(float));
|
|
2515
|
+
if (logitSoftcap != 0.0f) {
|
|
2516
|
+
return false;
|
|
2517
|
+
}
|
|
2518
|
+
return true;
|
|
2237
2519
|
}
|
|
2520
|
+
case GGML_OP_SSM_CONV:
|
|
2238
2521
|
return true;
|
|
2239
|
-
}
|
|
2240
2522
|
default:
|
|
2241
2523
|
return false;
|
|
2242
2524
|
}
|
|
@@ -2259,28 +2541,6 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
|
|
|
2259
2541
|
return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
|
|
2260
2542
|
}
|
|
2261
2543
|
|
|
2262
|
-
/**
|
|
2263
|
-
* @brief Determines if a tensor operation should be offloaded to the CANN
|
|
2264
|
-
* backend.
|
|
2265
|
-
*
|
|
2266
|
-
* This function checks if a given tensor operation should be offloaded to the
|
|
2267
|
-
* CANN backend based on the operation type and the size of the tensor. It
|
|
2268
|
-
* returns true if the second dimension (ne[1]) of the tensor is greater than or
|
|
2269
|
-
* equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
|
|
2270
|
-
*
|
|
2271
|
-
* @param backend Pointer to the CANN backend.
|
|
2272
|
-
* @param op Pointer to the tensor operation to check.
|
|
2273
|
-
* @return bool Returns true if the operation should be offloaded, otherwise
|
|
2274
|
-
* false.
|
|
2275
|
-
*/
|
|
2276
|
-
static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
|
|
2277
|
-
const ggml_tensor* op) {
|
|
2278
|
-
const int min_batch_size = 32;
|
|
2279
|
-
GGML_UNUSED(dev);
|
|
2280
|
-
|
|
2281
|
-
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
|
|
2282
|
-
}
|
|
2283
|
-
|
|
2284
2544
|
/**
|
|
2285
2545
|
* @brief Records an event on the CANN backend stream.
|
|
2286
2546
|
*
|
|
@@ -2290,9 +2550,8 @@ static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
|
|
|
2290
2550
|
* @param event Pointer to the event structure to be recorded.
|
|
2291
2551
|
*/
|
|
2292
2552
|
static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
|
|
2293
|
-
ggml_backend_cann_context* cann_ctx =
|
|
2294
|
-
|
|
2295
|
-
ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
|
|
2553
|
+
ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
|
|
2554
|
+
ACL_CHECK(aclrtRecordEvent((aclrtEvent) event->context, cann_ctx->stream()));
|
|
2296
2555
|
}
|
|
2297
2556
|
|
|
2298
2557
|
/**
|
|
@@ -2305,13 +2564,10 @@ static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_
|
|
|
2305
2564
|
* @param event Pointer to the event structure that the backend needs to wait
|
|
2306
2565
|
* for.
|
|
2307
2566
|
*/
|
|
2308
|
-
static void ggml_backend_cann_event_wait(ggml_backend_t backend,
|
|
2309
|
-
|
|
2310
|
-
ggml_backend_cann_context* cann_ctx =
|
|
2311
|
-
(ggml_backend_cann_context*)backend->context;
|
|
2567
|
+
static void ggml_backend_cann_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
|
2568
|
+
ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
|
|
2312
2569
|
if (ggml_backend_is_cann(backend)) {
|
|
2313
|
-
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
|
|
2314
|
-
(aclrtEvent)event->context));
|
|
2570
|
+
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(), (aclrtEvent) event->context));
|
|
2315
2571
|
} else {
|
|
2316
2572
|
GGML_ABORT("fatal error");
|
|
2317
2573
|
}
|
|
@@ -2338,6 +2594,7 @@ static const ggml_backend_i ggml_backend_cann_interface = {
|
|
|
2338
2594
|
/* .graph_compute = */ ggml_backend_cann_graph_compute,
|
|
2339
2595
|
/* .event_record = */ ggml_backend_cann_event_record,
|
|
2340
2596
|
/* .event_wait = */ ggml_backend_cann_event_wait,
|
|
2597
|
+
/* .graph_optimize = */ NULL,
|
|
2341
2598
|
};
|
|
2342
2599
|
|
|
2343
2600
|
/**
|
|
@@ -2349,30 +2606,31 @@ static const ggml_backend_i ggml_backend_cann_interface = {
|
|
|
2349
2606
|
* @return A pointer to the static GUID.
|
|
2350
2607
|
*/
|
|
2351
2608
|
static ggml_guid_t ggml_backend_cann_guid() {
|
|
2352
|
-
static ggml_guid guid = {0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34,
|
|
2353
|
-
|
|
2609
|
+
static ggml_guid guid = { 0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34,
|
|
2610
|
+
0xbe, 0x1a, 0x9e, 0x71, 0x1f, 0x9e, 0xed, 0x64 };
|
|
2354
2611
|
return &guid;
|
|
2355
2612
|
}
|
|
2356
2613
|
|
|
2357
2614
|
// backend device
|
|
2358
2615
|
struct ggml_backend_cann_device_context {
|
|
2359
|
-
int
|
|
2616
|
+
int device;
|
|
2360
2617
|
std::string name;
|
|
2361
2618
|
std::string description;
|
|
2619
|
+
int op_offload_min_batch_size;
|
|
2362
2620
|
};
|
|
2363
2621
|
|
|
2364
2622
|
static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
|
|
2365
|
-
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
|
|
2623
|
+
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
|
|
2366
2624
|
return ctx->name.c_str();
|
|
2367
2625
|
}
|
|
2368
2626
|
|
|
2369
|
-
static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
|
|
2370
|
-
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
|
|
2627
|
+
static const char * ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
|
|
2628
|
+
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
|
|
2371
2629
|
return ctx->description.c_str();
|
|
2372
2630
|
}
|
|
2373
2631
|
|
|
2374
2632
|
static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
|
2375
|
-
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
|
|
2633
|
+
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
|
|
2376
2634
|
ggml_backend_cann_get_device_memory(ctx->device, free, total);
|
|
2377
2635
|
}
|
|
2378
2636
|
|
|
@@ -2399,7 +2657,7 @@ static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_back
|
|
|
2399
2657
|
|
|
2400
2658
|
static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
|
|
2401
2659
|
GGML_UNUSED(params);
|
|
2402
|
-
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
|
|
2660
|
+
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
|
|
2403
2661
|
return ggml_backend_cann_init(ctx->device);
|
|
2404
2662
|
}
|
|
2405
2663
|
|
|
@@ -2416,19 +2674,17 @@ static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, cons
|
|
|
2416
2674
|
* @return bool Returns true if the CANN backend supports the buffer type,
|
|
2417
2675
|
* otherwise false.
|
|
2418
2676
|
*/
|
|
2419
|
-
static bool ggml_backend_cann_supports_buft(
|
|
2420
|
-
ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
|
2677
|
+
static bool ggml_backend_cann_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
|
2421
2678
|
if (ggml_backend_buft_is_cann(buft)) {
|
|
2422
|
-
ggml_backend_cann_device_context *
|
|
2423
|
-
ggml_backend_cann_buffer_type_context * buft_ctx =
|
|
2424
|
-
(ggml_backend_cann_buffer_type_context *)buft->context;
|
|
2679
|
+
ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *) dev->context;
|
|
2680
|
+
ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
|
|
2425
2681
|
return buft_ctx->device == dev_ctx->device;
|
|
2426
2682
|
}
|
|
2427
2683
|
return false;
|
|
2428
2684
|
}
|
|
2429
2685
|
|
|
2430
2686
|
static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
|
|
2431
|
-
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
|
|
2687
|
+
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
|
|
2432
2688
|
return ggml_backend_cann_buffer_type(ctx->device);
|
|
2433
2689
|
}
|
|
2434
2690
|
|
|
@@ -2437,6 +2693,26 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
|
|
|
2437
2693
|
return ggml_backend_cann_host_buffer_type();
|
|
2438
2694
|
}
|
|
2439
2695
|
|
|
2696
|
+
/**
|
|
2697
|
+
* @brief Determines if a tensor operation should be offloaded to the CANN
|
|
2698
|
+
* backend.
|
|
2699
|
+
*
|
|
2700
|
+
* This function checks if a given tensor operation should be offloaded to the
|
|
2701
|
+
* CANN backend based on the operation type and the size of the tensor. It
|
|
2702
|
+
* returns true if the second dimension (ne[1]) of the tensor is greater than or
|
|
2703
|
+
* equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
|
|
2704
|
+
*
|
|
2705
|
+
* @param backend Pointer to the CANN backend.
|
|
2706
|
+
* @param op Pointer to the tensor operation to check.
|
|
2707
|
+
* @return bool Returns true if the operation should be offloaded, otherwise
|
|
2708
|
+
* false.
|
|
2709
|
+
*/
|
|
2710
|
+
static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
|
2711
|
+
ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
|
|
2712
|
+
|
|
2713
|
+
return op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS;
|
|
2714
|
+
}
|
|
2715
|
+
|
|
2440
2716
|
/**
|
|
2441
2717
|
* @brief Creates a new event for the CANN backend device.
|
|
2442
2718
|
*
|
|
@@ -2447,9 +2723,8 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
|
|
|
2447
2723
|
* @param backend Pointer to the CANN backend.
|
|
2448
2724
|
* @return ggml_backend_event_t Returns a pointer to the new event structure.
|
|
2449
2725
|
*/
|
|
2450
|
-
static ggml_backend_event_t ggml_backend_cann_device_event_new(
|
|
2451
|
-
|
|
2452
|
-
ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
|
|
2726
|
+
static ggml_backend_event_t ggml_backend_cann_device_event_new(ggml_backend_dev_t dev) {
|
|
2727
|
+
ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *) dev->context;
|
|
2453
2728
|
|
|
2454
2729
|
ggml_cann_set_device(dev_ctx->device);
|
|
2455
2730
|
|
|
@@ -2471,7 +2746,7 @@ static ggml_backend_event_t ggml_backend_cann_device_event_new(
|
|
|
2471
2746
|
* @param event Pointer to the event structure to be freed.
|
|
2472
2747
|
*/
|
|
2473
2748
|
static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
|
|
2474
|
-
ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
|
|
2749
|
+
ACL_CHECK(aclrtDestroyEvent((aclrtEvent) event->context));
|
|
2475
2750
|
|
|
2476
2751
|
delete event;
|
|
2477
2752
|
GGML_UNUSED(dev);
|
|
@@ -2485,7 +2760,7 @@ static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_bac
|
|
|
2485
2760
|
* @param event Pointer to the event structure to be synchronized.
|
|
2486
2761
|
*/
|
|
2487
2762
|
static void ggml_backend_cann_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
|
|
2488
|
-
ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
|
|
2763
|
+
ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent) event->context));
|
|
2489
2764
|
|
|
2490
2765
|
GGML_UNUSED(dev);
|
|
2491
2766
|
}
|
|
@@ -2496,10 +2771,10 @@ static const ggml_backend_device_i ggml_backend_cann_device_interface = {
|
|
|
2496
2771
|
/* .get_memory = */ ggml_backend_cann_device_get_memory,
|
|
2497
2772
|
/* .get_type = */ ggml_backend_cann_device_get_type,
|
|
2498
2773
|
/* .get_props = */ ggml_backend_cann_device_get_props,
|
|
2499
|
-
/* .init_backend = */ ggml_backend_cann_device_init,
|
|
2774
|
+
/* .init_backend = */ ggml_backend_cann_device_init, // called for every card
|
|
2500
2775
|
/* .get_buffer_type = */ ggml_backend_cann_device_get_buffer_type,
|
|
2501
2776
|
/* .get_host_buffer_type = */ ggml_backend_cann_device_get_host_buffer_type,
|
|
2502
|
-
/* .buffer_from_host_ptr = */ NULL,
|
|
2777
|
+
/* .buffer_from_host_ptr = */ NULL, // not supported for CANN
|
|
2503
2778
|
/* .supports_op = */ ggml_backend_cann_supports_op,
|
|
2504
2779
|
/* .supports_buft = */ ggml_backend_cann_supports_buft,
|
|
2505
2780
|
/* .offload_op = */ ggml_backend_cann_offload_op,
|
|
@@ -2508,7 +2783,6 @@ static const ggml_backend_device_i ggml_backend_cann_device_interface = {
|
|
|
2508
2783
|
/* .event_synchronize = */ ggml_backend_cann_device_event_synchronize,
|
|
2509
2784
|
};
|
|
2510
2785
|
|
|
2511
|
-
|
|
2512
2786
|
// backend reg
|
|
2513
2787
|
struct ggml_backend_cann_reg_context {
|
|
2514
2788
|
std::vector<ggml_backend_dev_t> devices;
|
|
@@ -2520,12 +2794,12 @@ static const char * ggml_backend_cann_reg_get_name(ggml_backend_reg_t reg) {
|
|
|
2520
2794
|
}
|
|
2521
2795
|
|
|
2522
2796
|
static size_t ggml_backend_cann_reg_get_device_count(ggml_backend_reg_t reg) {
|
|
2523
|
-
ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
|
|
2797
|
+
ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *) reg->context;
|
|
2524
2798
|
return ctx->devices.size();
|
|
2525
2799
|
}
|
|
2526
2800
|
|
|
2527
2801
|
static ggml_backend_dev_t ggml_backend_cann_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
|
2528
|
-
ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
|
|
2802
|
+
ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *) reg->context;
|
|
2529
2803
|
GGML_ASSERT(index < ctx->devices.size());
|
|
2530
2804
|
return ctx->devices[index];
|
|
2531
2805
|
}
|
|
@@ -2547,34 +2821,32 @@ static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
|
|
|
2547
2821
|
// backend registry, called only once for cann backend
|
|
2548
2822
|
ggml_backend_reg_t ggml_backend_cann_reg() {
|
|
2549
2823
|
static ggml_backend_reg reg;
|
|
2550
|
-
static bool
|
|
2824
|
+
static bool initialized = false;
|
|
2551
2825
|
|
|
2552
2826
|
{
|
|
2553
|
-
static std::mutex
|
|
2827
|
+
static std::mutex mutex;
|
|
2554
2828
|
std::lock_guard<std::mutex> lock(mutex);
|
|
2555
2829
|
if (!initialized) {
|
|
2556
2830
|
aclInit(nullptr);
|
|
2557
2831
|
ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
|
|
2832
|
+
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
|
2558
2833
|
|
|
2559
2834
|
for (int i = 0; i < ggml_cann_info().device_count; i++) {
|
|
2560
|
-
ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context();
|
|
2561
|
-
dev_ctx->description
|
|
2562
|
-
dev_ctx->device
|
|
2563
|
-
dev_ctx->name
|
|
2835
|
+
ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context();
|
|
2836
|
+
dev_ctx->description = aclrtGetSocName();
|
|
2837
|
+
dev_ctx->device = i;
|
|
2838
|
+
dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
|
|
2839
|
+
dev_ctx->op_offload_min_batch_size = min_batch_size;
|
|
2564
2840
|
ggml_cann_set_device(i);
|
|
2565
|
-
ggml_backend_dev_t dev = new ggml_backend_device
|
|
2566
|
-
|
|
2567
|
-
|
|
2568
|
-
/* .context = */ dev_ctx
|
|
2569
|
-
};
|
|
2841
|
+
ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface = */ ggml_backend_cann_device_interface,
|
|
2842
|
+
/* .reg = */ ®,
|
|
2843
|
+
/* .context = */ dev_ctx };
|
|
2570
2844
|
ctx->devices.push_back(dev);
|
|
2571
2845
|
}
|
|
2572
2846
|
|
|
2573
|
-
reg = ggml_backend_reg
|
|
2574
|
-
|
|
2575
|
-
|
|
2576
|
-
/* .context = */ ctx
|
|
2577
|
-
};
|
|
2847
|
+
reg = ggml_backend_reg{ /* .api_version = */ GGML_BACKEND_API_VERSION,
|
|
2848
|
+
/* .iface = */ ggml_backend_cann_reg_interface,
|
|
2849
|
+
/* .context = */ ctx };
|
|
2578
2850
|
}
|
|
2579
2851
|
|
|
2580
2852
|
initialized = true;
|
|
@@ -2590,39 +2862,36 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) {
|
|
|
2590
2862
|
return nullptr;
|
|
2591
2863
|
}
|
|
2592
2864
|
|
|
2593
|
-
ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
|
|
2865
|
+
ggml_backend_cann_context * ctx = new ggml_backend_cann_context(device);
|
|
2594
2866
|
if (ctx == nullptr) {
|
|
2595
2867
|
GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
|
2596
2868
|
return nullptr;
|
|
2597
2869
|
}
|
|
2598
2870
|
ggml_cann_set_device(ctx->device);
|
|
2599
2871
|
ggml_backend_t cann_backend =
|
|
2600
|
-
new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
|
|
2601
|
-
|
|
2602
|
-
|
|
2603
|
-
|
|
2872
|
+
new ggml_backend{ /* .guid = */ ggml_backend_cann_guid(),
|
|
2873
|
+
/* .interface = */ ggml_backend_cann_interface,
|
|
2874
|
+
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
|
|
2875
|
+
/* .context = */ ctx };
|
|
2604
2876
|
|
|
2605
2877
|
return cann_backend;
|
|
2606
2878
|
}
|
|
2607
2879
|
|
|
2608
2880
|
bool ggml_backend_is_cann(ggml_backend_t backend) {
|
|
2609
|
-
return backend != NULL &&
|
|
2610
|
-
ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
|
|
2881
|
+
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
|
|
2611
2882
|
}
|
|
2612
2883
|
|
|
2613
2884
|
int32_t ggml_backend_cann_get_device_count() {
|
|
2614
2885
|
return ggml_cann_info().device_count;
|
|
2615
2886
|
}
|
|
2616
2887
|
|
|
2617
|
-
void ggml_backend_cann_get_device_description(
|
|
2618
|
-
int32_t device, char* description, size_t description_size) {
|
|
2888
|
+
void ggml_backend_cann_get_device_description(int32_t device, char * description, size_t description_size) {
|
|
2619
2889
|
ggml_cann_set_device(device);
|
|
2620
|
-
const char* soc_name = aclrtGetSocName();
|
|
2890
|
+
const char * soc_name = aclrtGetSocName();
|
|
2621
2891
|
snprintf(description, description_size, "%s", soc_name);
|
|
2622
2892
|
}
|
|
2623
2893
|
|
|
2624
|
-
void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
|
|
2625
|
-
size_t* total) {
|
|
2894
|
+
void ggml_backend_cann_get_device_memory(int32_t device, size_t * free, size_t * total) {
|
|
2626
2895
|
ggml_cann_set_device(device);
|
|
2627
2896
|
ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
|
|
2628
2897
|
}
|