whispercpp 1.3.3 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -43
- data/ext/extconf.rb +2 -2
- data/ext/ruby_whisper.c +14 -2
- data/ext/ruby_whisper.h +39 -0
- data/ext/ruby_whisper_context.c +22 -22
- data/ext/ruby_whisper_model.c +12 -12
- data/ext/ruby_whisper_params.c +79 -25
- data/ext/ruby_whisper_segment.c +84 -19
- data/ext/ruby_whisper_token.c +351 -0
- data/ext/ruby_whisper_transcribe.cpp +1 -1
- data/ext/ruby_whisper_vad_context.c +75 -0
- data/ext/ruby_whisper_vad_context_detect.cpp +50 -0
- data/ext/ruby_whisper_vad_segment.c +139 -0
- data/ext/ruby_whisper_vad_segments.c +106 -0
- data/ext/sources/CMakeLists.txt +4 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/arm64-apple-clang.cmake +16 -0
- data/ext/sources/cmake/arm64-windows-llvm.cmake +16 -0
- data/ext/sources/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
- data/ext/sources/cmake/x64-windows-llvm.cmake +5 -0
- data/ext/sources/examples/CMakeLists.txt +1 -0
- data/ext/sources/examples/addon.node/addon.cpp +19 -19
- data/ext/sources/examples/addon.node/index.js +7 -5
- data/ext/sources/examples/addon.node/vad-example.js +2 -2
- data/ext/sources/examples/bench/bench.cpp +26 -16
- data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
- data/ext/sources/examples/cli/cli.cpp +122 -111
- data/ext/sources/examples/command/command.cpp +26 -24
- data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/lsp/CMakeLists.txt +2 -1
- data/ext/sources/examples/lsp/lsp.cpp +19 -17
- data/ext/sources/examples/quantize/CMakeLists.txt +2 -1
- data/ext/sources/examples/server/server.cpp +34 -24
- data/ext/sources/examples/server.py +6 -1
- data/ext/sources/examples/stream/stream.cpp +4 -2
- data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
- data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
- data/ext/sources/examples/talk-llama/CMakeLists.txt +7 -3
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +113 -7
- data/ext/sources/examples/talk-llama/llama-adapter.h +13 -1
- data/ext/sources/examples/talk-llama/llama-arch.cpp +2136 -1491
- data/ext/sources/examples/talk-llama/llama-arch.h +125 -3
- data/ext/sources/examples/talk-llama/llama-batch.cpp +174 -100
- data/ext/sources/examples/talk-llama/llama-batch.h +46 -20
- data/ext/sources/examples/talk-llama/llama-chat.cpp +199 -8
- data/ext/sources/examples/talk-llama/llama-chat.h +11 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +1213 -413
- data/ext/sources/examples/talk-llama/llama-context.h +99 -36
- data/ext/sources/examples/talk-llama/llama-cparams.h +5 -4
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +288 -53
- data/ext/sources/examples/talk-llama/llama-grammar.h +22 -1
- data/ext/sources/examples/talk-llama/llama-graph.cpp +883 -294
- data/ext/sources/examples/talk-llama/llama-graph.h +361 -161
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +144 -6
- data/ext/sources/examples/talk-llama/llama-hparams.h +100 -23
- data/ext/sources/examples/talk-llama/llama-impl.cpp +7 -3
- data/ext/sources/examples/talk-llama/llama-impl.h +3 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +328 -0
- data/ext/sources/examples/talk-llama/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +38 -29
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2100 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +373 -27
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +124 -30
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +63 -41
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +30 -29
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +77 -35
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +15 -16
- data/ext/sources/examples/talk-llama/llama-memory.h +16 -10
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +172 -37
- data/ext/sources/examples/talk-llama/llama-mmap.h +8 -3
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +93 -9
- data/ext/sources/examples/talk-llama/llama-model-loader.h +9 -2
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +3 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +3369 -10145
- data/ext/sources/examples/talk-llama/llama-model.h +104 -12
- data/ext/sources/examples/talk-llama/llama-quant.cpp +53 -30
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +1520 -324
- data/ext/sources/examples/talk-llama/llama-sampling.h +19 -7
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +562 -39
- data/ext/sources/examples/talk-llama/llama-vocab.h +50 -0
- data/ext/sources/examples/talk-llama/llama.cpp +794 -12
- data/ext/sources/examples/talk-llama/llama.h +246 -190
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +191 -0
- data/ext/sources/examples/talk-llama/models/apertus.cpp +125 -0
- data/ext/sources/examples/talk-llama/models/arcee.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/arctic.cpp +138 -0
- data/ext/sources/examples/talk-llama/models/arwkv7.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/bert.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +160 -0
- data/ext/sources/examples/talk-llama/models/bloom.cpp +101 -0
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +111 -0
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +102 -0
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +134 -0
- data/ext/sources/examples/talk-llama/models/command-r.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/deci.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +259 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +134 -0
- data/ext/sources/examples/talk-llama/models/dream.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +114 -0
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +113 -0
- data/ext/sources/examples/talk-llama/models/falcon.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +116 -0
- data/ext/sources/examples/talk-llama/models/gemma.cpp +112 -0
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +155 -0
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +384 -0
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +170 -0
- data/ext/sources/examples/talk-llama/models/glm4.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +196 -0
- data/ext/sources/examples/talk-llama/models/granite.cpp +211 -0
- data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +283 -0
- data/ext/sources/examples/talk-llama/models/grok.cpp +159 -0
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +141 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +154 -0
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/jais.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +106 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +175 -0
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/llada.cpp +99 -0
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/llama.cpp +168 -0
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/mamba.cpp +55 -0
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +199 -0
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +160 -0
- data/ext/sources/examples/talk-llama/models/models.h +569 -0
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +116 -0
- data/ext/sources/examples/talk-llama/models/mpt.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +104 -0
- data/ext/sources/examples/talk-llama/models/olmo.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +127 -0
- data/ext/sources/examples/talk-llama/models/openelm.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/orion.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi3.cpp +152 -0
- data/ext/sources/examples/talk-llama/models/plamo.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +316 -0
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/plm.cpp +168 -0
- data/ext/sources/examples/talk-llama/models/qwen.cpp +108 -0
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +151 -0
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +873 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +149 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +141 -0
- data/ext/sources/examples/talk-llama/models/refact.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +162 -0
- data/ext/sources/examples/talk-llama/models/rwkv6.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/rwkv7.cpp +90 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +146 -0
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +100 -0
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +166 -0
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +96 -0
- data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +149 -0
- data/ext/sources/examples/talk-llama/models/xverse.cpp +108 -0
- data/ext/sources/examples/talk-llama/talk-llama.cpp +9 -6
- data/ext/sources/examples/talk-llama/unicode.cpp +309 -16
- data/ext/sources/examples/talk-llama/unicode.h +45 -0
- data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +1 -1
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +4 -2
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +18 -17
- data/ext/sources/ggml/CMakeLists.txt +135 -79
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
- data/ext/sources/ggml/include/ggml-alloc.h +9 -0
- data/ext/sources/ggml/include/ggml-backend.h +21 -2
- data/ext/sources/ggml/include/ggml-cpu.h +2 -1
- data/ext/sources/ggml/include/ggml-hexagon.h +19 -0
- data/ext/sources/ggml/include/ggml-metal.h +1 -6
- data/ext/sources/ggml/include/ggml-opt.h +25 -6
- data/ext/sources/ggml/include/ggml-rpc.h +8 -11
- data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
- data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
- data/ext/sources/ggml/include/ggml-zendnn.h +22 -0
- data/ext/sources/ggml/include/ggml.h +406 -23
- data/ext/sources/ggml/src/CMakeLists.txt +99 -13
- data/ext/sources/ggml/src/ggml-alloc.c +368 -161
- data/ext/sources/ggml/src/ggml-backend-impl.h +5 -5
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +55 -14
- data/ext/sources/ggml/src/ggml-backend.cpp +290 -57
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +17 -3
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +10 -13
- data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +14 -0
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +59 -45
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +138 -47
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +2586 -1917
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +348 -309
- data/ext/sources/ggml/src/ggml-cann/common.h +350 -133
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +894 -625
- data/ext/sources/ggml/src/ggml-common.h +17 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +167 -75
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +560 -622
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1002 -270
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +107 -587
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +373 -486
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +521 -353
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +4682 -1660
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +82 -4
- data/ext/sources/ggml/src/ggml-cpu/common.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +18 -9
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +263 -111
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +39 -28
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +683 -82
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +38 -43
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +435 -119
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1234 -1182
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +2167 -1480
- data/ext/sources/ggml/src/ggml-cpu/ops.h +10 -12
- data/ext/sources/ggml/src/ggml-cpu/quants.c +35 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1132 -81
- data/ext/sources/ggml/src/ggml-cpu/repack.h +36 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +120 -93
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- data/ext/sources/ggml/src/ggml-cpu/traits.cpp +2 -2
- data/ext/sources/ggml/src/ggml-cpu/traits.h +1 -1
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +7 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +294 -27
- data/ext/sources/ggml/src/ggml-cpu/vec.h +606 -48
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +92 -17
- data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
- data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/argmax.cu +2 -2
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +123 -6
- data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +16 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +588 -128
- data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +95 -22
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +25 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +335 -485
- data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +1 -5
- data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cu +307 -0
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
- data/ext/sources/ggml/src/ggml-cuda/diag.cu +77 -0
- data/ext/sources/ggml/src/ggml-cuda/diag.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +519 -378
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +750 -637
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +1244 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +98 -61
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +230 -197
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/fill.cu +37 -0
- data/ext/sources/ggml/src/ggml-cuda/fill.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1557 -294
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
- data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +57 -2
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +915 -69
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +171 -0
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +835 -0
- data/ext/sources/ggml/src/ggml-cuda/mmid.cu +164 -0
- data/ext/sources/ggml/src/ggml-cuda/mmid.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +109 -67
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1601 -733
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +802 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +12 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +286 -149
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
- data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +86 -32
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +163 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +14 -0
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +207 -98
- data/ext/sources/ggml/src/ggml-cuda/rope.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +330 -0
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/set.cu +39 -0
- data/ext/sources/ggml/src/ggml-cuda/set.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +325 -61
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +275 -0
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +14 -12
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +291 -104
- data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +21 -4
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +96 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +351 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +21 -0
- data/ext/sources/ggml/src/ggml-cuda/tri.cu +136 -0
- data/ext/sources/ggml/src/ggml-cuda/tri.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +189 -5
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +44 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +248 -6
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +8 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +70 -37
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +10 -3
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +80 -0
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3151 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +44 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +682 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +566 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +112 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.c +63 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +157 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +165 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +92 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +94 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +72 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +1020 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +1353 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +1001 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2503 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +487 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +168 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +287 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +454 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +221 -0
- data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +153 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +16 -13
- data/ext/sources/ggml/src/ggml-impl.h +186 -15
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -7
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +609 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1743 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +273 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1686 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +356 -61
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +4161 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +724 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +4495 -1876
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +21 -9
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +29 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4005 -427
- data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +147 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +66 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +82 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
- data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +177 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +94 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +33 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
- data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
- data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
- data/ext/sources/ggml/src/ggml-quants.c +111 -16
- data/ext/sources/ggml/src/ggml-quants.h +6 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +497 -195
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +48 -3
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +77 -0
- data/ext/sources/ggml/src/ggml-sycl/add-id.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +6 -5
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +117 -15
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +50 -30
- data/ext/sources/ggml/src/ggml-sycl/conv.cpp +10 -4
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +200 -99
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +79 -0
- data/ext/sources/ggml/src/ggml-sycl/count-equal.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +72 -309
- data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +18 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +67 -49
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +77 -34
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +397 -314
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +12 -2
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +14 -26
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +9 -6
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +643 -413
- data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +80 -60
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +223 -132
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +230 -55
- data/ext/sources/ggml/src/ggml-sycl/norm.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.cpp +97 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.hpp +24 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +8 -9
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.cpp +122 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +65 -59
- data/ext/sources/ggml/src/ggml-sycl/set.cpp +73 -0
- data/ext/sources/ggml/src/ggml-sycl/set.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +330 -165
- data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +4 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +12 -6
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +60 -6
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +16 -12
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +7398 -2635
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +43 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +15 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +56 -39
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +347 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +5 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +67 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +158 -16
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +38 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +103 -36
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +220 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +139 -45
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +113 -38
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +75 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +19 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +2 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +33 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +227 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +71 -21
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +41 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +44 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +20 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +143 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +494 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -556
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +230 -51
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +566 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +72 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +90 -223
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +195 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +41 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +59 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +104 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +234 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +6 -52
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +6 -35
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +6 -35
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +6 -39
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +5 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +30 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +16 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +43 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +435 -24
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +148 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.glsl +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +619 -177
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +169 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3087 -0
- data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +147 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +591 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +112 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +483 -0
- data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +466 -0
- data/ext/sources/ggml/src/ggml.c +901 -129
- data/ext/sources/ggml/src/gguf.cpp +8 -1
- data/ext/sources/include/whisper.h +1 -0
- data/ext/sources/src/CMakeLists.txt +3 -1
- data/ext/sources/src/whisper.cpp +124 -81
- data/ext/sources/tests/CMakeLists.txt +8 -1
- data/ext/sources/tests/test-vad-full.cpp +7 -5
- data/ext/sources/tests/test-vad.cpp +3 -3
- data/extsources.rb +1 -0
- data/lib/whisper/model/uri.rb +17 -18
- data/sig/whisper.rbs +126 -2
- data/test/test_params.rb +24 -8
- data/test/test_segment.rb +0 -1
- data/test/test_token.rb +70 -0
- data/test/test_vad.rb +1 -1
- data/test/test_vad_context.rb +50 -0
- data/test/test_vad_segment.rb +19 -0
- data/test/test_vad_segments.rb +16 -0
- data/test/test_whisper.rb +8 -1
- data/whispercpp.gemspec +1 -1
- metadata +439 -179
- data/ext/sources/build-xcframework.sh +0 -547
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +0 -279
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +0 -1841
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +0 -303
- data/ext/sources/ggml/include/ggml-kompute.h +0 -50
- data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
- data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
- data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
- data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
- data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
- data/ext/sources/ggml/src/ggml-cann/Doxyfile +0 -2579
- data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
- data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -506
- data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +0 -11
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -6280
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +0 -162
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -118
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -99
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -58
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
|
@@ -19,9 +19,8 @@
|
|
|
19
19
|
#include <stdio.h>
|
|
20
20
|
#include <stdlib.h>
|
|
21
21
|
#include <string.h>
|
|
22
|
-
#include <string>
|
|
23
|
-
#include <vector>
|
|
24
22
|
#include <algorithm>
|
|
23
|
+
#include <vector>
|
|
25
24
|
|
|
26
25
|
#ifdef __APPLE__
|
|
27
26
|
#include <sys/types.h>
|
|
@@ -32,23 +31,26 @@
|
|
|
32
31
|
// backend buffer type
|
|
33
32
|
|
|
34
33
|
const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
|
34
|
+
GGML_ASSERT(buft);
|
|
35
35
|
return buft->iface.get_name(buft);
|
|
36
36
|
}
|
|
37
37
|
|
|
38
38
|
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
39
|
+
GGML_ASSERT(buft);
|
|
39
40
|
if (size == 0) {
|
|
40
41
|
// return a dummy buffer for zero-sized allocations
|
|
41
42
|
return ggml_backend_buffer_init(buft, {}, NULL, 0);
|
|
42
43
|
}
|
|
43
|
-
|
|
44
44
|
return buft->iface.alloc_buffer(buft, size);
|
|
45
45
|
}
|
|
46
46
|
|
|
47
47
|
size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
48
|
+
GGML_ASSERT(buft);
|
|
48
49
|
return buft->iface.get_alignment(buft);
|
|
49
50
|
}
|
|
50
51
|
|
|
51
52
|
size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
|
53
|
+
GGML_ASSERT(buft);
|
|
52
54
|
// get_max_size is optional, defaults to SIZE_MAX
|
|
53
55
|
if (buft->iface.get_max_size) {
|
|
54
56
|
return buft->iface.get_max_size(buft);
|
|
@@ -57,6 +59,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
|
|
57
59
|
}
|
|
58
60
|
|
|
59
61
|
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
|
|
62
|
+
GGML_ASSERT(buft);
|
|
60
63
|
// get_alloc_size is optional, defaults to ggml_nbytes
|
|
61
64
|
if (buft->iface.get_alloc_size) {
|
|
62
65
|
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
|
@@ -67,6 +70,7 @@ size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const s
|
|
|
67
70
|
}
|
|
68
71
|
|
|
69
72
|
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
|
73
|
+
GGML_ASSERT(buft);
|
|
70
74
|
if (buft->iface.is_host) {
|
|
71
75
|
return buft->iface.is_host(buft);
|
|
72
76
|
}
|
|
@@ -74,6 +78,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
|
|
74
78
|
}
|
|
75
79
|
|
|
76
80
|
ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) {
|
|
81
|
+
GGML_ASSERT(buft);
|
|
77
82
|
return buft->device;
|
|
78
83
|
}
|
|
79
84
|
|
|
@@ -111,15 +116,23 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
|
|
111
116
|
}
|
|
112
117
|
|
|
113
118
|
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
|
119
|
+
GGML_ASSERT(buffer);
|
|
114
120
|
return buffer->size;
|
|
115
121
|
}
|
|
116
122
|
|
|
117
123
|
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
124
|
+
GGML_ASSERT(buffer);
|
|
118
125
|
// get_base is optional if the buffer is zero-sized
|
|
119
126
|
if (buffer->size == 0) {
|
|
120
127
|
return NULL;
|
|
121
128
|
}
|
|
122
129
|
|
|
130
|
+
// FIXME JG: a multi_buffer has a non-zero size, according to the above comment get_base is not optional,
|
|
131
|
+
// I don't know whether the above comment is correct
|
|
132
|
+
if (!buffer->iface.get_base) {
|
|
133
|
+
return NULL;
|
|
134
|
+
}
|
|
135
|
+
|
|
123
136
|
void * base = buffer->iface.get_base(buffer);
|
|
124
137
|
|
|
125
138
|
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
|
@@ -128,6 +141,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
|
128
141
|
}
|
|
129
142
|
|
|
130
143
|
enum ggml_status ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
|
144
|
+
GGML_ASSERT(buffer);
|
|
131
145
|
// init_tensor is optional
|
|
132
146
|
if (buffer->iface.init_tensor) {
|
|
133
147
|
return buffer->iface.init_tensor(buffer, tensor);
|
|
@@ -136,6 +150,7 @@ enum ggml_status ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, s
|
|
|
136
150
|
}
|
|
137
151
|
|
|
138
152
|
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
153
|
+
GGML_ASSERT(buffer);
|
|
139
154
|
// clear is optional if the buffer is zero-sized
|
|
140
155
|
if (buffer->size == 0) {
|
|
141
156
|
return;
|
|
@@ -161,6 +176,7 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
|
|
161
176
|
}
|
|
162
177
|
|
|
163
178
|
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
|
179
|
+
GGML_ASSERT(buffer);
|
|
164
180
|
buffer->usage = usage;
|
|
165
181
|
|
|
166
182
|
// FIXME: add a generic callback to the buffer interface
|
|
@@ -170,14 +186,17 @@ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backe
|
|
|
170
186
|
}
|
|
171
187
|
|
|
172
188
|
enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
|
|
189
|
+
GGML_ASSERT(buffer);
|
|
173
190
|
return buffer->usage;
|
|
174
191
|
}
|
|
175
192
|
|
|
176
193
|
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
|
194
|
+
GGML_ASSERT(buffer);
|
|
177
195
|
return buffer->buft;
|
|
178
196
|
}
|
|
179
197
|
|
|
180
198
|
void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
|
199
|
+
GGML_ASSERT(buffer);
|
|
181
200
|
if (buffer->iface.reset) {
|
|
182
201
|
buffer->iface.reset(buffer);
|
|
183
202
|
}
|
|
@@ -216,6 +235,7 @@ void ggml_backend_free(ggml_backend_t backend) {
|
|
|
216
235
|
}
|
|
217
236
|
|
|
218
237
|
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
|
|
238
|
+
GGML_ASSERT(backend);
|
|
219
239
|
return ggml_backend_dev_buffer_type(backend->device);
|
|
220
240
|
}
|
|
221
241
|
|
|
@@ -232,6 +252,8 @@ size_t ggml_backend_get_max_size(ggml_backend_t backend) {
|
|
|
232
252
|
}
|
|
233
253
|
|
|
234
254
|
void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
255
|
+
GGML_ASSERT(backend);
|
|
256
|
+
GGML_ASSERT(tensor);
|
|
235
257
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
236
258
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
237
259
|
|
|
@@ -243,6 +265,8 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor *
|
|
|
243
265
|
}
|
|
244
266
|
|
|
245
267
|
void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
268
|
+
GGML_ASSERT(backend);
|
|
269
|
+
GGML_ASSERT(tensor);
|
|
246
270
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
247
271
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
|
248
272
|
|
|
@@ -284,6 +308,7 @@ void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, siz
|
|
|
284
308
|
}
|
|
285
309
|
|
|
286
310
|
void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
311
|
+
GGML_ASSERT(tensor);
|
|
287
312
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
288
313
|
|
|
289
314
|
if (size == 0) {
|
|
@@ -299,6 +324,7 @@ void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size
|
|
|
299
324
|
}
|
|
300
325
|
|
|
301
326
|
void ggml_backend_synchronize(ggml_backend_t backend) {
|
|
327
|
+
GGML_ASSERT(backend);
|
|
302
328
|
if (backend->iface.synchronize == NULL) {
|
|
303
329
|
return;
|
|
304
330
|
}
|
|
@@ -307,18 +333,21 @@ void ggml_backend_synchronize(ggml_backend_t backend) {
|
|
|
307
333
|
}
|
|
308
334
|
|
|
309
335
|
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
|
336
|
+
GGML_ASSERT(backend);
|
|
310
337
|
GGML_ASSERT(backend->iface.graph_plan_create != NULL);
|
|
311
338
|
|
|
312
339
|
return backend->iface.graph_plan_create(backend, cgraph);
|
|
313
340
|
}
|
|
314
341
|
|
|
315
342
|
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
|
343
|
+
GGML_ASSERT(backend);
|
|
316
344
|
GGML_ASSERT(backend->iface.graph_plan_free != NULL);
|
|
317
345
|
|
|
318
346
|
backend->iface.graph_plan_free(backend, plan);
|
|
319
347
|
}
|
|
320
348
|
|
|
321
349
|
enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
|
350
|
+
GGML_ASSERT(backend);
|
|
322
351
|
GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
|
|
323
352
|
|
|
324
353
|
return backend->iface.graph_plan_compute(backend, plan);
|
|
@@ -331,42 +360,32 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
|
|
|
331
360
|
}
|
|
332
361
|
|
|
333
362
|
enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
|
363
|
+
GGML_ASSERT(backend);
|
|
334
364
|
return backend->iface.graph_compute(backend, cgraph);
|
|
335
365
|
}
|
|
336
366
|
|
|
337
367
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
368
|
+
GGML_ASSERT(backend);
|
|
338
369
|
return ggml_backend_dev_supports_op(backend->device, op);
|
|
339
370
|
}
|
|
340
371
|
|
|
341
372
|
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
373
|
+
GGML_ASSERT(backend);
|
|
342
374
|
return ggml_backend_dev_supports_buft(backend->device, buft);
|
|
343
375
|
}
|
|
344
376
|
|
|
345
377
|
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
378
|
+
GGML_ASSERT(backend);
|
|
346
379
|
return ggml_backend_dev_offload_op(backend->device, op);
|
|
347
380
|
}
|
|
348
381
|
|
|
349
382
|
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
|
|
383
|
+
GGML_ASSERT(backend);
|
|
350
384
|
return backend->device;
|
|
351
385
|
}
|
|
352
386
|
|
|
353
387
|
// backend copy
|
|
354
388
|
|
|
355
|
-
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
|
356
|
-
if (a->type != b->type) {
|
|
357
|
-
return false;
|
|
358
|
-
}
|
|
359
|
-
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
|
360
|
-
if (a->ne[i] != b->ne[i]) {
|
|
361
|
-
return false;
|
|
362
|
-
}
|
|
363
|
-
if (a->nb[i] != b->nb[i]) {
|
|
364
|
-
return false;
|
|
365
|
-
}
|
|
366
|
-
}
|
|
367
|
-
return true;
|
|
368
|
-
}
|
|
369
|
-
|
|
370
389
|
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
|
|
371
390
|
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
|
372
391
|
|
|
@@ -397,6 +416,7 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
|
|
|
397
416
|
return;
|
|
398
417
|
}
|
|
399
418
|
|
|
419
|
+
GGML_ASSERT(backend_dst);
|
|
400
420
|
if (backend_dst->iface.cpy_tensor_async != NULL) {
|
|
401
421
|
if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
|
|
402
422
|
return;
|
|
@@ -428,38 +448,52 @@ void ggml_backend_event_free(ggml_backend_event_t event) {
|
|
|
428
448
|
}
|
|
429
449
|
|
|
430
450
|
void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
|
|
451
|
+
GGML_ASSERT(backend);
|
|
431
452
|
GGML_ASSERT(backend->iface.event_record != NULL);
|
|
432
453
|
|
|
433
454
|
backend->iface.event_record(backend, event);
|
|
434
455
|
}
|
|
435
456
|
|
|
436
457
|
void ggml_backend_event_synchronize(ggml_backend_event_t event) {
|
|
458
|
+
GGML_ASSERT(event);
|
|
437
459
|
GGML_ASSERT(event->device->iface.event_synchronize);
|
|
438
460
|
|
|
439
461
|
event->device->iface.event_synchronize(event->device, event);
|
|
440
462
|
}
|
|
441
463
|
|
|
442
464
|
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
|
465
|
+
GGML_ASSERT(backend);
|
|
443
466
|
GGML_ASSERT(backend->iface.event_wait != NULL);
|
|
444
467
|
|
|
445
468
|
backend->iface.event_wait(backend, event);
|
|
446
469
|
}
|
|
447
470
|
|
|
471
|
+
static void ggml_backend_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
|
472
|
+
GGML_ASSERT(backend);
|
|
473
|
+
if (backend->iface.graph_optimize != NULL) {
|
|
474
|
+
backend->iface.graph_optimize(backend, cgraph);
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
|
|
448
478
|
// Backend device
|
|
449
479
|
|
|
450
480
|
const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
|
|
481
|
+
GGML_ASSERT(device);
|
|
451
482
|
return device->iface.get_name(device);
|
|
452
483
|
}
|
|
453
484
|
|
|
454
485
|
const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
|
|
486
|
+
GGML_ASSERT(device);
|
|
455
487
|
return device->iface.get_description(device);
|
|
456
488
|
}
|
|
457
489
|
|
|
458
490
|
void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
|
|
491
|
+
GGML_ASSERT(device);
|
|
459
492
|
device->iface.get_memory(device, free, total);
|
|
460
493
|
}
|
|
461
494
|
|
|
462
495
|
enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
|
|
496
|
+
GGML_ASSERT(device);
|
|
463
497
|
return device->iface.get_type(device);
|
|
464
498
|
}
|
|
465
499
|
|
|
@@ -469,18 +503,22 @@ void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_d
|
|
|
469
503
|
}
|
|
470
504
|
|
|
471
505
|
ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
|
|
506
|
+
GGML_ASSERT(device);
|
|
472
507
|
return device->reg;
|
|
473
508
|
}
|
|
474
509
|
|
|
475
510
|
ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
|
|
511
|
+
GGML_ASSERT(device);
|
|
476
512
|
return device->iface.init_backend(device, params);
|
|
477
513
|
}
|
|
478
514
|
|
|
479
515
|
ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
|
|
516
|
+
GGML_ASSERT(device);
|
|
480
517
|
return device->iface.get_buffer_type(device);
|
|
481
518
|
}
|
|
482
519
|
|
|
483
520
|
ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
|
|
521
|
+
GGML_ASSERT(device);
|
|
484
522
|
if (device->iface.get_host_buffer_type == NULL) {
|
|
485
523
|
return NULL;
|
|
486
524
|
}
|
|
@@ -489,18 +527,22 @@ ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t
|
|
|
489
527
|
}
|
|
490
528
|
|
|
491
529
|
ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
|
|
530
|
+
GGML_ASSERT(device);
|
|
492
531
|
return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
|
|
493
532
|
}
|
|
494
533
|
|
|
495
534
|
bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
|
535
|
+
GGML_ASSERT(device);
|
|
496
536
|
return device->iface.supports_op(device, op);
|
|
497
537
|
}
|
|
498
538
|
|
|
499
539
|
bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
|
|
540
|
+
GGML_ASSERT(device);
|
|
500
541
|
return device->iface.supports_buft(device, buft);
|
|
501
542
|
}
|
|
502
543
|
|
|
503
544
|
bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
|
545
|
+
GGML_ASSERT(device);
|
|
504
546
|
if (device->iface.offload_op != NULL) {
|
|
505
547
|
return device->iface.offload_op(device, op);
|
|
506
548
|
}
|
|
@@ -511,18 +553,22 @@ bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_te
|
|
|
511
553
|
// Backend (reg)
|
|
512
554
|
|
|
513
555
|
const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
|
|
556
|
+
GGML_ASSERT(reg);
|
|
514
557
|
return reg->iface.get_name(reg);
|
|
515
558
|
}
|
|
516
559
|
|
|
517
560
|
size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
|
|
561
|
+
GGML_ASSERT(reg);
|
|
518
562
|
return reg->iface.get_device_count(reg);
|
|
519
563
|
}
|
|
520
564
|
|
|
521
565
|
ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
|
|
566
|
+
GGML_ASSERT(reg);
|
|
522
567
|
return reg->iface.get_device(reg, index);
|
|
523
568
|
}
|
|
524
569
|
|
|
525
570
|
void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
|
571
|
+
GGML_ASSERT(reg);
|
|
526
572
|
if (!reg->iface.get_proc_address) {
|
|
527
573
|
return NULL;
|
|
528
574
|
}
|
|
@@ -537,6 +583,7 @@ struct ggml_backend_multi_buffer_context {
|
|
|
537
583
|
};
|
|
538
584
|
|
|
539
585
|
static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
586
|
+
GGML_ASSERT(buffer);
|
|
540
587
|
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
|
541
588
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
|
542
589
|
ggml_backend_buffer_free(ctx->buffers[i]);
|
|
@@ -547,6 +594,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
|
|
547
594
|
}
|
|
548
595
|
|
|
549
596
|
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
597
|
+
GGML_ASSERT(buffer);
|
|
550
598
|
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
|
551
599
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
|
552
600
|
ggml_backend_buffer_clear(ctx->buffers[i], value);
|
|
@@ -582,10 +630,12 @@ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer
|
|
|
582
630
|
}
|
|
583
631
|
|
|
584
632
|
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
|
|
633
|
+
GGML_ASSERT(buffer);
|
|
585
634
|
return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
|
|
586
635
|
}
|
|
587
636
|
|
|
588
637
|
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
|
638
|
+
GGML_ASSERT(buffer);
|
|
589
639
|
GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
|
|
590
640
|
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
|
591
641
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
|
@@ -613,7 +663,7 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
|
|
613
663
|
#endif
|
|
614
664
|
|
|
615
665
|
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
|
616
|
-
#define GGML_SCHED_MAX_SPLIT_INPUTS
|
|
666
|
+
#define GGML_SCHED_MAX_SPLIT_INPUTS 30
|
|
617
667
|
#endif
|
|
618
668
|
|
|
619
669
|
#ifndef GGML_SCHED_MAX_COPIES
|
|
@@ -662,6 +712,7 @@ struct ggml_backend_sched {
|
|
|
662
712
|
// pipeline parallelism support
|
|
663
713
|
int n_copies;
|
|
664
714
|
int cur_copy;
|
|
715
|
+
int next_copy;
|
|
665
716
|
ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
|
|
666
717
|
struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
|
|
667
718
|
int n_graph_inputs;
|
|
@@ -677,6 +728,12 @@ struct ggml_backend_sched {
|
|
|
677
728
|
bool op_offload;
|
|
678
729
|
|
|
679
730
|
int debug;
|
|
731
|
+
|
|
732
|
+
// used for debugging graph reallocations [GGML_SCHED_DEBUG_REALLOC]
|
|
733
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/17617
|
|
734
|
+
int debug_realloc;
|
|
735
|
+
int debug_graph_size;
|
|
736
|
+
int debug_prev_graph_size;
|
|
680
737
|
};
|
|
681
738
|
|
|
682
739
|
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
|
@@ -863,7 +920,7 @@ static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, stru
|
|
|
863
920
|
}
|
|
864
921
|
|
|
865
922
|
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
|
866
|
-
|
|
923
|
+
void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
|
867
924
|
// reset splits
|
|
868
925
|
sched->n_splits = 0;
|
|
869
926
|
sched->n_graph_inputs = 0;
|
|
@@ -1085,6 +1142,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1085
1142
|
}
|
|
1086
1143
|
}
|
|
1087
1144
|
}
|
|
1145
|
+
// if the node is still unassigned, assign it to the first backend that supports it
|
|
1146
|
+
for (int b = 0; b < sched->n_backends && *cur_backend_id == -1; b++) {
|
|
1147
|
+
ggml_backend_sched_set_if_supported(sched, node, b, cur_backend_id);
|
|
1148
|
+
}
|
|
1149
|
+
GGML_ASSERT(*cur_backend_id != -1);
|
|
1088
1150
|
}
|
|
1089
1151
|
|
|
1090
1152
|
// pass 5: split graph, find tensors that need to be copied
|
|
@@ -1112,7 +1174,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1112
1174
|
|
|
1113
1175
|
const int node_backend_id = tensor_backend_id(node);
|
|
1114
1176
|
|
|
1115
|
-
|
|
1177
|
+
GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
|
|
1116
1178
|
|
|
1117
1179
|
// check if we should start a new split based on the sources of the current node
|
|
1118
1180
|
bool need_new_split = false;
|
|
@@ -1170,7 +1232,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1170
1232
|
|
|
1171
1233
|
size_t src_id = hash_id(src);
|
|
1172
1234
|
const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
|
|
1173
|
-
|
|
1235
|
+
GGML_ASSERT(src_backend_id != -1); // all inputs should be assigned by now
|
|
1174
1236
|
|
|
1175
1237
|
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
|
1176
1238
|
if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
|
|
@@ -1183,10 +1245,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1183
1245
|
tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
|
1184
1246
|
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
|
|
1185
1247
|
}
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
|
1189
|
-
}
|
|
1248
|
+
ggml_set_input(tensor_copy);
|
|
1249
|
+
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
|
1190
1250
|
tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
|
|
1191
1251
|
SET_CAUSE(tensor_copy, "4.cpy");
|
|
1192
1252
|
}
|
|
@@ -1238,6 +1298,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1238
1298
|
}
|
|
1239
1299
|
|
|
1240
1300
|
int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
|
|
1301
|
+
|
|
1302
|
+
// remember the actual graph_size for performing reallocation checks later [GGML_SCHED_DEBUG_REALLOC]
|
|
1303
|
+
sched->debug_prev_graph_size = sched->debug_graph_size;
|
|
1304
|
+
sched->debug_graph_size = graph_size;
|
|
1305
|
+
|
|
1241
1306
|
if (sched->graph.size < graph_size) {
|
|
1242
1307
|
sched->graph.size = graph_size;
|
|
1243
1308
|
sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
|
@@ -1254,6 +1319,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1254
1319
|
struct ggml_backend_sched_split * split = &sched->splits[i];
|
|
1255
1320
|
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
|
1256
1321
|
|
|
1322
|
+
// Optimize this split of the graph. This needs to happen before we make graph_copy,
|
|
1323
|
+
// so they are in sync.
|
|
1324
|
+
ggml_backend_graph_optimize(sched->backends[split->backend_id], &split->graph);
|
|
1325
|
+
|
|
1257
1326
|
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
|
1258
1327
|
for (int j = 0; j < split->n_inputs; j++) {
|
|
1259
1328
|
assert(graph_copy->size > (graph_copy->n_nodes + 1));
|
|
@@ -1340,14 +1409,27 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
|
|
1340
1409
|
|
|
1341
1410
|
// allocate graph
|
|
1342
1411
|
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
|
1412
|
+
#ifndef NDEBUG
|
|
1413
|
+
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
|
1414
|
+
#endif
|
|
1415
|
+
|
|
1416
|
+
if (sched->debug_realloc > 0) {
|
|
1417
|
+
// we are interested only in situations where the graph was reallocated even though its size remained the same [GGML_SCHED_DEBUG_REALLOC]
|
|
1418
|
+
// example: https://github.com/ggml-org/llama.cpp/pull/17143
|
|
1419
|
+
const bool unexpected = !backend_ids_changed && sched->debug_prev_graph_size == sched->debug_graph_size;
|
|
1420
|
+
|
|
1421
|
+
if (unexpected || sched->debug_realloc > 1) {
|
|
1422
|
+
GGML_ABORT("%s: unexpected graph reallocation (graph size = %d, nodes = %d, leafs = %d), debug_realloc = %d\n", __func__,
|
|
1423
|
+
sched->debug_graph_size, sched->graph.n_nodes, sched->graph.n_leafs, sched->debug_realloc);
|
|
1424
|
+
}
|
|
1425
|
+
}
|
|
1426
|
+
|
|
1343
1427
|
// the re-allocation may cause the split inputs to be moved to a different address
|
|
1344
1428
|
// synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
|
|
1345
1429
|
for (int i = 0; i < sched->n_backends; i++) {
|
|
1346
1430
|
ggml_backend_synchronize(sched->backends[i]);
|
|
1347
1431
|
}
|
|
1348
|
-
|
|
1349
|
-
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
|
1350
|
-
#endif
|
|
1432
|
+
|
|
1351
1433
|
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
|
1352
1434
|
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
|
1353
1435
|
GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
|
|
@@ -1359,17 +1441,22 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
|
|
1359
1441
|
}
|
|
1360
1442
|
|
|
1361
1443
|
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1444
|
+
GGML_ASSERT(sched);
|
|
1362
1445
|
struct ggml_backend_sched_split * splits = sched->splits;
|
|
1363
1446
|
|
|
1364
|
-
|
|
1365
|
-
|
|
1447
|
+
ggml_tensor * prev_ids_tensor = nullptr;
|
|
1448
|
+
std::vector<int32_t> ids;
|
|
1449
|
+
std::vector<ggml_bitset_t> used_ids;
|
|
1450
|
+
|
|
1451
|
+
for (int split_id = 0; split_id < sched->n_splits; split_id++) {
|
|
1452
|
+
struct ggml_backend_sched_split * split = &splits[split_id];
|
|
1366
1453
|
int split_backend_id = split->backend_id;
|
|
1367
1454
|
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
|
1368
1455
|
|
|
1369
1456
|
// copy the input tensors to the split backend
|
|
1370
|
-
for (int
|
|
1371
|
-
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[
|
|
1372
|
-
struct ggml_tensor * input = split->inputs[
|
|
1457
|
+
for (int input_id = 0; input_id < split->n_inputs; input_id++) {
|
|
1458
|
+
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
|
|
1459
|
+
struct ggml_tensor * input = split->inputs[input_id];
|
|
1373
1460
|
struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
|
|
1374
1461
|
|
|
1375
1462
|
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
|
|
@@ -1387,16 +1474,104 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|
|
1387
1474
|
} else {
|
|
1388
1475
|
ggml_backend_synchronize(split_backend);
|
|
1389
1476
|
}
|
|
1390
|
-
|
|
1391
|
-
//
|
|
1392
|
-
|
|
1477
|
+
|
|
1478
|
+
// when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used
|
|
1479
|
+
ggml_tensor * node = split->graph.nodes[0];
|
|
1480
|
+
if (split->graph.n_nodes > 0 &&
|
|
1481
|
+
ggml_backend_buffer_get_usage(input->buffer) == GGML_BACKEND_BUFFER_USAGE_WEIGHTS &&
|
|
1482
|
+
ggml_backend_buffer_is_host(input->buffer) && (
|
|
1483
|
+
(node->src[0] == input_cpy && node->op == GGML_OP_MUL_MAT_ID)
|
|
1484
|
+
//|| (node->src[1] == input_cpy && node->op == GGML_OP_ADD_ID) /* GGML_OP_ADD_ID weights are small and not worth splitting */
|
|
1485
|
+
)) {
|
|
1486
|
+
|
|
1487
|
+
const int64_t n_expert = node->op == GGML_OP_MUL_MAT_ID ? input->ne[2] : input->ne[1];
|
|
1488
|
+
const size_t expert_size = node->op == GGML_OP_MUL_MAT_ID ? input->nb[2] : input->nb[1];
|
|
1489
|
+
|
|
1393
1490
|
ggml_backend_synchronize(input_backend);
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1491
|
+
|
|
1492
|
+
// get the ids
|
|
1493
|
+
ggml_tensor * ids_tensor = node->src[2];
|
|
1494
|
+
ggml_backend_t ids_backend = split_backend;
|
|
1495
|
+
|
|
1496
|
+
// if the ids tensor is also an input of the split, it may not have been copied yet to the split backend
|
|
1497
|
+
// in that case, we use the original ids tensor
|
|
1498
|
+
for (int i = input_id + 1; i < split->n_inputs; i++) {
|
|
1499
|
+
if (ids_tensor == tensor_copy(split->inputs[i], split_backend_id, sched->cur_copy)) {
|
|
1500
|
+
ids_tensor = split->inputs[i];
|
|
1501
|
+
ids_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[i]);
|
|
1502
|
+
break;
|
|
1503
|
+
}
|
|
1504
|
+
}
|
|
1505
|
+
|
|
1506
|
+
if (ids_tensor != prev_ids_tensor) {
|
|
1507
|
+
ids.resize(ggml_nbytes(ids_tensor) / sizeof(int32_t));
|
|
1508
|
+
ggml_backend_tensor_get_async(ids_backend, ids_tensor, ids.data(), 0, ggml_nbytes(ids_tensor));
|
|
1509
|
+
ggml_backend_synchronize(ids_backend);
|
|
1510
|
+
|
|
1511
|
+
// find the used experts
|
|
1512
|
+
used_ids.clear();
|
|
1513
|
+
used_ids.resize(ggml_bitset_size(n_expert));
|
|
1514
|
+
for (int64_t i1 = 0; i1 < ids_tensor->ne[1]; i1++) {
|
|
1515
|
+
for (int64_t i0 = 0; i0 < ids_tensor->ne[0]; i0++) {
|
|
1516
|
+
int32_t id = ids[i1 * ids_tensor->nb[1]/sizeof(int32_t) + i0 * ids_tensor->nb[0]/sizeof(int32_t)];
|
|
1517
|
+
GGML_ASSERT(id >= 0 && id < n_expert);
|
|
1518
|
+
ggml_bitset_set(used_ids.data(), id);
|
|
1519
|
+
}
|
|
1520
|
+
}
|
|
1521
|
+
|
|
1522
|
+
prev_ids_tensor = ids_tensor;
|
|
1523
|
+
}
|
|
1524
|
+
|
|
1525
|
+
// group consecutive experts and copy them together
|
|
1526
|
+
auto copy_experts = [&](int32_t first_id, int32_t last_id) {
|
|
1527
|
+
const size_t expert_offset = first_id * expert_size;
|
|
1528
|
+
const size_t expert_size_copy = (last_id - first_id + 1) * expert_size;
|
|
1529
|
+
const size_t padding = std::min<size_t>(expert_size, 512);
|
|
1530
|
+
const size_t padding_end = last_id < n_expert - 1 ? padding : 0;
|
|
1531
|
+
|
|
1532
|
+
ggml_backend_tensor_set_async(split_backend,
|
|
1533
|
+
input_cpy,
|
|
1534
|
+
(const uint8_t *)input->data + expert_offset, expert_offset,
|
|
1535
|
+
// copy a bit extra at the to ensure there are no NaNs in the padding of the last expert
|
|
1536
|
+
// this is necessary for MMQ in the CUDA backend
|
|
1537
|
+
expert_size_copy + padding_end);
|
|
1538
|
+
};
|
|
1539
|
+
|
|
1540
|
+
int id = 0;
|
|
1541
|
+
while (!ggml_bitset_get(used_ids.data(), id)) {
|
|
1542
|
+
id++;
|
|
1543
|
+
}
|
|
1544
|
+
int32_t first_id = id;
|
|
1545
|
+
int32_t last_id = first_id;
|
|
1546
|
+
|
|
1547
|
+
for (++id; id < n_expert; ++id) {
|
|
1548
|
+
if (!ggml_bitset_get(used_ids.data(), id)) {
|
|
1549
|
+
continue;
|
|
1550
|
+
}
|
|
1551
|
+
|
|
1552
|
+
if (id == last_id + 1) {
|
|
1553
|
+
last_id = id;
|
|
1554
|
+
continue;
|
|
1555
|
+
}
|
|
1556
|
+
|
|
1557
|
+
copy_experts(first_id, last_id);
|
|
1558
|
+
|
|
1559
|
+
first_id = id;
|
|
1560
|
+
last_id = id;
|
|
1561
|
+
}
|
|
1562
|
+
copy_experts(first_id, last_id);
|
|
1563
|
+
} else {
|
|
1564
|
+
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
|
1565
|
+
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
|
1566
|
+
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
|
1567
|
+
ggml_backend_synchronize(input_backend);
|
|
1568
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
|
1569
|
+
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
|
1570
|
+
} else {
|
|
1571
|
+
ggml_backend_synchronize(split_backend);
|
|
1572
|
+
}
|
|
1573
|
+
ggml_backend_tensor_copy(input, input_cpy);
|
|
1398
1574
|
}
|
|
1399
|
-
ggml_backend_tensor_copy(input, input_cpy);
|
|
1400
1575
|
}
|
|
1401
1576
|
}
|
|
1402
1577
|
}
|
|
@@ -1448,8 +1623,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|
|
1448
1623
|
}
|
|
1449
1624
|
}
|
|
1450
1625
|
|
|
1451
|
-
sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
|
|
1452
|
-
|
|
1453
1626
|
return GGML_STATUS_SUCCESS;
|
|
1454
1627
|
}
|
|
1455
1628
|
|
|
@@ -1468,6 +1641,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
|
1468
1641
|
|
|
1469
1642
|
const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
|
|
1470
1643
|
sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
|
|
1644
|
+
|
|
1645
|
+
sched->debug_realloc = 0;
|
|
1646
|
+
#ifdef GGML_SCHED_NO_REALLOC
|
|
1647
|
+
sched->debug_realloc = 1;
|
|
1648
|
+
#endif
|
|
1649
|
+
const char * GGML_SCHED_DEBUG_REALLOC = getenv("GGML_SCHED_DEBUG_REALLOC");
|
|
1650
|
+
sched->debug_realloc = GGML_SCHED_DEBUG_REALLOC ? atoi(GGML_SCHED_DEBUG_REALLOC) : sched->debug_realloc;
|
|
1651
|
+
|
|
1471
1652
|
sched->n_backends = n_backends;
|
|
1472
1653
|
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
|
1473
1654
|
|
|
@@ -1484,6 +1665,9 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
|
1484
1665
|
sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
|
1485
1666
|
sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
|
1486
1667
|
|
|
1668
|
+
sched->debug_graph_size = 0;
|
|
1669
|
+
sched->debug_prev_graph_size = 0;
|
|
1670
|
+
|
|
1487
1671
|
sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
|
|
1488
1672
|
sched->context_buffer = (char *) malloc(sched->context_buffer_size);
|
|
1489
1673
|
|
|
@@ -1537,6 +1721,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
|
1537
1721
|
}
|
|
1538
1722
|
|
|
1539
1723
|
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
|
1724
|
+
GGML_ASSERT(sched);
|
|
1540
1725
|
// reset state for the next run
|
|
1541
1726
|
if (!sched->is_reset) {
|
|
1542
1727
|
ggml_hash_set_reset(&sched->hash_set);
|
|
@@ -1547,13 +1732,28 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
|
|
1547
1732
|
sched->is_alloc = false;
|
|
1548
1733
|
}
|
|
1549
1734
|
|
|
1550
|
-
|
|
1735
|
+
void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes) {
|
|
1736
|
+
GGML_ASSERT(sched);
|
|
1551
1737
|
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
|
1738
|
+
GGML_ASSERT(sizes);
|
|
1739
|
+
|
|
1740
|
+
ggml_backend_sched_reset(sched);
|
|
1741
|
+
|
|
1742
|
+
ggml_backend_sched_synchronize(sched);
|
|
1552
1743
|
|
|
1553
1744
|
ggml_backend_sched_split_graph(sched, measure_graph);
|
|
1554
1745
|
|
|
1746
|
+
ggml_gallocr_reserve_n_size(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids, sizes);
|
|
1747
|
+
}
|
|
1748
|
+
|
|
1749
|
+
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
|
1750
|
+
GGML_ASSERT(sched);
|
|
1751
|
+
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
|
1752
|
+
|
|
1555
1753
|
ggml_backend_sched_synchronize(sched);
|
|
1556
1754
|
|
|
1755
|
+
ggml_backend_sched_split_graph(sched, measure_graph);
|
|
1756
|
+
|
|
1557
1757
|
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
|
1558
1758
|
return false;
|
|
1559
1759
|
}
|
|
@@ -1564,7 +1764,12 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
|
|
1564
1764
|
}
|
|
1565
1765
|
|
|
1566
1766
|
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
|
1767
|
+
GGML_ASSERT(sched);
|
|
1567
1768
|
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
|
|
1769
|
+
GGML_ASSERT(!sched->is_alloc);
|
|
1770
|
+
|
|
1771
|
+
sched->cur_copy = sched->next_copy;
|
|
1772
|
+
sched->next_copy = (sched->next_copy + 1) % sched->n_copies;
|
|
1568
1773
|
|
|
1569
1774
|
ggml_backend_sched_split_graph(sched, graph);
|
|
1570
1775
|
|
|
@@ -1584,6 +1789,7 @@ enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, st
|
|
|
1584
1789
|
}
|
|
1585
1790
|
|
|
1586
1791
|
enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
|
1792
|
+
GGML_ASSERT(sched);
|
|
1587
1793
|
if (!sched->is_reset && !sched->is_alloc) {
|
|
1588
1794
|
ggml_backend_sched_reset(sched);
|
|
1589
1795
|
}
|
|
@@ -1598,6 +1804,7 @@ enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sch
|
|
|
1598
1804
|
}
|
|
1599
1805
|
|
|
1600
1806
|
void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
|
1807
|
+
GGML_ASSERT(sched);
|
|
1601
1808
|
for (int i = 0; i < sched->n_backends; i++) {
|
|
1602
1809
|
ggml_backend_synchronize(sched->backends[i]);
|
|
1603
1810
|
}
|
|
@@ -1605,33 +1812,47 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
|
|
1605
1812
|
// if the graph is not already allocated, always use copy 0 after a synchronization
|
|
1606
1813
|
// this ensures that during generation the same copy is used every time,
|
|
1607
1814
|
// which avoids changes in the graph that could cause CUDA or other graphs to be disabled
|
|
1608
|
-
sched->
|
|
1815
|
+
sched->next_copy = 0;
|
|
1609
1816
|
}
|
|
1610
1817
|
}
|
|
1611
1818
|
|
|
1612
1819
|
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
|
1820
|
+
GGML_ASSERT(sched);
|
|
1613
1821
|
sched->callback_eval = callback;
|
|
1614
1822
|
sched->callback_eval_user_data = user_data;
|
|
1615
1823
|
}
|
|
1616
1824
|
|
|
1617
1825
|
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
|
1826
|
+
GGML_ASSERT(sched);
|
|
1618
1827
|
return sched->n_splits;
|
|
1619
1828
|
}
|
|
1620
1829
|
|
|
1621
1830
|
int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
|
|
1831
|
+
GGML_ASSERT(sched);
|
|
1622
1832
|
return sched->n_copies;
|
|
1623
1833
|
}
|
|
1624
1834
|
|
|
1625
1835
|
int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
|
|
1836
|
+
GGML_ASSERT(sched);
|
|
1626
1837
|
return sched->n_backends;
|
|
1627
1838
|
}
|
|
1628
1839
|
|
|
1629
1840
|
ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
|
|
1841
|
+
GGML_ASSERT(sched);
|
|
1630
1842
|
GGML_ASSERT(i >= 0 && i < sched->n_backends);
|
|
1631
1843
|
return sched->backends[i];
|
|
1632
1844
|
}
|
|
1633
1845
|
|
|
1846
|
+
ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
|
1847
|
+
GGML_ASSERT(sched);
|
|
1848
|
+
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
|
1849
|
+
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
|
1850
|
+
|
|
1851
|
+
return sched->bufts[backend_index];
|
|
1852
|
+
}
|
|
1853
|
+
|
|
1634
1854
|
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
|
1855
|
+
GGML_ASSERT(sched);
|
|
1635
1856
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
|
1636
1857
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
|
1637
1858
|
|
|
@@ -1639,6 +1860,7 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
|
|
1639
1860
|
}
|
|
1640
1861
|
|
|
1641
1862
|
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
|
1863
|
+
GGML_ASSERT(sched);
|
|
1642
1864
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
|
1643
1865
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
|
1644
1866
|
tensor_backend_id(node) = backend_index;
|
|
@@ -1647,6 +1869,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
|
|
|
1647
1869
|
}
|
|
1648
1870
|
|
|
1649
1871
|
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
|
1872
|
+
GGML_ASSERT(sched);
|
|
1650
1873
|
int backend_index = tensor_backend_id(node);
|
|
1651
1874
|
if (backend_index == -1) {
|
|
1652
1875
|
return NULL;
|
|
@@ -1657,6 +1880,7 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
|
|
|
1657
1880
|
// utils
|
|
1658
1881
|
|
|
1659
1882
|
enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) {
|
|
1883
|
+
GGML_ASSERT(tensor);
|
|
1660
1884
|
GGML_ASSERT(tensor->buffer == NULL);
|
|
1661
1885
|
GGML_ASSERT(tensor->view_src != NULL);
|
|
1662
1886
|
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
|
@@ -1668,6 +1892,7 @@ enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) {
|
|
|
1668
1892
|
}
|
|
1669
1893
|
|
|
1670
1894
|
enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
|
1895
|
+
GGML_ASSERT(tensor);
|
|
1671
1896
|
GGML_ASSERT(tensor->buffer == NULL);
|
|
1672
1897
|
GGML_ASSERT(tensor->data == NULL);
|
|
1673
1898
|
GGML_ASSERT(tensor->view_src == NULL);
|
|
@@ -1741,6 +1966,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_
|
|
|
1741
1966
|
}
|
|
1742
1967
|
|
|
1743
1968
|
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
|
1969
|
+
GGML_ASSERT(graph);
|
|
1744
1970
|
struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
|
|
1745
1971
|
struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
|
1746
1972
|
bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
|
|
@@ -1827,7 +2053,7 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
|
|
|
1827
2053
|
ggml_free(copy.ctx_unallocated);
|
|
1828
2054
|
}
|
|
1829
2055
|
|
|
1830
|
-
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor *
|
|
2056
|
+
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor const * const * test_nodes, size_t num_test_nodes) {
|
|
1831
2057
|
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
|
|
1832
2058
|
if (copy.buffer == NULL) {
|
|
1833
2059
|
return false;
|
|
@@ -1838,22 +2064,22 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
|
|
|
1838
2064
|
|
|
1839
2065
|
assert(g1->n_nodes == g2->n_nodes);
|
|
1840
2066
|
|
|
1841
|
-
if (
|
|
1842
|
-
|
|
2067
|
+
if (num_test_nodes != 0) {
|
|
2068
|
+
GGML_ASSERT(test_nodes);
|
|
2069
|
+
// Compute the whole graph and only test the output for specific tensors
|
|
1843
2070
|
ggml_backend_graph_compute(backend1, g1);
|
|
1844
2071
|
ggml_backend_graph_compute(backend2, g2);
|
|
1845
2072
|
|
|
1846
|
-
|
|
2073
|
+
bool verified = false;
|
|
1847
2074
|
for (int i = 0; i < g1->n_nodes; i++) {
|
|
1848
|
-
|
|
1849
|
-
|
|
1850
|
-
|
|
1851
|
-
|
|
2075
|
+
for (size_t j = 0; j < num_test_nodes; ++j) {
|
|
2076
|
+
if (g1->nodes[i] == test_nodes[j]) {
|
|
2077
|
+
callback(i, g1->nodes[i], g2->nodes[i], user_data);
|
|
2078
|
+
verified = true;
|
|
2079
|
+
}
|
|
1852
2080
|
}
|
|
1853
2081
|
}
|
|
1854
|
-
GGML_ASSERT(
|
|
1855
|
-
|
|
1856
|
-
callback(test_node_idx, g1->nodes[test_node_idx], g2->nodes[test_node_idx], user_data);
|
|
2082
|
+
GGML_ASSERT(verified);
|
|
1857
2083
|
} else {
|
|
1858
2084
|
for (int i = 0; i < g1->n_nodes; i++) {
|
|
1859
2085
|
struct ggml_tensor * t1 = g1->nodes[i];
|
|
@@ -1885,6 +2111,7 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
|
|
|
1885
2111
|
// CPU backend - buffer
|
|
1886
2112
|
|
|
1887
2113
|
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
2114
|
+
GGML_ASSERT(buffer);
|
|
1888
2115
|
uintptr_t data = (uintptr_t)buffer->context;
|
|
1889
2116
|
|
|
1890
2117
|
// align the buffer
|
|
@@ -1896,28 +2123,33 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
|
1896
2123
|
}
|
|
1897
2124
|
|
|
1898
2125
|
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
2126
|
+
GGML_ASSERT(buffer);
|
|
1899
2127
|
ggml_aligned_free(buffer->context, buffer->size);
|
|
1900
2128
|
}
|
|
1901
2129
|
|
|
1902
2130
|
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
2131
|
+
GGML_ASSERT(tensor);
|
|
1903
2132
|
memset((char *)tensor->data + offset, value, size);
|
|
1904
2133
|
|
|
1905
2134
|
GGML_UNUSED(buffer);
|
|
1906
2135
|
}
|
|
1907
2136
|
|
|
1908
2137
|
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
2138
|
+
GGML_ASSERT(tensor);
|
|
1909
2139
|
memcpy((char *)tensor->data + offset, data, size);
|
|
1910
2140
|
|
|
1911
2141
|
GGML_UNUSED(buffer);
|
|
1912
2142
|
}
|
|
1913
2143
|
|
|
1914
2144
|
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
2145
|
+
GGML_ASSERT(tensor);
|
|
1915
2146
|
memcpy(data, (const char *)tensor->data + offset, size);
|
|
1916
2147
|
|
|
1917
2148
|
GGML_UNUSED(buffer);
|
|
1918
2149
|
}
|
|
1919
2150
|
|
|
1920
2151
|
static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
|
2152
|
+
GGML_ASSERT(src);
|
|
1921
2153
|
if (ggml_backend_buffer_is_host(src->buffer)) {
|
|
1922
2154
|
memcpy(dst->data, src->data, ggml_nbytes(src));
|
|
1923
2155
|
return true;
|
|
@@ -1928,6 +2160,7 @@ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
|
|
|
1928
2160
|
}
|
|
1929
2161
|
|
|
1930
2162
|
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
2163
|
+
GGML_ASSERT(buffer);
|
|
1931
2164
|
memset(buffer->context, value, buffer->size);
|
|
1932
2165
|
}
|
|
1933
2166
|
|