whispercpp 1.3.4 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +158 -44
- data/ext/extconf.rb +3 -2
- data/ext/ruby_whisper.c +34 -6
- data/ext/ruby_whisper.h +67 -0
- data/ext/ruby_whisper_context.c +236 -144
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_model.c +12 -13
- data/ext/ruby_whisper_params.c +47 -24
- data/ext/ruby_whisper_segment.c +84 -20
- data/ext/ruby_whisper_token.c +371 -0
- data/ext/ruby_whisper_transcribe.cpp +5 -2
- data/ext/ruby_whisper_vad_context.c +122 -0
- data/ext/ruby_whisper_vad_context_detect.cpp +51 -0
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +138 -0
- data/ext/ruby_whisper_vad_segments.c +105 -0
- data/ext/sources/CMakeLists.txt +4 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/arm64-apple-clang.cmake +16 -0
- data/ext/sources/cmake/arm64-windows-llvm.cmake +16 -0
- data/ext/sources/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/cmake/x64-windows-llvm.cmake +5 -0
- data/ext/sources/examples/addon.node/vad-example.js +2 -2
- data/ext/sources/examples/bench/bench.cpp +23 -18
- data/ext/sources/examples/cli/cli.cpp +129 -112
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/lsp/CMakeLists.txt +2 -1
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/quantize/CMakeLists.txt +2 -1
- data/ext/sources/examples/server/server.cpp +28 -15
- data/ext/sources/examples/talk-llama/CMakeLists.txt +8 -3
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +5 -2
- data/ext/sources/examples/talk-llama/llama-adapter.h +7 -0
- data/ext/sources/examples/talk-llama/llama-arch.cpp +2378 -1988
- data/ext/sources/examples/talk-llama/llama-arch.h +109 -2
- data/ext/sources/examples/talk-llama/llama-batch.cpp +78 -34
- data/ext/sources/examples/talk-llama/llama-batch.h +17 -4
- data/ext/sources/examples/talk-llama/llama-chat.cpp +100 -4
- data/ext/sources/examples/talk-llama/llama-chat.h +5 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +1088 -403
- data/ext/sources/examples/talk-llama/llama-context.h +70 -23
- data/ext/sources/examples/talk-llama/llama-cparams.h +6 -0
- data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +295 -60
- data/ext/sources/examples/talk-llama/llama-grammar.h +22 -1
- data/ext/sources/examples/talk-llama/llama-graph.cpp +925 -155
- data/ext/sources/examples/talk-llama/llama-graph.h +234 -23
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +79 -38
- data/ext/sources/examples/talk-llama/llama-hparams.h +118 -18
- data/ext/sources/examples/talk-llama/llama-impl.cpp +11 -7
- data/ext/sources/examples/talk-llama/llama-impl.h +14 -2
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +8 -4
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +405 -140
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +24 -10
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +44 -2
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +12 -10
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +42 -31
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +2 -2
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +197 -45
- data/ext/sources/examples/talk-llama/llama-mmap.h +8 -3
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +606 -116
- data/ext/sources/examples/talk-llama/llama-model-loader.h +41 -5
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +61 -44
- data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
- data/ext/sources/examples/talk-llama/llama-model.cpp +2756 -13643
- data/ext/sources/examples/talk-llama/llama-model.h +112 -18
- data/ext/sources/examples/talk-llama/llama-quant.cpp +582 -365
- data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +1409 -199
- data/ext/sources/examples/talk-llama/llama-sampler.h +42 -0
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +248 -82
- data/ext/sources/examples/talk-llama/llama-vocab.h +50 -40
- data/ext/sources/examples/talk-llama/llama.cpp +802 -21
- data/ext/sources/examples/talk-llama/llama.h +210 -39
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +190 -0
- data/ext/sources/examples/talk-llama/models/apertus.cpp +125 -0
- data/ext/sources/examples/talk-llama/models/arcee.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/arctic.cpp +137 -0
- data/ext/sources/examples/talk-llama/models/arwkv7.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +143 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +133 -0
- data/ext/sources/examples/talk-llama/models/bert.cpp +184 -0
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +145 -0
- data/ext/sources/examples/talk-llama/models/bloom.cpp +101 -0
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +111 -0
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +102 -0
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +134 -0
- data/ext/sources/examples/talk-llama/models/command-r.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/deci.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +142 -0
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +262 -0
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/dream.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +148 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +114 -0
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +111 -0
- data/ext/sources/examples/talk-llama/models/falcon.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +116 -0
- data/ext/sources/examples/talk-llama/models/gemma.cpp +112 -0
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +155 -0
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +384 -0
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +170 -0
- data/ext/sources/examples/talk-llama/models/glm4.cpp +157 -0
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +195 -0
- data/ext/sources/examples/talk-llama/models/granite.cpp +210 -0
- data/ext/sources/examples/talk-llama/models/grok.cpp +159 -0
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +139 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +153 -0
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/jais.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +106 -0
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +196 -0
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/llada.cpp +99 -0
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/llama.cpp +175 -0
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/mamba-base.cpp +289 -0
- data/ext/sources/examples/talk-llama/models/mamba.cpp +54 -0
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +129 -0
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +200 -0
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +160 -0
- data/ext/sources/examples/talk-llama/models/models.h +704 -0
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +109 -0
- data/ext/sources/examples/talk-llama/models/mpt.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +162 -0
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +104 -0
- data/ext/sources/examples/talk-llama/models/olmo.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +127 -0
- data/ext/sources/examples/talk-llama/models/openelm.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/orion.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi3.cpp +152 -0
- data/ext/sources/examples/talk-llama/models/plamo.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +320 -0
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/plm.cpp +169 -0
- data/ext/sources/examples/talk-llama/models/qwen.cpp +108 -0
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +151 -0
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +131 -0
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +525 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +140 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/refact.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +164 -0
- data/ext/sources/examples/talk-llama/models/rwkv6.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +137 -0
- data/ext/sources/examples/talk-llama/models/rwkv7.cpp +90 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +146 -0
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +100 -0
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +166 -0
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +96 -0
- data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +149 -0
- data/ext/sources/examples/talk-llama/models/xverse.cpp +108 -0
- data/ext/sources/examples/talk-llama/unicode.cpp +121 -79
- data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +1 -1
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +1 -1
- data/ext/sources/ggml/CMakeLists.txt +90 -56
- data/ext/sources/ggml/include/ggml-alloc.h +9 -0
- data/ext/sources/ggml/include/ggml-backend.h +5 -2
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +6 -0
- data/ext/sources/ggml/include/ggml-hexagon.h +19 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +14 -12
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml-zendnn.h +22 -0
- data/ext/sources/ggml/include/ggml.h +246 -21
- data/ext/sources/ggml/src/CMakeLists.txt +85 -11
- data/ext/sources/ggml/src/ggml-alloc.c +128 -50
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-impl.h +1 -4
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +54 -88
- data/ext/sources/ggml/src/ggml-backend.cpp +76 -23
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +18 -4
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +11 -11
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +58 -46
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +139 -48
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +2427 -1785
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +238 -362
- data/ext/sources/ggml/src/ggml-cann/common.h +285 -211
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +663 -831
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +170 -95
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -18
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +513 -27
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +4192 -992
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1761 -49
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +124 -24
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +157 -28
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -3
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +251 -80
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +19 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +587 -119
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +33 -44
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1093 -194
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1284 -203
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1519 -527
- data/ext/sources/ggml/src/ggml-cpu/ops.h +6 -4
- data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +3632 -781
- data/ext/sources/ggml/src/ggml-cpu/repack.h +129 -4
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +152 -46
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +3 -2
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +152 -1
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +7 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +140 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.h +261 -146
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +72 -1
- data/ext/sources/ggml/src/ggml-cuda/argmax.cu +2 -2
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +132 -6
- data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +16 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +33 -31
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +474 -85
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +342 -246
- data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +1 -5
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cu +307 -0
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/diag.cu +77 -0
- data/ext/sources/ggml/src/ggml-cuda/diag.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +98 -74
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +973 -665
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +35 -741
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +1255 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +33 -40
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +40 -18
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +206 -45
- data/ext/sources/ggml/src/ggml-cuda/fill.cu +37 -0
- data/ext/sources/ggml/src/ggml-cuda/fill.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1688 -302
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +12 -10
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +908 -48
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +88 -20
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +502 -90
- data/ext/sources/ggml/src/ggml-cuda/mmid.cu +164 -0
- data/ext/sources/ggml/src/ggml-cuda/mmid.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +69 -176
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +532 -193
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +460 -104
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +5 -2
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +360 -122
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +2 -1
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +73 -39
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +152 -1
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +14 -0
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +364 -149
- data/ext/sources/ggml/src/ggml-cuda/rope.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +101 -47
- data/ext/sources/ggml/src/ggml-cuda/set.cu +39 -0
- data/ext/sources/ggml/src/ggml-cuda/set.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +163 -41
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +275 -0
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +68 -50
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +49 -84
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +22 -4
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +95 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +275 -119
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -7
- data/ext/sources/ggml/src/ggml-cuda/tri.cu +136 -0
- data/ext/sources/ggml/src/ggml-cuda/tri.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +160 -11
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +38 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +163 -7
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +22 -1
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +6 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3325 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +46 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +813 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +891 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +713 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +112 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +155 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +63 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +26 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +1199 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2670 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +497 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +168 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +419 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +382 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +153 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +14 -13
- data/ext/sources/ggml/src/ggml-impl.h +129 -6
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +15 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +173 -34
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +912 -344
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +124 -59
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +588 -144
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +396 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +1724 -421
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +16 -3
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +333 -114
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +3050 -1539
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +3 -1
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +30 -1
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4279 -497
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +267 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +4 -3
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +35 -16
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
- data/ext/sources/ggml/src/ggml-quants.c +96 -5
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +438 -156
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +59 -87
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +81 -0
- data/ext/sources/ggml/src/ggml-sycl/add-id.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -29
- data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +0 -6
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +427 -20
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +55 -44
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +103 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +79 -0
- data/ext/sources/ggml/src/ggml-sycl/count-equal.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +0 -3
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +18 -0
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +867 -50
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +401 -358
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +12 -2
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +645 -155
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +221 -66
- data/ext/sources/ggml/src/ggml-sycl/norm.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/pad.cpp +97 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.hpp +24 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.cpp +122 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +457 -281
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/set.cpp +73 -0
- data/ext/sources/ggml/src/ggml-sycl/set.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +327 -162
- data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +4 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +71 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +39 -19
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +5994 -3055
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +18 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +47 -49
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +9 -21
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +386 -160
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +82 -20
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +400 -174
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +123 -37
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +10 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +17 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +2 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +19 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +13 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mat_vec_base.comp → mul_mat_vec_base.glsl} +77 -29
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +71 -21
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +41 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +44 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +39 -36
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +494 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +88 -105
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +41 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mm_funcs.comp → mul_mm_funcs.glsl} +69 -59
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +74 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +92 -230
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +21 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +10 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +49 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +207 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +8 -49
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +8 -32
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +8 -32
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +33 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +8 -38
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +50 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +2 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +43 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +345 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +90 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +384 -180
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +28 -2
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1374 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +2544 -726
- data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +73 -15
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +72 -261
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +766 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +147 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +196 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +83 -17
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +91 -0
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
- data/ext/sources/ggml/src/ggml.c +590 -64
- data/ext/sources/ggml/src/gguf.cpp +229 -44
- data/ext/sources/include/whisper.h +1 -0
- data/ext/sources/src/CMakeLists.txt +3 -1
- data/ext/sources/src/whisper.cpp +106 -62
- data/ext/sources/tests/CMakeLists.txt +2 -2
- data/ext/sources/tests/test-vad-full.cpp +4 -2
- data/ext/sources/tests/test-vad.cpp +1 -1
- data/extsources.rb +1 -0
- data/lib/whisper/model/uri.rb +17 -18
- data/sig/whisper.rbs +162 -4
- data/test/test_context_params.rb +82 -0
- data/test/test_params.rb +16 -8
- data/test/test_segment.rb +0 -1
- data/test/test_token.rb +81 -0
- data/test/test_vad.rb +1 -1
- data/test/test_vad_context.rb +100 -0
- data/test/test_vad_segment.rb +19 -0
- data/test/test_vad_segments.rb +16 -0
- data/test/test_whisper.rb +27 -0
- data/whispercpp.gemspec +1 -1
- metadata +502 -37
- data/ext/sources/build-xcframework.sh +0 -571
- data/ext/sources/examples/talk-llama/llama-sampling.h +0 -32
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cann/Doxyfile +0 -2579
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +0 -44
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +0 -41
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +0 -44
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +0 -41
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +0 -48
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
#import "ggml-metal-device.h"
|
|
2
2
|
|
|
3
3
|
#import "ggml-impl.h"
|
|
4
|
-
#import "ggml-threading.h"
|
|
5
4
|
|
|
6
5
|
#include <Foundation/Foundation.h>
|
|
7
6
|
|
|
8
7
|
#include <Metal/Metal.h>
|
|
9
8
|
|
|
9
|
+
#include <stdatomic.h>
|
|
10
|
+
|
|
10
11
|
#ifndef TARGET_OS_VISION
|
|
11
12
|
#define TARGET_OS_VISION 0
|
|
12
13
|
#endif
|
|
@@ -19,8 +20,9 @@
|
|
|
19
20
|
#define GGML_METAL_HAS_RESIDENCY_SETS 1
|
|
20
21
|
#endif
|
|
21
22
|
|
|
22
|
-
// overload of
|
|
23
|
+
// overload of MTLGPUFamilyMetalX (not available in some environments)
|
|
23
24
|
static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
|
|
25
|
+
static const NSInteger MTLGPUFamilyMetal4_GGML = 5002;
|
|
24
26
|
|
|
25
27
|
#if !GGML_METAL_EMBED_LIBRARY
|
|
26
28
|
// Here to assist with NSBundle Path Hack
|
|
@@ -69,14 +71,6 @@ void ggml_metal_cv_set_bool(ggml_metal_cv_t cv, bool value, int32_t idx) {
|
|
|
69
71
|
|
|
70
72
|
struct ggml_metal_pipeline {
|
|
71
73
|
id<MTLComputePipelineState> obj;
|
|
72
|
-
|
|
73
|
-
// suggested dispatch sizes
|
|
74
|
-
int nsg;
|
|
75
|
-
|
|
76
|
-
int nr0;
|
|
77
|
-
int nr1;
|
|
78
|
-
|
|
79
|
-
size_t smem;
|
|
80
74
|
};
|
|
81
75
|
|
|
82
76
|
ggml_metal_pipeline_t ggml_metal_pipeline_init(void) {
|
|
@@ -84,10 +78,6 @@ ggml_metal_pipeline_t ggml_metal_pipeline_init(void) {
|
|
|
84
78
|
|
|
85
79
|
*res = (struct ggml_metal_pipeline) {
|
|
86
80
|
/*.obj =*/ nil,
|
|
87
|
-
/*.nsg =*/ 0,
|
|
88
|
-
/*.nr0 =*/ 0,
|
|
89
|
-
/*.nr1 =*/ 0,
|
|
90
|
-
/*.smem =*/ 0,
|
|
91
81
|
};
|
|
92
82
|
|
|
93
83
|
return res;
|
|
@@ -99,40 +89,8 @@ void ggml_metal_pipeline_free(ggml_metal_pipeline_t pipeline) {
|
|
|
99
89
|
free(pipeline);
|
|
100
90
|
}
|
|
101
91
|
|
|
102
|
-
|
|
103
|
-
pipeline->
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
int ggml_metal_pipeline_get_nsg(ggml_metal_pipeline_t pipeline) {
|
|
107
|
-
return pipeline->nsg;
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
void ggml_metal_pipeline_set_nr0(ggml_metal_pipeline_t pipeline, int nr0) {
|
|
111
|
-
pipeline->nr0 = nr0;
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
int ggml_metal_pipeline_get_nr0(ggml_metal_pipeline_t pipeline) {
|
|
115
|
-
return pipeline->nr0;
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
void ggml_metal_pipeline_set_nr1(ggml_metal_pipeline_t pipeline, int nr1) {
|
|
119
|
-
pipeline->nr1 = nr1;
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
int ggml_metal_pipeline_get_nr1(ggml_metal_pipeline_t pipeline) {
|
|
123
|
-
return pipeline->nr1;
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
void ggml_metal_pipeline_set_smem(ggml_metal_pipeline_t pipeline, size_t smem) {
|
|
127
|
-
pipeline->smem = smem;
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
size_t ggml_metal_pipeline_get_smem(ggml_metal_pipeline_t pipeline) {
|
|
131
|
-
return pipeline->smem;
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
int ggml_metal_pipeline_max_theads_per_threadgroup(ggml_metal_pipeline_t pipeline) {
|
|
135
|
-
return pipeline->obj.maxTotalThreadsPerThreadgroup;
|
|
92
|
+
int ggml_metal_pipeline_max_theads_per_threadgroup(struct ggml_metal_pipeline_with_params pipeline) {
|
|
93
|
+
return pipeline.pipeline->obj.maxTotalThreadsPerThreadgroup;
|
|
136
94
|
}
|
|
137
95
|
|
|
138
96
|
struct ggml_metal_library {
|
|
@@ -140,6 +98,8 @@ struct ggml_metal_library {
|
|
|
140
98
|
id<MTLDevice> device;
|
|
141
99
|
|
|
142
100
|
ggml_metal_pipelines_t pipelines; // cache of compiled pipelines
|
|
101
|
+
|
|
102
|
+
NSLock * lock;
|
|
143
103
|
};
|
|
144
104
|
|
|
145
105
|
ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
|
|
@@ -256,6 +216,10 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
|
|
|
256
216
|
[prep setObject:@"1" forKey:@"GGML_METAL_HAS_BF16"];
|
|
257
217
|
}
|
|
258
218
|
|
|
219
|
+
if (ggml_metal_device_get_props(dev)->has_tensor) {
|
|
220
|
+
[prep setObject:@"1" forKey:@"GGML_METAL_HAS_TENSOR"];
|
|
221
|
+
}
|
|
222
|
+
|
|
259
223
|
#if GGML_METAL_EMBED_LIBRARY
|
|
260
224
|
[prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
|
|
261
225
|
#endif
|
|
@@ -286,9 +250,77 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
|
|
|
286
250
|
|
|
287
251
|
ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));
|
|
288
252
|
|
|
289
|
-
res->obj
|
|
290
|
-
res->device
|
|
253
|
+
res->obj = library;
|
|
254
|
+
res->device = device;
|
|
255
|
+
res->pipelines = ggml_metal_pipelines_init();
|
|
256
|
+
res->lock = [NSLock new];
|
|
257
|
+
|
|
258
|
+
return res;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev, const char * source, bool verbose) {
|
|
262
|
+
if (source == NULL) {
|
|
263
|
+
GGML_LOG_ERROR("%s: source is NULL\n", __func__);
|
|
264
|
+
return NULL;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
id<MTLDevice> device = ggml_metal_device_get_obj(dev);
|
|
268
|
+
id<MTLLibrary> library = nil;
|
|
269
|
+
NSError * error = nil;
|
|
270
|
+
|
|
271
|
+
const int64_t t_start = ggml_time_us();
|
|
272
|
+
|
|
273
|
+
NSString * src = [[NSString alloc] initWithBytes:source
|
|
274
|
+
length:strlen(source)
|
|
275
|
+
encoding:NSUTF8StringEncoding];
|
|
276
|
+
if (!src) {
|
|
277
|
+
GGML_LOG_ERROR("%s: failed to create NSString from source\n", __func__);
|
|
278
|
+
return NULL;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
@autoreleasepool {
|
|
282
|
+
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
|
283
|
+
|
|
284
|
+
MTLCompileOptions * options = [MTLCompileOptions new];
|
|
285
|
+
options.preprocessorMacros = prep;
|
|
286
|
+
|
|
287
|
+
library = [device newLibraryWithSource:src options:options error:&error];
|
|
288
|
+
if (error) {
|
|
289
|
+
if (verbose) {
|
|
290
|
+
GGML_LOG_ERROR("%s: error compiling source: %s\n", __func__, [[error description] UTF8String]);
|
|
291
|
+
} else {
|
|
292
|
+
GGML_LOG_ERROR("%s: error compiling source\n", __func__);
|
|
293
|
+
}
|
|
294
|
+
library = nil;
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
[options release];
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
[src release];
|
|
301
|
+
|
|
302
|
+
if (!library) {
|
|
303
|
+
if (verbose) {
|
|
304
|
+
GGML_LOG_ERROR("%s: failed to create Metal library from source\n", __func__);
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
return NULL;
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
if (verbose) {
|
|
311
|
+
GGML_LOG_INFO("%s: compiled in %.3f sec\n", __func__, (ggml_time_us() - t_start) / 1e6);
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));
|
|
315
|
+
if (!res) {
|
|
316
|
+
GGML_LOG_ERROR("%s: calloc failed\n", __func__);
|
|
317
|
+
return NULL;
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
res->obj = library;
|
|
321
|
+
res->device = device;
|
|
291
322
|
res->pipelines = ggml_metal_pipelines_init();
|
|
323
|
+
res->lock = [NSLock new];
|
|
292
324
|
|
|
293
325
|
return res;
|
|
294
326
|
}
|
|
@@ -304,26 +336,51 @@ void ggml_metal_library_free(ggml_metal_library_t lib) {
|
|
|
304
336
|
|
|
305
337
|
ggml_metal_pipelines_free(lib->pipelines);
|
|
306
338
|
|
|
339
|
+
[lib->lock release];
|
|
340
|
+
|
|
307
341
|
free(lib);
|
|
308
342
|
}
|
|
309
343
|
|
|
310
|
-
|
|
311
|
-
|
|
344
|
+
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline(ggml_metal_library_t lib, const char * name) {
|
|
345
|
+
[lib->lock lock];
|
|
346
|
+
|
|
347
|
+
struct ggml_metal_pipeline_with_params res = {
|
|
348
|
+
/*.pipeline =*/ nil,
|
|
349
|
+
/*.nsg =*/ 0,
|
|
350
|
+
/*.nr0 =*/ 0,
|
|
351
|
+
/*.nr1 =*/ 0,
|
|
352
|
+
/*.smem =*/ 0,
|
|
353
|
+
/*.c4 =*/ false,
|
|
354
|
+
/*.cnt =*/ false,
|
|
355
|
+
};
|
|
356
|
+
|
|
357
|
+
res.pipeline = ggml_metal_pipelines_get(lib->pipelines, name);
|
|
358
|
+
|
|
359
|
+
[lib->lock unlock];
|
|
360
|
+
|
|
361
|
+
return res;
|
|
312
362
|
}
|
|
313
363
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
364
|
+
struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv) {
|
|
365
|
+
struct ggml_metal_pipeline_with_params res = {
|
|
366
|
+
/*.pipeline =*/ nil,
|
|
367
|
+
/*.nsg =*/ 0,
|
|
368
|
+
/*.nr0 =*/ 0,
|
|
369
|
+
/*.nr1 =*/ 0,
|
|
370
|
+
/*.smem =*/ 0,
|
|
371
|
+
/*.c4 =*/ false,
|
|
372
|
+
/*.cnt =*/ false,
|
|
373
|
+
};
|
|
374
|
+
|
|
375
|
+
[lib->lock lock];
|
|
317
376
|
|
|
318
|
-
|
|
319
|
-
if (res) {
|
|
320
|
-
|
|
377
|
+
res.pipeline = ggml_metal_pipelines_get(lib->pipelines, name);
|
|
378
|
+
if (res.pipeline) {
|
|
379
|
+
[lib->lock unlock];
|
|
321
380
|
|
|
322
381
|
return res;
|
|
323
382
|
}
|
|
324
383
|
|
|
325
|
-
res = ggml_metal_pipeline_init();
|
|
326
|
-
|
|
327
384
|
@autoreleasepool {
|
|
328
385
|
NSError * error = nil;
|
|
329
386
|
|
|
@@ -338,28 +395,53 @@ ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t l
|
|
|
338
395
|
mtl_function = [lib->obj newFunctionWithName:base_func constantValues:cv->obj error:&error];
|
|
339
396
|
}
|
|
340
397
|
if (!mtl_function) {
|
|
341
|
-
|
|
398
|
+
[lib->lock unlock];
|
|
342
399
|
|
|
343
|
-
GGML_LOG_ERROR("%s:
|
|
400
|
+
GGML_LOG_ERROR("%s: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
|
|
344
401
|
if (error) {
|
|
345
|
-
GGML_LOG_ERROR("%s:
|
|
402
|
+
GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
|
|
346
403
|
}
|
|
347
404
|
|
|
348
|
-
return
|
|
405
|
+
return res;
|
|
349
406
|
}
|
|
350
407
|
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
ggml_metal_pipelines_add(lib->pipelines, name, res);
|
|
408
|
+
id<MTLComputePipelineState> obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
|
|
354
409
|
|
|
355
410
|
[mtl_function release];
|
|
356
411
|
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
412
|
+
if (!obj) {
|
|
413
|
+
[lib->lock unlock];
|
|
414
|
+
|
|
415
|
+
GGML_LOG_ERROR("%s: failed to create pipeline state: base = '%s', name = '%s'\n", __func__, base, name);
|
|
416
|
+
if (error) {
|
|
417
|
+
GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
return res;
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name,
|
|
424
|
+
(void *) obj,
|
|
425
|
+
(int) obj.maxTotalThreadsPerThreadgroup,
|
|
426
|
+
(int) obj.threadExecutionWidth);
|
|
427
|
+
|
|
428
|
+
if (obj.maxTotalThreadsPerThreadgroup == 0 || obj.threadExecutionWidth == 0) {
|
|
429
|
+
[obj release];
|
|
430
|
+
|
|
431
|
+
[lib->lock unlock];
|
|
432
|
+
|
|
433
|
+
GGML_LOG_ERROR("%s: incompatible pipeline %s\n", __func__, name);
|
|
434
|
+
|
|
435
|
+
return res;
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
res.pipeline = ggml_metal_pipeline_init();
|
|
439
|
+
res.pipeline->obj = obj;
|
|
440
|
+
|
|
441
|
+
ggml_metal_pipelines_add(lib->pipelines, name, res.pipeline);
|
|
360
442
|
}
|
|
361
443
|
|
|
362
|
-
|
|
444
|
+
[lib->lock unlock];
|
|
363
445
|
|
|
364
446
|
return res;
|
|
365
447
|
}
|
|
@@ -401,8 +483,8 @@ void ggml_metal_encoder_debug_group_pop (ggml_metal_encoder_t encoder) {
|
|
|
401
483
|
[encoder->obj popDebugGroup];
|
|
402
484
|
}
|
|
403
485
|
|
|
404
|
-
void ggml_metal_encoder_set_pipeline(ggml_metal_encoder_t encoder,
|
|
405
|
-
[encoder->obj setComputePipelineState:pipeline->obj];
|
|
486
|
+
void ggml_metal_encoder_set_pipeline(ggml_metal_encoder_t encoder, struct ggml_metal_pipeline_with_params pipeline) {
|
|
487
|
+
[encoder->obj setComputePipelineState:pipeline.pipeline->obj];
|
|
406
488
|
}
|
|
407
489
|
|
|
408
490
|
void ggml_metal_encoder_set_bytes(ggml_metal_encoder_t encoder, void * data, size_t size, int idx) {
|
|
@@ -437,12 +519,110 @@ struct ggml_metal_device {
|
|
|
437
519
|
// ref: https://github.com/ggml-org/llama.cpp/pull/15906
|
|
438
520
|
id<MTLCommandQueue> mtl_queue;
|
|
439
521
|
|
|
522
|
+
ggml_metal_rsets_t rsets;
|
|
523
|
+
|
|
440
524
|
ggml_metal_library_t library;
|
|
441
525
|
|
|
442
526
|
struct ggml_metal_device_props props;
|
|
527
|
+
|
|
528
|
+
// virtual address for GPU memory allocations
|
|
529
|
+
atomic_uintptr_t addr_virt;
|
|
530
|
+
};
|
|
531
|
+
|
|
532
|
+
//
|
|
533
|
+
// MTLResidenceSet wrapper
|
|
534
|
+
//
|
|
535
|
+
|
|
536
|
+
struct ggml_metal_rsets {
|
|
537
|
+
NSLock * lock;
|
|
538
|
+
|
|
539
|
+
NSMutableArray * data;
|
|
540
|
+
|
|
541
|
+
// number of seconds since the last graph computation
|
|
542
|
+
// keep the residency sets wired for that amount of time to avoid being collected by the OS
|
|
543
|
+
int keep_alive_s;
|
|
544
|
+
|
|
545
|
+
// background heartbeat thread to keep the residency sets alive
|
|
546
|
+
atomic_bool d_stop;
|
|
547
|
+
atomic_int d_loop;
|
|
548
|
+
|
|
549
|
+
dispatch_group_t d_group;
|
|
443
550
|
};
|
|
444
551
|
|
|
445
|
-
|
|
552
|
+
ggml_metal_rsets_t ggml_metal_rsets_init(void) {
|
|
553
|
+
ggml_metal_rsets_t res = calloc(1, sizeof(struct ggml_metal_rsets));
|
|
554
|
+
|
|
555
|
+
res->lock = [[NSLock alloc] init];
|
|
556
|
+
res->data = [[NSMutableArray alloc] init];
|
|
557
|
+
|
|
558
|
+
// by default keep the memory wired for 3 minutes
|
|
559
|
+
res->keep_alive_s = 3*60;
|
|
560
|
+
|
|
561
|
+
const char * GGML_METAL_RESIDENCY_KEEP_ALIVE_S = getenv("GGML_METAL_RESIDENCY_KEEP_ALIVE_S");
|
|
562
|
+
if (GGML_METAL_RESIDENCY_KEEP_ALIVE_S) {
|
|
563
|
+
res->keep_alive_s = atoi(GGML_METAL_RESIDENCY_KEEP_ALIVE_S);
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
if (res->keep_alive_s <= 0) {
|
|
567
|
+
res->keep_alive_s = 3*60;
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
GGML_LOG_INFO("%s: creating a residency set collection (keep_alive = %d s)\n", __func__, res->keep_alive_s);
|
|
571
|
+
|
|
572
|
+
atomic_store_explicit(&res->d_stop, false, memory_order_relaxed);
|
|
573
|
+
atomic_store_explicit(&res->d_loop, 2*res->keep_alive_s, memory_order_relaxed);
|
|
574
|
+
|
|
575
|
+
res->d_group = dispatch_group_create();
|
|
576
|
+
|
|
577
|
+
// start a background thread that periodically requests residency for all the currently active sets in the collection
|
|
578
|
+
// the requests stop after a certain amount of time (keep_alive_s) of inactivity
|
|
579
|
+
dispatch_queue_t d_queue = dispatch_get_global_queue(QOS_CLASS_DEFAULT, 0);
|
|
580
|
+
dispatch_group_async(res->d_group, d_queue, ^{
|
|
581
|
+
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
|
|
582
|
+
if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
|
|
583
|
+
while (!atomic_load_explicit(&res->d_stop, memory_order_relaxed)) {
|
|
584
|
+
if (atomic_load_explicit(&res->d_loop, memory_order_relaxed) > 0) {
|
|
585
|
+
[res->lock lock];
|
|
586
|
+
|
|
587
|
+
for (int i = 0; i < (int) res->data.count; ++i) {
|
|
588
|
+
[res->data[i] requestResidency];
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
atomic_fetch_sub_explicit(&res->d_loop, 1, memory_order_relaxed);
|
|
592
|
+
|
|
593
|
+
[res->lock unlock];
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
// half a second
|
|
597
|
+
usleep(500 * 1000);
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
#endif
|
|
601
|
+
});
|
|
602
|
+
|
|
603
|
+
return res;
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
void ggml_metal_rsets_free(ggml_metal_rsets_t rsets) {
|
|
607
|
+
if (rsets == NULL) {
|
|
608
|
+
return;
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
// note: if you hit this assert, most likely you haven't deallocated all Metal resources before exiting
|
|
612
|
+
GGML_ASSERT([rsets->data count] == 0);
|
|
613
|
+
|
|
614
|
+
atomic_store_explicit(&rsets->d_stop, true, memory_order_relaxed);
|
|
615
|
+
|
|
616
|
+
dispatch_group_wait(rsets->d_group, DISPATCH_TIME_FOREVER);
|
|
617
|
+
dispatch_release(rsets->d_group);
|
|
618
|
+
|
|
619
|
+
[rsets->data release];
|
|
620
|
+
[rsets->lock release];
|
|
621
|
+
|
|
622
|
+
free(rsets);
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
ggml_metal_device_t ggml_metal_device_init(int device) {
|
|
446
626
|
ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device));
|
|
447
627
|
|
|
448
628
|
assert(dev != NULL);
|
|
@@ -456,6 +636,9 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
|
|
456
636
|
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
|
|
457
637
|
}
|
|
458
638
|
|
|
639
|
+
dev->addr_virt = 0x000000400ULL;
|
|
640
|
+
|
|
641
|
+
dev->props.device = device;
|
|
459
642
|
dev->props.has_simdgroup_reduction = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
|
460
643
|
dev->props.has_simdgroup_reduction |= [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
|
|
461
644
|
|
|
@@ -464,6 +647,128 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
|
|
464
647
|
|
|
465
648
|
dev->props.has_bfloat = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
|
|
466
649
|
dev->props.has_bfloat |= [dev->mtl_device supportsFamily:MTLGPUFamilyApple6];
|
|
650
|
+
if (getenv("GGML_METAL_BF16_DISABLE") != NULL) {
|
|
651
|
+
dev->props.has_bfloat = false;
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
dev->props.has_tensor = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal4_GGML];
|
|
655
|
+
if (getenv("GGML_METAL_TENSOR_DISABLE") != NULL) {
|
|
656
|
+
dev->props.has_tensor = false;
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
// note: disable the tensor API by default for old chips because with the current implementation it is not useful
|
|
660
|
+
// - M2 Ultra: ~5% slower
|
|
661
|
+
// - M4, M4 Max: no significant difference
|
|
662
|
+
//
|
|
663
|
+
// TODO: try to update the tensor API kernels to at least match the simdgroup performance
|
|
664
|
+
if (getenv("GGML_METAL_TENSOR_ENABLE") == NULL &&
|
|
665
|
+
![[dev->mtl_device name] containsString:@"M5"] &&
|
|
666
|
+
![[dev->mtl_device name] containsString:@"M6"] &&
|
|
667
|
+
![[dev->mtl_device name] containsString:@"A19"] &&
|
|
668
|
+
![[dev->mtl_device name] containsString:@"A20"]) {
|
|
669
|
+
GGML_LOG_WARN("%s: tensor API disabled for pre-M5 and pre-A19 devices\n", __func__);
|
|
670
|
+
dev->props.has_tensor = false;
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
// double-check that the tensor API compiles
|
|
674
|
+
if (dev->props.has_tensor) {
|
|
675
|
+
const char * src_tensor_f16 = "\n"
|
|
676
|
+
"#include <metal_stdlib> \n"
|
|
677
|
+
"#include <metal_tensor> \n"
|
|
678
|
+
"#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
|
|
679
|
+
" \n"
|
|
680
|
+
"using namespace metal; \n"
|
|
681
|
+
"using namespace mpp::tensor_ops; \n"
|
|
682
|
+
" \n"
|
|
683
|
+
"kernel void dummy_kernel( \n"
|
|
684
|
+
" tensor<device half, dextents<int32_t, 2>> A [[buffer(0)]], \n"
|
|
685
|
+
" tensor<device half, dextents<int32_t, 2>> B [[buffer(1)]], \n"
|
|
686
|
+
" device float * C [[buffer(2)]], \n"
|
|
687
|
+
" uint2 tgid [[threadgroup_position_in_grid]]) \n"
|
|
688
|
+
"{ \n"
|
|
689
|
+
" auto tA = A.slice(0, (int)tgid.y); \n"
|
|
690
|
+
" auto tB = B.slice((int)tgid.x, 0); \n"
|
|
691
|
+
" \n"
|
|
692
|
+
" matmul2d< \n"
|
|
693
|
+
" matmul2d_descriptor(8, 8, dynamic_extent), \n"
|
|
694
|
+
" execution_simdgroups<4>> mm; \n"
|
|
695
|
+
" \n"
|
|
696
|
+
" auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
|
|
697
|
+
" \n"
|
|
698
|
+
" auto sA = tA.slice(0, 0); \n"
|
|
699
|
+
" auto sB = tB.slice(0, 0); \n"
|
|
700
|
+
" mm.run(sB, sA, cT); \n"
|
|
701
|
+
" \n"
|
|
702
|
+
" auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
|
|
703
|
+
" \n"
|
|
704
|
+
" cT.store(tC); \n"
|
|
705
|
+
"}";
|
|
706
|
+
|
|
707
|
+
GGML_LOG_INFO("%s: testing tensor API for f16 support\n", __func__);
|
|
708
|
+
ggml_metal_library_t lib = ggml_metal_library_init_from_source(dev, src_tensor_f16, false);
|
|
709
|
+
if (lib == NULL) {
|
|
710
|
+
GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
|
|
711
|
+
dev->props.has_tensor = false;
|
|
712
|
+
} else {
|
|
713
|
+
struct ggml_metal_pipeline_with_params ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
|
|
714
|
+
if (!ppl.pipeline) {
|
|
715
|
+
GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
|
|
716
|
+
dev->props.has_tensor = false;
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
ggml_metal_library_free(lib);
|
|
720
|
+
}
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
// try to compile a dummy kernel to determine if the tensor API is supported for bfloat
|
|
724
|
+
if (dev->props.has_tensor && dev->props.has_bfloat) {
|
|
725
|
+
const char * src_tensor_bf16 = "\n"
|
|
726
|
+
"#include <metal_stdlib> \n"
|
|
727
|
+
"#include <metal_tensor> \n"
|
|
728
|
+
"#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
|
|
729
|
+
" \n"
|
|
730
|
+
"using namespace metal; \n"
|
|
731
|
+
"using namespace mpp::tensor_ops; \n"
|
|
732
|
+
" \n"
|
|
733
|
+
"kernel void dummy_kernel( \n"
|
|
734
|
+
" tensor<device bfloat, dextents<int32_t, 2>> A [[buffer(0)]], \n"
|
|
735
|
+
" tensor<device bfloat, dextents<int32_t, 2>> B [[buffer(1)]], \n"
|
|
736
|
+
" device float * C [[buffer(2)]], \n"
|
|
737
|
+
" uint2 tgid [[threadgroup_position_in_grid]]) \n"
|
|
738
|
+
"{ \n"
|
|
739
|
+
" auto tA = A.slice(0, (int)tgid.y); \n"
|
|
740
|
+
" auto tB = B.slice((int)tgid.x, 0); \n"
|
|
741
|
+
" \n"
|
|
742
|
+
" matmul2d< \n"
|
|
743
|
+
" matmul2d_descriptor(8, 8, dynamic_extent), \n"
|
|
744
|
+
" execution_simdgroups<4>> mm; \n"
|
|
745
|
+
" \n"
|
|
746
|
+
" auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
|
|
747
|
+
" \n"
|
|
748
|
+
" auto sA = tA.slice(0, 0); \n"
|
|
749
|
+
" auto sB = tB.slice(0, 0); \n"
|
|
750
|
+
" mm.run(sB, sA, cT); \n"
|
|
751
|
+
" \n"
|
|
752
|
+
" auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
|
|
753
|
+
" \n"
|
|
754
|
+
" cT.store(tC); \n"
|
|
755
|
+
"}";
|
|
756
|
+
|
|
757
|
+
GGML_LOG_INFO("%s: testing tensor API for bfloat support\n", __func__);
|
|
758
|
+
ggml_metal_library_t lib = ggml_metal_library_init_from_source(dev, src_tensor_bf16, false);
|
|
759
|
+
if (lib == NULL) {
|
|
760
|
+
GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
|
|
761
|
+
dev->props.has_bfloat = false;
|
|
762
|
+
} else {
|
|
763
|
+
struct ggml_metal_pipeline_with_params ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
|
|
764
|
+
if (!ppl.pipeline) {
|
|
765
|
+
GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
|
|
766
|
+
dev->props.has_bfloat = false;
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
ggml_metal_library_free(lib);
|
|
770
|
+
}
|
|
771
|
+
}
|
|
467
772
|
|
|
468
773
|
dev->props.use_residency_sets = true;
|
|
469
774
|
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
|
|
@@ -471,25 +776,42 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
|
|
471
776
|
#endif
|
|
472
777
|
|
|
473
778
|
dev->props.use_shared_buffers = dev->props.has_unified_memory;
|
|
474
|
-
|
|
779
|
+
#if TARGET_OS_OSX
|
|
780
|
+
// In case of eGPU, shared memory may be preferable.
|
|
781
|
+
dev->props.use_shared_buffers |= [dev->mtl_device location] == MTLDeviceLocationExternal;
|
|
782
|
+
#endif
|
|
475
783
|
if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
|
|
476
784
|
dev->props.use_shared_buffers = false;
|
|
477
785
|
}
|
|
786
|
+
if (getenv("GGML_METAL_SHARED_BUFFERS_ENABLE") != NULL) {
|
|
787
|
+
dev->props.use_shared_buffers = true;
|
|
788
|
+
}
|
|
478
789
|
|
|
479
790
|
dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
|
480
791
|
|
|
792
|
+
dev->props.op_offload_min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
|
793
|
+
|
|
481
794
|
dev->props.max_buffer_size = dev->mtl_device.maxBufferLength;
|
|
482
|
-
dev->props.max_working_set_size = dev->mtl_device.recommendedMaxWorkingSetSize;
|
|
483
795
|
dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength;
|
|
796
|
+
if (@available(macOS 10.12, iOS 16.0, *)) {
|
|
797
|
+
dev->props.max_working_set_size = dev->mtl_device.recommendedMaxWorkingSetSize;
|
|
798
|
+
} else {
|
|
799
|
+
dev->props.max_working_set_size = dev->mtl_device.maxBufferLength;
|
|
800
|
+
}
|
|
484
801
|
|
|
485
|
-
|
|
802
|
+
snprintf(dev->props.name, sizeof(dev->props.name), "%s%d", "MTL", device);
|
|
803
|
+
snprintf(dev->props.desc, sizeof(dev->props.desc), "%s", [[dev->mtl_device name] UTF8String]);
|
|
486
804
|
|
|
487
805
|
dev->library = ggml_metal_library_init(dev);
|
|
488
806
|
if (!dev->library) {
|
|
489
807
|
GGML_LOG_ERROR("%s: error: failed to create library\n", __func__);
|
|
490
808
|
}
|
|
491
809
|
|
|
492
|
-
|
|
810
|
+
if (dev->props.use_residency_sets) {
|
|
811
|
+
dev->rsets = ggml_metal_rsets_init();
|
|
812
|
+
} else {
|
|
813
|
+
dev->rsets = nil;
|
|
814
|
+
}
|
|
493
815
|
|
|
494
816
|
// print MTL GPU family:
|
|
495
817
|
GGML_LOG_INFO("%s: GPU name: %s\n", __func__, dev->props.name);
|
|
@@ -524,6 +846,7 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
|
|
524
846
|
GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, dev->props.has_simdgroup_mm ? "true" : "false");
|
|
525
847
|
GGML_LOG_INFO("%s: has unified memory = %s\n", __func__, dev->props.has_unified_memory ? "true" : "false");
|
|
526
848
|
GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, dev->props.has_bfloat ? "true" : "false");
|
|
849
|
+
GGML_LOG_INFO("%s: has tensor = %s\n", __func__, dev->props.has_tensor ? "true" : "false");
|
|
527
850
|
GGML_LOG_INFO("%s: use residency sets = %s\n", __func__, dev->props.use_residency_sets ? "true" : "false");
|
|
528
851
|
GGML_LOG_INFO("%s: use shared buffers = %s\n", __func__, dev->props.use_shared_buffers ? "true" : "false");
|
|
529
852
|
|
|
@@ -541,6 +864,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
|
|
541
864
|
void ggml_metal_device_free(ggml_metal_device_t dev) {
|
|
542
865
|
assert(dev != NULL);
|
|
543
866
|
|
|
867
|
+
ggml_metal_rsets_free(dev->rsets);
|
|
868
|
+
|
|
544
869
|
ggml_metal_library_free(dev->library);
|
|
545
870
|
dev->library = NULL;
|
|
546
871
|
|
|
@@ -569,6 +894,95 @@ ggml_metal_library_t ggml_metal_device_get_library(ggml_metal_device_t dev) {
|
|
|
569
894
|
return dev->library;
|
|
570
895
|
}
|
|
571
896
|
|
|
897
|
+
void ggml_metal_device_rsets_add(ggml_metal_device_t dev, ggml_metal_rset_t rset) {
|
|
898
|
+
if (rset == nil) {
|
|
899
|
+
return;
|
|
900
|
+
}
|
|
901
|
+
|
|
902
|
+
GGML_ASSERT(dev->rsets);
|
|
903
|
+
|
|
904
|
+
[dev->rsets->lock lock];
|
|
905
|
+
|
|
906
|
+
[dev->rsets->data addObject:rset];
|
|
907
|
+
|
|
908
|
+
[dev->rsets->lock unlock];
|
|
909
|
+
}
|
|
910
|
+
|
|
911
|
+
void ggml_metal_device_rsets_rm(ggml_metal_device_t dev, ggml_metal_rset_t rset) {
|
|
912
|
+
if (rset == nil) {
|
|
913
|
+
return;
|
|
914
|
+
}
|
|
915
|
+
|
|
916
|
+
GGML_ASSERT(dev->rsets);
|
|
917
|
+
|
|
918
|
+
[dev->rsets->lock lock];
|
|
919
|
+
|
|
920
|
+
[dev->rsets->data removeObject:rset];
|
|
921
|
+
|
|
922
|
+
[dev->rsets->lock unlock];
|
|
923
|
+
}
|
|
924
|
+
|
|
925
|
+
void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
|
|
926
|
+
if (dev->rsets == NULL) {
|
|
927
|
+
return;
|
|
928
|
+
}
|
|
929
|
+
|
|
930
|
+
atomic_store_explicit(&dev->rsets->d_loop, 2*dev->rsets->keep_alive_s, memory_order_relaxed);
|
|
931
|
+
}
|
|
932
|
+
|
|
933
|
+
struct ggml_metal_event {
|
|
934
|
+
void * obj; // id<MTLEvent>
|
|
935
|
+
|
|
936
|
+
atomic_int value;
|
|
937
|
+
};
|
|
938
|
+
|
|
939
|
+
void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
|
|
940
|
+
id<MTLEvent> event = (id<MTLEvent>)ev->obj;
|
|
941
|
+
|
|
942
|
+
id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
|
|
943
|
+
|
|
944
|
+
[cmd_buf encodeSignalEvent:event value:atomic_fetch_add_explicit(&ev->value, 1, memory_order_relaxed) + 1];
|
|
945
|
+
}
|
|
946
|
+
|
|
947
|
+
void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
|
|
948
|
+
id<MTLEvent> event = (id<MTLEvent>)ev->obj;
|
|
949
|
+
|
|
950
|
+
id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
|
|
951
|
+
|
|
952
|
+
[cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
|
|
953
|
+
}
|
|
954
|
+
|
|
955
|
+
ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
|
|
956
|
+
id<MTLEvent> event = [dev->mtl_device newEvent];
|
|
957
|
+
|
|
958
|
+
ggml_metal_event_t ev = calloc(1, sizeof(struct ggml_metal_event));
|
|
959
|
+
|
|
960
|
+
ev->obj = (__bridge void *)event;
|
|
961
|
+
ev->value = 0;
|
|
962
|
+
|
|
963
|
+
return ev;
|
|
964
|
+
}
|
|
965
|
+
|
|
966
|
+
void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev) {
|
|
967
|
+
id<MTLEvent> event = ev->obj;
|
|
968
|
+
[event release];
|
|
969
|
+
|
|
970
|
+
free(ev);
|
|
971
|
+
|
|
972
|
+
GGML_UNUSED(dev);
|
|
973
|
+
}
|
|
974
|
+
|
|
975
|
+
void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev) {
|
|
976
|
+
@autoreleasepool {
|
|
977
|
+
id<MTLEvent> event = ev->obj;
|
|
978
|
+
|
|
979
|
+
id<MTLCommandBuffer> cmd_buf = [dev->mtl_queue commandBuffer];
|
|
980
|
+
[cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
|
|
981
|
+
[cmd_buf commit];
|
|
982
|
+
[cmd_buf waitUntilCompleted];
|
|
983
|
+
}
|
|
984
|
+
}
|
|
985
|
+
|
|
572
986
|
void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
|
|
573
987
|
if (@available(macOS 10.12, iOS 16.0, *)) {
|
|
574
988
|
*total = dev->mtl_device.recommendedMaxWorkingSetSize;
|
|
@@ -597,6 +1011,15 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
|
|
597
1011
|
}
|
|
598
1012
|
|
|
599
1013
|
switch (op->op) {
|
|
1014
|
+
case GGML_OP_SCALE:
|
|
1015
|
+
case GGML_OP_FILL:
|
|
1016
|
+
case GGML_OP_CLAMP:
|
|
1017
|
+
case GGML_OP_SQR:
|
|
1018
|
+
case GGML_OP_SQRT:
|
|
1019
|
+
case GGML_OP_SIN:
|
|
1020
|
+
case GGML_OP_COS:
|
|
1021
|
+
case GGML_OP_LOG:
|
|
1022
|
+
return ggml_is_contiguous_rows(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
|
|
600
1023
|
case GGML_OP_UNARY:
|
|
601
1024
|
switch (ggml_get_unary_op(op)) {
|
|
602
1025
|
case GGML_UNARY_OP_TANH:
|
|
@@ -614,7 +1037,9 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
|
|
614
1037
|
case GGML_UNARY_OP_HARDSWISH:
|
|
615
1038
|
case GGML_UNARY_OP_HARDSIGMOID:
|
|
616
1039
|
case GGML_UNARY_OP_EXP:
|
|
617
|
-
|
|
1040
|
+
case GGML_UNARY_OP_SOFTPLUS:
|
|
1041
|
+
case GGML_UNARY_OP_EXPM1:
|
|
1042
|
+
return ggml_is_contiguous_rows(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
|
|
618
1043
|
default:
|
|
619
1044
|
return false;
|
|
620
1045
|
}
|
|
@@ -642,27 +1067,32 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
|
|
642
1067
|
case GGML_OP_MUL:
|
|
643
1068
|
case GGML_OP_DIV:
|
|
644
1069
|
case GGML_OP_ADD_ID:
|
|
645
|
-
return op->src[0]->type == GGML_TYPE_F32;
|
|
646
1070
|
case GGML_OP_ACC:
|
|
1071
|
+
return ggml_is_contiguous_rows(op->src[0]) && ggml_is_contiguous_rows(op->src[1]) && op->src[0]->type == GGML_TYPE_F32;
|
|
647
1072
|
case GGML_OP_REPEAT:
|
|
648
|
-
case GGML_OP_SCALE:
|
|
649
1073
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
650
1074
|
return true;
|
|
651
|
-
case
|
|
652
|
-
return op->src[0]->
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
case
|
|
657
|
-
|
|
658
|
-
|
|
1075
|
+
case GGML_OP_CONV_TRANSPOSE_2D:
|
|
1076
|
+
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]) &&
|
|
1077
|
+
(op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32) &&
|
|
1078
|
+
op->src[1]->type == GGML_TYPE_F32 &&
|
|
1079
|
+
op->type == GGML_TYPE_F32;
|
|
1080
|
+
case GGML_OP_SUM:
|
|
1081
|
+
return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
|
|
1082
|
+
case GGML_OP_TRI:
|
|
1083
|
+
return ggml_is_contiguous_rows(op->src[0]);
|
|
659
1084
|
case GGML_OP_SUM_ROWS:
|
|
1085
|
+
case GGML_OP_CUMSUM:
|
|
660
1086
|
case GGML_OP_MEAN:
|
|
661
1087
|
case GGML_OP_SOFT_MAX:
|
|
662
1088
|
case GGML_OP_GROUP_NORM:
|
|
663
|
-
return has_simdgroup_reduction && ggml_is_contiguous_rows(op->src[0]);
|
|
664
1089
|
case GGML_OP_L2_NORM:
|
|
665
|
-
return has_simdgroup_reduction && (op->
|
|
1090
|
+
return has_simdgroup_reduction && ggml_is_contiguous_rows(op->src[0]);
|
|
1091
|
+
case GGML_OP_COUNT_EQUAL:
|
|
1092
|
+
return has_simdgroup_reduction &&
|
|
1093
|
+
op->src[0]->type == GGML_TYPE_I32 &&
|
|
1094
|
+
op->src[1]->type == GGML_TYPE_I32 &&
|
|
1095
|
+
op->type == GGML_TYPE_I64;
|
|
666
1096
|
case GGML_OP_ARGMAX:
|
|
667
1097
|
return has_simdgroup_reduction;
|
|
668
1098
|
case GGML_OP_NORM:
|
|
@@ -672,13 +1102,23 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
|
|
672
1102
|
return true;
|
|
673
1103
|
case GGML_OP_IM2COL:
|
|
674
1104
|
return ggml_is_contiguous(op->src[1]) && op->src[1]->type == GGML_TYPE_F32 && (op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32);
|
|
675
|
-
case
|
|
676
|
-
return
|
|
1105
|
+
case GGML_OP_CONV_2D:
|
|
1106
|
+
return ggml_is_contiguous(op->src[0]) &&
|
|
1107
|
+
op->src[1]->type == GGML_TYPE_F32 &&
|
|
1108
|
+
op->type == GGML_TYPE_F32 &&
|
|
1109
|
+
(op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32);
|
|
677
1110
|
case GGML_OP_UPSCALE:
|
|
678
|
-
return op->src[0]->type == GGML_TYPE_F32
|
|
1111
|
+
return op->src[0]->type == GGML_TYPE_F32;
|
|
1112
|
+
case GGML_OP_POOL_1D:
|
|
1113
|
+
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
|
679
1114
|
case GGML_OP_POOL_2D:
|
|
680
1115
|
return op->src[0]->type == GGML_TYPE_F32;
|
|
681
1116
|
case GGML_OP_PAD:
|
|
1117
|
+
// TODO: add circular padding support for metal, see https://github.com/ggml-org/llama.cpp/pull/16985
|
|
1118
|
+
if (ggml_get_op_params_i32(op, 8) != 0) {
|
|
1119
|
+
return false;
|
|
1120
|
+
}
|
|
1121
|
+
|
|
682
1122
|
return (ggml_get_op_params_i32(op, 0) == 0) && (ggml_get_op_params_i32(op, 2) == 0) &&
|
|
683
1123
|
(ggml_get_op_params_i32(op, 4) == 0) && (ggml_get_op_params_i32(op, 6) == 0);
|
|
684
1124
|
case GGML_OP_PAD_REFLECT_1D:
|
|
@@ -686,25 +1126,24 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
|
|
686
1126
|
case GGML_OP_LEAKY_RELU:
|
|
687
1127
|
return op->src[0]->type == GGML_TYPE_F32;
|
|
688
1128
|
case GGML_OP_ARGSORT:
|
|
689
|
-
|
|
690
|
-
return op->src[0]->ne[0] <= 1024;
|
|
1129
|
+
case GGML_OP_TOP_K:
|
|
691
1130
|
case GGML_OP_ARANGE:
|
|
692
1131
|
return true;
|
|
693
1132
|
case GGML_OP_FLASH_ATTN_EXT:
|
|
694
1133
|
// for new head sizes, add checks here
|
|
695
|
-
if (op->src[0]->ne[0] !=
|
|
1134
|
+
if (op->src[0]->ne[0] != 32 &&
|
|
1135
|
+
op->src[0]->ne[0] != 40 &&
|
|
1136
|
+
op->src[0]->ne[0] != 48 &&
|
|
696
1137
|
op->src[0]->ne[0] != 64 &&
|
|
1138
|
+
op->src[0]->ne[0] != 72 &&
|
|
697
1139
|
op->src[0]->ne[0] != 80 &&
|
|
698
1140
|
op->src[0]->ne[0] != 96 &&
|
|
699
1141
|
op->src[0]->ne[0] != 112 &&
|
|
700
1142
|
op->src[0]->ne[0] != 128 &&
|
|
701
1143
|
op->src[0]->ne[0] != 192 &&
|
|
702
|
-
op->src[0]->ne[0] != 256
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
if (op->src[0]->ne[0] == 576) {
|
|
706
|
-
// DeepSeek sizes
|
|
707
|
-
// TODO: disabled for now, until optmized
|
|
1144
|
+
op->src[0]->ne[0] != 256 &&
|
|
1145
|
+
op->src[0]->ne[0] != 320 &&
|
|
1146
|
+
op->src[0]->ne[0] != 576) {
|
|
708
1147
|
return false;
|
|
709
1148
|
}
|
|
710
1149
|
if (op->src[1]->type != op->src[2]->type) {
|
|
@@ -717,9 +1156,13 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
|
|
717
1156
|
case GGML_OP_RWKV_WKV6:
|
|
718
1157
|
case GGML_OP_RWKV_WKV7:
|
|
719
1158
|
return true;
|
|
1159
|
+
case GGML_OP_GATED_DELTA_NET:
|
|
1160
|
+
return has_simdgroup_reduction && op->src[2]->ne[0] % 32 == 0;
|
|
1161
|
+
case GGML_OP_SOLVE_TRI:
|
|
720
1162
|
case GGML_OP_MUL_MAT:
|
|
721
1163
|
case GGML_OP_MUL_MAT_ID:
|
|
722
|
-
return has_simdgroup_reduction;
|
|
1164
|
+
return has_simdgroup_reduction && op->src[0]->type != GGML_TYPE_NVFP4;
|
|
1165
|
+
case GGML_OP_SET:
|
|
723
1166
|
case GGML_OP_CPY:
|
|
724
1167
|
case GGML_OP_DUP:
|
|
725
1168
|
case GGML_OP_CONT:
|
|
@@ -770,15 +1213,13 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
|
|
770
1213
|
return false;
|
|
771
1214
|
}
|
|
772
1215
|
case GGML_TYPE_I32:
|
|
773
|
-
return op->type == GGML_TYPE_F32;
|
|
1216
|
+
return op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_I32;
|
|
774
1217
|
default:
|
|
775
1218
|
return false;
|
|
776
1219
|
};
|
|
777
1220
|
}
|
|
778
1221
|
case GGML_OP_GET_ROWS:
|
|
779
|
-
|
|
780
|
-
return op->ne[3] == 1;
|
|
781
|
-
}
|
|
1222
|
+
return op->src[0]->type != GGML_TYPE_NVFP4;
|
|
782
1223
|
case GGML_OP_SET_ROWS:
|
|
783
1224
|
{
|
|
784
1225
|
if (op->src[0]->type != GGML_TYPE_F32) {
|
|
@@ -800,6 +1241,11 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
|
|
800
1241
|
return false;
|
|
801
1242
|
};
|
|
802
1243
|
}
|
|
1244
|
+
case GGML_OP_DIAG:
|
|
1245
|
+
return true;
|
|
1246
|
+
case GGML_OP_OPT_STEP_ADAMW:
|
|
1247
|
+
case GGML_OP_OPT_STEP_SGD:
|
|
1248
|
+
return has_simdgroup_reduction;
|
|
803
1249
|
default:
|
|
804
1250
|
return false;
|
|
805
1251
|
}
|
|
@@ -824,7 +1270,7 @@ struct ggml_metal_buffer_wrapper {
|
|
|
824
1270
|
};
|
|
825
1271
|
|
|
826
1272
|
struct ggml_metal_buffer {
|
|
827
|
-
void * all_data;
|
|
1273
|
+
void * all_data;
|
|
828
1274
|
size_t all_size;
|
|
829
1275
|
|
|
830
1276
|
// if false, the Metal buffer data is allocated in private GPU memory and is not shared with the host
|
|
@@ -838,12 +1284,11 @@ struct ggml_metal_buffer {
|
|
|
838
1284
|
bool use_residency_sets;
|
|
839
1285
|
|
|
840
1286
|
// optional MTLResidencySet
|
|
841
|
-
// note: cannot use
|
|
1287
|
+
// note: cannot use explicitly "id<MTLResidencySet>" here because it is not available on certain OSes
|
|
842
1288
|
id rset;
|
|
843
1289
|
|
|
844
|
-
// pointers to global device
|
|
845
|
-
|
|
846
|
-
id<MTLCommandQueue> queue;
|
|
1290
|
+
// pointers to global device
|
|
1291
|
+
ggml_metal_device_t dev;
|
|
847
1292
|
};
|
|
848
1293
|
|
|
849
1294
|
static void ggml_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
|
|
@@ -886,7 +1331,7 @@ static bool ggml_metal_buffer_rset_init(ggml_metal_buffer_t buf) {
|
|
|
886
1331
|
desc.initialCapacity = buf->n_buffers;
|
|
887
1332
|
|
|
888
1333
|
NSError * error;
|
|
889
|
-
buf->rset = [buf->
|
|
1334
|
+
buf->rset = [buf->dev->mtl_device newResidencySetWithDescriptor:desc error:&error];
|
|
890
1335
|
if (error) {
|
|
891
1336
|
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
|
892
1337
|
[desc release];
|
|
@@ -947,6 +1392,8 @@ static void * ggml_metal_host_malloc(size_t n) {
|
|
|
947
1392
|
ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size, bool shared) {
|
|
948
1393
|
ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));
|
|
949
1394
|
|
|
1395
|
+
res->dev = dev;
|
|
1396
|
+
|
|
950
1397
|
const size_t size_page = sysconf(_SC_PAGESIZE);
|
|
951
1398
|
|
|
952
1399
|
size_t size_aligned = size;
|
|
@@ -962,16 +1409,14 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
|
|
|
962
1409
|
if (shared) {
|
|
963
1410
|
res->all_data = ggml_metal_host_malloc(size_aligned);
|
|
964
1411
|
res->is_shared = true;
|
|
965
|
-
res->owned = true;
|
|
966
1412
|
} else {
|
|
967
|
-
//
|
|
968
|
-
res->all_data = (void *)
|
|
1413
|
+
// use virtual address
|
|
1414
|
+
res->all_data = (void *) atomic_fetch_add_explicit(&dev->addr_virt, size_aligned, memory_order_relaxed);
|
|
969
1415
|
res->is_shared = false;
|
|
970
1416
|
}
|
|
971
1417
|
res->all_size = size_aligned;
|
|
972
1418
|
|
|
973
|
-
res->
|
|
974
|
-
res->queue = ggml_metal_device_get_queue(dev);
|
|
1419
|
+
res->owned = true;
|
|
975
1420
|
|
|
976
1421
|
res->n_buffers = 1;
|
|
977
1422
|
|
|
@@ -980,15 +1425,13 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
|
|
|
980
1425
|
res->buffers[0].metal = nil;
|
|
981
1426
|
|
|
982
1427
|
if (size_aligned > 0) {
|
|
983
|
-
if (props_dev->use_shared_buffers &&shared) {
|
|
984
|
-
res->buffers[0].metal = [res->
|
|
1428
|
+
if (props_dev->use_shared_buffers && shared) {
|
|
1429
|
+
res->buffers[0].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:res->all_data
|
|
985
1430
|
length:size_aligned
|
|
986
1431
|
options:MTLResourceStorageModeShared
|
|
987
1432
|
deallocator:nil];
|
|
988
1433
|
} else {
|
|
989
|
-
res->buffers[0].metal = [res->
|
|
990
|
-
|
|
991
|
-
res->all_data = (void *) (res->buffers[0].metal.gpuAddress);
|
|
1434
|
+
res->buffers[0].metal = [res->dev->mtl_device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
|
|
992
1435
|
}
|
|
993
1436
|
}
|
|
994
1437
|
|
|
@@ -1009,6 +1452,8 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
|
|
|
1009
1452
|
return NULL;
|
|
1010
1453
|
}
|
|
1011
1454
|
|
|
1455
|
+
ggml_metal_device_rsets_add(dev, res->rset);
|
|
1456
|
+
|
|
1012
1457
|
//ggml_metal_log_allocated_size(device, size_aligned);
|
|
1013
1458
|
|
|
1014
1459
|
return res;
|
|
@@ -1017,6 +1462,8 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
|
|
|
1017
1462
|
ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
|
1018
1463
|
ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));
|
|
1019
1464
|
|
|
1465
|
+
res->dev = dev;
|
|
1466
|
+
|
|
1020
1467
|
res->all_data = ptr;
|
|
1021
1468
|
res->all_size = size;
|
|
1022
1469
|
|
|
@@ -1039,9 +1486,6 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
|
|
|
1039
1486
|
size_aligned += (size_page - (size_aligned % size_page));
|
|
1040
1487
|
}
|
|
1041
1488
|
|
|
1042
|
-
res->device = ggml_metal_device_get_obj(dev);
|
|
1043
|
-
res->queue = ggml_metal_device_get_queue(dev);
|
|
1044
|
-
|
|
1045
1489
|
const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
|
|
1046
1490
|
|
|
1047
1491
|
// the buffer fits into the max buffer size allowed by the device
|
|
@@ -1051,7 +1495,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
|
|
|
1051
1495
|
res->buffers[res->n_buffers].metal = nil;
|
|
1052
1496
|
|
|
1053
1497
|
if (size_aligned > 0) {
|
|
1054
|
-
res->buffers[res->n_buffers].metal = [res->
|
|
1498
|
+
res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
|
1055
1499
|
|
|
1056
1500
|
if (res->buffers[res->n_buffers].metal == nil) {
|
|
1057
1501
|
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
|
@@ -1060,7 +1504,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
|
|
|
1060
1504
|
}
|
|
1061
1505
|
}
|
|
1062
1506
|
|
|
1063
|
-
ggml_metal_log_allocated_size(res->
|
|
1507
|
+
ggml_metal_log_allocated_size(res->dev->mtl_device, size_aligned);
|
|
1064
1508
|
|
|
1065
1509
|
++res->n_buffers;
|
|
1066
1510
|
} else {
|
|
@@ -1078,7 +1522,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
|
|
|
1078
1522
|
res->buffers[res->n_buffers].metal = nil;
|
|
1079
1523
|
|
|
1080
1524
|
if (size_step_aligned > 0) {
|
|
1081
|
-
res->buffers[res->n_buffers].metal = [res->
|
|
1525
|
+
res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
|
1082
1526
|
|
|
1083
1527
|
if (res->buffers[res->n_buffers].metal == nil) {
|
|
1084
1528
|
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
|
|
@@ -1087,7 +1531,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
|
|
|
1087
1531
|
}
|
|
1088
1532
|
}
|
|
1089
1533
|
|
|
1090
|
-
ggml_metal_log_allocated_size(res->
|
|
1534
|
+
ggml_metal_log_allocated_size(res->dev->mtl_device, size_step_aligned);
|
|
1091
1535
|
|
|
1092
1536
|
if (i + size_step < size) {
|
|
1093
1537
|
GGML_LOG_INFO("\n");
|
|
@@ -1105,10 +1549,14 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
|
|
|
1105
1549
|
return NULL;
|
|
1106
1550
|
}
|
|
1107
1551
|
|
|
1552
|
+
ggml_metal_device_rsets_add(dev, res->rset);
|
|
1553
|
+
|
|
1108
1554
|
return res;
|
|
1109
1555
|
}
|
|
1110
1556
|
|
|
1111
1557
|
void ggml_metal_buffer_free(ggml_metal_buffer_t buf) {
|
|
1558
|
+
ggml_metal_device_rsets_rm(buf->dev, buf->rset);
|
|
1559
|
+
|
|
1112
1560
|
for (int i = 0; i < buf->n_buffers; i++) {
|
|
1113
1561
|
[buf->buffers[i].metal release];
|
|
1114
1562
|
}
|
|
@@ -1136,7 +1584,7 @@ bool ggml_metal_buffer_is_shared(ggml_metal_buffer_t buf) {
|
|
|
1136
1584
|
|
|
1137
1585
|
void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
1138
1586
|
if (buf->is_shared) {
|
|
1139
|
-
memset((char *)tensor->data + offset, value, size);
|
|
1587
|
+
memset((char *) tensor->data + offset, value, size);
|
|
1140
1588
|
return;
|
|
1141
1589
|
}
|
|
1142
1590
|
|
|
@@ -1145,8 +1593,7 @@ void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor
|
|
|
1145
1593
|
struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor);
|
|
1146
1594
|
bid_dst.offs += offset;
|
|
1147
1595
|
|
|
1148
|
-
id<
|
|
1149
|
-
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
|
|
1596
|
+
id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
|
|
1150
1597
|
|
|
1151
1598
|
{
|
|
1152
1599
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
|
@@ -1165,14 +1612,14 @@ void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor
|
|
|
1165
1612
|
|
|
1166
1613
|
void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
1167
1614
|
if (buf->is_shared) {
|
|
1168
|
-
memcpy((char *)tensor->data + offset, data, size);
|
|
1615
|
+
memcpy((char *) tensor->data + offset, data, size);
|
|
1169
1616
|
return;
|
|
1170
1617
|
}
|
|
1171
1618
|
|
|
1172
1619
|
@autoreleasepool {
|
|
1173
1620
|
// src
|
|
1174
1621
|
void * data_ptr = (void *)(uintptr_t) data; // "const cast" the src data
|
|
1175
|
-
id<MTLBuffer> buf_src = [buf->
|
|
1622
|
+
id<MTLBuffer> buf_src = [buf->dev->mtl_device newBufferWithBytesNoCopy:data_ptr
|
|
1176
1623
|
length:size
|
|
1177
1624
|
options:MTLResourceStorageModeShared
|
|
1178
1625
|
deallocator:nil];
|
|
@@ -1187,8 +1634,7 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *
|
|
|
1187
1634
|
// this is alternative to waitUntilCompleted, which should be faster, but don't seem to make much difference
|
|
1188
1635
|
dispatch_semaphore_t completion_semaphore = dispatch_semaphore_create(0);
|
|
1189
1636
|
|
|
1190
|
-
id<
|
|
1191
|
-
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
|
|
1637
|
+
id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
|
|
1192
1638
|
|
|
1193
1639
|
{
|
|
1194
1640
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
|
@@ -1220,7 +1666,7 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *
|
|
|
1220
1666
|
|
|
1221
1667
|
void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
1222
1668
|
if (buf->is_shared) {
|
|
1223
|
-
memcpy(data, (const char *)tensor->data + offset, size);
|
|
1669
|
+
memcpy(data, (const char *) tensor->data + offset, size);
|
|
1224
1670
|
return;
|
|
1225
1671
|
}
|
|
1226
1672
|
|
|
@@ -1230,15 +1676,14 @@ void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_ten
|
|
|
1230
1676
|
bid_src.offs += offset;
|
|
1231
1677
|
|
|
1232
1678
|
// dst
|
|
1233
|
-
id<MTLBuffer> buf_dst = [buf->
|
|
1679
|
+
id<MTLBuffer> buf_dst = [buf->dev->mtl_device newBufferWithBytesNoCopy:data
|
|
1234
1680
|
length:size
|
|
1235
1681
|
options:MTLResourceStorageModeShared
|
|
1236
1682
|
deallocator:nil];
|
|
1237
1683
|
|
|
1238
1684
|
GGML_ASSERT(buf_dst);
|
|
1239
1685
|
|
|
1240
|
-
id<
|
|
1241
|
-
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
|
|
1686
|
+
id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
|
|
1242
1687
|
|
|
1243
1688
|
{
|
|
1244
1689
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
|
@@ -1264,8 +1709,7 @@ void ggml_metal_buffer_clear(ggml_metal_buffer_t buf, uint8_t value) {
|
|
|
1264
1709
|
}
|
|
1265
1710
|
|
|
1266
1711
|
@autoreleasepool {
|
|
1267
|
-
id<
|
|
1268
|
-
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
|
|
1712
|
+
id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
|
|
1269
1713
|
|
|
1270
1714
|
{
|
|
1271
1715
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|