whispercpp 1.3.4 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -43
- data/ext/extconf.rb +2 -2
- data/ext/ruby_whisper.c +14 -2
- data/ext/ruby_whisper.h +39 -0
- data/ext/ruby_whisper_context.c +22 -22
- data/ext/ruby_whisper_model.c +12 -12
- data/ext/ruby_whisper_params.c +47 -23
- data/ext/ruby_whisper_segment.c +84 -19
- data/ext/ruby_whisper_token.c +351 -0
- data/ext/ruby_whisper_transcribe.cpp +1 -1
- data/ext/ruby_whisper_vad_context.c +75 -0
- data/ext/ruby_whisper_vad_context_detect.cpp +50 -0
- data/ext/ruby_whisper_vad_segment.c +139 -0
- data/ext/ruby_whisper_vad_segments.c +106 -0
- data/ext/sources/CMakeLists.txt +4 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/arm64-apple-clang.cmake +16 -0
- data/ext/sources/cmake/arm64-windows-llvm.cmake +16 -0
- data/ext/sources/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
- data/ext/sources/cmake/x64-windows-llvm.cmake +5 -0
- data/ext/sources/examples/addon.node/vad-example.js +2 -2
- data/ext/sources/examples/cli/cli.cpp +121 -112
- data/ext/sources/examples/lsp/CMakeLists.txt +2 -1
- data/ext/sources/examples/quantize/CMakeLists.txt +2 -1
- data/ext/sources/examples/server/server.cpp +10 -11
- data/ext/sources/examples/talk-llama/CMakeLists.txt +5 -1
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +12 -3
- data/ext/sources/examples/talk-llama/llama-adapter.h +7 -1
- data/ext/sources/examples/talk-llama/llama-arch.cpp +2046 -1974
- data/ext/sources/examples/talk-llama/llama-arch.h +67 -2
- data/ext/sources/examples/talk-llama/llama-batch.cpp +75 -33
- data/ext/sources/examples/talk-llama/llama-batch.h +17 -4
- data/ext/sources/examples/talk-llama/llama-chat.cpp +79 -3
- data/ext/sources/examples/talk-llama/llama-chat.h +4 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +775 -78
- data/ext/sources/examples/talk-llama/llama-context.h +57 -9
- data/ext/sources/examples/talk-llama/llama-cparams.h +1 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +288 -53
- data/ext/sources/examples/talk-llama/llama-grammar.h +22 -1
- data/ext/sources/examples/talk-llama/llama-graph.cpp +381 -64
- data/ext/sources/examples/talk-llama/llama-graph.h +103 -13
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +26 -2
- data/ext/sources/examples/talk-llama/llama-hparams.h +41 -10
- data/ext/sources/examples/talk-llama/llama-impl.cpp +7 -3
- data/ext/sources/examples/talk-llama/llama-impl.h +1 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +5 -3
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +145 -65
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +22 -7
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +44 -2
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +12 -10
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +32 -19
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +2 -2
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +172 -37
- data/ext/sources/examples/talk-llama/llama-mmap.h +8 -3
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +91 -9
- data/ext/sources/examples/talk-llama/llama-model-loader.h +6 -0
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +3 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +1529 -13134
- data/ext/sources/examples/talk-llama/llama-model.h +44 -3
- data/ext/sources/examples/talk-llama/llama-quant.cpp +8 -23
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +1294 -198
- data/ext/sources/examples/talk-llama/llama-sampling.h +19 -7
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +133 -37
- data/ext/sources/examples/talk-llama/llama-vocab.h +45 -40
- data/ext/sources/examples/talk-llama/llama.cpp +729 -2
- data/ext/sources/examples/talk-llama/llama.h +152 -14
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +191 -0
- data/ext/sources/examples/talk-llama/models/apertus.cpp +125 -0
- data/ext/sources/examples/talk-llama/models/arcee.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/arctic.cpp +138 -0
- data/ext/sources/examples/talk-llama/models/arwkv7.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/bert.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +160 -0
- data/ext/sources/examples/talk-llama/models/bloom.cpp +101 -0
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +111 -0
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +102 -0
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +134 -0
- data/ext/sources/examples/talk-llama/models/command-r.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/deci.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +259 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +134 -0
- data/ext/sources/examples/talk-llama/models/dream.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +114 -0
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +113 -0
- data/ext/sources/examples/talk-llama/models/falcon.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +116 -0
- data/ext/sources/examples/talk-llama/models/gemma.cpp +112 -0
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +155 -0
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +384 -0
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +170 -0
- data/ext/sources/examples/talk-llama/models/glm4.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +196 -0
- data/ext/sources/examples/talk-llama/models/granite.cpp +211 -0
- data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +283 -0
- data/ext/sources/examples/talk-llama/models/grok.cpp +159 -0
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +141 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +154 -0
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/jais.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +106 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +175 -0
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/llada.cpp +99 -0
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/llama.cpp +168 -0
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/mamba.cpp +55 -0
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +199 -0
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +160 -0
- data/ext/sources/examples/talk-llama/models/models.h +569 -0
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +116 -0
- data/ext/sources/examples/talk-llama/models/mpt.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +104 -0
- data/ext/sources/examples/talk-llama/models/olmo.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +127 -0
- data/ext/sources/examples/talk-llama/models/openelm.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/orion.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi3.cpp +152 -0
- data/ext/sources/examples/talk-llama/models/plamo.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +316 -0
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/plm.cpp +168 -0
- data/ext/sources/examples/talk-llama/models/qwen.cpp +108 -0
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +151 -0
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +873 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +149 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +141 -0
- data/ext/sources/examples/talk-llama/models/refact.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +162 -0
- data/ext/sources/examples/talk-llama/models/rwkv6.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/rwkv7.cpp +90 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +146 -0
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +100 -0
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +166 -0
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +96 -0
- data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +149 -0
- data/ext/sources/examples/talk-llama/models/xverse.cpp +108 -0
- data/ext/sources/examples/talk-llama/unicode.cpp +102 -16
- data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +1 -1
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +1 -1
- data/ext/sources/ggml/CMakeLists.txt +82 -54
- data/ext/sources/ggml/include/ggml-alloc.h +9 -0
- data/ext/sources/ggml/include/ggml-backend.h +4 -1
- data/ext/sources/ggml/include/ggml-cpu.h +1 -0
- data/ext/sources/ggml/include/ggml-hexagon.h +19 -0
- data/ext/sources/ggml/include/ggml-rpc.h +8 -11
- data/ext/sources/ggml/include/ggml-zendnn.h +22 -0
- data/ext/sources/ggml/include/ggml.h +190 -12
- data/ext/sources/ggml/src/CMakeLists.txt +82 -11
- data/ext/sources/ggml/src/ggml-alloc.c +124 -41
- data/ext/sources/ggml/src/ggml-backend-impl.h +1 -4
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +27 -3
- data/ext/sources/ggml/src/ggml-backend.cpp +71 -21
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +17 -3
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -9
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +138 -47
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +2179 -1696
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +238 -317
- data/ext/sources/ggml/src/ggml-cann/common.h +283 -208
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +626 -776
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +156 -86
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1004 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6 -6
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +50 -2
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -3
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +195 -71
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +573 -106
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +33 -44
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +298 -112
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +819 -125
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +708 -431
- data/ext/sources/ggml/src/ggml-cpu/ops.h +5 -4
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +671 -31
- data/ext/sources/ggml/src/ggml-cpu/repack.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +41 -43
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +3 -2
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +7 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +124 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.h +261 -146
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +72 -1
- data/ext/sources/ggml/src/ggml-cuda/argmax.cu +2 -2
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +123 -6
- data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +16 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +353 -80
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +339 -246
- data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +1 -5
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cu +307 -0
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/diag.cu +77 -0
- data/ext/sources/ggml/src/ggml-cuda/diag.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +31 -21
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +663 -596
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +35 -741
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +1241 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +30 -37
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +14 -13
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +83 -37
- data/ext/sources/ggml/src/ggml-cuda/fill.cu +37 -0
- data/ext/sources/ggml/src/ggml-cuda/fill.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1155 -164
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +5 -4
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +741 -48
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +60 -12
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +381 -42
- data/ext/sources/ggml/src/ggml-cuda/mmid.cu +164 -0
- data/ext/sources/ggml/src/ggml-cuda/mmid.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +69 -176
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +498 -171
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +375 -79
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +3 -2
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +241 -95
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +64 -33
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +151 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +14 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +192 -77
- data/ext/sources/ggml/src/ggml-cuda/rope.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +101 -47
- data/ext/sources/ggml/src/ggml-cuda/set.cu +39 -0
- data/ext/sources/ggml/src/ggml-cuda/set.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +203 -6
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +275 -0
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +14 -20
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +49 -84
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +19 -1
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +96 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +168 -76
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +11 -4
- data/ext/sources/ggml/src/ggml-cuda/tri.cu +136 -0
- data/ext/sources/ggml/src/ggml-cuda/tri.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +105 -11
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +36 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +163 -7
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +12 -1
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +6 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +80 -0
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3151 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +44 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +682 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +566 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +112 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.c +63 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +157 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +165 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +92 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +94 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +72 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +1020 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +1353 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +1001 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2503 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +487 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +168 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +287 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +454 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +221 -0
- data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +153 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +8 -13
- data/ext/sources/ggml/src/ggml-impl.h +67 -6
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +2 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +29 -20
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +652 -285
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +103 -56
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +496 -118
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +231 -9
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +1227 -224
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +12 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +14 -8
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1972 -704
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +3 -1
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +11 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1430 -120
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +63 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +82 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +4 -3
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
- data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +35 -16
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +438 -156
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +48 -3
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +77 -0
- data/ext/sources/ggml/src/ggml-sycl/add-id.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +0 -9
- data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +0 -6
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +117 -15
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +55 -44
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +34 -0
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +79 -0
- data/ext/sources/ggml/src/ggml-sycl/count-equal.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +0 -3
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +18 -0
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +76 -3
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +333 -300
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +10 -2
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +335 -110
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +156 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.cpp +97 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.hpp +24 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.cpp +122 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +30 -17
- data/ext/sources/ggml/src/ggml-sycl/set.cpp +73 -0
- data/ext/sources/ggml/src/ggml-sycl/set.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +327 -162
- data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +4 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +58 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +5013 -2859
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +47 -49
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +9 -21
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +39 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +19 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +45 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +50 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +17 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +2 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +19 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mat_vec_base.comp → mul_mat_vec_base.glsl} +70 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +71 -21
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +41 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +44 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +39 -36
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +494 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +78 -103
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +34 -23
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mm_funcs.comp → mul_mm_funcs.glsl} +69 -59
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +72 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +88 -228
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +21 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +10 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +50 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +234 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +6 -50
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +6 -33
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +6 -33
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +6 -39
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +2 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +43 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +345 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +90 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +335 -151
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +28 -2
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +169 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1964 -435
- data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +33 -10
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +591 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +1 -1
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +6 -6
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +83 -17
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +112 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +483 -0
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +466 -0
- data/ext/sources/ggml/src/ggml.c +425 -33
- data/ext/sources/include/whisper.h +1 -0
- data/ext/sources/src/CMakeLists.txt +3 -1
- data/ext/sources/src/whisper.cpp +101 -35
- data/ext/sources/tests/CMakeLists.txt +2 -2
- data/ext/sources/tests/test-vad-full.cpp +4 -2
- data/ext/sources/tests/test-vad.cpp +1 -1
- data/extsources.rb +1 -0
- data/lib/whisper/model/uri.rb +17 -18
- data/sig/whisper.rbs +119 -2
- data/test/test_params.rb +16 -8
- data/test/test_segment.rb +0 -1
- data/test/test_token.rb +70 -0
- data/test/test_vad.rb +1 -1
- data/test/test_vad_context.rb +50 -0
- data/test/test_vad_segment.rb +19 -0
- data/test/test_vad_segments.rb +16 -0
- data/test/test_whisper.rb +7 -0
- data/whispercpp.gemspec +1 -1
- metadata +287 -34
- data/ext/sources/build-xcframework.sh +0 -571
- data/ext/sources/ggml/src/ggml-cann/Doxyfile +0 -2579
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +0 -44
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +0 -41
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +0 -44
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +0 -41
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +0 -48
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
|
@@ -25,6 +25,7 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
|
|
|
25
25
|
// ops that return true for this function must not use restrict pointers for their backend implementations
|
|
26
26
|
bool ggml_op_can_inplace(enum ggml_op op) {
|
|
27
27
|
switch (op) {
|
|
28
|
+
case GGML_OP_FILL:
|
|
28
29
|
case GGML_OP_SCALE:
|
|
29
30
|
case GGML_OP_DIAG_MASK_ZERO:
|
|
30
31
|
case GGML_OP_DIAG_MASK_INF:
|
|
@@ -226,16 +227,23 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
|
|
|
226
227
|
}
|
|
227
228
|
|
|
228
229
|
if (best_fit_block == -1) {
|
|
229
|
-
// no suitable block found, try the last block (this
|
|
230
|
+
// no suitable block found, try the last block (this may grow a chunks size)
|
|
231
|
+
int64_t best_reuse = INT64_MIN;
|
|
230
232
|
for (int c = 0; c < alloc->n_chunks; ++c) {
|
|
231
233
|
struct tallocr_chunk * chunk = alloc->chunks[c];
|
|
232
234
|
if (chunk->n_free_blocks > 0) {
|
|
233
235
|
struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1];
|
|
234
236
|
max_avail = MAX(max_avail, block->size);
|
|
235
|
-
|
|
237
|
+
int64_t reuse_factor = chunk->max_size - block->offset - size;
|
|
238
|
+
// reuse_factor < 0 : amount of extra memory that needs to be allocated
|
|
239
|
+
// reuse_factor = 0 : allocated free space exactly matches tensor size
|
|
240
|
+
// reuse_factor > 0 : superfluous memory that will remain unused
|
|
241
|
+
bool better_reuse = best_reuse < 0 && reuse_factor > best_reuse;
|
|
242
|
+
bool better_fit = reuse_factor >= 0 && reuse_factor < best_reuse;
|
|
243
|
+
if (block->size >= size && (better_reuse || better_fit)) {
|
|
236
244
|
best_fit_chunk = c;
|
|
237
245
|
best_fit_block = chunk->n_free_blocks - 1;
|
|
238
|
-
|
|
246
|
+
best_reuse = reuse_factor;
|
|
239
247
|
}
|
|
240
248
|
}
|
|
241
249
|
}
|
|
@@ -268,7 +276,7 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
|
|
|
268
276
|
#ifdef GGML_ALLOCATOR_DEBUG
|
|
269
277
|
add_allocated_tensor(alloc, addr, tensor);
|
|
270
278
|
size_t cur_max = addr.offset + size;
|
|
271
|
-
if (cur_max >
|
|
279
|
+
if (cur_max > chunk->max_size) {
|
|
272
280
|
// sort allocated_tensors by chunk/offset
|
|
273
281
|
for (int i = 0; i < 1024; i++) {
|
|
274
282
|
for (int j = i + 1; j < 1024; j++) {
|
|
@@ -304,16 +312,9 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
|
|
|
304
312
|
}
|
|
305
313
|
|
|
306
314
|
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
|
307
|
-
static void
|
|
315
|
+
static void ggml_dyn_tallocr_free_bytes(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size) {
|
|
308
316
|
size = aligned_offset(NULL, size, alloc->alignment);
|
|
309
317
|
|
|
310
|
-
AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
|
|
311
|
-
__func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
|
|
312
|
-
|
|
313
|
-
#ifdef GGML_ALLOCATOR_DEBUG
|
|
314
|
-
remove_allocated_tensor(alloc, addr, tensor);
|
|
315
|
-
#endif
|
|
316
|
-
|
|
317
318
|
struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
|
|
318
319
|
|
|
319
320
|
// see if we can merge with an existing block
|
|
@@ -349,8 +350,6 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct
|
|
|
349
350
|
}
|
|
350
351
|
// otherwise, add a new block
|
|
351
352
|
ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
|
|
352
|
-
|
|
353
|
-
GGML_UNUSED(tensor);
|
|
354
353
|
}
|
|
355
354
|
|
|
356
355
|
static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
|
|
@@ -392,12 +391,8 @@ static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
|
|
|
392
391
|
free(alloc);
|
|
393
392
|
}
|
|
394
393
|
|
|
395
|
-
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
|
|
396
|
-
|
|
397
|
-
for (int i = 0; i < alloc->n_chunks; i++) {
|
|
398
|
-
max_size += alloc->chunks[i]->max_size;
|
|
399
|
-
}
|
|
400
|
-
return max_size;
|
|
394
|
+
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc, int chunk) {
|
|
395
|
+
return chunk < alloc->n_chunks ? alloc->chunks[chunk]->max_size : 0;
|
|
401
396
|
}
|
|
402
397
|
|
|
403
398
|
|
|
@@ -417,10 +412,8 @@ static void ggml_vbuffer_free(struct vbuffer * buf) {
|
|
|
417
412
|
free(buf);
|
|
418
413
|
}
|
|
419
414
|
|
|
420
|
-
static
|
|
421
|
-
|
|
422
|
-
while (n < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[n]) n++;
|
|
423
|
-
return n;
|
|
415
|
+
static size_t ggml_vbuffer_chunk_size(struct vbuffer * buf, int chunk) {
|
|
416
|
+
return buf->chunks[chunk] ? ggml_backend_buffer_get_size(buf->chunks[chunk]) : 0;
|
|
424
417
|
}
|
|
425
418
|
|
|
426
419
|
static size_t ggml_vbuffer_size(struct vbuffer * buf) {
|
|
@@ -601,7 +594,33 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
|
|
601
594
|
}
|
|
602
595
|
|
|
603
596
|
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
|
604
|
-
return t->data != NULL
|
|
597
|
+
return t->data != NULL // tensor data already set externally
|
|
598
|
+
|| t->buffer // tensor on external buffer (but not yet allocated)
|
|
599
|
+
|| ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
// free the extra space at the end if the new tensor is smaller
|
|
603
|
+
static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_tensor * node, struct ggml_tensor * parent) {
|
|
604
|
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
|
605
|
+
struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
|
|
606
|
+
|
|
607
|
+
size_t parent_size = ggml_backend_buft_get_alloc_size(galloc->bufts[p_hn->buffer_id], parent);
|
|
608
|
+
size_t node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
|
|
609
|
+
|
|
610
|
+
GGML_ASSERT(parent_size >= node_size);
|
|
611
|
+
|
|
612
|
+
// note: we want after the freeing the chunks to continue to be aligned
|
|
613
|
+
struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
|
|
614
|
+
parent_size = aligned_offset(NULL, parent_size, p_alloc->alignment);
|
|
615
|
+
node_size = aligned_offset(NULL, node_size, p_alloc->alignment);
|
|
616
|
+
|
|
617
|
+
if (parent_size > node_size) {
|
|
618
|
+
struct buffer_address p_addr = p_hn->addr;
|
|
619
|
+
p_addr.offset += node_size;
|
|
620
|
+
size_t extra_size = parent_size - node_size;
|
|
621
|
+
AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
|
|
622
|
+
ggml_dyn_tallocr_free_bytes(p_alloc, p_addr, extra_size);
|
|
623
|
+
}
|
|
605
624
|
}
|
|
606
625
|
|
|
607
626
|
static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
|
|
@@ -649,6 +668,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
|
|
649
668
|
hn->addr = p_hn->addr;
|
|
650
669
|
p_hn->allocated = false; // avoid freeing the parent
|
|
651
670
|
view_src_hn->allocated = false;
|
|
671
|
+
ggml_gallocr_free_extra_space(galloc, node, view_src);
|
|
652
672
|
return;
|
|
653
673
|
}
|
|
654
674
|
} else {
|
|
@@ -656,6 +676,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
|
|
656
676
|
hn->buffer_id = p_hn->buffer_id;
|
|
657
677
|
hn->addr = p_hn->addr;
|
|
658
678
|
p_hn->allocated = false; // avoid freeing the parent
|
|
679
|
+
ggml_gallocr_free_extra_space(galloc, node, parent);
|
|
659
680
|
return;
|
|
660
681
|
}
|
|
661
682
|
}
|
|
@@ -682,7 +703,14 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
|
|
|
682
703
|
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
|
683
704
|
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
|
684
705
|
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
|
685
|
-
|
|
706
|
+
|
|
707
|
+
AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
|
|
708
|
+
__func__, node->name, hn->addr.chunk, hn->addr.offset, size, alloc->chunks[hn->addr.chunk]->n_free_blocks);
|
|
709
|
+
#ifdef GGML_ALLOCATOR_DEBUG
|
|
710
|
+
remove_allocated_tensor(alloc, hn->addr, node);
|
|
711
|
+
#endif
|
|
712
|
+
|
|
713
|
+
ggml_dyn_tallocr_free_bytes(alloc, hn->addr, size);
|
|
686
714
|
hn->allocated = false;
|
|
687
715
|
}
|
|
688
716
|
|
|
@@ -797,7 +825,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
|
797
825
|
}
|
|
798
826
|
}
|
|
799
827
|
|
|
800
|
-
bool
|
|
828
|
+
static bool ggml_gallocr_reserve_n_impl(
|
|
829
|
+
ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, bool no_alloc) {
|
|
801
830
|
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
|
|
802
831
|
// add 25% margin to avoid hash collisions
|
|
803
832
|
min_hash_size += min_hash_size / 4;
|
|
@@ -885,20 +914,36 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
|
885
914
|
}
|
|
886
915
|
}
|
|
887
916
|
|
|
888
|
-
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
|
|
889
|
-
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
|
890
|
-
|
|
891
917
|
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
|
892
|
-
|
|
918
|
+
bool realloc = galloc->buffers[i] == NULL;
|
|
919
|
+
size_t new_size = 0;
|
|
920
|
+
for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
|
|
921
|
+
size_t cur_chunk_size = galloc->buffers[i] ? ggml_vbuffer_chunk_size(galloc->buffers[i], c) : 0;
|
|
922
|
+
size_t new_chunk_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i], c);
|
|
923
|
+
new_size += new_chunk_size;
|
|
924
|
+
if (new_chunk_size > cur_chunk_size) {
|
|
925
|
+
realloc = true;
|
|
926
|
+
}
|
|
927
|
+
}
|
|
928
|
+
if (realloc) {
|
|
893
929
|
#ifndef NDEBUG
|
|
894
|
-
|
|
930
|
+
{
|
|
931
|
+
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
|
|
932
|
+
if (cur_size > 0) {
|
|
933
|
+
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
|
|
934
|
+
__func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
|
935
|
+
}
|
|
936
|
+
}
|
|
895
937
|
#endif
|
|
896
|
-
|
|
897
938
|
ggml_vbuffer_free(galloc->buffers[i]);
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
939
|
+
if (no_alloc) {
|
|
940
|
+
galloc->buffers[i] = NULL;
|
|
941
|
+
} else {
|
|
942
|
+
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
|
943
|
+
if (galloc->buffers[i] == NULL) {
|
|
944
|
+
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
|
945
|
+
return false;
|
|
946
|
+
}
|
|
902
947
|
}
|
|
903
948
|
}
|
|
904
949
|
}
|
|
@@ -906,6 +951,21 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
|
906
951
|
return true;
|
|
907
952
|
}
|
|
908
953
|
|
|
954
|
+
void ggml_gallocr_reserve_n_size(
|
|
955
|
+
ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, size_t * sizes) {
|
|
956
|
+
GGML_ASSERT(ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ true));
|
|
957
|
+
for (int i = 0; i < galloc->n_buffers; i++) {
|
|
958
|
+
sizes[i] = 0;
|
|
959
|
+
for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
|
|
960
|
+
sizes[i] += galloc->buf_tallocs[i]->chunks[c]->max_size;
|
|
961
|
+
}
|
|
962
|
+
}
|
|
963
|
+
}
|
|
964
|
+
|
|
965
|
+
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
|
966
|
+
return ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ false);
|
|
967
|
+
}
|
|
968
|
+
|
|
909
969
|
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
|
910
970
|
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
|
911
971
|
}
|
|
@@ -1108,7 +1168,8 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
|
1108
1168
|
return true;
|
|
1109
1169
|
}
|
|
1110
1170
|
|
|
1111
|
-
ggml_backend_buffer_t
|
|
1171
|
+
static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl(
|
|
1172
|
+
struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool no_alloc) {
|
|
1112
1173
|
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
|
1113
1174
|
|
|
1114
1175
|
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
|
@@ -1116,6 +1177,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
|
1116
1177
|
|
|
1117
1178
|
ggml_backend_buffer_t * buffers = NULL;
|
|
1118
1179
|
size_t n_buffers = 0;
|
|
1180
|
+
*nbytes_total = 0;
|
|
1119
1181
|
|
|
1120
1182
|
size_t cur_buf_size = 0;
|
|
1121
1183
|
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
|
|
@@ -1127,10 +1189,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
|
1127
1189
|
|
|
1128
1190
|
if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
|
|
1129
1191
|
// allocate tensors in the current buffer
|
|
1130
|
-
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
|
1192
|
+
if (!no_alloc && !alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
|
1131
1193
|
return NULL;
|
|
1132
1194
|
}
|
|
1133
1195
|
first = t;
|
|
1196
|
+
*nbytes_total += cur_buf_size;
|
|
1134
1197
|
cur_buf_size = this_size;
|
|
1135
1198
|
} else {
|
|
1136
1199
|
cur_buf_size += this_size;
|
|
@@ -1139,15 +1202,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
|
1139
1202
|
|
|
1140
1203
|
// allocate remaining tensors
|
|
1141
1204
|
if (cur_buf_size > 0) {
|
|
1142
|
-
|
|
1205
|
+
*nbytes_total += cur_buf_size;
|
|
1206
|
+
if (!no_alloc && !alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
|
|
1143
1207
|
return NULL;
|
|
1144
1208
|
}
|
|
1145
1209
|
}
|
|
1146
1210
|
|
|
1211
|
+
if (no_alloc) {
|
|
1212
|
+
return NULL;
|
|
1213
|
+
}
|
|
1214
|
+
|
|
1147
1215
|
if (n_buffers == 0) {
|
|
1148
1216
|
#ifndef NDEBUG
|
|
1149
1217
|
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
|
|
1150
1218
|
#endif
|
|
1219
|
+
GGML_ASSERT(!buffers);
|
|
1151
1220
|
return NULL;
|
|
1152
1221
|
}
|
|
1153
1222
|
|
|
@@ -1157,10 +1226,24 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
|
1157
1226
|
} else {
|
|
1158
1227
|
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
|
|
1159
1228
|
}
|
|
1160
|
-
|
|
1229
|
+
if (buffers) {
|
|
1230
|
+
free(buffers); // can be NULL if context is empty or no_alloc
|
|
1231
|
+
}
|
|
1161
1232
|
return buffer;
|
|
1162
1233
|
}
|
|
1163
1234
|
|
|
1235
|
+
size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
|
1236
|
+
size_t nbytes_total = 0;
|
|
1237
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc=*/ true);
|
|
1238
|
+
GGML_ASSERT(!buf);
|
|
1239
|
+
return nbytes_total;
|
|
1240
|
+
}
|
|
1241
|
+
|
|
1242
|
+
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
|
1243
|
+
size_t nbytes_total = 0;
|
|
1244
|
+
return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
|
|
1245
|
+
}
|
|
1246
|
+
|
|
1164
1247
|
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
|
|
1165
1248
|
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
|
|
1166
1249
|
}
|
|
@@ -144,7 +144,7 @@ extern "C" {
|
|
|
144
144
|
// device description: short informative description of the device, could be the model name
|
|
145
145
|
const char * (*get_description)(ggml_backend_dev_t dev);
|
|
146
146
|
|
|
147
|
-
// device memory in bytes
|
|
147
|
+
// device memory in bytes: 0 bytes to indicate no memory to report
|
|
148
148
|
void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
|
|
149
149
|
|
|
150
150
|
// device type
|
|
@@ -209,9 +209,6 @@ extern "C" {
|
|
|
209
209
|
void * context;
|
|
210
210
|
};
|
|
211
211
|
|
|
212
|
-
// Internal backend registry API
|
|
213
|
-
GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
|
|
214
|
-
|
|
215
212
|
// Add backend dynamic loading support to the backend
|
|
216
213
|
|
|
217
214
|
// Initialize the backend
|
|
@@ -57,6 +57,10 @@
|
|
|
57
57
|
#include "ggml-opencl.h"
|
|
58
58
|
#endif
|
|
59
59
|
|
|
60
|
+
#ifdef GGML_USE_HEXAGON
|
|
61
|
+
#include "ggml-hexagon.h"
|
|
62
|
+
#endif
|
|
63
|
+
|
|
60
64
|
#ifdef GGML_USE_BLAS
|
|
61
65
|
#include "ggml-blas.h"
|
|
62
66
|
#endif
|
|
@@ -69,6 +73,10 @@
|
|
|
69
73
|
#include "ggml-cann.h"
|
|
70
74
|
#endif
|
|
71
75
|
|
|
76
|
+
#ifdef GGML_USE_ZENDNN
|
|
77
|
+
#include "ggml-zendnn.h"
|
|
78
|
+
#endif
|
|
79
|
+
|
|
72
80
|
// disable C++17 deprecation warning for std::codecvt_utf8
|
|
73
81
|
#if defined(__clang__)
|
|
74
82
|
# pragma clang diagnostic push
|
|
@@ -199,6 +207,12 @@ struct ggml_backend_registry {
|
|
|
199
207
|
#ifdef GGML_USE_OPENCL
|
|
200
208
|
register_backend(ggml_backend_opencl_reg());
|
|
201
209
|
#endif
|
|
210
|
+
#ifdef GGML_USE_ZENDNN
|
|
211
|
+
register_backend(ggml_backend_zendnn_reg());
|
|
212
|
+
#endif
|
|
213
|
+
#ifdef GGML_USE_HEXAGON
|
|
214
|
+
register_backend(ggml_backend_hexagon_reg());
|
|
215
|
+
#endif
|
|
202
216
|
#ifdef GGML_USE_CANN
|
|
203
217
|
register_backend(ggml_backend_cann_reg());
|
|
204
218
|
#endif
|
|
@@ -527,8 +541,12 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
|
|
527
541
|
fs::path best_path;
|
|
528
542
|
|
|
529
543
|
for (const auto & search_path : search_paths) {
|
|
530
|
-
if (!fs::exists(search_path)) {
|
|
531
|
-
|
|
544
|
+
if (std::error_code ec; !fs::exists(search_path, ec)) {
|
|
545
|
+
if (ec) {
|
|
546
|
+
GGML_LOG_DEBUG("%s: posix_stat(%s) failure, error-message: %s\n", __func__, path_str(search_path).c_str(), ec.message().c_str());
|
|
547
|
+
} else {
|
|
548
|
+
GGML_LOG_DEBUG("%s: search path %s does not exist\n", __func__, path_str(search_path).c_str());
|
|
549
|
+
}
|
|
532
550
|
continue;
|
|
533
551
|
}
|
|
534
552
|
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
|
|
@@ -568,8 +586,12 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
|
|
568
586
|
for (const auto & search_path : search_paths) {
|
|
569
587
|
fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native();
|
|
570
588
|
fs::path path = search_path / filename;
|
|
571
|
-
if (fs::exists(path)) {
|
|
589
|
+
if (std::error_code ec; fs::exists(path, ec)) {
|
|
572
590
|
return get_reg().load_backend(path, silent);
|
|
591
|
+
} else {
|
|
592
|
+
if (ec) {
|
|
593
|
+
GGML_LOG_DEBUG("%s: posix_stat(%s) failure, error-message: %s\n", __func__, path_str(path).c_str(), ec.message().c_str());
|
|
594
|
+
}
|
|
573
595
|
}
|
|
574
596
|
}
|
|
575
597
|
return nullptr;
|
|
@@ -590,6 +612,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
|
|
|
590
612
|
#endif
|
|
591
613
|
|
|
592
614
|
ggml_backend_load_best("blas", silent, dir_path);
|
|
615
|
+
ggml_backend_load_best("zendnn", silent, dir_path);
|
|
593
616
|
ggml_backend_load_best("cann", silent, dir_path);
|
|
594
617
|
ggml_backend_load_best("cuda", silent, dir_path);
|
|
595
618
|
ggml_backend_load_best("hip", silent, dir_path);
|
|
@@ -598,6 +621,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
|
|
|
598
621
|
ggml_backend_load_best("sycl", silent, dir_path);
|
|
599
622
|
ggml_backend_load_best("vulkan", silent, dir_path);
|
|
600
623
|
ggml_backend_load_best("opencl", silent, dir_path);
|
|
624
|
+
ggml_backend_load_best("hexagon", silent, dir_path);
|
|
601
625
|
ggml_backend_load_best("musa", silent, dir_path);
|
|
602
626
|
ggml_backend_load_best("cpu", silent, dir_path);
|
|
603
627
|
// check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
|
|
@@ -36,12 +36,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
|
|
36
36
|
}
|
|
37
37
|
|
|
38
38
|
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
39
|
+
GGML_ASSERT(buft);
|
|
39
40
|
if (size == 0) {
|
|
40
41
|
// return a dummy buffer for zero-sized allocations
|
|
41
42
|
return ggml_backend_buffer_init(buft, {}, NULL, 0);
|
|
42
43
|
}
|
|
43
|
-
|
|
44
|
-
GGML_ASSERT(buft);
|
|
45
44
|
return buft->iface.alloc_buffer(buft, size);
|
|
46
45
|
}
|
|
47
46
|
|
|
@@ -128,6 +127,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
|
128
127
|
return NULL;
|
|
129
128
|
}
|
|
130
129
|
|
|
130
|
+
// FIXME JG: a multi_buffer has a non-zero size, according to the above comment get_base is not optional,
|
|
131
|
+
// I don't know whether the above comment is correct
|
|
132
|
+
if (!buffer->iface.get_base) {
|
|
133
|
+
return NULL;
|
|
134
|
+
}
|
|
135
|
+
|
|
131
136
|
void * base = buffer->iface.get_base(buffer);
|
|
132
137
|
|
|
133
138
|
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
|
@@ -723,6 +728,12 @@ struct ggml_backend_sched {
|
|
|
723
728
|
bool op_offload;
|
|
724
729
|
|
|
725
730
|
int debug;
|
|
731
|
+
|
|
732
|
+
// used for debugging graph reallocations [GGML_SCHED_DEBUG_REALLOC]
|
|
733
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/17617
|
|
734
|
+
int debug_realloc;
|
|
735
|
+
int debug_graph_size;
|
|
736
|
+
int debug_prev_graph_size;
|
|
726
737
|
};
|
|
727
738
|
|
|
728
739
|
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
|
@@ -1234,10 +1245,8 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
|
|
|
1234
1245
|
tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
|
1235
1246
|
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
|
|
1236
1247
|
}
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
|
1240
|
-
}
|
|
1248
|
+
ggml_set_input(tensor_copy);
|
|
1249
|
+
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
|
1241
1250
|
tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
|
|
1242
1251
|
SET_CAUSE(tensor_copy, "4.cpy");
|
|
1243
1252
|
}
|
|
@@ -1289,6 +1298,11 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
|
|
|
1289
1298
|
}
|
|
1290
1299
|
|
|
1291
1300
|
int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
|
|
1301
|
+
|
|
1302
|
+
// remember the actual graph_size for performing reallocation checks later [GGML_SCHED_DEBUG_REALLOC]
|
|
1303
|
+
sched->debug_prev_graph_size = sched->debug_graph_size;
|
|
1304
|
+
sched->debug_graph_size = graph_size;
|
|
1305
|
+
|
|
1292
1306
|
if (sched->graph.size < graph_size) {
|
|
1293
1307
|
sched->graph.size = graph_size;
|
|
1294
1308
|
sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
|
@@ -1395,14 +1409,27 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
|
|
1395
1409
|
|
|
1396
1410
|
// allocate graph
|
|
1397
1411
|
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
|
1412
|
+
#ifndef NDEBUG
|
|
1413
|
+
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
|
1414
|
+
#endif
|
|
1415
|
+
|
|
1416
|
+
if (sched->debug_realloc > 0) {
|
|
1417
|
+
// we are interested only in situations where the graph was reallocated even though its size remained the same [GGML_SCHED_DEBUG_REALLOC]
|
|
1418
|
+
// example: https://github.com/ggml-org/llama.cpp/pull/17143
|
|
1419
|
+
const bool unexpected = !backend_ids_changed && sched->debug_prev_graph_size == sched->debug_graph_size;
|
|
1420
|
+
|
|
1421
|
+
if (unexpected || sched->debug_realloc > 1) {
|
|
1422
|
+
GGML_ABORT("%s: unexpected graph reallocation (graph size = %d, nodes = %d, leafs = %d), debug_realloc = %d\n", __func__,
|
|
1423
|
+
sched->debug_graph_size, sched->graph.n_nodes, sched->graph.n_leafs, sched->debug_realloc);
|
|
1424
|
+
}
|
|
1425
|
+
}
|
|
1426
|
+
|
|
1398
1427
|
// the re-allocation may cause the split inputs to be moved to a different address
|
|
1399
1428
|
// synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
|
|
1400
1429
|
for (int i = 0; i < sched->n_backends; i++) {
|
|
1401
1430
|
ggml_backend_synchronize(sched->backends[i]);
|
|
1402
1431
|
}
|
|
1403
|
-
|
|
1404
|
-
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
|
1405
|
-
#endif
|
|
1432
|
+
|
|
1406
1433
|
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
|
1407
1434
|
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
|
1408
1435
|
GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
|
|
@@ -1614,6 +1641,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
|
1614
1641
|
|
|
1615
1642
|
const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
|
|
1616
1643
|
sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
|
|
1644
|
+
|
|
1645
|
+
sched->debug_realloc = 0;
|
|
1646
|
+
#ifdef GGML_SCHED_NO_REALLOC
|
|
1647
|
+
sched->debug_realloc = 1;
|
|
1648
|
+
#endif
|
|
1649
|
+
const char * GGML_SCHED_DEBUG_REALLOC = getenv("GGML_SCHED_DEBUG_REALLOC");
|
|
1650
|
+
sched->debug_realloc = GGML_SCHED_DEBUG_REALLOC ? atoi(GGML_SCHED_DEBUG_REALLOC) : sched->debug_realloc;
|
|
1651
|
+
|
|
1617
1652
|
sched->n_backends = n_backends;
|
|
1618
1653
|
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
|
1619
1654
|
|
|
@@ -1630,6 +1665,9 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
|
1630
1665
|
sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
|
1631
1666
|
sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
|
1632
1667
|
|
|
1668
|
+
sched->debug_graph_size = 0;
|
|
1669
|
+
sched->debug_prev_graph_size = 0;
|
|
1670
|
+
|
|
1633
1671
|
sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
|
|
1634
1672
|
sched->context_buffer = (char *) malloc(sched->context_buffer_size);
|
|
1635
1673
|
|
|
@@ -1694,9 +1732,10 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
|
|
1694
1732
|
sched->is_alloc = false;
|
|
1695
1733
|
}
|
|
1696
1734
|
|
|
1697
|
-
|
|
1735
|
+
void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes) {
|
|
1698
1736
|
GGML_ASSERT(sched);
|
|
1699
1737
|
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
|
1738
|
+
GGML_ASSERT(sizes);
|
|
1700
1739
|
|
|
1701
1740
|
ggml_backend_sched_reset(sched);
|
|
1702
1741
|
|
|
@@ -1704,6 +1743,17 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
|
|
1704
1743
|
|
|
1705
1744
|
ggml_backend_sched_split_graph(sched, measure_graph);
|
|
1706
1745
|
|
|
1746
|
+
ggml_gallocr_reserve_n_size(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids, sizes);
|
|
1747
|
+
}
|
|
1748
|
+
|
|
1749
|
+
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
|
1750
|
+
GGML_ASSERT(sched);
|
|
1751
|
+
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
|
1752
|
+
|
|
1753
|
+
ggml_backend_sched_synchronize(sched);
|
|
1754
|
+
|
|
1755
|
+
ggml_backend_sched_split_graph(sched, measure_graph);
|
|
1756
|
+
|
|
1707
1757
|
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
|
1708
1758
|
return false;
|
|
1709
1759
|
}
|
|
@@ -2003,7 +2053,7 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
|
|
|
2003
2053
|
ggml_free(copy.ctx_unallocated);
|
|
2004
2054
|
}
|
|
2005
2055
|
|
|
2006
|
-
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor *
|
|
2056
|
+
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor const * const * test_nodes, size_t num_test_nodes) {
|
|
2007
2057
|
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
|
|
2008
2058
|
if (copy.buffer == NULL) {
|
|
2009
2059
|
return false;
|
|
@@ -2014,22 +2064,22 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
|
|
|
2014
2064
|
|
|
2015
2065
|
assert(g1->n_nodes == g2->n_nodes);
|
|
2016
2066
|
|
|
2017
|
-
if (
|
|
2018
|
-
|
|
2067
|
+
if (num_test_nodes != 0) {
|
|
2068
|
+
GGML_ASSERT(test_nodes);
|
|
2069
|
+
// Compute the whole graph and only test the output for specific tensors
|
|
2019
2070
|
ggml_backend_graph_compute(backend1, g1);
|
|
2020
2071
|
ggml_backend_graph_compute(backend2, g2);
|
|
2021
2072
|
|
|
2022
|
-
|
|
2073
|
+
bool verified = false;
|
|
2023
2074
|
for (int i = 0; i < g1->n_nodes; i++) {
|
|
2024
|
-
|
|
2025
|
-
|
|
2026
|
-
|
|
2027
|
-
|
|
2075
|
+
for (size_t j = 0; j < num_test_nodes; ++j) {
|
|
2076
|
+
if (g1->nodes[i] == test_nodes[j]) {
|
|
2077
|
+
callback(i, g1->nodes[i], g2->nodes[i], user_data);
|
|
2078
|
+
verified = true;
|
|
2079
|
+
}
|
|
2028
2080
|
}
|
|
2029
2081
|
}
|
|
2030
|
-
GGML_ASSERT(
|
|
2031
|
-
|
|
2032
|
-
callback(test_node_idx, g1->nodes[test_node_idx], g2->nodes[test_node_idx], user_data);
|
|
2082
|
+
GGML_ASSERT(verified);
|
|
2033
2083
|
} else {
|
|
2034
2084
|
for (int i = 0; i < g1->n_nodes; i++) {
|
|
2035
2085
|
struct ggml_tensor * t1 = g1->nodes[i];
|
|
@@ -32,14 +32,12 @@ if (BLAS_FOUND)
|
|
|
32
32
|
pkg_check_modules(DepBLAS openblas)
|
|
33
33
|
endif()
|
|
34
34
|
elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
|
|
35
|
-
add_compile_definitions(GGML_BLAS_USE_BLIS)
|
|
36
35
|
pkg_check_modules(DepBLAS blis)
|
|
37
36
|
elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
|
|
38
37
|
pkg_check_modules(DepBLAS blas-atlas)
|
|
39
38
|
elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
|
|
40
39
|
pkg_check_modules(DepBLAS flexiblas_api)
|
|
41
40
|
elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
|
|
42
|
-
add_compile_definitions(GGML_BLAS_USE_MKL)
|
|
43
41
|
# all Intel* libraries share the same include path
|
|
44
42
|
pkg_check_modules(DepBLAS mkl-sdl)
|
|
45
43
|
elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
|
|
@@ -74,10 +72,26 @@ if (BLAS_FOUND)
|
|
|
74
72
|
|
|
75
73
|
target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
|
|
76
74
|
|
|
77
|
-
if ("${
|
|
75
|
+
if ("${GGML_BLAS_VENDOR}" STREQUAL "")
|
|
76
|
+
message(WARNING "GGML_BLAS_VENDOR is not set; some methods may not link properly.")
|
|
77
|
+
endif()
|
|
78
|
+
|
|
79
|
+
if ("${GGML_BLAS_VENDOR}" MATCHES "Intel" OR ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND "${GGML_BLAS_VENDOR}" MATCHES "Generic"))
|
|
78
80
|
add_compile_definitions(GGML_BLAS_USE_MKL)
|
|
79
81
|
endif()
|
|
80
82
|
|
|
83
|
+
if ("${GGML_BLAS_VENDOR}" MATCHES "OpenBLAS")
|
|
84
|
+
add_compile_definitions(GGML_BLAS_USE_OPENBLAS)
|
|
85
|
+
endif()
|
|
86
|
+
|
|
87
|
+
if ("${GGML_BLAS_VENDOR}" MATCHES "FLAME" OR "${GGML_BLAS_VENDOR}" MATCHES "AOCL" OR "${GGML_BLAS_VENDOR}" MATCHES "AOCL_mt")
|
|
88
|
+
add_compile_definitions(GGML_BLAS_USE_BLIS)
|
|
89
|
+
endif()
|
|
90
|
+
|
|
91
|
+
if ("${GGML_BLAS_VENDOR}" MATCHES "NVPL")
|
|
92
|
+
add_compile_definitions(GGML_BLAS_USE_NVPL)
|
|
93
|
+
endif()
|
|
94
|
+
|
|
81
95
|
target_link_libraries (ggml-blas PRIVATE ${BLAS_LIBRARIES})
|
|
82
96
|
target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
|
|
83
97
|
else()
|