whispercpp 1.3.4 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +158 -44
- data/ext/extconf.rb +3 -2
- data/ext/ruby_whisper.c +34 -6
- data/ext/ruby_whisper.h +67 -0
- data/ext/ruby_whisper_context.c +236 -144
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_model.c +12 -13
- data/ext/ruby_whisper_params.c +47 -24
- data/ext/ruby_whisper_segment.c +84 -20
- data/ext/ruby_whisper_token.c +371 -0
- data/ext/ruby_whisper_transcribe.cpp +5 -2
- data/ext/ruby_whisper_vad_context.c +122 -0
- data/ext/ruby_whisper_vad_context_detect.cpp +51 -0
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +138 -0
- data/ext/ruby_whisper_vad_segments.c +105 -0
- data/ext/sources/CMakeLists.txt +4 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/arm64-apple-clang.cmake +16 -0
- data/ext/sources/cmake/arm64-windows-llvm.cmake +16 -0
- data/ext/sources/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/cmake/x64-windows-llvm.cmake +5 -0
- data/ext/sources/examples/addon.node/vad-example.js +2 -2
- data/ext/sources/examples/bench/bench.cpp +23 -18
- data/ext/sources/examples/cli/cli.cpp +129 -112
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/lsp/CMakeLists.txt +2 -1
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/quantize/CMakeLists.txt +2 -1
- data/ext/sources/examples/server/server.cpp +28 -15
- data/ext/sources/examples/talk-llama/CMakeLists.txt +8 -3
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +5 -2
- data/ext/sources/examples/talk-llama/llama-adapter.h +7 -0
- data/ext/sources/examples/talk-llama/llama-arch.cpp +2378 -1988
- data/ext/sources/examples/talk-llama/llama-arch.h +109 -2
- data/ext/sources/examples/talk-llama/llama-batch.cpp +78 -34
- data/ext/sources/examples/talk-llama/llama-batch.h +17 -4
- data/ext/sources/examples/talk-llama/llama-chat.cpp +100 -4
- data/ext/sources/examples/talk-llama/llama-chat.h +5 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +1088 -403
- data/ext/sources/examples/talk-llama/llama-context.h +70 -23
- data/ext/sources/examples/talk-llama/llama-cparams.h +6 -0
- data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +295 -60
- data/ext/sources/examples/talk-llama/llama-grammar.h +22 -1
- data/ext/sources/examples/talk-llama/llama-graph.cpp +925 -155
- data/ext/sources/examples/talk-llama/llama-graph.h +234 -23
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +79 -38
- data/ext/sources/examples/talk-llama/llama-hparams.h +118 -18
- data/ext/sources/examples/talk-llama/llama-impl.cpp +11 -7
- data/ext/sources/examples/talk-llama/llama-impl.h +14 -2
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +8 -4
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +405 -140
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +24 -10
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +44 -2
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +12 -10
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +42 -31
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +2 -2
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +197 -45
- data/ext/sources/examples/talk-llama/llama-mmap.h +8 -3
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +606 -116
- data/ext/sources/examples/talk-llama/llama-model-loader.h +41 -5
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +61 -44
- data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
- data/ext/sources/examples/talk-llama/llama-model.cpp +2756 -13643
- data/ext/sources/examples/talk-llama/llama-model.h +112 -18
- data/ext/sources/examples/talk-llama/llama-quant.cpp +582 -365
- data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +1409 -199
- data/ext/sources/examples/talk-llama/llama-sampler.h +42 -0
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +248 -82
- data/ext/sources/examples/talk-llama/llama-vocab.h +50 -40
- data/ext/sources/examples/talk-llama/llama.cpp +802 -21
- data/ext/sources/examples/talk-llama/llama.h +210 -39
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +190 -0
- data/ext/sources/examples/talk-llama/models/apertus.cpp +125 -0
- data/ext/sources/examples/talk-llama/models/arcee.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/arctic.cpp +137 -0
- data/ext/sources/examples/talk-llama/models/arwkv7.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +143 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +133 -0
- data/ext/sources/examples/talk-llama/models/bert.cpp +184 -0
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +145 -0
- data/ext/sources/examples/talk-llama/models/bloom.cpp +101 -0
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +111 -0
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +102 -0
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +134 -0
- data/ext/sources/examples/talk-llama/models/command-r.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/deci.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +142 -0
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +262 -0
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/dream.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +148 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +114 -0
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +111 -0
- data/ext/sources/examples/talk-llama/models/falcon.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +116 -0
- data/ext/sources/examples/talk-llama/models/gemma.cpp +112 -0
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +155 -0
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +384 -0
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +170 -0
- data/ext/sources/examples/talk-llama/models/glm4.cpp +157 -0
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +195 -0
- data/ext/sources/examples/talk-llama/models/granite.cpp +210 -0
- data/ext/sources/examples/talk-llama/models/grok.cpp +159 -0
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +139 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +153 -0
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/jais.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +106 -0
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +196 -0
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/llada.cpp +99 -0
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/llama.cpp +175 -0
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/mamba-base.cpp +289 -0
- data/ext/sources/examples/talk-llama/models/mamba.cpp +54 -0
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +129 -0
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +200 -0
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +160 -0
- data/ext/sources/examples/talk-llama/models/models.h +704 -0
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +109 -0
- data/ext/sources/examples/talk-llama/models/mpt.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +162 -0
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +104 -0
- data/ext/sources/examples/talk-llama/models/olmo.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +127 -0
- data/ext/sources/examples/talk-llama/models/openelm.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/orion.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi3.cpp +152 -0
- data/ext/sources/examples/talk-llama/models/plamo.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +320 -0
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/plm.cpp +169 -0
- data/ext/sources/examples/talk-llama/models/qwen.cpp +108 -0
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +151 -0
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +131 -0
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +525 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +140 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/refact.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +164 -0
- data/ext/sources/examples/talk-llama/models/rwkv6.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +137 -0
- data/ext/sources/examples/talk-llama/models/rwkv7.cpp +90 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +146 -0
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +100 -0
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +166 -0
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +96 -0
- data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +149 -0
- data/ext/sources/examples/talk-llama/models/xverse.cpp +108 -0
- data/ext/sources/examples/talk-llama/unicode.cpp +121 -79
- data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +1 -1
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +1 -1
- data/ext/sources/ggml/CMakeLists.txt +90 -56
- data/ext/sources/ggml/include/ggml-alloc.h +9 -0
- data/ext/sources/ggml/include/ggml-backend.h +5 -2
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +6 -0
- data/ext/sources/ggml/include/ggml-hexagon.h +19 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +14 -12
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml-zendnn.h +22 -0
- data/ext/sources/ggml/include/ggml.h +246 -21
- data/ext/sources/ggml/src/CMakeLists.txt +85 -11
- data/ext/sources/ggml/src/ggml-alloc.c +128 -50
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-impl.h +1 -4
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +54 -88
- data/ext/sources/ggml/src/ggml-backend.cpp +76 -23
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +18 -4
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +11 -11
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +58 -46
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +139 -48
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +2427 -1785
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +238 -362
- data/ext/sources/ggml/src/ggml-cann/common.h +285 -211
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +663 -831
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +170 -95
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -18
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +513 -27
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +4192 -992
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1761 -49
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +124 -24
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +157 -28
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -3
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +251 -80
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +19 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +587 -119
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +33 -44
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1093 -194
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1284 -203
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1519 -527
- data/ext/sources/ggml/src/ggml-cpu/ops.h +6 -4
- data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +3632 -781
- data/ext/sources/ggml/src/ggml-cpu/repack.h +129 -4
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +152 -46
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +3 -2
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +152 -1
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +7 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +140 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.h +261 -146
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +72 -1
- data/ext/sources/ggml/src/ggml-cuda/argmax.cu +2 -2
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +132 -6
- data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +16 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +33 -31
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +474 -85
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +342 -246
- data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +1 -5
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cu +307 -0
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/diag.cu +77 -0
- data/ext/sources/ggml/src/ggml-cuda/diag.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +98 -74
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +973 -665
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +35 -741
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +1255 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +33 -40
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +40 -18
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +206 -45
- data/ext/sources/ggml/src/ggml-cuda/fill.cu +37 -0
- data/ext/sources/ggml/src/ggml-cuda/fill.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1688 -302
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +12 -10
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +908 -48
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +88 -20
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +502 -90
- data/ext/sources/ggml/src/ggml-cuda/mmid.cu +164 -0
- data/ext/sources/ggml/src/ggml-cuda/mmid.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +69 -176
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +532 -193
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +460 -104
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +5 -2
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +360 -122
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +2 -1
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +73 -39
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +152 -1
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +14 -0
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +364 -149
- data/ext/sources/ggml/src/ggml-cuda/rope.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +101 -47
- data/ext/sources/ggml/src/ggml-cuda/set.cu +39 -0
- data/ext/sources/ggml/src/ggml-cuda/set.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +163 -41
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +275 -0
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +68 -50
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +49 -84
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +22 -4
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +95 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +275 -119
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -7
- data/ext/sources/ggml/src/ggml-cuda/tri.cu +136 -0
- data/ext/sources/ggml/src/ggml-cuda/tri.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +160 -11
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +38 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +163 -7
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +22 -1
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +6 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3325 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +46 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +813 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +891 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +713 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +112 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +155 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +63 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +26 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +1199 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2670 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +497 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +168 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +419 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +382 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +153 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +14 -13
- data/ext/sources/ggml/src/ggml-impl.h +129 -6
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +15 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +173 -34
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +912 -344
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +124 -59
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +588 -144
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +396 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +1724 -421
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +16 -3
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +333 -114
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +3050 -1539
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +3 -1
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +30 -1
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4279 -497
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +267 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +4 -3
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +35 -16
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
- data/ext/sources/ggml/src/ggml-quants.c +96 -5
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +438 -156
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +59 -87
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +81 -0
- data/ext/sources/ggml/src/ggml-sycl/add-id.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -29
- data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +0 -6
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +427 -20
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +55 -44
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +103 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +79 -0
- data/ext/sources/ggml/src/ggml-sycl/count-equal.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +0 -3
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +18 -0
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +867 -50
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +401 -358
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +12 -2
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +645 -155
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +221 -66
- data/ext/sources/ggml/src/ggml-sycl/norm.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/pad.cpp +97 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.hpp +24 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.cpp +122 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +457 -281
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/set.cpp +73 -0
- data/ext/sources/ggml/src/ggml-sycl/set.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +327 -162
- data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +4 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +71 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +39 -19
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +5994 -3055
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +18 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +47 -49
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +9 -21
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +386 -160
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +82 -20
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +400 -174
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +123 -37
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +10 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +17 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +2 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +19 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +13 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mat_vec_base.comp → mul_mat_vec_base.glsl} +77 -29
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +71 -21
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +41 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +44 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +39 -36
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +494 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +88 -105
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +41 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mm_funcs.comp → mul_mm_funcs.glsl} +69 -59
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +74 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +92 -230
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +21 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +10 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +49 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +207 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +8 -49
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +8 -32
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +8 -32
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +33 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +8 -38
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +50 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +2 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +43 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +345 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +90 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +384 -180
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +28 -2
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1374 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +2544 -726
- data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +73 -15
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +72 -261
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +766 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +147 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +196 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +83 -17
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +91 -0
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
- data/ext/sources/ggml/src/ggml.c +590 -64
- data/ext/sources/ggml/src/gguf.cpp +229 -44
- data/ext/sources/include/whisper.h +1 -0
- data/ext/sources/src/CMakeLists.txt +3 -1
- data/ext/sources/src/whisper.cpp +106 -62
- data/ext/sources/tests/CMakeLists.txt +2 -2
- data/ext/sources/tests/test-vad-full.cpp +4 -2
- data/ext/sources/tests/test-vad.cpp +1 -1
- data/extsources.rb +1 -0
- data/lib/whisper/model/uri.rb +17 -18
- data/sig/whisper.rbs +162 -4
- data/test/test_context_params.rb +82 -0
- data/test/test_params.rb +16 -8
- data/test/test_segment.rb +0 -1
- data/test/test_token.rb +81 -0
- data/test/test_vad.rb +1 -1
- data/test/test_vad_context.rb +100 -0
- data/test/test_vad_segment.rb +19 -0
- data/test/test_vad_segments.rb +16 -0
- data/test/test_whisper.rb +27 -0
- data/whispercpp.gemspec +1 -1
- metadata +502 -37
- data/ext/sources/build-xcframework.sh +0 -571
- data/ext/sources/examples/talk-llama/llama-sampling.h +0 -32
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cann/Doxyfile +0 -2579
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +0 -44
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +0 -41
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +0 -44
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +0 -41
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +0 -48
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
|
@@ -1,6 +1,10 @@
|
|
|
1
|
+
#include "llama.h"
|
|
2
|
+
|
|
3
|
+
#include "ggml-cpp.h"
|
|
1
4
|
#include "llama-impl.h"
|
|
2
5
|
|
|
3
6
|
#include "llama-chat.h"
|
|
7
|
+
#include "llama-context.h"
|
|
4
8
|
#include "llama-mmap.h"
|
|
5
9
|
#include "llama-vocab.h"
|
|
6
10
|
#include "llama-model-loader.h"
|
|
@@ -9,13 +13,17 @@
|
|
|
9
13
|
|
|
10
14
|
#include "ggml.h"
|
|
11
15
|
#include "ggml-backend.h"
|
|
16
|
+
#include "gguf.h"
|
|
12
17
|
|
|
13
18
|
#include <algorithm>
|
|
19
|
+
#include <cassert>
|
|
20
|
+
#include <cinttypes>
|
|
14
21
|
#include <cstddef>
|
|
15
22
|
#include <cstdint>
|
|
16
23
|
#include <cstdio>
|
|
17
24
|
#include <cstring>
|
|
18
25
|
#include <ctime>
|
|
26
|
+
#include <stdexcept>
|
|
19
27
|
|
|
20
28
|
#if defined(_MSC_VER)
|
|
21
29
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
@@ -37,9 +45,725 @@ const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_ty
|
|
|
37
45
|
GGML_ABORT("fatal error");
|
|
38
46
|
}
|
|
39
47
|
|
|
48
|
+
struct llama_device_memory_data {
|
|
49
|
+
int64_t total;
|
|
50
|
+
int64_t free;
|
|
51
|
+
llama_memory_breakdown_data mb;
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
|
55
|
+
const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams,
|
|
56
|
+
std::vector<ggml_backend_dev_t> & devs, uint32_t & hp_ngl, uint32_t & hp_n_ctx_train, uint32_t & hp_n_expert,
|
|
57
|
+
const ggml_log_level log_level) {
|
|
58
|
+
struct user_data_t {
|
|
59
|
+
struct {
|
|
60
|
+
ggml_log_callback callback;
|
|
61
|
+
void * user_data;
|
|
62
|
+
} original_logger;
|
|
63
|
+
ggml_log_level min_level; // prints below this log level go to debug log
|
|
64
|
+
};
|
|
65
|
+
user_data_t ud;
|
|
66
|
+
llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
|
|
67
|
+
ud.min_level = log_level;
|
|
68
|
+
|
|
69
|
+
llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
|
|
70
|
+
const user_data_t * ud = (const user_data_t *) user_data;
|
|
71
|
+
const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
|
|
72
|
+
ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
|
|
73
|
+
}, &ud);
|
|
74
|
+
|
|
75
|
+
llama_model_params mparams_copy = *mparams;
|
|
76
|
+
mparams_copy.no_alloc = true;
|
|
77
|
+
mparams_copy.use_mmap = false;
|
|
78
|
+
mparams_copy.use_mlock = false;
|
|
79
|
+
|
|
80
|
+
llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
|
|
81
|
+
if (model == nullptr) {
|
|
82
|
+
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
|
83
|
+
throw std::runtime_error("failed to load model");
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
llama_context * ctx = llama_init_from_model(model, *cparams);
|
|
87
|
+
if (ctx == nullptr) {
|
|
88
|
+
llama_model_free(model);
|
|
89
|
+
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
|
90
|
+
throw std::runtime_error("failed to create llama_context from model");
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
std::vector<llama_device_memory_data> ret(model->devices.size());
|
|
94
|
+
|
|
95
|
+
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
|
|
96
|
+
|
|
97
|
+
for (const auto & [buft, mb] : memory_breakdown) {
|
|
98
|
+
if (ggml_backend_buft_is_host(buft)) {
|
|
99
|
+
continue;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
|
103
|
+
if (!dev) {
|
|
104
|
+
continue;
|
|
105
|
+
}
|
|
106
|
+
for (size_t i = 0; i < ret.size(); i++) {
|
|
107
|
+
if (model->devices[i] == dev) {
|
|
108
|
+
ret[i].mb.model += mb.model;
|
|
109
|
+
ret[i].mb.context += mb.context;
|
|
110
|
+
ret[i].mb.compute += mb.compute;
|
|
111
|
+
break;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
for (size_t i = 0; i < ret.size(); i++) {
|
|
116
|
+
size_t free;
|
|
117
|
+
size_t total;
|
|
118
|
+
ggml_backend_dev_memory(model->devices[i], &free, &total);
|
|
119
|
+
|
|
120
|
+
// devices can return 0 bytes for free and total memory if they do not
|
|
121
|
+
// have any to report. in this case, we will use the host memory as a fallback
|
|
122
|
+
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
|
123
|
+
if (free == 0 && total == 0) {
|
|
124
|
+
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
125
|
+
if (cpu_dev == nullptr) {
|
|
126
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
127
|
+
}
|
|
128
|
+
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
|
129
|
+
}
|
|
130
|
+
ret[i].free = free;
|
|
131
|
+
ret[i].total = total;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
devs = model->devices;
|
|
135
|
+
hp_ngl = model->hparams.n_layer;
|
|
136
|
+
hp_n_ctx_train = model->hparams.n_ctx_train;
|
|
137
|
+
hp_n_expert = model->hparams.n_expert;
|
|
138
|
+
|
|
139
|
+
llama_memory_breakdown_print(ctx); // goes to debug log
|
|
140
|
+
|
|
141
|
+
llama_free(ctx);
|
|
142
|
+
llama_model_free(model);
|
|
143
|
+
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
|
144
|
+
return ret;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// enum to identify part of a layer for distributing its tensors:
|
|
148
|
+
enum layer_fraction_t {
|
|
149
|
+
LAYER_FRACTION_NONE = 0, // nothing
|
|
150
|
+
LAYER_FRACTION_ATTN = 1, // attention
|
|
151
|
+
LAYER_FRACTION_UP = 2, // attention + up
|
|
152
|
+
LAYER_FRACTION_GATE = 3, // attention + up + gate
|
|
153
|
+
LAYER_FRACTION_MOE = 4, // everything but sparse MoE weights
|
|
154
|
+
};
|
|
155
|
+
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
|
|
156
|
+
|
|
157
|
+
class llama_params_fit_exception : public std::runtime_error {
|
|
158
|
+
using std::runtime_error::runtime_error;
|
|
159
|
+
};
|
|
160
|
+
|
|
161
|
+
static void llama_params_fit_impl(
|
|
162
|
+
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
|
163
|
+
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
|
164
|
+
size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
|
165
|
+
constexpr int64_t MiB = 1024*1024;
|
|
166
|
+
typedef std::vector<llama_device_memory_data> dmds_t;
|
|
167
|
+
const llama_model_params default_mparams = llama_model_default_params();
|
|
168
|
+
|
|
169
|
+
std::vector<ggml_backend_dev_t> devs;
|
|
170
|
+
uint32_t hp_ngl = 0; // hparams.n_gpu_layers
|
|
171
|
+
uint32_t hp_nct = 0; // hparams.n_ctx_train
|
|
172
|
+
uint32_t hp_nex = 0; // hparams.n_expert
|
|
173
|
+
|
|
174
|
+
// step 1: get data for default parameters and check whether any changes are necessary in the first place
|
|
175
|
+
|
|
176
|
+
LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
|
|
177
|
+
const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
|
178
|
+
const size_t nd = devs.size(); // number of devices
|
|
179
|
+
if (nd == 0) {
|
|
180
|
+
LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__);
|
|
181
|
+
return;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
|
185
|
+
margins.reserve(nd);
|
|
186
|
+
for (size_t id = 0; id < nd; id++) {
|
|
187
|
+
margins.push_back(margins_s[id]);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
std::vector<std::string> dev_names;
|
|
191
|
+
{
|
|
192
|
+
dev_names.reserve(nd);
|
|
193
|
+
size_t max_length = 0;
|
|
194
|
+
for (ggml_backend_dev_t dev : devs) {
|
|
195
|
+
std::string name = ggml_backend_dev_name(dev);
|
|
196
|
+
name += " (";
|
|
197
|
+
name += ggml_backend_dev_description(dev);
|
|
198
|
+
name += ")";
|
|
199
|
+
dev_names.push_back(name);
|
|
200
|
+
max_length = std::max(max_length, name.length());
|
|
201
|
+
}
|
|
202
|
+
for (std::string & dn : dev_names) {
|
|
203
|
+
dn.insert(dn.end(), max_length - dn.length(), ' ');
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
int64_t sum_free = 0;
|
|
208
|
+
int64_t sum_projected_free = 0;
|
|
209
|
+
int64_t sum_projected_used = 0;
|
|
210
|
+
int64_t sum_projected_model = 0;
|
|
211
|
+
std::vector<int64_t> projected_free_per_device;
|
|
212
|
+
projected_free_per_device.reserve(nd);
|
|
213
|
+
|
|
214
|
+
if (nd > 1) {
|
|
215
|
+
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
|
216
|
+
}
|
|
217
|
+
for (size_t id = 0; id < nd; id++) {
|
|
218
|
+
const llama_device_memory_data & dmd = dmds_full[id];
|
|
219
|
+
|
|
220
|
+
const int64_t projected_used = dmd.mb.total();
|
|
221
|
+
const int64_t projected_free = dmd.free - projected_used;
|
|
222
|
+
projected_free_per_device.push_back(projected_free);
|
|
223
|
+
|
|
224
|
+
sum_free += dmd.free;
|
|
225
|
+
sum_projected_used += projected_used;
|
|
226
|
+
sum_projected_free += projected_free;
|
|
227
|
+
sum_projected_model += dmd.mb.model;
|
|
228
|
+
|
|
229
|
+
if (nd > 1) {
|
|
230
|
+
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
|
|
231
|
+
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
assert(sum_free >= 0 && sum_projected_used >= 0);
|
|
235
|
+
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
|
236
|
+
__func__, sum_projected_used/MiB, sum_free/MiB);
|
|
237
|
+
if (nd == 1) {
|
|
238
|
+
if (projected_free_per_device[0] >= margins[0]) {
|
|
239
|
+
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
|
240
|
+
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
|
|
241
|
+
return;
|
|
242
|
+
}
|
|
243
|
+
} else {
|
|
244
|
+
bool changes_needed = false;
|
|
245
|
+
for (size_t id = 0; id < nd; id++) {
|
|
246
|
+
if (projected_free_per_device[id] < margins[id]) {
|
|
247
|
+
changes_needed = true;
|
|
248
|
+
break;
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
if (!changes_needed) {
|
|
252
|
+
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
|
|
253
|
+
return;
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// step 2: try reducing memory use by reducing the context size
|
|
258
|
+
|
|
259
|
+
{
|
|
260
|
+
int64_t global_surplus = sum_projected_free;
|
|
261
|
+
for (size_t id = 0; id < nd; id++) {
|
|
262
|
+
global_surplus -= margins[id];
|
|
263
|
+
}
|
|
264
|
+
if (global_surplus < 0) {
|
|
265
|
+
if (nd == 1) {
|
|
266
|
+
LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
|
|
267
|
+
__func__, margins[0]/MiB, -global_surplus/MiB);
|
|
268
|
+
} else {
|
|
269
|
+
LLAMA_LOG_INFO(
|
|
270
|
+
"%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
|
|
271
|
+
__func__, -global_surplus/MiB);
|
|
272
|
+
}
|
|
273
|
+
if (cparams->n_ctx == 0) {
|
|
274
|
+
if (hp_nct > n_ctx_min) {
|
|
275
|
+
int64_t sum_used_target = sum_free;
|
|
276
|
+
for (size_t id = 0; id < nd; id++) {
|
|
277
|
+
sum_used_target -= margins[id];
|
|
278
|
+
}
|
|
279
|
+
if (nd > 1) {
|
|
280
|
+
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
|
|
281
|
+
// - for dense models only whole layers can be assigned to devices
|
|
282
|
+
// - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
|
|
283
|
+
// - on average we expect a waste of 0.5 layers/tensors per device
|
|
284
|
+
// - use slightly more than the expected average for nd devices to be safe
|
|
285
|
+
const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
|
|
286
|
+
sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
int64_t sum_projected_used_min_ctx = 0;
|
|
290
|
+
cparams->n_ctx = n_ctx_min;
|
|
291
|
+
const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
|
292
|
+
for (const auto & dmd : dmds_min_ctx) {
|
|
293
|
+
sum_projected_used_min_ctx += dmd.mb.total();
|
|
294
|
+
}
|
|
295
|
+
if (sum_used_target > sum_projected_used_min_ctx) {
|
|
296
|
+
// linear interpolation between minimum and maximum context size:
|
|
297
|
+
cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
|
|
298
|
+
/ (sum_projected_used - sum_projected_used_min_ctx);
|
|
299
|
+
cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
|
|
300
|
+
|
|
301
|
+
const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
|
|
302
|
+
const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
|
|
303
|
+
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
|
304
|
+
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
|
305
|
+
if (nd == 1) {
|
|
306
|
+
LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
|
|
307
|
+
return;
|
|
308
|
+
}
|
|
309
|
+
LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
|
|
310
|
+
} else {
|
|
311
|
+
const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
|
|
312
|
+
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
|
313
|
+
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
|
314
|
+
}
|
|
315
|
+
} else {
|
|
316
|
+
if (n_ctx_min == UINT32_MAX) {
|
|
317
|
+
LLAMA_LOG_INFO("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
|
|
318
|
+
} else {
|
|
319
|
+
LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
|
|
320
|
+
__func__, hp_nct, n_ctx_min);
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
} else {
|
|
324
|
+
LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
|
|
330
|
+
throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
|
|
331
|
+
}
|
|
332
|
+
if (nd > 1) {
|
|
333
|
+
if (!tensor_split) {
|
|
334
|
+
throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
|
|
335
|
+
}
|
|
336
|
+
if (mparams->tensor_split) {
|
|
337
|
+
for (size_t id = 0; id < nd; id++) {
|
|
338
|
+
if (mparams->tensor_split[id] != 0.0f) {
|
|
339
|
+
throw llama_params_fit_exception("model_params::tensor_split already set by user, abort");
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
|
344
|
+
throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
if (!tensor_buft_overrides) {
|
|
348
|
+
throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
|
|
349
|
+
}
|
|
350
|
+
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
|
|
351
|
+
throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
// step 3: iteratively fill the back to front with "dense" layers
|
|
355
|
+
// - for a dense model simply fill full layers, giving each device a contiguous slice of the model
|
|
356
|
+
// - for a MoE model, same as dense model but with all MoE tensors in system memory
|
|
357
|
+
|
|
358
|
+
// utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
|
|
359
|
+
auto get_overflow_pattern = [&](const size_t il, const layer_fraction_t lf) -> const char * {
|
|
360
|
+
constexpr size_t n_strings = 1000;
|
|
361
|
+
if (il >= n_strings) {
|
|
362
|
+
throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
|
|
363
|
+
}
|
|
364
|
+
switch (lf) {
|
|
365
|
+
case LAYER_FRACTION_ATTN: {
|
|
366
|
+
static std::array<std::string, n_strings> patterns;
|
|
367
|
+
if (patterns[il].empty()) {
|
|
368
|
+
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|gate|down).*";
|
|
369
|
+
}
|
|
370
|
+
return patterns[il].c_str();
|
|
371
|
+
}
|
|
372
|
+
case LAYER_FRACTION_UP: {
|
|
373
|
+
static std::array<std::string, n_strings> patterns;
|
|
374
|
+
if (patterns[il].empty()) {
|
|
375
|
+
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|down).*";
|
|
376
|
+
}
|
|
377
|
+
return patterns[il].c_str();
|
|
378
|
+
}
|
|
379
|
+
case LAYER_FRACTION_GATE: {
|
|
380
|
+
static std::array<std::string, n_strings> patterns;
|
|
381
|
+
if (patterns[il].empty()) {
|
|
382
|
+
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
|
|
383
|
+
}
|
|
384
|
+
return patterns[il].c_str();
|
|
385
|
+
}
|
|
386
|
+
case LAYER_FRACTION_MOE: {
|
|
387
|
+
static std::array<std::string, n_strings> patterns;
|
|
388
|
+
if (patterns[il].empty()) {
|
|
389
|
+
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate)_(ch|)exps";
|
|
390
|
+
}
|
|
391
|
+
return patterns[il].c_str();
|
|
392
|
+
}
|
|
393
|
+
default:
|
|
394
|
+
GGML_ABORT("fatal error");
|
|
395
|
+
}
|
|
396
|
+
};
|
|
397
|
+
|
|
398
|
+
struct ngl_t {
|
|
399
|
+
uint32_t n_layer = 0; // number of total layers
|
|
400
|
+
uint32_t n_part = 0; // number of partial layers, <= n_layer
|
|
401
|
+
|
|
402
|
+
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
|
|
403
|
+
layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
|
|
404
|
+
|
|
405
|
+
uint32_t n_full() const {
|
|
406
|
+
assert(n_layer >= n_part);
|
|
407
|
+
return n_layer - n_part;
|
|
408
|
+
}
|
|
409
|
+
};
|
|
410
|
+
|
|
411
|
+
const size_t ntbo = llama_max_tensor_buft_overrides();
|
|
412
|
+
|
|
413
|
+
// utility function to set n_gpu_layers and tensor_split
|
|
414
|
+
auto set_ngl_tensor_split_tbo = [&](
|
|
415
|
+
const std::vector<ngl_t> & ngl_per_device,
|
|
416
|
+
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
|
|
417
|
+
llama_model_params & mparams) {
|
|
418
|
+
mparams.n_gpu_layers = 0;
|
|
419
|
+
for (size_t id = 0; id < nd; id++) {
|
|
420
|
+
mparams.n_gpu_layers += ngl_per_device[id].n_layer;
|
|
421
|
+
if (nd > 1) {
|
|
422
|
+
tensor_split[id] = ngl_per_device[id].n_layer;
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
|
|
426
|
+
uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
|
|
427
|
+
|
|
428
|
+
mparams.tensor_split = tensor_split;
|
|
429
|
+
|
|
430
|
+
size_t itbo = 0;
|
|
431
|
+
for (size_t id = 0; id < nd; id++) {
|
|
432
|
+
il0 += ngl_per_device[id].n_full();
|
|
433
|
+
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
|
|
434
|
+
if (itbo + 1 >= ntbo) {
|
|
435
|
+
tensor_buft_overrides[itbo].pattern = nullptr;
|
|
436
|
+
tensor_buft_overrides[itbo].buft = nullptr;
|
|
437
|
+
itbo++;
|
|
438
|
+
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
|
439
|
+
throw llama_params_fit_exception("llama_max_tensor_buft_overrides() == "
|
|
440
|
+
+ std::to_string(ntbo) + " is insufficient for model");
|
|
441
|
+
}
|
|
442
|
+
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
|
443
|
+
tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
|
|
444
|
+
itbo++;
|
|
445
|
+
}
|
|
446
|
+
il0 += ngl_per_device[id].n_part;
|
|
447
|
+
}
|
|
448
|
+
tensor_buft_overrides[itbo].pattern = nullptr;
|
|
449
|
+
tensor_buft_overrides[itbo].buft = nullptr;
|
|
450
|
+
itbo++;
|
|
451
|
+
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
|
452
|
+
};
|
|
453
|
+
|
|
454
|
+
// utility function that returns the memory use per device for given numbers of layers per device
|
|
455
|
+
auto get_memory_for_layers = [&](
|
|
456
|
+
const char * func_name,
|
|
457
|
+
const std::vector<ngl_t> & ngl_per_device,
|
|
458
|
+
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
|
|
459
|
+
llama_model_params mparams_copy = *mparams;
|
|
460
|
+
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
|
|
461
|
+
|
|
462
|
+
const dmds_t dmd_nl = llama_get_device_memory_data(
|
|
463
|
+
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
|
464
|
+
|
|
465
|
+
LLAMA_LOG_DEBUG("%s: memory for test allocation by device:\n", func_name);
|
|
466
|
+
for (size_t id = 0; id < nd; id++) {
|
|
467
|
+
const ngl_t & n = ngl_per_device[id];
|
|
468
|
+
LLAMA_LOG_DEBUG(
|
|
469
|
+
"%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
|
|
470
|
+
func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
std::vector<int64_t> ret;
|
|
474
|
+
ret.reserve(nd);
|
|
475
|
+
for (const llama_device_memory_data & dmd : dmd_nl) {
|
|
476
|
+
ret.push_back(dmd.mb.total());
|
|
477
|
+
}
|
|
478
|
+
return ret;
|
|
479
|
+
};
|
|
480
|
+
|
|
481
|
+
int64_t global_surplus_cpu_moe = 0;
|
|
482
|
+
if (hp_nex > 0) {
|
|
483
|
+
const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate)_(ch|)exps"; // matches all MoE tensors
|
|
484
|
+
ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
|
|
485
|
+
tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
|
|
486
|
+
tensor_buft_overrides[1] = {nullptr, nullptr};
|
|
487
|
+
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
|
488
|
+
|
|
489
|
+
LLAMA_LOG_DEBUG("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
|
|
490
|
+
const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
|
|
491
|
+
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
|
492
|
+
|
|
493
|
+
for (size_t id = 0; id < nd; id++) {
|
|
494
|
+
global_surplus_cpu_moe += dmds_cpu_moe[id].free;
|
|
495
|
+
global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
if (global_surplus_cpu_moe > 0) {
|
|
499
|
+
LLAMA_LOG_INFO("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
|
|
500
|
+
__func__, global_surplus_cpu_moe/MiB);
|
|
501
|
+
} else {
|
|
502
|
+
LLAMA_LOG_INFO("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
|
|
503
|
+
__func__, -global_surplus_cpu_moe/MiB);
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
// reset
|
|
507
|
+
tensor_buft_overrides[0] = {nullptr, nullptr};
|
|
508
|
+
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
std::vector<int64_t> targets; // maximum acceptable memory use per device
|
|
512
|
+
targets.reserve(nd);
|
|
513
|
+
for (size_t id = 0; id < nd; id++) {
|
|
514
|
+
targets.push_back(dmds_full[id].free - margins[id]);
|
|
515
|
+
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
|
|
519
|
+
overflow_bufts.reserve(nd);
|
|
520
|
+
for (size_t id = 0; id < nd; id++) {
|
|
521
|
+
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
std::vector<ngl_t> ngl_per_device(nd);
|
|
525
|
+
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
|
|
526
|
+
|
|
527
|
+
// optimize the number of layers per device using the method of false position:
|
|
528
|
+
// - ngl_per_device has 0 layers for each device, lower bound
|
|
529
|
+
// - try a "high" configuration where a device is given all unassigned layers
|
|
530
|
+
// - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
|
|
531
|
+
// - check memory use of our guess, replace either the low or high bound
|
|
532
|
+
// - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
|
|
533
|
+
// - the last device has the output layer, which cannot be a partial layer
|
|
534
|
+
if (hp_nex == 0) {
|
|
535
|
+
LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
|
|
536
|
+
} else {
|
|
537
|
+
LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
|
|
538
|
+
}
|
|
539
|
+
for (int id = nd - 1; id >= 0; id--) {
|
|
540
|
+
uint32_t n_unassigned = hp_ngl + 1;
|
|
541
|
+
for (size_t jd = id + 1; jd < nd; ++jd) {
|
|
542
|
+
assert(n_unassigned >= ngl_per_device[jd].n_layer);
|
|
543
|
+
n_unassigned -= ngl_per_device[jd].n_layer;
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
|
547
|
+
ngl_per_device_high[id].n_layer = n_unassigned;
|
|
548
|
+
if (hp_nex > 0) {
|
|
549
|
+
ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
|
|
550
|
+
}
|
|
551
|
+
if (ngl_per_device_high[id].n_layer > 0) {
|
|
552
|
+
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
|
553
|
+
if (mem_high[id] > targets[id]) {
|
|
554
|
+
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
|
|
555
|
+
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
|
556
|
+
LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
|
|
557
|
+
while (delta > 1) {
|
|
558
|
+
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
|
559
|
+
step_size = std::max(step_size, uint32_t(1));
|
|
560
|
+
step_size = std::min(step_size, delta - 1);
|
|
561
|
+
|
|
562
|
+
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
|
563
|
+
ngl_per_device_test[id].n_layer += step_size;
|
|
564
|
+
if (hp_nex) {
|
|
565
|
+
ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
|
|
566
|
+
step_size - 1 : step_size; // the first layer is the output layer which must always be full
|
|
567
|
+
}
|
|
568
|
+
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
|
569
|
+
|
|
570
|
+
if (mem_test[id] <= targets[id]) {
|
|
571
|
+
ngl_per_device = ngl_per_device_test;
|
|
572
|
+
mem = mem_test;
|
|
573
|
+
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
|
574
|
+
} else {
|
|
575
|
+
ngl_per_device_high = ngl_per_device_test;
|
|
576
|
+
mem_high = mem_test;
|
|
577
|
+
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
|
|
578
|
+
}
|
|
579
|
+
delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
|
580
|
+
}
|
|
581
|
+
} else {
|
|
582
|
+
assert(ngl_per_device_high[id].n_layer == n_unassigned);
|
|
583
|
+
ngl_per_device = ngl_per_device_high;
|
|
584
|
+
mem = mem_high;
|
|
585
|
+
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
|
586
|
+
}
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
|
590
|
+
LLAMA_LOG_INFO(
|
|
591
|
+
"%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
|
592
|
+
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
|
|
593
|
+
}
|
|
594
|
+
if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
|
|
595
|
+
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
|
596
|
+
return;
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
// step 4: for a MoE model where all dense tensors fit,
|
|
600
|
+
// convert the dense-only layers in the back to full layers in the front until all devices are full
|
|
601
|
+
// essentially the same procedure as for the dense-only layers except front-to-back
|
|
602
|
+
// also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
|
|
603
|
+
|
|
604
|
+
size_t id_dense_start = nd;
|
|
605
|
+
for (int id = nd - 1; id >= 0; id--) {
|
|
606
|
+
if (ngl_per_device[id].n_layer > 0) {
|
|
607
|
+
id_dense_start = id;
|
|
608
|
+
continue;
|
|
609
|
+
}
|
|
610
|
+
break;
|
|
611
|
+
}
|
|
612
|
+
assert(id_dense_start < nd);
|
|
613
|
+
|
|
614
|
+
LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
|
|
615
|
+
for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
|
|
616
|
+
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
|
617
|
+
for (size_t jd = id_dense_start; jd < nd; jd++) {
|
|
618
|
+
const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
|
|
619
|
+
ngl_per_device_high[id].n_layer += n_layer_move;
|
|
620
|
+
ngl_per_device_high[jd].n_layer -= n_layer_move;
|
|
621
|
+
ngl_per_device_high[jd].n_part = 0;
|
|
622
|
+
}
|
|
623
|
+
size_t id_dense_start_high = nd - 1;
|
|
624
|
+
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
|
625
|
+
|
|
626
|
+
if (mem_high[id] > targets[id]) {
|
|
627
|
+
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
|
628
|
+
uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
|
629
|
+
while (delta > 1) {
|
|
630
|
+
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
|
631
|
+
step_size = std::max(step_size, uint32_t(1));
|
|
632
|
+
step_size = std::min(step_size, delta - 1);
|
|
633
|
+
|
|
634
|
+
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
|
635
|
+
size_t id_dense_start_test = id_dense_start;
|
|
636
|
+
uint32_t n_converted_test = 0;
|
|
637
|
+
for (;id_dense_start_test < nd; id_dense_start_test++) {
|
|
638
|
+
const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
|
|
639
|
+
ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
|
|
640
|
+
ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
|
|
641
|
+
ngl_per_device_test[id].n_layer += n_convert_jd;
|
|
642
|
+
n_converted_test += n_convert_jd;
|
|
643
|
+
|
|
644
|
+
if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
|
|
645
|
+
break;
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
|
649
|
+
|
|
650
|
+
if (mem_test[id] <= targets[id]) {
|
|
651
|
+
ngl_per_device = ngl_per_device_test;
|
|
652
|
+
mem = mem_test;
|
|
653
|
+
id_dense_start = id_dense_start_test;
|
|
654
|
+
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
|
655
|
+
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
|
656
|
+
} else {
|
|
657
|
+
ngl_per_device_high = ngl_per_device_test;
|
|
658
|
+
mem_high = mem_test;
|
|
659
|
+
id_dense_start_high = id_dense_start_test;
|
|
660
|
+
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
|
|
661
|
+
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
|
|
662
|
+
}
|
|
663
|
+
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
|
664
|
+
delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
|
665
|
+
}
|
|
666
|
+
} else {
|
|
667
|
+
ngl_per_device = ngl_per_device_high;
|
|
668
|
+
mem = mem_high;
|
|
669
|
+
id_dense_start = id_dense_start_high;
|
|
670
|
+
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
|
671
|
+
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
// try to fit at least part of one more layer
|
|
675
|
+
if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
|
|
676
|
+
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
|
677
|
+
size_t id_dense_start_test = id_dense_start;
|
|
678
|
+
ngl_per_device_test[id_dense_start_test].n_layer--;
|
|
679
|
+
ngl_per_device_test[id_dense_start_test].n_part--;
|
|
680
|
+
ngl_per_device_test[id].n_layer++;
|
|
681
|
+
ngl_per_device_test[id].n_part++;
|
|
682
|
+
if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
|
|
683
|
+
id_dense_start_test++;
|
|
684
|
+
}
|
|
685
|
+
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
|
|
686
|
+
std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
|
|
687
|
+
if (id < nd - 1) {
|
|
688
|
+
overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
|
|
689
|
+
}
|
|
690
|
+
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
|
691
|
+
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
|
692
|
+
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
693
|
+
ngl_per_device = ngl_per_device_test;
|
|
694
|
+
overflow_bufts = overflow_bufts_test;
|
|
695
|
+
mem = mem_test;
|
|
696
|
+
id_dense_start = id_dense_start_test;
|
|
697
|
+
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
|
|
698
|
+
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
|
699
|
+
|
|
700
|
+
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
|
701
|
+
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
|
702
|
+
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
|
703
|
+
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
704
|
+
ngl_per_device = ngl_per_device_test;
|
|
705
|
+
overflow_bufts = overflow_bufts_test;
|
|
706
|
+
mem = mem_test;
|
|
707
|
+
id_dense_start = id_dense_start_test;
|
|
708
|
+
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
|
|
709
|
+
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
|
710
|
+
}
|
|
711
|
+
} else {
|
|
712
|
+
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
|
713
|
+
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
|
714
|
+
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
|
715
|
+
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
716
|
+
ngl_per_device = ngl_per_device_test;
|
|
717
|
+
overflow_bufts = overflow_bufts_test;
|
|
718
|
+
mem = mem_test;
|
|
719
|
+
id_dense_start = id_dense_start_test;
|
|
720
|
+
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
|
|
721
|
+
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
|
727
|
+
LLAMA_LOG_INFO(
|
|
728
|
+
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
|
729
|
+
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
// print info for devices that were not changed during the conversion from dense only to full layers:
|
|
733
|
+
for (size_t id = id_dense_start + 1; id < nd; id++) {
|
|
734
|
+
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
|
735
|
+
LLAMA_LOG_INFO(
|
|
736
|
+
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
|
737
|
+
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
enum llama_params_fit_status llama_params_fit(
|
|
744
|
+
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
|
745
|
+
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
|
746
|
+
size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
|
747
|
+
const int64_t t0_us = llama_time_us();
|
|
748
|
+
llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
|
|
749
|
+
try {
|
|
750
|
+
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
|
|
751
|
+
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
|
|
752
|
+
} catch (const llama_params_fit_exception & e) {
|
|
753
|
+
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
|
754
|
+
status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
|
|
755
|
+
} catch (const std::runtime_error & e) {
|
|
756
|
+
LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
|
|
757
|
+
status = LLAMA_PARAMS_FIT_STATUS_ERROR;
|
|
758
|
+
}
|
|
759
|
+
const int64_t t1_us = llama_time_us();
|
|
760
|
+
LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
|
|
761
|
+
return status;
|
|
762
|
+
}
|
|
763
|
+
|
|
40
764
|
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
|
41
765
|
struct llama_sampler_chain_params result = {
|
|
42
|
-
/*.no_perf
|
|
766
|
+
/*.no_perf =*/ true,
|
|
43
767
|
};
|
|
44
768
|
|
|
45
769
|
return result;
|
|
@@ -49,6 +773,10 @@ size_t llama_max_devices(void) {
|
|
|
49
773
|
return 16;
|
|
50
774
|
}
|
|
51
775
|
|
|
776
|
+
size_t llama_max_tensor_buft_overrides() {
|
|
777
|
+
return 4096;
|
|
778
|
+
}
|
|
779
|
+
|
|
52
780
|
bool llama_supports_mmap(void) {
|
|
53
781
|
return llama_mmap::SUPPORTED;
|
|
54
782
|
}
|
|
@@ -99,7 +827,8 @@ int64_t llama_time_us(void) {
|
|
|
99
827
|
}
|
|
100
828
|
|
|
101
829
|
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
|
102
|
-
static int llama_model_load(
|
|
830
|
+
static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
|
|
831
|
+
const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
|
|
103
832
|
// loading time will be recalculated after the first eval, so
|
|
104
833
|
// we take page faults deferred by mmap() into consideration
|
|
105
834
|
model.t_load_us = 0;
|
|
@@ -108,11 +837,13 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
|
|
108
837
|
model.t_start_us = tm.t_start_us;
|
|
109
838
|
|
|
110
839
|
try {
|
|
111
|
-
llama_model_loader ml(fname, splits, params.use_mmap, params.
|
|
840
|
+
llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io,
|
|
841
|
+
params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
|
112
842
|
|
|
113
843
|
ml.print_info();
|
|
114
844
|
|
|
115
845
|
model.hparams.vocab_only = params.vocab_only;
|
|
846
|
+
model.hparams.no_alloc = params.no_alloc;
|
|
116
847
|
|
|
117
848
|
try {
|
|
118
849
|
model.load_arch(ml);
|
|
@@ -124,6 +855,9 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
|
|
124
855
|
} catch(const std::exception & e) {
|
|
125
856
|
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
|
|
126
857
|
}
|
|
858
|
+
if (model.arch == LLM_ARCH_CLIP) {
|
|
859
|
+
throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
|
|
860
|
+
}
|
|
127
861
|
try {
|
|
128
862
|
model.load_vocab(ml);
|
|
129
863
|
} catch(const std::exception & e) {
|
|
@@ -150,9 +884,13 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
|
|
150
884
|
}
|
|
151
885
|
|
|
152
886
|
static struct llama_model * llama_model_load_from_file_impl(
|
|
887
|
+
struct gguf_context * metadata,
|
|
888
|
+
llama_model_set_tensor_data_t set_tensor_data,
|
|
889
|
+
void * set_tensor_data_ud,
|
|
153
890
|
const std::string & path_model,
|
|
154
891
|
std::vector<std::string> & splits,
|
|
155
892
|
struct llama_model_params params) {
|
|
893
|
+
GGML_ASSERT((metadata == nullptr) != path_model.empty() && "exactly one out of metadata and path_model needs to be defined");
|
|
156
894
|
ggml_time_init();
|
|
157
895
|
|
|
158
896
|
if (!params.vocab_only && ggml_backend_reg_count() == 0) {
|
|
@@ -273,7 +1011,7 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
|
273
1011
|
props.memory_free/1024/1024);
|
|
274
1012
|
}
|
|
275
1013
|
|
|
276
|
-
const int status = llama_model_load(path_model, splits, *model, params);
|
|
1014
|
+
const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, *model, params);
|
|
277
1015
|
GGML_ASSERT(status <= 0);
|
|
278
1016
|
if (status < 0) {
|
|
279
1017
|
if (status == -1) {
|
|
@@ -289,6 +1027,18 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
|
289
1027
|
return model;
|
|
290
1028
|
}
|
|
291
1029
|
|
|
1030
|
+
struct llama_model * llama_model_init_from_user(
|
|
1031
|
+
struct gguf_context * metadata,
|
|
1032
|
+
llama_model_set_tensor_data_t set_tensor_data,
|
|
1033
|
+
void * set_tensor_data_ud,
|
|
1034
|
+
struct llama_model_params params) {
|
|
1035
|
+
GGML_ASSERT(metadata != nullptr);
|
|
1036
|
+
std::string path_model;
|
|
1037
|
+
std::vector<std::string> splits = {};
|
|
1038
|
+
params.use_mmap = false;
|
|
1039
|
+
params.use_extra_bufts = false;
|
|
1040
|
+
return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params);
|
|
1041
|
+
}
|
|
292
1042
|
// deprecated
|
|
293
1043
|
struct llama_model * llama_load_model_from_file(
|
|
294
1044
|
const char * path_model,
|
|
@@ -300,7 +1050,7 @@ struct llama_model * llama_model_load_from_file(
|
|
|
300
1050
|
const char * path_model,
|
|
301
1051
|
struct llama_model_params params) {
|
|
302
1052
|
std::vector<std::string> splits = {};
|
|
303
|
-
return llama_model_load_from_file_impl(path_model, splits, params);
|
|
1053
|
+
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, params);
|
|
304
1054
|
}
|
|
305
1055
|
|
|
306
1056
|
struct llama_model * llama_model_load_from_splits(
|
|
@@ -312,14 +1062,15 @@ struct llama_model * llama_model_load_from_splits(
|
|
|
312
1062
|
LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
|
|
313
1063
|
return nullptr;
|
|
314
1064
|
}
|
|
1065
|
+
splits.reserve(n_paths);
|
|
315
1066
|
for (size_t i = 0; i < n_paths; ++i) {
|
|
316
1067
|
splits.push_back(paths[i]);
|
|
317
1068
|
}
|
|
318
|
-
return llama_model_load_from_file_impl(splits.front(), splits, params);
|
|
1069
|
+
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, params);
|
|
319
1070
|
}
|
|
320
1071
|
|
|
321
1072
|
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
|
|
322
|
-
llama_model_saver ms(
|
|
1073
|
+
llama_model_saver ms(model);
|
|
323
1074
|
ms.add_kv_from_model();
|
|
324
1075
|
ms.add_tensors_from_model();
|
|
325
1076
|
ms.save(path_model);
|
|
@@ -364,25 +1115,55 @@ int32_t llama_chat_apply_template(
|
|
|
364
1115
|
// model split
|
|
365
1116
|
//
|
|
366
1117
|
|
|
367
|
-
|
|
1118
|
+
int32_t llama_split_path(
|
|
1119
|
+
char * split_path,
|
|
1120
|
+
size_t maxlen,
|
|
1121
|
+
const char * path_prefix,
|
|
1122
|
+
int32_t split_no,
|
|
1123
|
+
int32_t split_count) {
|
|
1124
|
+
|
|
368
1125
|
static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
|
|
369
|
-
|
|
370
|
-
|
|
1126
|
+
|
|
1127
|
+
const int written = snprintf(
|
|
1128
|
+
split_path,
|
|
1129
|
+
maxlen,
|
|
1130
|
+
SPLIT_PATH_FORMAT,
|
|
1131
|
+
path_prefix,
|
|
1132
|
+
split_no + 1,
|
|
1133
|
+
split_count
|
|
1134
|
+
);
|
|
1135
|
+
|
|
1136
|
+
if (written < 0 || (size_t) written >= maxlen) {
|
|
1137
|
+
return 0;
|
|
371
1138
|
}
|
|
372
|
-
|
|
1139
|
+
|
|
1140
|
+
return (int32_t) written;
|
|
373
1141
|
}
|
|
374
1142
|
|
|
375
|
-
|
|
376
|
-
|
|
1143
|
+
int32_t llama_split_prefix(
|
|
1144
|
+
char * split_prefix,
|
|
1145
|
+
size_t maxlen,
|
|
1146
|
+
const char * split_path,
|
|
1147
|
+
int32_t split_no,
|
|
1148
|
+
int32_t split_count) {
|
|
1149
|
+
|
|
1150
|
+
const std::string str_split_path(split_path);
|
|
1151
|
+
|
|
377
1152
|
char postfix[32];
|
|
378
|
-
snprintf(postfix,
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
1153
|
+
snprintf(postfix, sizeof(postfix), "-%05d-of-%05d.gguf", split_no + 1, split_count);
|
|
1154
|
+
|
|
1155
|
+
const std::string str_postfix(postfix);
|
|
1156
|
+
if (str_split_path.size() <= str_postfix.size()) {
|
|
1157
|
+
return 0;
|
|
1158
|
+
}
|
|
1159
|
+
|
|
1160
|
+
const size_t size_prefix = str_split_path.size() - str_postfix.size();
|
|
1161
|
+
|
|
1162
|
+
if (str_split_path.compare(size_prefix, std::string::npos, str_postfix) == 0) {
|
|
1163
|
+
const size_t copy_len = std::min(size_prefix + 1, maxlen);
|
|
1164
|
+
snprintf(split_prefix, copy_len, "%s", split_path);
|
|
1165
|
+
|
|
1166
|
+
return (int32_t) size_prefix;
|
|
386
1167
|
}
|
|
387
1168
|
|
|
388
1169
|
return 0;
|