whispercpp 1.3.5 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +99 -2
- data/ext/extconf.rb +1 -0
- data/ext/ruby_whisper.c +20 -4
- data/ext/ruby_whisper.h +30 -2
- data/ext/ruby_whisper_context.c +216 -124
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_model.c +0 -1
- data/ext/ruby_whisper_params.c +0 -1
- data/ext/ruby_whisper_segment.c +0 -1
- data/ext/ruby_whisper_token.c +29 -9
- data/ext/ruby_whisper_transcribe.cpp +4 -1
- data/ext/ruby_whisper_vad_context.c +48 -1
- data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +0 -1
- data/ext/ruby_whisper_vad_segments.c +0 -1
- data/ext/sources/CMakeLists.txt +1 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/examples/bench/bench.cpp +23 -18
- data/ext/sources/examples/cli/cli.cpp +8 -0
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/server/server.cpp +18 -4
- data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
- data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
- data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
- data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
- data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
- data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
- data/ext/sources/examples/talk-llama/llama-context.h +27 -28
- data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
- data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
- data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
- data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
- data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
- data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
- data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
- data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
- data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
- data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
- data/ext/sources/examples/talk-llama/llama-model.h +72 -19
- data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
- data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
- data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
- data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
- data/ext/sources/examples/talk-llama/llama.cpp +76 -22
- data/ext/sources/examples/talk-llama/llama.h +63 -30
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
- data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
- data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
- data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
- data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
- data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/models.h +181 -46
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
- data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
- data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
- data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
- data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
- data/ext/sources/ggml/CMakeLists.txt +9 -3
- data/ext/sources/ggml/include/ggml-backend.h +1 -1
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +5 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +6 -1
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml.h +56 -9
- data/ext/sources/ggml/src/CMakeLists.txt +3 -0
- data/ext/sources/ggml/src/ggml-alloc.c +4 -9
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
- data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
- data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
- data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
- data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
- data/ext/sources/ggml/src/ggml-impl.h +62 -0
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
- data/ext/sources/ggml/src/ggml-quants.c +96 -5
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
- data/ext/sources/ggml/src/ggml.c +167 -33
- data/ext/sources/ggml/src/gguf.cpp +229 -44
- data/ext/sources/src/whisper.cpp +6 -28
- data/sig/whisper.rbs +43 -2
- data/test/test_context_params.rb +82 -0
- data/test/test_token.rb +11 -0
- data/test/test_vad_context.rb +58 -8
- data/test/test_whisper.rb +20 -0
- data/whispercpp.gemspec +1 -1
- metadata +240 -28
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
#include "llama-memory-hybrid-iswa.h"
|
|
2
|
+
|
|
3
|
+
#include "llama-impl.h"
|
|
4
|
+
#include "llama-model.h"
|
|
5
|
+
#include "llama-context.h"
|
|
6
|
+
|
|
7
|
+
//
|
|
8
|
+
// llama_memory_hybrid_iswa
|
|
9
|
+
//
|
|
10
|
+
|
|
11
|
+
llama_memory_hybrid_iswa::llama_memory_hybrid_iswa(
|
|
12
|
+
const llama_model & model,
|
|
13
|
+
/* attn */
|
|
14
|
+
ggml_type type_k,
|
|
15
|
+
ggml_type type_v,
|
|
16
|
+
bool v_trans,
|
|
17
|
+
bool swa_full,
|
|
18
|
+
uint32_t kv_size,
|
|
19
|
+
uint32_t n_ubatch,
|
|
20
|
+
uint32_t n_pad,
|
|
21
|
+
/* recurrent */
|
|
22
|
+
ggml_type type_r,
|
|
23
|
+
ggml_type type_s,
|
|
24
|
+
uint32_t rs_size,
|
|
25
|
+
/* common */
|
|
26
|
+
uint32_t n_seq_max,
|
|
27
|
+
bool offload,
|
|
28
|
+
bool unified,
|
|
29
|
+
/* layer filters */
|
|
30
|
+
const layer_filter_cb & filter_attn,
|
|
31
|
+
const layer_filter_cb & filter_recr) :
|
|
32
|
+
hparams(model.hparams),
|
|
33
|
+
mem_attn(new llama_kv_cache_iswa(
|
|
34
|
+
model,
|
|
35
|
+
type_k,
|
|
36
|
+
type_v,
|
|
37
|
+
v_trans,
|
|
38
|
+
offload,
|
|
39
|
+
swa_full,
|
|
40
|
+
unified,
|
|
41
|
+
kv_size,
|
|
42
|
+
n_seq_max,
|
|
43
|
+
n_ubatch,
|
|
44
|
+
n_pad,
|
|
45
|
+
filter_attn == nullptr ?
|
|
46
|
+
[&](int32_t il) { return !hparams.is_recurrent(il); }
|
|
47
|
+
: filter_attn,
|
|
48
|
+
nullptr
|
|
49
|
+
)),
|
|
50
|
+
mem_recr(new llama_memory_recurrent(
|
|
51
|
+
model,
|
|
52
|
+
type_r,
|
|
53
|
+
type_s,
|
|
54
|
+
offload,
|
|
55
|
+
rs_size,
|
|
56
|
+
n_seq_max,
|
|
57
|
+
filter_recr == nullptr ?
|
|
58
|
+
[&](int32_t il) { return hparams.is_recurrent(il); }
|
|
59
|
+
: filter_recr
|
|
60
|
+
)) {}
|
|
61
|
+
|
|
62
|
+
llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
|
63
|
+
do {
|
|
64
|
+
balloc.split_reset();
|
|
65
|
+
|
|
66
|
+
// follow the recurrent pattern for creating the ubatch splits
|
|
67
|
+
std::vector<llama_ubatch> ubatches;
|
|
68
|
+
|
|
69
|
+
while (true) {
|
|
70
|
+
llama_ubatch ubatch;
|
|
71
|
+
|
|
72
|
+
if (embd_all) {
|
|
73
|
+
// if all tokens are output, split by sequence
|
|
74
|
+
ubatch = balloc.split_seq(n_ubatch);
|
|
75
|
+
} else {
|
|
76
|
+
// TODO: non-sequential equal split can be done if using unified KV cache
|
|
77
|
+
// for simplicity, we always use sequential equal split for now
|
|
78
|
+
ubatch = balloc.split_equal(n_ubatch, true);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
if (ubatch.n_tokens == 0) {
|
|
82
|
+
break;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
ubatches.push_back(std::move(ubatch)); // NOLINT
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
if (balloc.get_n_used() < balloc.get_n_tokens()) {
|
|
89
|
+
// failed to find a suitable split
|
|
90
|
+
break;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// prepare the recurrent batches first
|
|
94
|
+
if (!mem_recr->prepare(ubatches)) {
|
|
95
|
+
// TODO: will the recurrent cache be in an undefined context at this point?
|
|
96
|
+
LLAMA_LOG_ERROR("%s: failed to prepare recurrent ubatches\n", __func__);
|
|
97
|
+
return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// prepare the attention cache (iswa version returns both base and swa slot infos)
|
|
101
|
+
auto sinfos_base = mem_attn->get_base()->prepare(ubatches);
|
|
102
|
+
if (sinfos_base.empty()) {
|
|
103
|
+
LLAMA_LOG_ERROR("%s: failed to prepare attention base ubatches\n", __func__);
|
|
104
|
+
return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
auto sinfos_swa = mem_attn->get_swa()->prepare(ubatches);
|
|
108
|
+
if (sinfos_swa.empty()) {
|
|
109
|
+
LLAMA_LOG_ERROR("%s: failed to prepare attention swa ubatches\n", __func__);
|
|
110
|
+
return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
return std::make_unique<llama_memory_hybrid_iswa_context>(
|
|
114
|
+
this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
|
|
115
|
+
} while(false);
|
|
116
|
+
|
|
117
|
+
return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
llama_memory_context_ptr llama_memory_hybrid_iswa::init_full() {
|
|
121
|
+
return std::make_unique<llama_memory_hybrid_iswa_context>(this);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
llama_memory_context_ptr llama_memory_hybrid_iswa::init_update(llama_context * lctx, bool optimize) {
|
|
125
|
+
return std::make_unique<llama_memory_hybrid_iswa_context>(this, lctx, optimize);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
bool llama_memory_hybrid_iswa::get_can_shift() const {
|
|
129
|
+
// Shifting is trivially supported for recurrent
|
|
130
|
+
return mem_attn->get_can_shift();
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
void llama_memory_hybrid_iswa::clear(bool data) {
|
|
134
|
+
mem_attn->clear(data);
|
|
135
|
+
mem_recr->clear(data);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
bool llama_memory_hybrid_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
|
139
|
+
// Try removing from the recurrent cache first since it may fail. If it does
|
|
140
|
+
// fail, the cache will not have been mutated.
|
|
141
|
+
if (!mem_recr->seq_rm(seq_id, p0, p1)) {
|
|
142
|
+
return false;
|
|
143
|
+
}
|
|
144
|
+
return mem_attn->seq_rm(seq_id, p0, p1);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
void llama_memory_hybrid_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
|
148
|
+
mem_attn->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
|
149
|
+
mem_recr->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
void llama_memory_hybrid_iswa::seq_keep(llama_seq_id seq_id) {
|
|
153
|
+
mem_attn->seq_keep(seq_id);
|
|
154
|
+
mem_recr->seq_keep(seq_id);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
void llama_memory_hybrid_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
|
|
158
|
+
mem_attn->seq_add(seq_id, p0, p1, shift);
|
|
159
|
+
mem_recr->seq_add(seq_id, p0, p1, shift);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
void llama_memory_hybrid_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
|
163
|
+
mem_attn->seq_div(seq_id, p0, p1, d);
|
|
164
|
+
mem_recr->seq_div(seq_id, p0, p1, d);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
llama_pos llama_memory_hybrid_iswa::seq_pos_min(llama_seq_id seq_id) const {
|
|
168
|
+
// the min of the total cache is the max of the two caches' min values
|
|
169
|
+
return std::max(mem_attn->seq_pos_min(seq_id), mem_recr->seq_pos_min(seq_id));
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
llama_pos llama_memory_hybrid_iswa::seq_pos_max(llama_seq_id seq_id) const {
|
|
173
|
+
// the max of the total cache is the min of the two caches' max values
|
|
174
|
+
return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid_iswa::memory_breakdown() const {
|
|
178
|
+
std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
|
|
179
|
+
for (const auto & buft_size : mem_recr->memory_breakdown()) {
|
|
180
|
+
mb[buft_size.first] += buft_size.second;
|
|
181
|
+
}
|
|
182
|
+
return mb;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
void llama_memory_hybrid_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
186
|
+
mem_attn->state_write(io, seq_id, flags);
|
|
187
|
+
mem_recr->state_write(io, seq_id, flags);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
void llama_memory_hybrid_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
191
|
+
mem_attn->state_read(io, seq_id, flags);
|
|
192
|
+
mem_recr->state_read(io, seq_id, flags);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
llama_kv_cache_iswa * llama_memory_hybrid_iswa::get_mem_attn() const {
|
|
196
|
+
return mem_attn.get();
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
llama_memory_recurrent * llama_memory_hybrid_iswa::get_mem_recr() const {
|
|
200
|
+
return mem_recr.get();
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
//
|
|
204
|
+
// llama_memory_hybrid_iswa_context
|
|
205
|
+
//
|
|
206
|
+
|
|
207
|
+
llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(llama_memory_status status) : status(status) {}
|
|
208
|
+
|
|
209
|
+
llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(llama_memory_hybrid_iswa * mem) :
|
|
210
|
+
ctx_attn(mem->get_mem_attn()->init_full()),
|
|
211
|
+
ctx_recr(mem->get_mem_recr()->init_full()),
|
|
212
|
+
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(
|
|
216
|
+
llama_memory_hybrid_iswa * mem,
|
|
217
|
+
llama_context * lctx,
|
|
218
|
+
bool optimize) :
|
|
219
|
+
ctx_attn(mem->get_mem_attn()->init_update(lctx, optimize)),
|
|
220
|
+
ctx_recr(mem->get_mem_recr()->init_update(lctx, optimize)),
|
|
221
|
+
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(
|
|
225
|
+
llama_memory_hybrid_iswa * mem,
|
|
226
|
+
slot_info_vec_t sinfos_base,
|
|
227
|
+
slot_info_vec_t sinfos_swa,
|
|
228
|
+
std::vector<llama_ubatch> ubatches) :
|
|
229
|
+
ubatches(std::move(ubatches)),
|
|
230
|
+
// note: here we copy the ubatches. not sure if this is ideal
|
|
231
|
+
ctx_attn(new llama_kv_cache_iswa_context(mem->get_mem_attn(), std::move(sinfos_base), std::move(sinfos_swa), this->ubatches)),
|
|
232
|
+
ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
|
|
233
|
+
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
bool llama_memory_hybrid_iswa_context::next() {
|
|
237
|
+
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
|
238
|
+
|
|
239
|
+
ctx_attn->next();
|
|
240
|
+
ctx_recr->next();
|
|
241
|
+
|
|
242
|
+
if (++i_next >= ubatches.size()) {
|
|
243
|
+
return false;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
return true;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
bool llama_memory_hybrid_iswa_context::apply() {
|
|
250
|
+
assert(!llama_memory_status_is_fail(status));
|
|
251
|
+
|
|
252
|
+
bool res = true;
|
|
253
|
+
|
|
254
|
+
res = res & ctx_attn->apply();
|
|
255
|
+
res = res & ctx_recr->apply();
|
|
256
|
+
|
|
257
|
+
return res;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
llama_memory_status llama_memory_hybrid_iswa_context::get_status() const {
|
|
261
|
+
return status;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
const llama_ubatch & llama_memory_hybrid_iswa_context::get_ubatch() const {
|
|
265
|
+
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
|
266
|
+
return ubatches[i_next];
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
const llama_kv_cache_iswa_context * llama_memory_hybrid_iswa_context::get_attn() const {
|
|
270
|
+
return static_cast<const llama_kv_cache_iswa_context *>(ctx_attn.get());
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
const llama_memory_recurrent_context * llama_memory_hybrid_iswa_context::get_recr() const {
|
|
274
|
+
return static_cast<const llama_memory_recurrent_context *>(ctx_recr.get());
|
|
275
|
+
}
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "llama-batch.h"
|
|
4
|
+
#include "llama-graph.h"
|
|
5
|
+
#include "llama-kv-cache-iswa.h"
|
|
6
|
+
#include "llama-memory.h"
|
|
7
|
+
#include "llama-memory-recurrent.h"
|
|
8
|
+
|
|
9
|
+
#include <memory>
|
|
10
|
+
#include <vector>
|
|
11
|
+
|
|
12
|
+
//
|
|
13
|
+
// llama_memory_hybrid_iswa
|
|
14
|
+
//
|
|
15
|
+
|
|
16
|
+
// utilizes instances of llama_memory_recurrent and llama_kv_cache_iswa to
|
|
17
|
+
// support models where each layer may be either attention-based (with SWA support) or recurrent
|
|
18
|
+
|
|
19
|
+
class llama_memory_hybrid_iswa : public llama_memory_i {
|
|
20
|
+
public:
|
|
21
|
+
llama_memory_hybrid_iswa(
|
|
22
|
+
const llama_model & model,
|
|
23
|
+
/* attn */
|
|
24
|
+
ggml_type type_k,
|
|
25
|
+
ggml_type type_v,
|
|
26
|
+
bool v_trans,
|
|
27
|
+
bool swa_full,
|
|
28
|
+
uint32_t kv_size,
|
|
29
|
+
uint32_t n_ubatch,
|
|
30
|
+
uint32_t n_pad,
|
|
31
|
+
/* recurrent */
|
|
32
|
+
ggml_type type_r,
|
|
33
|
+
ggml_type type_s,
|
|
34
|
+
uint32_t rs_size,
|
|
35
|
+
/* common */
|
|
36
|
+
uint32_t n_seq_max,
|
|
37
|
+
bool offload,
|
|
38
|
+
bool unified,
|
|
39
|
+
/* layer filters */
|
|
40
|
+
const layer_filter_cb & filter_attn = nullptr,
|
|
41
|
+
const layer_filter_cb & filter_recr = nullptr);
|
|
42
|
+
|
|
43
|
+
~llama_memory_hybrid_iswa() = default;
|
|
44
|
+
|
|
45
|
+
//
|
|
46
|
+
// llama_memory_i
|
|
47
|
+
//
|
|
48
|
+
|
|
49
|
+
llama_memory_context_ptr init_batch(
|
|
50
|
+
llama_batch_allocr & balloc,
|
|
51
|
+
uint32_t n_ubatch,
|
|
52
|
+
bool embd_all) override;
|
|
53
|
+
|
|
54
|
+
llama_memory_context_ptr init_full() override;
|
|
55
|
+
|
|
56
|
+
llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
|
|
57
|
+
|
|
58
|
+
bool get_can_shift() const override;
|
|
59
|
+
|
|
60
|
+
void clear(bool data) override;
|
|
61
|
+
|
|
62
|
+
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
|
|
63
|
+
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
|
|
64
|
+
void seq_keep(llama_seq_id seq_id) override;
|
|
65
|
+
void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override;
|
|
66
|
+
void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
|
|
67
|
+
|
|
68
|
+
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
|
69
|
+
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
|
70
|
+
|
|
71
|
+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
|
72
|
+
|
|
73
|
+
// state write/load
|
|
74
|
+
|
|
75
|
+
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
|
76
|
+
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
|
77
|
+
|
|
78
|
+
//
|
|
79
|
+
// llama_memory_hybrid_iswa specific API
|
|
80
|
+
//
|
|
81
|
+
|
|
82
|
+
llama_kv_cache_iswa * get_mem_attn() const;
|
|
83
|
+
llama_memory_recurrent * get_mem_recr() const;
|
|
84
|
+
|
|
85
|
+
private:
|
|
86
|
+
const llama_hparams & hparams;
|
|
87
|
+
|
|
88
|
+
const std::unique_ptr<llama_kv_cache_iswa> mem_attn;
|
|
89
|
+
const std::unique_ptr<llama_memory_recurrent> mem_recr;
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
class llama_memory_hybrid_iswa_context : public llama_memory_context_i {
|
|
93
|
+
public:
|
|
94
|
+
using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
|
|
95
|
+
|
|
96
|
+
// init failure
|
|
97
|
+
explicit llama_memory_hybrid_iswa_context(llama_memory_status status);
|
|
98
|
+
|
|
99
|
+
// init full
|
|
100
|
+
explicit llama_memory_hybrid_iswa_context(llama_memory_hybrid_iswa * mem);
|
|
101
|
+
|
|
102
|
+
// init update
|
|
103
|
+
explicit llama_memory_hybrid_iswa_context(
|
|
104
|
+
llama_memory_hybrid_iswa * mem,
|
|
105
|
+
llama_context * lctx,
|
|
106
|
+
bool optimize);
|
|
107
|
+
|
|
108
|
+
// init success
|
|
109
|
+
llama_memory_hybrid_iswa_context(
|
|
110
|
+
llama_memory_hybrid_iswa * mem,
|
|
111
|
+
slot_info_vec_t sinfos_base,
|
|
112
|
+
slot_info_vec_t sinfos_swa,
|
|
113
|
+
std::vector<llama_ubatch> ubatches);
|
|
114
|
+
|
|
115
|
+
~llama_memory_hybrid_iswa_context() = default;
|
|
116
|
+
|
|
117
|
+
bool next() override;
|
|
118
|
+
bool apply() override;
|
|
119
|
+
|
|
120
|
+
llama_memory_status get_status() const override;
|
|
121
|
+
const llama_ubatch & get_ubatch() const override;
|
|
122
|
+
|
|
123
|
+
//
|
|
124
|
+
// llama_memory_hybrid_iswa_context
|
|
125
|
+
//
|
|
126
|
+
|
|
127
|
+
const llama_kv_cache_iswa_context * get_attn() const;
|
|
128
|
+
const llama_memory_recurrent_context * get_recr() const;
|
|
129
|
+
|
|
130
|
+
private:
|
|
131
|
+
// the index of the next ubatch to process
|
|
132
|
+
size_t i_next = 0;
|
|
133
|
+
|
|
134
|
+
std::vector<llama_ubatch> ubatches;
|
|
135
|
+
|
|
136
|
+
const llama_memory_context_ptr ctx_attn;
|
|
137
|
+
const llama_memory_context_ptr ctx_recr;
|
|
138
|
+
|
|
139
|
+
const llama_memory_status status;
|
|
140
|
+
};
|
|
@@ -163,7 +163,7 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|
|
163
163
|
const auto & cell = cells[tail_id];
|
|
164
164
|
// partial intersection is invalid if it includes the final pos
|
|
165
165
|
if (0 < p0 && p0 <= cell.pos && p1 > cell.pos) {
|
|
166
|
-
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n");
|
|
166
|
+
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false, p0 = %d, cell.pos = %d, p1 = %d\n", p0, cell.pos, p1);
|
|
167
167
|
return false;
|
|
168
168
|
}
|
|
169
169
|
// invalidate tails which will be cleared
|
|
@@ -785,23 +785,21 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
|
|
785
785
|
io.write(&s_trans, sizeof(s_trans));
|
|
786
786
|
io.write(&n_layer, sizeof(n_layer));
|
|
787
787
|
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
// Iterate and write all the keys first, each row is a cell
|
|
788
|
+
// Iterate and write all the R tensors first, each row is a cell
|
|
791
789
|
// Get whole range at a time
|
|
792
790
|
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
793
791
|
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
|
|
794
792
|
if (r_l[il] == nullptr) continue;
|
|
795
793
|
|
|
796
|
-
// Write
|
|
794
|
+
// Write R tensor type
|
|
797
795
|
const int32_t r_type_i = (int32_t)r_l[il]->type;
|
|
798
796
|
io.write(&r_type_i, sizeof(r_type_i));
|
|
799
797
|
|
|
800
|
-
// Write row size of
|
|
798
|
+
// Write row size of R tensor
|
|
801
799
|
const uint64_t r_size_row = ggml_row_size(r_l[il]->type, hparams.n_embd_r());
|
|
802
800
|
io.write(&r_size_row, sizeof(r_size_row));
|
|
803
801
|
|
|
804
|
-
//
|
|
802
|
+
// Write each range of cells of r_size_row length
|
|
805
803
|
for (const auto & range : cell_ranges) {
|
|
806
804
|
const size_t range_size = range.second - range.first;
|
|
807
805
|
const size_t buf_size = range_size * r_size_row;
|
|
@@ -814,15 +812,15 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
|
|
814
812
|
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
|
|
815
813
|
if (s_l[il] == nullptr) continue;
|
|
816
814
|
|
|
817
|
-
// Write
|
|
815
|
+
// Write S tensor type
|
|
818
816
|
const int32_t s_type_i = (int32_t)s_l[il]->type;
|
|
819
817
|
io.write(&s_type_i, sizeof(s_type_i));
|
|
820
818
|
|
|
821
|
-
// Write row size of
|
|
819
|
+
// Write row size of S tensor
|
|
822
820
|
const uint64_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s());
|
|
823
821
|
io.write(&s_size_row, sizeof(s_size_row));
|
|
824
822
|
|
|
825
|
-
//
|
|
823
|
+
// Write each range of S tensor rows
|
|
826
824
|
for (const auto & range : cell_ranges) {
|
|
827
825
|
const size_t range_size = range.second - range.first;
|
|
828
826
|
const size_t buf_size = range_size * s_size_row;
|
|
@@ -830,7 +828,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
|
|
830
828
|
}
|
|
831
829
|
}
|
|
832
830
|
} else {
|
|
833
|
-
// When
|
|
831
|
+
// When S tensor is transposed, we also need the element size and get the element ranges from each row
|
|
834
832
|
const uint32_t mem_size = size;
|
|
835
833
|
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
836
834
|
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
|
|
@@ -838,7 +836,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
|
|
838
836
|
|
|
839
837
|
const uint32_t n_embd_s = hparams.n_embd_s();
|
|
840
838
|
|
|
841
|
-
// Write
|
|
839
|
+
// Write S tensor type
|
|
842
840
|
const int32_t s_type_i = (int32_t)s_l[il]->type;
|
|
843
841
|
io.write(&s_type_i, sizeof(s_type_i));
|
|
844
842
|
|
|
@@ -851,7 +849,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
|
|
851
849
|
|
|
852
850
|
// For each row, we get the element values of each cell
|
|
853
851
|
for (uint32_t j = 0; j < n_embd_s; ++j) {
|
|
854
|
-
//
|
|
852
|
+
// Write each range of cells of s_size_el length
|
|
855
853
|
for (const auto & range : cell_ranges) {
|
|
856
854
|
const size_t range_size = range.second - range.first;
|
|
857
855
|
const size_t src_offset = (range.first + j * mem_size) * s_size_el;
|
|
@@ -244,11 +244,14 @@ struct llama_file::impl {
|
|
|
244
244
|
}
|
|
245
245
|
errno = 0;
|
|
246
246
|
if (fd == -1) {
|
|
247
|
-
|
|
247
|
+
const size_t curr_off = tell();
|
|
248
|
+
const size_t to_read = std::min(len, size - curr_off);
|
|
249
|
+
|
|
250
|
+
std::size_t ret = std::fread(ptr, to_read, 1, fp);
|
|
248
251
|
if (ferror(fp)) {
|
|
249
252
|
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
|
250
253
|
}
|
|
251
|
-
if (ret != 1) {
|
|
254
|
+
if (to_read > 0 && ret != 1) {
|
|
252
255
|
throw std::runtime_error("unexpectedly reached end of file");
|
|
253
256
|
}
|
|
254
257
|
} else {
|
|
@@ -262,7 +265,8 @@ struct llama_file::impl {
|
|
|
262
265
|
continue; // Interrupted by signal, retry
|
|
263
266
|
}
|
|
264
267
|
// Fallback to std::fread in case the DMA controller cannot access the buffer
|
|
265
|
-
if (errno == EFAULT) {
|
|
268
|
+
if (errno == EFAULT || errno == EINVAL) {
|
|
269
|
+
LLAMA_LOG_WARN("%s: Falling back to buffered IO due to %s\n", __func__, strerror(errno));
|
|
266
270
|
auto curr_off = tell();
|
|
267
271
|
close(fd);
|
|
268
272
|
fd = -1;
|
|
@@ -381,6 +385,9 @@ int llama_file::file_id() const {
|
|
|
381
385
|
#ifdef _WIN32
|
|
382
386
|
return _fileno(pimpl->fp);
|
|
383
387
|
#else
|
|
388
|
+
if (pimpl->fd != -1) {
|
|
389
|
+
return pimpl->fd;
|
|
390
|
+
}
|
|
384
391
|
#if defined(fileno)
|
|
385
392
|
return fileno(pimpl->fp);
|
|
386
393
|
#else
|
|
@@ -497,6 +504,8 @@ struct llama_mmap::impl {
|
|
|
497
504
|
}
|
|
498
505
|
}
|
|
499
506
|
#elif defined(_WIN32)
|
|
507
|
+
HANDLE hMapping = nullptr;
|
|
508
|
+
|
|
500
509
|
impl(struct llama_file * file, size_t prefetch, bool numa) {
|
|
501
510
|
GGML_UNUSED(numa);
|
|
502
511
|
|
|
@@ -504,7 +513,7 @@ struct llama_mmap::impl {
|
|
|
504
513
|
|
|
505
514
|
HANDLE hFile = (HANDLE) _get_osfhandle(file->file_id());
|
|
506
515
|
|
|
507
|
-
|
|
516
|
+
hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
|
508
517
|
|
|
509
518
|
if (hMapping == NULL) {
|
|
510
519
|
DWORD error = GetLastError();
|
|
@@ -513,9 +522,9 @@ struct llama_mmap::impl {
|
|
|
513
522
|
|
|
514
523
|
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
|
515
524
|
DWORD error = GetLastError();
|
|
516
|
-
CloseHandle(hMapping);
|
|
517
525
|
|
|
518
526
|
if (addr == NULL) {
|
|
527
|
+
CloseHandle(hMapping);
|
|
519
528
|
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
|
520
529
|
}
|
|
521
530
|
|
|
@@ -547,9 +556,17 @@ struct llama_mmap::impl {
|
|
|
547
556
|
}
|
|
548
557
|
|
|
549
558
|
~impl() {
|
|
550
|
-
if (
|
|
551
|
-
|
|
552
|
-
|
|
559
|
+
if (hMapping) {
|
|
560
|
+
if (addr) {
|
|
561
|
+
if (!UnmapViewOfFile(addr)) {
|
|
562
|
+
LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n",
|
|
563
|
+
llama_format_win_err(GetLastError()).c_str());
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
if (!CloseHandle(hMapping)) {
|
|
567
|
+
LLAMA_LOG_WARN("warning: CloseHandle failed: %s\n",
|
|
568
|
+
llama_format_win_err(GetLastError()).c_str());
|
|
569
|
+
}
|
|
553
570
|
}
|
|
554
571
|
}
|
|
555
572
|
#else
|
|
@@ -611,9 +628,9 @@ struct llama_mlock::impl {
|
|
|
611
628
|
|
|
612
629
|
char* errmsg = std::strerror(errno);
|
|
613
630
|
bool suggest = (errno == ENOMEM);
|
|
614
|
-
#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX)
|
|
615
|
-
// visionOS/tvOS
|
|
616
|
-
// Skip resource limit checks on
|
|
631
|
+
#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX) || defined(__HAIKU__)
|
|
632
|
+
// visionOS/tvOS/Haiku don't support RLIMIT_MEMLOCK
|
|
633
|
+
// Skip resource limit checks on these platforms
|
|
617
634
|
suggest = false;
|
|
618
635
|
#else
|
|
619
636
|
struct rlimit lock_limit;
|