whispercpp 1.3.5 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +99 -2
- data/ext/extconf.rb +1 -0
- data/ext/ruby_whisper.c +20 -4
- data/ext/ruby_whisper.h +30 -2
- data/ext/ruby_whisper_context.c +216 -124
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_model.c +0 -1
- data/ext/ruby_whisper_params.c +0 -1
- data/ext/ruby_whisper_segment.c +0 -1
- data/ext/ruby_whisper_token.c +29 -9
- data/ext/ruby_whisper_transcribe.cpp +4 -1
- data/ext/ruby_whisper_vad_context.c +48 -1
- data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +0 -1
- data/ext/ruby_whisper_vad_segments.c +0 -1
- data/ext/sources/CMakeLists.txt +1 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/examples/bench/bench.cpp +23 -18
- data/ext/sources/examples/cli/cli.cpp +8 -0
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/server/server.cpp +18 -4
- data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
- data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
- data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
- data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
- data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
- data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
- data/ext/sources/examples/talk-llama/llama-context.h +27 -28
- data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
- data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
- data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
- data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
- data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
- data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
- data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
- data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
- data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
- data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
- data/ext/sources/examples/talk-llama/llama-model.h +72 -19
- data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
- data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
- data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
- data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
- data/ext/sources/examples/talk-llama/llama.cpp +76 -22
- data/ext/sources/examples/talk-llama/llama.h +63 -30
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
- data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
- data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
- data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
- data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
- data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/models.h +181 -46
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
- data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
- data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
- data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
- data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
- data/ext/sources/ggml/CMakeLists.txt +9 -3
- data/ext/sources/ggml/include/ggml-backend.h +1 -1
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +5 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +6 -1
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml.h +56 -9
- data/ext/sources/ggml/src/CMakeLists.txt +3 -0
- data/ext/sources/ggml/src/ggml-alloc.c +4 -9
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
- data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
- data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
- data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
- data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
- data/ext/sources/ggml/src/ggml-impl.h +62 -0
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
- data/ext/sources/ggml/src/ggml-quants.c +96 -5
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
- data/ext/sources/ggml/src/ggml.c +167 -33
- data/ext/sources/ggml/src/gguf.cpp +229 -44
- data/ext/sources/src/whisper.cpp +6 -28
- data/sig/whisper.rbs +43 -2
- data/test/test_context_params.rb +82 -0
- data/test/test_token.rb +11 -0
- data/test/test_vad_context.rb +58 -8
- data/test/test_whisper.rb +20 -0
- data/whispercpp.gemspec +1 -1
- metadata +240 -28
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
|
@@ -24,6 +24,7 @@ class llama_kv_cache_context;
|
|
|
24
24
|
class llama_kv_cache_iswa_context;
|
|
25
25
|
class llama_memory_recurrent_context;
|
|
26
26
|
class llama_memory_hybrid_context;
|
|
27
|
+
class llama_memory_hybrid_iswa_context;
|
|
27
28
|
|
|
28
29
|
// certain models (typically multi-modal) can produce different types of graphs
|
|
29
30
|
enum llm_graph_type {
|
|
@@ -105,7 +106,7 @@ using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
|
|
|
105
106
|
|
|
106
107
|
class llm_graph_input_embd : public llm_graph_input_i {
|
|
107
108
|
public:
|
|
108
|
-
llm_graph_input_embd()
|
|
109
|
+
llm_graph_input_embd(int64_t n_embd) : n_embd(n_embd) {}
|
|
109
110
|
virtual ~llm_graph_input_embd() = default;
|
|
110
111
|
|
|
111
112
|
void set_input(const llama_ubatch * ubatch) override;
|
|
@@ -114,6 +115,8 @@ public:
|
|
|
114
115
|
|
|
115
116
|
ggml_tensor * tokens = nullptr; // I32 [n_batch]
|
|
116
117
|
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
|
|
118
|
+
|
|
119
|
+
const int64_t n_embd = 0;
|
|
117
120
|
};
|
|
118
121
|
|
|
119
122
|
class llm_graph_input_pos : public llm_graph_input_i {
|
|
@@ -314,6 +317,39 @@ public:
|
|
|
314
317
|
const llama_kv_cache_context * mctx;
|
|
315
318
|
};
|
|
316
319
|
|
|
320
|
+
// V-less input for the KV cache
|
|
321
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/19067
|
|
322
|
+
class llm_graph_input_attn_k : public llm_graph_input_i {
|
|
323
|
+
public:
|
|
324
|
+
llm_graph_input_attn_k(
|
|
325
|
+
const llama_hparams & hparams,
|
|
326
|
+
const llama_cparams & cparams,
|
|
327
|
+
const llama_kv_cache_context * mctx) :
|
|
328
|
+
hparams(hparams),
|
|
329
|
+
cparams(cparams),
|
|
330
|
+
mctx(mctx) {
|
|
331
|
+
}
|
|
332
|
+
~llm_graph_input_attn_k() = default;
|
|
333
|
+
|
|
334
|
+
void set_input(const llama_ubatch * ubatch) override;
|
|
335
|
+
|
|
336
|
+
bool can_reuse(const llm_graph_params & params) override;
|
|
337
|
+
|
|
338
|
+
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
|
|
339
|
+
|
|
340
|
+
ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
|
|
341
|
+
|
|
342
|
+
ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
|
|
343
|
+
|
|
344
|
+
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
|
345
|
+
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
|
346
|
+
|
|
347
|
+
const llama_hparams hparams;
|
|
348
|
+
const llama_cparams cparams;
|
|
349
|
+
|
|
350
|
+
const llama_kv_cache_context * mctx;
|
|
351
|
+
};
|
|
352
|
+
|
|
317
353
|
class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
|
|
318
354
|
public:
|
|
319
355
|
llm_graph_input_attn_kv_iswa(
|
|
@@ -397,6 +433,62 @@ public:
|
|
|
397
433
|
const llama_memory_hybrid_context * mctx;
|
|
398
434
|
};
|
|
399
435
|
|
|
436
|
+
class llm_graph_input_mem_hybrid_k : public llm_graph_input_i {
|
|
437
|
+
public:
|
|
438
|
+
llm_graph_input_mem_hybrid_k(
|
|
439
|
+
const llama_cparams & cparams,
|
|
440
|
+
std::unique_ptr<llm_graph_input_attn_k> inp_attn,
|
|
441
|
+
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
|
442
|
+
const llama_memory_hybrid_context * mctx) :
|
|
443
|
+
inp_attn(std::move(inp_attn)),
|
|
444
|
+
inp_rs(std::move(inp_rs)),
|
|
445
|
+
cparams(cparams),
|
|
446
|
+
mctx(mctx) { }
|
|
447
|
+
virtual ~llm_graph_input_mem_hybrid_k() = default;
|
|
448
|
+
|
|
449
|
+
void set_input(const llama_ubatch * ubatch) override;
|
|
450
|
+
|
|
451
|
+
bool can_reuse(const llm_graph_params & params) override;
|
|
452
|
+
|
|
453
|
+
std::unique_ptr<llm_graph_input_attn_k> inp_attn;
|
|
454
|
+
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
|
455
|
+
|
|
456
|
+
llm_graph_input_attn_k * get_attn() const { return inp_attn.get(); }
|
|
457
|
+
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
|
458
|
+
|
|
459
|
+
const llama_cparams cparams;
|
|
460
|
+
|
|
461
|
+
const llama_memory_hybrid_context * mctx;
|
|
462
|
+
};
|
|
463
|
+
|
|
464
|
+
class llm_graph_input_mem_hybrid_iswa : public llm_graph_input_i {
|
|
465
|
+
public:
|
|
466
|
+
llm_graph_input_mem_hybrid_iswa(
|
|
467
|
+
const llama_cparams & cparams,
|
|
468
|
+
std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn,
|
|
469
|
+
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
|
470
|
+
const llama_memory_hybrid_iswa_context * mctx) :
|
|
471
|
+
inp_attn(std::move(inp_attn)),
|
|
472
|
+
inp_rs(std::move(inp_rs)),
|
|
473
|
+
cparams(cparams),
|
|
474
|
+
mctx(mctx) { }
|
|
475
|
+
virtual ~llm_graph_input_mem_hybrid_iswa() = default;
|
|
476
|
+
|
|
477
|
+
void set_input(const llama_ubatch * ubatch) override;
|
|
478
|
+
|
|
479
|
+
bool can_reuse(const llm_graph_params & params) override;
|
|
480
|
+
|
|
481
|
+
std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn;
|
|
482
|
+
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
|
483
|
+
|
|
484
|
+
llm_graph_input_attn_kv_iswa * get_attn() const { return inp_attn.get(); }
|
|
485
|
+
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
|
486
|
+
|
|
487
|
+
const llama_cparams cparams;
|
|
488
|
+
|
|
489
|
+
const llama_memory_hybrid_iswa_context * mctx;
|
|
490
|
+
};
|
|
491
|
+
|
|
400
492
|
class llm_graph_input_sampling : public llm_graph_input_i {
|
|
401
493
|
public:
|
|
402
494
|
llm_graph_input_sampling(std::map<llama_seq_id, llama_sampler *> samplers) :
|
|
@@ -537,7 +629,7 @@ public:
|
|
|
537
629
|
|
|
538
630
|
virtual ~llm_graph_result() = default;
|
|
539
631
|
|
|
540
|
-
ggml_tensor *
|
|
632
|
+
ggml_tensor * get_inp_tokens() const { return t_inp_tokens; }
|
|
541
633
|
ggml_tensor * get_logits() const { return t_logits; }
|
|
542
634
|
ggml_tensor * get_embd() const { return t_embd; }
|
|
543
635
|
ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
|
|
@@ -564,7 +656,8 @@ public:
|
|
|
564
656
|
void set_params(const llm_graph_params & params);
|
|
565
657
|
|
|
566
658
|
// important graph nodes
|
|
567
|
-
ggml_tensor *
|
|
659
|
+
ggml_tensor * t_inp_tokens = nullptr;
|
|
660
|
+
ggml_tensor * t_inp_embd = nullptr; // [n_embd_inp, n_tokens]
|
|
568
661
|
ggml_tensor * t_logits = nullptr;
|
|
569
662
|
ggml_tensor * t_embd = nullptr;
|
|
570
663
|
ggml_tensor * t_embd_pooled = nullptr;
|
|
@@ -671,10 +764,11 @@ struct llm_graph_context {
|
|
|
671
764
|
ggml_tensor * cur,
|
|
672
765
|
int il) const;
|
|
673
766
|
|
|
674
|
-
// do mat_mul, while optionally apply lora
|
|
767
|
+
// do mat_mul, while optionally apply lora and per-tensor scale
|
|
675
768
|
ggml_tensor * build_lora_mm(
|
|
676
769
|
ggml_tensor * w,
|
|
677
|
-
ggml_tensor * cur
|
|
770
|
+
ggml_tensor * cur,
|
|
771
|
+
ggml_tensor * w_s = nullptr) const;
|
|
678
772
|
|
|
679
773
|
// do mat_mul_id, while optionally apply lora
|
|
680
774
|
ggml_tensor * build_lora_mm_id(
|
|
@@ -717,11 +811,14 @@ struct llm_graph_context {
|
|
|
717
811
|
int64_t n_expert_used,
|
|
718
812
|
llm_ffn_op_type type_op,
|
|
719
813
|
bool norm_w,
|
|
720
|
-
bool scale_w,
|
|
721
814
|
float w_scale,
|
|
722
815
|
llama_expert_gating_func_type gating_op,
|
|
723
816
|
int il,
|
|
724
|
-
ggml_tensor * probs_in = nullptr
|
|
817
|
+
ggml_tensor * probs_in = nullptr,
|
|
818
|
+
ggml_tensor * gate_up_exps = nullptr,
|
|
819
|
+
ggml_tensor * up_exps_s = nullptr,
|
|
820
|
+
ggml_tensor * gate_exps_s = nullptr,
|
|
821
|
+
ggml_tensor * down_exps_s = nullptr) const;
|
|
725
822
|
|
|
726
823
|
ggml_tensor * build_moe_ffn(
|
|
727
824
|
ggml_tensor * cur,
|
|
@@ -738,11 +835,15 @@ struct llm_graph_context {
|
|
|
738
835
|
int64_t n_expert_used,
|
|
739
836
|
llm_ffn_op_type type_op,
|
|
740
837
|
bool norm_w,
|
|
741
|
-
bool scale_w,
|
|
742
838
|
float w_scale,
|
|
743
839
|
llama_expert_gating_func_type gating_op,
|
|
744
840
|
int il,
|
|
745
|
-
ggml_tensor * probs_in = nullptr
|
|
841
|
+
ggml_tensor * probs_in = nullptr,
|
|
842
|
+
ggml_tensor * gate_up_exps = nullptr,
|
|
843
|
+
ggml_tensor * gate_up_exps_b = nullptr,
|
|
844
|
+
ggml_tensor * up_exps_s = nullptr,
|
|
845
|
+
ggml_tensor * gate_exps_s = nullptr,
|
|
846
|
+
ggml_tensor * down_exps_s = nullptr) const;
|
|
746
847
|
|
|
747
848
|
//
|
|
748
849
|
// inputs
|
|
@@ -801,6 +902,21 @@ struct llm_graph_context {
|
|
|
801
902
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
|
802
903
|
ggml_tensor * kq_b,
|
|
803
904
|
ggml_tensor * sinks, // [n_head_q]
|
|
905
|
+
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] // TODO: remove
|
|
906
|
+
float kq_scale,
|
|
907
|
+
int il) const;
|
|
908
|
+
|
|
909
|
+
llm_graph_input_attn_k * build_attn_inp_k() const;
|
|
910
|
+
|
|
911
|
+
ggml_tensor * build_attn(
|
|
912
|
+
llm_graph_input_attn_k * inp,
|
|
913
|
+
ggml_tensor * wo,
|
|
914
|
+
ggml_tensor * wo_b,
|
|
915
|
+
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
916
|
+
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
|
917
|
+
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
|
918
|
+
ggml_tensor * kq_b,
|
|
919
|
+
ggml_tensor * sinks, // [n_head_q]
|
|
804
920
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
805
921
|
float kq_scale,
|
|
806
922
|
int il) const;
|
|
@@ -880,6 +996,9 @@ struct llm_graph_context {
|
|
|
880
996
|
//
|
|
881
997
|
|
|
882
998
|
llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
|
|
999
|
+
llm_graph_input_mem_hybrid_k * build_inp_mem_hybrid_k() const;
|
|
1000
|
+
|
|
1001
|
+
llm_graph_input_mem_hybrid_iswa * build_inp_mem_hybrid_iswa() const;
|
|
883
1002
|
|
|
884
1003
|
//
|
|
885
1004
|
// pooling
|
|
@@ -889,7 +1008,8 @@ struct llm_graph_context {
|
|
|
889
1008
|
ggml_tensor * cls,
|
|
890
1009
|
ggml_tensor * cls_b,
|
|
891
1010
|
ggml_tensor * cls_out,
|
|
892
|
-
ggml_tensor * cls_out_b
|
|
1011
|
+
ggml_tensor * cls_out_b,
|
|
1012
|
+
ggml_tensor * cls_norm) const;
|
|
893
1013
|
|
|
894
1014
|
//
|
|
895
1015
|
// sampling (backend sampling)
|
|
@@ -903,6 +1023,7 @@ struct llm_graph_context {
|
|
|
903
1023
|
|
|
904
1024
|
void build_dense_out(
|
|
905
1025
|
ggml_tensor * dense_2,
|
|
1026
|
+
ggml_tensor * dense_2_b,
|
|
906
1027
|
ggml_tensor * dense_3) const;
|
|
907
1028
|
};
|
|
908
1029
|
|
|
@@ -62,6 +62,14 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const {
|
|
|
62
62
|
return n_head/n_head_kv;
|
|
63
63
|
}
|
|
64
64
|
|
|
65
|
+
uint32_t llama_hparams::n_rot(uint32_t il) const {
|
|
66
|
+
if (il < n_layer) {
|
|
67
|
+
return is_swa(il) ? n_rot_swa : n_rot_full;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
GGML_ABORT("fatal error");
|
|
71
|
+
}
|
|
72
|
+
|
|
65
73
|
uint32_t llama_hparams::n_embd_inp() const {
|
|
66
74
|
uint32_t n_embd_inp = n_embd;
|
|
67
75
|
|
|
@@ -72,20 +80,36 @@ uint32_t llama_hparams::n_embd_inp() const {
|
|
|
72
80
|
return n_embd_inp;
|
|
73
81
|
}
|
|
74
82
|
|
|
75
|
-
uint32_t llama_hparams::
|
|
76
|
-
return
|
|
83
|
+
uint32_t llama_hparams::n_embd_out() const {
|
|
84
|
+
return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
uint32_t llama_hparams::n_embd_head_k(uint32_t il) const {
|
|
88
|
+
if (il < n_layer) {
|
|
89
|
+
return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
GGML_ABORT("fatal error");
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
uint32_t llama_hparams::n_embd_head_v(uint32_t il) const {
|
|
96
|
+
if (il < n_layer) {
|
|
97
|
+
return is_swa(il) ? n_embd_head_v_swa : n_embd_head_v_full;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
GGML_ABORT("fatal error");
|
|
77
101
|
}
|
|
78
102
|
|
|
79
103
|
uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
|
|
80
104
|
const uint32_t n_head_kv = this->n_head_kv(il);
|
|
81
105
|
|
|
82
|
-
return n_embd_head_k * n_head_kv;
|
|
106
|
+
return n_embd_head_k(il) * n_head_kv;
|
|
83
107
|
}
|
|
84
108
|
|
|
85
109
|
uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
|
|
86
110
|
const uint32_t n_head_kv = this->n_head_kv(il);
|
|
87
111
|
|
|
88
|
-
return n_embd_head_v * n_head_kv;
|
|
112
|
+
return n_embd_head_v(il) * n_head_kv;
|
|
89
113
|
}
|
|
90
114
|
|
|
91
115
|
bool llama_hparams::is_n_embd_k_gqa_variable() const {
|
|
@@ -139,6 +163,13 @@ uint32_t llama_hparams::n_embd_r() const {
|
|
|
139
163
|
return n_embd * (n_shortconv_l_cache - 1);
|
|
140
164
|
}
|
|
141
165
|
|
|
166
|
+
if (n_embd_head_kda != 0) {
|
|
167
|
+
// for Kimi KDA layers
|
|
168
|
+
// Conv state for Q, K, V: 3 * (d_conv - 1) * n_head * head_dim
|
|
169
|
+
const uint32_t d_inner = n_head() * n_embd_head_kda; // 32 * 128 = 4096
|
|
170
|
+
return 3 * (ssm_d_conv > 0 ? ssm_d_conv - 1 : 3) * d_inner;
|
|
171
|
+
}
|
|
172
|
+
|
|
142
173
|
// TODO: maybe support other convolution strides than 1
|
|
143
174
|
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
|
|
144
175
|
// Corresponds to Mamba's conv_states size
|
|
@@ -151,6 +182,13 @@ uint32_t llama_hparams::n_embd_s() const {
|
|
|
151
182
|
return n_embd * wkv_head_size;
|
|
152
183
|
}
|
|
153
184
|
|
|
185
|
+
if (n_embd_head_kda != 0) {
|
|
186
|
+
// for Kimi KDA layers
|
|
187
|
+
// Full recurrent state: head_dim * head_dim * n_head
|
|
188
|
+
// h tensor shape for delta attention: [head_dim, head_dim, n_head]
|
|
189
|
+
return n_embd_head_kda * n_embd_head_kda * n_head(); // 128 * 128 * 32 = 524288
|
|
190
|
+
}
|
|
191
|
+
|
|
154
192
|
// corresponds to Mamba's ssm_states size
|
|
155
193
|
return ssm_d_state * ssm_d_inner;
|
|
156
194
|
}
|
|
@@ -175,6 +213,21 @@ bool llama_hparams::is_swa(uint32_t il) const {
|
|
|
175
213
|
GGML_ABORT("fatal error");
|
|
176
214
|
}
|
|
177
215
|
|
|
216
|
+
bool llama_hparams::is_mla() const {
|
|
217
|
+
assert((n_embd_head_k_mla_impl == 0 && n_embd_head_v_mla_impl == 0) ||
|
|
218
|
+
(n_embd_head_k_mla_impl != 0 && n_embd_head_v_mla_impl != 0));
|
|
219
|
+
|
|
220
|
+
return n_embd_head_k_mla_impl != 0 && n_embd_head_v_mla_impl != 0;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
uint32_t llama_hparams::n_embd_head_k_mla() const {
|
|
224
|
+
return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k();
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
uint32_t llama_hparams::n_embd_head_v_mla() const {
|
|
228
|
+
return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v();
|
|
229
|
+
}
|
|
230
|
+
|
|
178
231
|
bool llama_hparams::has_kv(uint32_t il) const {
|
|
179
232
|
if (n_layer_kv_from_start >= 0) {
|
|
180
233
|
if (il < (uint32_t) n_layer_kv_from_start) {
|
|
@@ -200,42 +253,6 @@ uint32_t llama_hparams::n_layer_kv() const {
|
|
|
200
253
|
return res;
|
|
201
254
|
}
|
|
202
255
|
|
|
203
|
-
bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
|
|
204
|
-
assert(p0 >= 0 && p1 >= 0);
|
|
205
|
-
|
|
206
|
-
switch (swa_type) {
|
|
207
|
-
case LLAMA_SWA_TYPE_NONE:
|
|
208
|
-
{
|
|
209
|
-
} break;
|
|
210
|
-
case LLAMA_SWA_TYPE_STANDARD:
|
|
211
|
-
{
|
|
212
|
-
if (p1 - p0 >= (int32_t) n_swa) {
|
|
213
|
-
return true;
|
|
214
|
-
}
|
|
215
|
-
} break;
|
|
216
|
-
case LLAMA_SWA_TYPE_CHUNKED:
|
|
217
|
-
{
|
|
218
|
-
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
|
|
219
|
-
|
|
220
|
-
if (p0 < pos_chunk_start) {
|
|
221
|
-
return true;
|
|
222
|
-
}
|
|
223
|
-
} break;
|
|
224
|
-
case LLAMA_SWA_TYPE_SYMMETRIC:
|
|
225
|
-
{
|
|
226
|
-
const int32_t half_n_swa = (int32_t) n_swa / 2;
|
|
227
|
-
const int32_t pos_diff = p1 - p0;
|
|
228
|
-
|
|
229
|
-
// Mask if outside the symmetric window
|
|
230
|
-
if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
|
|
231
|
-
return true;
|
|
232
|
-
}
|
|
233
|
-
} break;
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
return false;
|
|
237
|
-
}
|
|
238
|
-
|
|
239
256
|
bool llama_hparams::use_mrope() const {
|
|
240
257
|
return rope_sections[0] > 0 && rope_sections[1] > 0;
|
|
241
258
|
}
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include "llama.h"
|
|
4
4
|
|
|
5
5
|
#include <array>
|
|
6
|
+
#include <cassert>
|
|
6
7
|
|
|
7
8
|
// bump if necessary
|
|
8
9
|
#define LLAMA_MAX_LAYERS 512
|
|
@@ -41,19 +42,25 @@ struct llama_hparams {
|
|
|
41
42
|
|
|
42
43
|
uint32_t n_ctx_train; // context size the model was trained on
|
|
43
44
|
uint32_t n_embd;
|
|
44
|
-
uint32_t n_embd_features = 0;
|
|
45
45
|
uint32_t n_layer;
|
|
46
46
|
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
|
|
47
|
-
uint32_t n_rot;
|
|
48
|
-
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
|
49
|
-
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
|
50
47
|
uint32_t n_expert = 0;
|
|
51
48
|
uint32_t n_expert_used = 0;
|
|
52
49
|
uint32_t n_rel_attn_bkts = 0;
|
|
53
50
|
|
|
51
|
+
// different head size for full_attention and SWA layers
|
|
52
|
+
uint32_t n_embd_head_k_full; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
|
53
|
+
uint32_t n_embd_head_v_full; // dimension of values (d_v) aka n_embd_head
|
|
54
|
+
uint32_t n_embd_head_k_swa;
|
|
55
|
+
uint32_t n_embd_head_v_swa;
|
|
56
|
+
|
|
57
|
+
// different RoPE dimensions for full_attention and SWA layers
|
|
58
|
+
uint32_t n_rot_full;
|
|
59
|
+
uint32_t n_rot_swa;
|
|
60
|
+
|
|
54
61
|
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
|
|
55
|
-
uint32_t
|
|
56
|
-
uint32_t
|
|
62
|
+
uint32_t n_embd_head_k_mla_impl = 0;
|
|
63
|
+
uint32_t n_embd_head_v_mla_impl = 0;
|
|
57
64
|
|
|
58
65
|
// for WavTokenizer
|
|
59
66
|
struct llama_hparams_posnet posnet;
|
|
@@ -82,6 +89,7 @@ struct llama_hparams {
|
|
|
82
89
|
bool expert_weights_norm = false;
|
|
83
90
|
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
|
84
91
|
uint32_t moe_every_n_layers = 0;
|
|
92
|
+
uint32_t moe_latent_size = 0;
|
|
85
93
|
uint32_t nextn_predict_layers = 0;
|
|
86
94
|
|
|
87
95
|
float f_norm_eps;
|
|
@@ -136,6 +144,9 @@ struct llama_hparams {
|
|
|
136
144
|
uint32_t ssm_dt_rank = 0;
|
|
137
145
|
uint32_t ssm_n_group = 0;
|
|
138
146
|
|
|
147
|
+
// for Kimi Linear KDA
|
|
148
|
+
uint32_t n_embd_head_kda = 0;
|
|
149
|
+
|
|
139
150
|
// for hybrid state space models
|
|
140
151
|
std::array<bool, LLAMA_MAX_LAYERS> recurrent_layer_arr;
|
|
141
152
|
|
|
@@ -163,7 +174,7 @@ struct llama_hparams {
|
|
|
163
174
|
uint32_t n_cls_out = 1;
|
|
164
175
|
|
|
165
176
|
// output embedding dimension (0 = use n_embd)
|
|
166
|
-
uint32_t
|
|
177
|
+
uint32_t n_embd_out_impl = 0;
|
|
167
178
|
|
|
168
179
|
// llama4 smallthinker
|
|
169
180
|
uint32_t n_moe_layer_step = 0;
|
|
@@ -190,11 +201,16 @@ struct llama_hparams {
|
|
|
190
201
|
std::array<float, LLAMA_MAX_LAYERS> xielu_beta;
|
|
191
202
|
std::array<float, LLAMA_MAX_LAYERS> xielu_eps;
|
|
192
203
|
|
|
204
|
+
// DSA (deepseek sparse attention)
|
|
205
|
+
uint32_t indexer_n_head = 0;
|
|
206
|
+
uint32_t indexer_head_size = 0;
|
|
207
|
+
uint32_t indexer_top_k = 0;
|
|
208
|
+
|
|
193
209
|
// qwen3vl deepstack
|
|
194
210
|
uint32_t n_deepstack_layers = 0;
|
|
195
211
|
|
|
196
212
|
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
|
197
|
-
// ref: https://github.com/
|
|
213
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/8141
|
|
198
214
|
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
|
199
215
|
uint32_t dec_n_layer = 0;
|
|
200
216
|
|
|
@@ -202,6 +218,11 @@ struct llama_hparams {
|
|
|
202
218
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
|
203
219
|
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
|
204
220
|
|
|
221
|
+
|
|
222
|
+
// Step35: optional per-layer clamps for (Swi)GLU
|
|
223
|
+
std::array<float, LLAMA_MAX_LAYERS> swiglu_clamp_exp; // clamping for expert FFN
|
|
224
|
+
std::array<float, LLAMA_MAX_LAYERS> swiglu_clamp_shexp; // shared expert
|
|
225
|
+
|
|
205
226
|
// this value n_pattern means that every nth layer is dense (i.e. non-SWA)
|
|
206
227
|
// dense_first means whether the pattern is start with a dense layer
|
|
207
228
|
// note that if n_pattern == 0, all layers are SWA
|
|
@@ -234,11 +255,17 @@ struct llama_hparams {
|
|
|
234
255
|
|
|
235
256
|
uint32_t n_gqa(uint32_t il = 0) const;
|
|
236
257
|
|
|
258
|
+
uint32_t n_rot(uint32_t il = 0) const;
|
|
259
|
+
|
|
237
260
|
// dimension of main + auxiliary input embeddings
|
|
238
261
|
uint32_t n_embd_inp() const;
|
|
239
262
|
|
|
240
263
|
// dimension of output embeddings
|
|
241
|
-
uint32_t
|
|
264
|
+
uint32_t n_embd_out() const;
|
|
265
|
+
|
|
266
|
+
// dimension of key/value embeddings for each head (per layer)
|
|
267
|
+
uint32_t n_embd_head_k(uint32_t il = 0) const;
|
|
268
|
+
uint32_t n_embd_head_v(uint32_t il = 0) const;
|
|
242
269
|
|
|
243
270
|
// dimension of key embeddings across all k-v heads
|
|
244
271
|
uint32_t n_embd_k_gqa(uint32_t il = 0) const;
|
|
@@ -268,15 +295,57 @@ struct llama_hparams {
|
|
|
268
295
|
|
|
269
296
|
bool is_swa(uint32_t il) const;
|
|
270
297
|
|
|
298
|
+
// note: currently only support if either all or none of the layers are MLA
|
|
299
|
+
bool is_mla() const;
|
|
300
|
+
|
|
301
|
+
uint32_t n_embd_head_k_mla() const;
|
|
302
|
+
uint32_t n_embd_head_v_mla() const;
|
|
303
|
+
|
|
271
304
|
bool has_kv(uint32_t il) const;
|
|
272
305
|
|
|
273
306
|
// number of layers for which has_kv() returns true
|
|
274
307
|
uint32_t n_layer_kv() const;
|
|
275
308
|
|
|
276
309
|
// note that this function uses different SWA parameters from those in the hparams
|
|
310
|
+
// note: inlined on purpose for performance reasons
|
|
277
311
|
// TODO: think of a better place for this function
|
|
278
312
|
// TODO: pack the SWA params in a struct?
|
|
279
|
-
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1)
|
|
313
|
+
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
|
|
314
|
+
assert(p0 >= 0 && p1 >= 0);
|
|
315
|
+
|
|
316
|
+
switch (swa_type) {
|
|
317
|
+
case LLAMA_SWA_TYPE_NONE:
|
|
318
|
+
{
|
|
319
|
+
} break;
|
|
320
|
+
case LLAMA_SWA_TYPE_STANDARD:
|
|
321
|
+
{
|
|
322
|
+
if (p1 - p0 >= (int32_t) n_swa) {
|
|
323
|
+
return true;
|
|
324
|
+
}
|
|
325
|
+
} break;
|
|
326
|
+
case LLAMA_SWA_TYPE_CHUNKED:
|
|
327
|
+
{
|
|
328
|
+
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
|
|
329
|
+
|
|
330
|
+
if (p0 < pos_chunk_start) {
|
|
331
|
+
return true;
|
|
332
|
+
}
|
|
333
|
+
} break;
|
|
334
|
+
case LLAMA_SWA_TYPE_SYMMETRIC:
|
|
335
|
+
{
|
|
336
|
+
const int32_t half_n_swa = (int32_t) n_swa / 2;
|
|
337
|
+
const int32_t pos_diff = p1 - p0;
|
|
338
|
+
|
|
339
|
+
// Mask if outside the symmetric window
|
|
340
|
+
if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
|
|
341
|
+
return true;
|
|
342
|
+
}
|
|
343
|
+
} break;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
return false;
|
|
347
|
+
}
|
|
348
|
+
|
|
280
349
|
|
|
281
350
|
bool use_mrope() const;
|
|
282
351
|
};
|
|
@@ -100,18 +100,18 @@ std::string format(const char * fmt, ...) {
|
|
|
100
100
|
|
|
101
101
|
std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
|
|
102
102
|
char buf[256];
|
|
103
|
-
snprintf(buf, sizeof(buf), "%
|
|
103
|
+
snprintf(buf, sizeof(buf), "%6" PRId64, ne.at(0));
|
|
104
104
|
for (size_t i = 1; i < ne.size(); i++) {
|
|
105
|
-
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %
|
|
105
|
+
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, ne.at(i));
|
|
106
106
|
}
|
|
107
107
|
return buf;
|
|
108
108
|
}
|
|
109
109
|
|
|
110
110
|
std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
|
|
111
111
|
char buf[256];
|
|
112
|
-
snprintf(buf, sizeof(buf), "%
|
|
112
|
+
snprintf(buf, sizeof(buf), "%6" PRId64, t->ne[0]);
|
|
113
113
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
114
|
-
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %
|
|
114
|
+
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, t->ne[i]);
|
|
115
115
|
}
|
|
116
116
|
return buf;
|
|
117
117
|
}
|
|
@@ -49,6 +49,16 @@ struct time_meas {
|
|
|
49
49
|
int64_t & t_acc;
|
|
50
50
|
};
|
|
51
51
|
|
|
52
|
+
template <typename T>
|
|
53
|
+
struct buffer_view {
|
|
54
|
+
T * data;
|
|
55
|
+
size_t size = 0;
|
|
56
|
+
|
|
57
|
+
bool has_data() const {
|
|
58
|
+
return data && size > 0;
|
|
59
|
+
}
|
|
60
|
+
};
|
|
61
|
+
|
|
52
62
|
void replace_all(std::string & s, const std::string & search, const std::string & replace);
|
|
53
63
|
|
|
54
64
|
// TODO: rename to llama_format ?
|
|
@@ -60,4 +70,6 @@ std::string llama_format_tensor_shape(const struct ggml_tensor * t);
|
|
|
60
70
|
|
|
61
71
|
std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);
|
|
62
72
|
|
|
63
|
-
#define LLAMA_TENSOR_NAME_FATTN
|
|
73
|
+
#define LLAMA_TENSOR_NAME_FATTN "__fattn__"
|
|
74
|
+
#define LLAMA_TENSOR_NAME_FGDN_AR "__fgdn_ar__"
|
|
75
|
+
#define LLAMA_TENSOR_NAME_FGDN_CH "__fgdn_ch__"
|
|
@@ -218,7 +218,9 @@ llama_memory_context_ptr llama_kv_cache_iswa::init_update(llama_context * lctx,
|
|
|
218
218
|
}
|
|
219
219
|
|
|
220
220
|
bool llama_kv_cache_iswa::get_can_shift() const {
|
|
221
|
-
return kv_base->
|
|
221
|
+
return kv_base->get_can_shift() &&
|
|
222
|
+
kv_swa->get_can_shift() &&
|
|
223
|
+
kv_base->get_size() == kv_swa->get_size();
|
|
222
224
|
}
|
|
223
225
|
|
|
224
226
|
void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|