whispercpp 1.3.5 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +99 -2
- data/ext/extconf.rb +1 -0
- data/ext/ruby_whisper.c +20 -4
- data/ext/ruby_whisper.h +30 -2
- data/ext/ruby_whisper_context.c +216 -124
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_model.c +0 -1
- data/ext/ruby_whisper_params.c +0 -1
- data/ext/ruby_whisper_segment.c +0 -1
- data/ext/ruby_whisper_token.c +29 -9
- data/ext/ruby_whisper_transcribe.cpp +4 -1
- data/ext/ruby_whisper_vad_context.c +48 -1
- data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +0 -1
- data/ext/ruby_whisper_vad_segments.c +0 -1
- data/ext/sources/CMakeLists.txt +1 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/examples/bench/bench.cpp +23 -18
- data/ext/sources/examples/cli/cli.cpp +8 -0
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/server/server.cpp +18 -4
- data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
- data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
- data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
- data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
- data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
- data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
- data/ext/sources/examples/talk-llama/llama-context.h +27 -28
- data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
- data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
- data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
- data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
- data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
- data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
- data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
- data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
- data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
- data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
- data/ext/sources/examples/talk-llama/llama-model.h +72 -19
- data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
- data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
- data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
- data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
- data/ext/sources/examples/talk-llama/llama.cpp +76 -22
- data/ext/sources/examples/talk-llama/llama.h +63 -30
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
- data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
- data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
- data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
- data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
- data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/models.h +181 -46
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
- data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
- data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
- data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
- data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
- data/ext/sources/ggml/CMakeLists.txt +9 -3
- data/ext/sources/ggml/include/ggml-backend.h +1 -1
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +5 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +6 -1
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml.h +56 -9
- data/ext/sources/ggml/src/CMakeLists.txt +3 -0
- data/ext/sources/ggml/src/ggml-alloc.c +4 -9
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
- data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
- data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
- data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
- data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
- data/ext/sources/ggml/src/ggml-impl.h +62 -0
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
- data/ext/sources/ggml/src/ggml-quants.c +96 -5
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
- data/ext/sources/ggml/src/ggml.c +167 -33
- data/ext/sources/ggml/src/gguf.cpp +229 -44
- data/ext/sources/src/whisper.cpp +6 -28
- data/sig/whisper.rbs +43 -2
- data/test/test_context_params.rb +82 -0
- data/test/test_token.rb +11 -0
- data/test/test_vad_context.rb +58 -8
- data/test/test_whisper.rb +20 -0
- data/whispercpp.gemspec +1 -1
- metadata +240 -28
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
|
@@ -1,23 +1,71 @@
|
|
|
1
1
|
#pragma once
|
|
2
2
|
|
|
3
|
-
#include "
|
|
4
|
-
#include "
|
|
3
|
+
#include "llama-model.h"
|
|
4
|
+
#include "llama-graph.h"
|
|
5
5
|
|
|
6
|
-
//
|
|
7
|
-
#include "../llama-memory-recurrent.h"
|
|
6
|
+
// note: almost all graphs require at least sqrtf, so include cmath globally
|
|
8
7
|
#include <cmath>
|
|
9
8
|
|
|
10
|
-
|
|
11
|
-
|
|
9
|
+
//
|
|
10
|
+
// base classes
|
|
11
|
+
//
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
struct llm_build_mamba_base : public llm_graph_context {
|
|
14
|
+
llm_build_mamba_base(const llm_graph_params & params);
|
|
15
|
+
|
|
16
|
+
virtual ~llm_build_mamba_base() = default;
|
|
14
17
|
|
|
15
18
|
ggml_tensor * build_mamba_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il);
|
|
16
19
|
ggml_tensor * build_mamba2_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il) const;
|
|
17
20
|
|
|
18
21
|
};
|
|
19
22
|
|
|
20
|
-
|
|
23
|
+
struct llm_build_delta_net_base : public llm_graph_context {
|
|
24
|
+
llm_build_delta_net_base(const llm_graph_params & params);
|
|
25
|
+
|
|
26
|
+
virtual ~llm_build_delta_net_base() = default;
|
|
27
|
+
|
|
28
|
+
// returns pair of output and new state
|
|
29
|
+
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
|
|
30
|
+
ggml_tensor * q,
|
|
31
|
+
ggml_tensor * k,
|
|
32
|
+
ggml_tensor * v,
|
|
33
|
+
ggml_tensor * g,
|
|
34
|
+
ggml_tensor * b,
|
|
35
|
+
ggml_tensor * s,
|
|
36
|
+
int il);
|
|
37
|
+
|
|
38
|
+
// returns pair of output and new state
|
|
39
|
+
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
|
|
40
|
+
ggml_tensor * q,
|
|
41
|
+
ggml_tensor * k,
|
|
42
|
+
ggml_tensor * v,
|
|
43
|
+
ggml_tensor * g,
|
|
44
|
+
ggml_tensor * b,
|
|
45
|
+
ggml_tensor * s,
|
|
46
|
+
int il);
|
|
47
|
+
|
|
48
|
+
// use the ggml_gated_delta_net fused operator
|
|
49
|
+
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_fused(
|
|
50
|
+
ggml_tensor * q,
|
|
51
|
+
ggml_tensor * k,
|
|
52
|
+
ggml_tensor * v,
|
|
53
|
+
ggml_tensor * g,
|
|
54
|
+
ggml_tensor * b,
|
|
55
|
+
ggml_tensor * s,
|
|
56
|
+
int il);
|
|
57
|
+
|
|
58
|
+
// choose one of two implementations above based on the number of tokens
|
|
59
|
+
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net(
|
|
60
|
+
ggml_tensor * q,
|
|
61
|
+
ggml_tensor * k,
|
|
62
|
+
ggml_tensor * v,
|
|
63
|
+
ggml_tensor * g,
|
|
64
|
+
ggml_tensor * b,
|
|
65
|
+
ggml_tensor * s,
|
|
66
|
+
int il);
|
|
67
|
+
};
|
|
68
|
+
|
|
21
69
|
struct llm_build_rwkv6_base : public llm_graph_context {
|
|
22
70
|
const llama_model & model;
|
|
23
71
|
|
|
@@ -58,6 +106,10 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
58
106
|
int il) const;
|
|
59
107
|
};
|
|
60
108
|
|
|
109
|
+
//
|
|
110
|
+
// models
|
|
111
|
+
//
|
|
112
|
+
|
|
61
113
|
struct llm_build_afmoe : public llm_graph_context {
|
|
62
114
|
llm_build_afmoe(const llama_model & model, const llm_graph_params & params);
|
|
63
115
|
};
|
|
@@ -158,6 +210,10 @@ struct llm_build_ernie4_5_moe : public llm_graph_context {
|
|
|
158
210
|
llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params);
|
|
159
211
|
};
|
|
160
212
|
|
|
213
|
+
struct llm_build_paddleocr : public llm_graph_context {
|
|
214
|
+
llm_build_paddleocr(const llama_model & model, const llm_graph_params & params);
|
|
215
|
+
};
|
|
216
|
+
|
|
161
217
|
template <bool iswa>
|
|
162
218
|
struct llm_build_exaone4 : public llm_graph_context {
|
|
163
219
|
llm_build_exaone4(const llama_model & model, const llm_graph_params & params);
|
|
@@ -167,11 +223,15 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
167
223
|
llm_build_exaone(const llama_model & model, const llm_graph_params & params);
|
|
168
224
|
};
|
|
169
225
|
|
|
226
|
+
struct llm_build_exaone_moe : public llm_graph_context {
|
|
227
|
+
llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params);
|
|
228
|
+
};
|
|
229
|
+
|
|
170
230
|
struct llm_build_falcon : public llm_graph_context {
|
|
171
231
|
llm_build_falcon(const llama_model & model, const llm_graph_params & params);
|
|
172
232
|
};
|
|
173
233
|
|
|
174
|
-
struct llm_build_falcon_h1 : public
|
|
234
|
+
struct llm_build_falcon_h1 : public llm_build_mamba_base {
|
|
175
235
|
llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params);
|
|
176
236
|
};
|
|
177
237
|
|
|
@@ -249,7 +309,7 @@ private:
|
|
|
249
309
|
const int il);
|
|
250
310
|
};
|
|
251
311
|
|
|
252
|
-
struct llm_build_granite_hybrid : public
|
|
312
|
+
struct llm_build_granite_hybrid : public llm_build_mamba_base {
|
|
253
313
|
llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params);
|
|
254
314
|
ggml_tensor * build_layer_ffn(ggml_tensor * cur, ggml_tensor * inpSA, const llama_model & model, const int il);
|
|
255
315
|
ggml_tensor * build_attention_layer(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv * inp_attn,
|
|
@@ -280,19 +340,44 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
280
340
|
llm_build_jais(const llama_model & model, const llm_graph_params & params);
|
|
281
341
|
};
|
|
282
342
|
|
|
283
|
-
struct
|
|
343
|
+
struct llm_build_jais2 : public llm_graph_context {
|
|
344
|
+
llm_build_jais2(const llama_model & model, const llm_graph_params & params);
|
|
345
|
+
};
|
|
346
|
+
|
|
347
|
+
struct llm_build_jamba : public llm_build_mamba_base {
|
|
284
348
|
llm_build_jamba(const llama_model & model, const llm_graph_params & params);
|
|
285
349
|
};
|
|
286
350
|
|
|
287
|
-
struct
|
|
351
|
+
struct llm_build_kimi_linear : public llm_build_delta_net_base {
|
|
352
|
+
llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params);
|
|
353
|
+
|
|
354
|
+
std::pair<ggml_tensor *, ggml_tensor *> build_kda_autoregressive(
|
|
355
|
+
ggml_tensor * q,
|
|
356
|
+
ggml_tensor * k,
|
|
357
|
+
ggml_tensor * v,
|
|
358
|
+
ggml_tensor * gk,
|
|
359
|
+
ggml_tensor * beta,
|
|
360
|
+
ggml_tensor * state,
|
|
361
|
+
int il);
|
|
362
|
+
|
|
363
|
+
std::pair<ggml_tensor *, ggml_tensor *> build_kda_chunking(
|
|
364
|
+
ggml_tensor * q,
|
|
365
|
+
ggml_tensor * k,
|
|
366
|
+
ggml_tensor * v,
|
|
367
|
+
ggml_tensor * gk,
|
|
368
|
+
ggml_tensor * beta,
|
|
369
|
+
ggml_tensor * state,
|
|
370
|
+
ggml_tensor * causal_mask,
|
|
371
|
+
ggml_tensor * identity,
|
|
372
|
+
ggml_tensor * diag_mask,
|
|
373
|
+
int il);
|
|
374
|
+
|
|
288
375
|
const llama_model & model;
|
|
376
|
+
};
|
|
289
377
|
|
|
378
|
+
template <bool iswa>
|
|
379
|
+
struct llm_build_lfm2 : public llm_graph_context {
|
|
290
380
|
llm_build_lfm2(const llama_model & model, const llm_graph_params & params);
|
|
291
|
-
ggml_tensor * build_moe_feed_forward(ggml_tensor * cur, int il) const;
|
|
292
|
-
ggml_tensor * build_dense_feed_forward(ggml_tensor * cur, int il) const;
|
|
293
|
-
ggml_tensor * build_attn_block(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv * inp_attn, int il) const;
|
|
294
|
-
ggml_tensor * build_shortconv_block(ggml_tensor * cur, llm_graph_input_rs * inp_recr, int il);
|
|
295
|
-
|
|
296
381
|
};
|
|
297
382
|
|
|
298
383
|
struct llm_build_llada : public llm_graph_context {
|
|
@@ -316,7 +401,7 @@ struct llm_build_maincoder : public llm_graph_context {
|
|
|
316
401
|
llm_build_maincoder(const llama_model & model, const llm_graph_params & params);
|
|
317
402
|
};
|
|
318
403
|
|
|
319
|
-
struct llm_build_mamba : public
|
|
404
|
+
struct llm_build_mamba : public llm_build_mamba_base {
|
|
320
405
|
llm_build_mamba(const llama_model & model, const llm_graph_params & params);
|
|
321
406
|
};
|
|
322
407
|
|
|
@@ -348,17 +433,21 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
348
433
|
llm_build_nemotron(const llama_model & model, const llm_graph_params & params);
|
|
349
434
|
};
|
|
350
435
|
|
|
351
|
-
struct llm_build_nemotron_h : public
|
|
436
|
+
struct llm_build_nemotron_h : public llm_build_mamba_base {
|
|
352
437
|
llm_build_nemotron_h(const llama_model & model, const llm_graph_params & params);
|
|
353
|
-
ggml_tensor * build_ffn_layer(ggml_tensor * cur, const llama_model & model,
|
|
438
|
+
ggml_tensor * build_ffn_layer(ggml_tensor * cur, const llama_model & model, int il);
|
|
354
439
|
ggml_tensor * build_attention_layer(ggml_tensor * cur, llm_graph_input_attn_kv * inp_attn,
|
|
355
|
-
const llama_model & model,
|
|
440
|
+
const llama_model & model, int64_t n_embd_head, int il);
|
|
356
441
|
};
|
|
357
442
|
|
|
358
443
|
struct llm_build_neo_bert : public llm_graph_context {
|
|
359
444
|
llm_build_neo_bert(const llama_model & model, const llm_graph_params & params);
|
|
360
445
|
};
|
|
361
446
|
|
|
447
|
+
struct llm_build_eurobert : public llm_graph_context {
|
|
448
|
+
llm_build_eurobert(const llama_model & model, const llm_graph_params & params);
|
|
449
|
+
};
|
|
450
|
+
|
|
362
451
|
template <bool iswa>
|
|
363
452
|
struct llm_build_olmo2 : public llm_graph_context {
|
|
364
453
|
llm_build_olmo2(const llama_model & model, const llm_graph_params & params);
|
|
@@ -397,7 +486,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
397
486
|
llm_build_phi3(const llama_model & model, const llm_graph_params & params);
|
|
398
487
|
};
|
|
399
488
|
|
|
400
|
-
struct llm_build_plamo2 : public
|
|
489
|
+
struct llm_build_plamo2 : public llm_build_mamba_base {
|
|
401
490
|
llm_build_plamo2(const llama_model & model, const llm_graph_params & params);
|
|
402
491
|
private:
|
|
403
492
|
ggml_tensor * build_plamo2_mamba_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il);
|
|
@@ -445,7 +534,8 @@ struct llm_build_qwen3vl : public llm_graph_context {
|
|
|
445
534
|
struct llm_build_qwen3vlmoe : public llm_graph_context {
|
|
446
535
|
llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params);
|
|
447
536
|
};
|
|
448
|
-
|
|
537
|
+
|
|
538
|
+
struct llm_build_qwen3next : public llm_build_delta_net_base {
|
|
449
539
|
llm_build_qwen3next(const llama_model & model, const llm_graph_params & params);
|
|
450
540
|
private:
|
|
451
541
|
ggml_tensor * build_layer_attn(
|
|
@@ -457,37 +547,78 @@ private:
|
|
|
457
547
|
ggml_tensor * build_layer_attn_linear(
|
|
458
548
|
llm_graph_input_rs * inp,
|
|
459
549
|
ggml_tensor * cur,
|
|
460
|
-
ggml_tensor * causal_mask,
|
|
461
|
-
ggml_tensor * identity,
|
|
462
|
-
ggml_tensor * diag_mask,
|
|
463
550
|
int il);
|
|
464
551
|
|
|
465
552
|
ggml_tensor * build_layer_ffn(
|
|
466
553
|
ggml_tensor * cur,
|
|
467
554
|
int il);
|
|
468
555
|
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
ggml_tensor *
|
|
472
|
-
ggml_tensor *
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
ggml_tensor *
|
|
478
|
-
ggml_tensor * identity,
|
|
479
|
-
ggml_tensor * diag_mask,
|
|
556
|
+
ggml_tensor * build_norm_gated(
|
|
557
|
+
ggml_tensor * input,
|
|
558
|
+
ggml_tensor * weights,
|
|
559
|
+
ggml_tensor * gate,
|
|
560
|
+
int layer);
|
|
561
|
+
|
|
562
|
+
// returns pair of qkv, z
|
|
563
|
+
std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
|
|
564
|
+
ggml_tensor * input,
|
|
480
565
|
int il);
|
|
481
566
|
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
567
|
+
const llama_model & model;
|
|
568
|
+
};
|
|
569
|
+
|
|
570
|
+
struct llm_build_qwen35 : public llm_build_delta_net_base {
|
|
571
|
+
llm_build_qwen35(const llama_model & model, const llm_graph_params & params);
|
|
572
|
+
private:
|
|
573
|
+
ggml_tensor * build_layer_attn(
|
|
574
|
+
llm_graph_input_attn_kv * inp_attn,
|
|
575
|
+
ggml_tensor * cur,
|
|
576
|
+
ggml_tensor * inp_pos,
|
|
577
|
+
int * sections,
|
|
578
|
+
int il);
|
|
579
|
+
|
|
580
|
+
ggml_tensor * build_layer_attn_linear(
|
|
581
|
+
llm_graph_input_rs * inp,
|
|
582
|
+
ggml_tensor * cur,
|
|
583
|
+
int il);
|
|
584
|
+
|
|
585
|
+
ggml_tensor * build_layer_ffn(
|
|
586
|
+
ggml_tensor * cur,
|
|
587
|
+
int il);
|
|
588
|
+
|
|
589
|
+
ggml_tensor * build_norm_gated(
|
|
590
|
+
ggml_tensor * input,
|
|
591
|
+
ggml_tensor * weights,
|
|
592
|
+
ggml_tensor * gate,
|
|
593
|
+
int layer);
|
|
594
|
+
|
|
595
|
+
// returns pair of qkv, z
|
|
596
|
+
std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
|
|
597
|
+
ggml_tensor * input,
|
|
598
|
+
int il);
|
|
599
|
+
|
|
600
|
+
const llama_model & model;
|
|
601
|
+
};
|
|
602
|
+
|
|
603
|
+
// TODO: derive llm_build_delta_net_base instead
|
|
604
|
+
struct llm_build_qwen35moe : public llm_build_delta_net_base {
|
|
605
|
+
llm_build_qwen35moe(const llama_model & model, const llm_graph_params & params);
|
|
606
|
+
private:
|
|
607
|
+
ggml_tensor * build_layer_attn(
|
|
608
|
+
llm_graph_input_attn_kv * inp_attn,
|
|
609
|
+
ggml_tensor * cur,
|
|
610
|
+
ggml_tensor * inp_pos,
|
|
611
|
+
int * sections,
|
|
612
|
+
int il);
|
|
613
|
+
|
|
614
|
+
ggml_tensor * build_layer_attn_linear(
|
|
615
|
+
llm_graph_input_rs * inp,
|
|
616
|
+
ggml_tensor * cur,
|
|
617
|
+
int il);
|
|
618
|
+
|
|
619
|
+
ggml_tensor * build_layer_ffn(
|
|
620
|
+
ggml_tensor * cur,
|
|
621
|
+
int il);
|
|
491
622
|
|
|
492
623
|
ggml_tensor * build_norm_gated(
|
|
493
624
|
ggml_tensor * input,
|
|
@@ -552,6 +683,10 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
552
683
|
llm_build_starcoder(const llama_model & model, const llm_graph_params & params);
|
|
553
684
|
};
|
|
554
685
|
|
|
686
|
+
struct llm_build_step35_iswa : public llm_graph_context {
|
|
687
|
+
llm_build_step35_iswa(const llama_model & model, const llm_graph_params & params);
|
|
688
|
+
};
|
|
689
|
+
|
|
555
690
|
struct llm_build_t5_dec : public llm_graph_context {
|
|
556
691
|
llm_build_t5_dec(const llama_model & model, const llm_graph_params & params);
|
|
557
692
|
};
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
3
|
llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
4
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
4
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
5
5
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6
6
|
|
|
7
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
8
8
|
|
|
9
9
|
ggml_tensor * cur;
|
|
10
10
|
ggml_tensor * inpL;
|
|
@@ -104,13 +104,6 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll
|
|
|
104
104
|
LLM_NORM, -1);
|
|
105
105
|
cb(cur, "final_norm_out", -1);
|
|
106
106
|
|
|
107
|
-
if (hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
|
108
|
-
// extracting cls token
|
|
109
|
-
cur = ggml_view_1d(ctx0, cur, hparams.n_embd, 0);
|
|
110
|
-
cb(cur, "cls_pooled_embd", -1);
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
cb(cur, "res_embd", -1);
|
|
114
107
|
res->t_embd = cur;
|
|
115
108
|
ggml_build_forward_expand(gf, cur);
|
|
116
109
|
}
|
|
@@ -3,10 +3,10 @@
|
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
llm_build_mpt::llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
6
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
7
7
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
8
8
|
|
|
9
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
9
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
10
10
|
|
|
11
11
|
ggml_tensor * cur;
|
|
12
12
|
ggml_tensor * pos;
|
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
3
|
llm_build_nemotron_h::llm_build_nemotron_h(const llama_model & model, const llm_graph_params & params) :
|
|
6
|
-
|
|
7
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
8
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
4
|
+
llm_build_mamba_base(params) {
|
|
5
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
6
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
9
7
|
|
|
10
8
|
ggml_tensor * cur;
|
|
11
9
|
ggml_tensor * inpL;
|
|
@@ -65,9 +63,9 @@ llm_build_nemotron_h::llm_build_nemotron_h(const llama_model & model, const llm_
|
|
|
65
63
|
ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor * cur,
|
|
66
64
|
llm_graph_input_attn_kv * inp_attn,
|
|
67
65
|
const llama_model & model,
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
// compute Q and K
|
|
66
|
+
int64_t n_embd_head,
|
|
67
|
+
int il) {
|
|
68
|
+
// compute Q and K
|
|
71
69
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
72
70
|
cb(Qcur, "Qcur", il);
|
|
73
71
|
if (model.layers[il].bq) {
|
|
@@ -106,7 +104,7 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
|
|
|
106
104
|
return cur;
|
|
107
105
|
}
|
|
108
106
|
|
|
109
|
-
ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model,
|
|
107
|
+
ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, int il) {
|
|
110
108
|
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
111
109
|
cur = build_ffn(cur,
|
|
112
110
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
@@ -116,9 +114,18 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla
|
|
|
116
114
|
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
|
117
115
|
cb(cur, "ffn_out", il);
|
|
118
116
|
} else {
|
|
119
|
-
ggml_tensor *
|
|
117
|
+
ggml_tensor * inp_emb = cur;
|
|
118
|
+
ggml_tensor * inp_latent = cur;
|
|
119
|
+
|
|
120
|
+
if (model.layers[il].ffn_latent_down) {
|
|
121
|
+
inp_latent = ggml_mul_mat(ctx0, model.layers[il].ffn_latent_down, cur);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
ggml_tensor * router_logits = build_lora_mm(model.layers[il].ffn_gate_inp, cur);
|
|
125
|
+
cb(router_logits, "ffn_moe_logits", il);
|
|
126
|
+
|
|
120
127
|
ggml_tensor * moe_out =
|
|
121
|
-
build_moe_ffn(
|
|
128
|
+
build_moe_ffn(inp_latent,
|
|
122
129
|
model.layers[il].ffn_gate_inp,
|
|
123
130
|
model.layers[il].ffn_up_exps,
|
|
124
131
|
nullptr, // no gate
|
|
@@ -126,12 +133,17 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla
|
|
|
126
133
|
model.layers[il].ffn_exp_probs_b,
|
|
127
134
|
n_expert, n_expert_used,
|
|
128
135
|
LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
|
|
129
|
-
|
|
136
|
+
hparams.expert_weights_scale,
|
|
130
137
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
|
|
131
|
-
il
|
|
138
|
+
il,
|
|
139
|
+
router_logits);
|
|
132
140
|
cb(moe_out, "ffn_moe_out", il);
|
|
133
141
|
|
|
134
|
-
|
|
142
|
+
if (model.layers[il].ffn_latent_up) {
|
|
143
|
+
moe_out = ggml_mul_mat(ctx0, model.layers[il].ffn_latent_up, moe_out);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
ggml_tensor * ffn_shexp = build_ffn(inp_emb,
|
|
135
147
|
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
136
148
|
NULL /* no gate */ , NULL, NULL,
|
|
137
149
|
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
3
|
llm_build_nemotron::llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
4
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
4
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
5
5
|
|
|
6
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7
|
-
//GGML_ASSERT(n_embd_head ==
|
|
6
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
7
|
+
//GGML_ASSERT(n_embd_head == n_rot);
|
|
8
8
|
|
|
9
9
|
ggml_tensor * cur;
|
|
10
10
|
ggml_tensor * inpL;
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
3
|
llm_build_neo_bert::llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
4
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
4
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
5
5
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6
6
|
|
|
7
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
8
8
|
|
|
9
9
|
ggml_tensor * cur;
|
|
10
10
|
ggml_tensor * inpL;
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
3
|
llm_build_olmo::llm_build_olmo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
4
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
4
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
5
5
|
|
|
6
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7
|
-
GGML_ASSERT(n_embd_head ==
|
|
6
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
7
|
+
GGML_ASSERT(n_embd_head == n_rot);
|
|
8
8
|
|
|
9
9
|
ggml_tensor * cur;
|
|
10
10
|
ggml_tensor * inpL;
|
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
template <bool iswa>
|
|
4
4
|
llm_build_olmo2<iswa>::llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
5
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
6
6
|
|
|
7
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
8
|
-
GGML_ASSERT(n_embd_head ==
|
|
7
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
8
|
+
GGML_ASSERT(n_embd_head == n_rot);
|
|
9
9
|
|
|
10
10
|
ggml_tensor * cur;
|
|
11
11
|
ggml_tensor * inpL;
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
3
|
llm_build_olmoe::llm_build_olmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
4
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
4
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
5
5
|
|
|
6
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7
|
-
GGML_ASSERT(n_embd_head ==
|
|
6
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
7
|
+
GGML_ASSERT(n_embd_head == n_rot);
|
|
8
8
|
|
|
9
9
|
ggml_tensor * cur;
|
|
10
10
|
ggml_tensor * inpL;
|
|
@@ -92,7 +92,7 @@ llm_build_olmoe::llm_build_olmoe(const llama_model & model, const llm_graph_para
|
|
|
92
92
|
nullptr,
|
|
93
93
|
n_expert, n_expert_used,
|
|
94
94
|
LLM_FFN_SILU, false,
|
|
95
|
-
|
|
95
|
+
hparams.expert_weights_scale,
|
|
96
96
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
97
97
|
il);
|
|
98
98
|
cb(cur, "ffn_moe_out", il);
|
|
@@ -95,7 +95,7 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
|
|
|
95
95
|
nullptr,
|
|
96
96
|
n_expert, n_expert_used,
|
|
97
97
|
LLM_FFN_SWIGLU_OAI_MOE, false,
|
|
98
|
-
|
|
98
|
+
hparams.expert_weights_scale,
|
|
99
99
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
|
|
100
100
|
il);
|
|
101
101
|
cb(cur, "ffn_moe_out", il);
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
3
|
llm_build_openelm::llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
4
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
4
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
5
5
|
|
|
6
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
6
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
7
7
|
|
|
8
8
|
ggml_tensor * cur;
|
|
9
9
|
ggml_tensor * inpL;
|
|
@@ -43,7 +43,7 @@ llm_build_openelm::llm_build_openelm(const llama_model & model, const llm_graph_
|
|
|
43
43
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head);
|
|
44
44
|
cb(Kcur, "Kcur", il);
|
|
45
45
|
|
|
46
|
-
ggml_tensor * Vcur =
|
|
46
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv));
|
|
47
47
|
cb(Vcur, "Vcur", il);
|
|
48
48
|
|
|
49
49
|
Qcur = build_norm(Qcur,
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
3
|
llm_build_orion::llm_build_orion(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
4
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
4
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
5
5
|
|
|
6
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7
|
-
GGML_ASSERT(n_embd_head ==
|
|
6
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
7
|
+
GGML_ASSERT(n_embd_head == n_rot);
|
|
8
8
|
|
|
9
9
|
ggml_tensor * cur;
|
|
10
10
|
ggml_tensor * inpL;
|