whispercpp 1.3.5 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +99 -2
- data/ext/extconf.rb +1 -0
- data/ext/ruby_whisper.c +20 -4
- data/ext/ruby_whisper.h +30 -2
- data/ext/ruby_whisper_context.c +216 -124
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_model.c +0 -1
- data/ext/ruby_whisper_params.c +0 -1
- data/ext/ruby_whisper_segment.c +0 -1
- data/ext/ruby_whisper_token.c +29 -9
- data/ext/ruby_whisper_transcribe.cpp +4 -1
- data/ext/ruby_whisper_vad_context.c +48 -1
- data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +0 -1
- data/ext/ruby_whisper_vad_segments.c +0 -1
- data/ext/sources/CMakeLists.txt +1 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/examples/bench/bench.cpp +23 -18
- data/ext/sources/examples/cli/cli.cpp +8 -0
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/server/server.cpp +18 -4
- data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
- data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
- data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
- data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
- data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
- data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
- data/ext/sources/examples/talk-llama/llama-context.h +27 -28
- data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
- data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
- data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
- data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
- data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
- data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
- data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
- data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
- data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
- data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
- data/ext/sources/examples/talk-llama/llama-model.h +72 -19
- data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
- data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
- data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
- data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
- data/ext/sources/examples/talk-llama/llama.cpp +76 -22
- data/ext/sources/examples/talk-llama/llama.h +63 -30
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
- data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
- data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
- data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
- data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
- data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/models.h +181 -46
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
- data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
- data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
- data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
- data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
- data/ext/sources/ggml/CMakeLists.txt +9 -3
- data/ext/sources/ggml/include/ggml-backend.h +1 -1
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +5 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +6 -1
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml.h +56 -9
- data/ext/sources/ggml/src/CMakeLists.txt +3 -0
- data/ext/sources/ggml/src/ggml-alloc.c +4 -9
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
- data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
- data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
- data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
- data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
- data/ext/sources/ggml/src/ggml-impl.h +62 -0
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
- data/ext/sources/ggml/src/ggml-quants.c +96 -5
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
- data/ext/sources/ggml/src/ggml.c +167 -33
- data/ext/sources/ggml/src/gguf.cpp +229 -44
- data/ext/sources/src/whisper.cpp +6 -28
- data/sig/whisper.rbs +43 -2
- data/test/test_context_params.rb +82 -0
- data/test/test_token.rb +11 -0
- data/test/test_vad_context.rb +58 -8
- data/test/test_whisper.rb +20 -0
- data/whispercpp.gemspec +1 -1
- metadata +240 -28
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
|
@@ -1,11 +1,17 @@
|
|
|
1
1
|
#include "llama-model-loader.h"
|
|
2
2
|
|
|
3
|
+
#include "ggml-alloc.h"
|
|
3
4
|
#include "ggml.h"
|
|
5
|
+
#include "gguf.h"
|
|
6
|
+
#include "llama-hparams.h"
|
|
4
7
|
|
|
8
|
+
#include <algorithm>
|
|
5
9
|
#include <array>
|
|
6
10
|
#include <cinttypes>
|
|
11
|
+
#include <cstdint>
|
|
7
12
|
#include <cstring>
|
|
8
13
|
#include <future>
|
|
14
|
+
#include <regex>
|
|
9
15
|
|
|
10
16
|
static const size_t kiB = 1024;
|
|
11
17
|
static const size_t MiB = 1024*kiB;
|
|
@@ -36,6 +42,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
|
36
42
|
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
|
|
37
43
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
|
38
44
|
case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE";
|
|
45
|
+
case LLAMA_FTYPE_MOSTLY_NVFP4: return "NVFP4";
|
|
39
46
|
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
|
|
40
47
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
|
|
41
48
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
|
@@ -262,7 +269,7 @@ namespace GGUFMeta {
|
|
|
262
269
|
template<typename T>
|
|
263
270
|
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
|
264
271
|
llama_model_loader::get_arr_n(const std::string & key, T & result, bool required) {
|
|
265
|
-
const int kid = gguf_find_key(
|
|
272
|
+
const int kid = gguf_find_key(metadata, key.c_str());
|
|
266
273
|
|
|
267
274
|
if (kid < 0) {
|
|
268
275
|
if (required) {
|
|
@@ -272,7 +279,7 @@ namespace GGUFMeta {
|
|
|
272
279
|
}
|
|
273
280
|
|
|
274
281
|
struct GGUFMeta::ArrayInfo arr_info =
|
|
275
|
-
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(
|
|
282
|
+
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid);
|
|
276
283
|
|
|
277
284
|
|
|
278
285
|
result = arr_info.length;
|
|
@@ -289,7 +296,7 @@ namespace GGUFMeta {
|
|
|
289
296
|
|
|
290
297
|
template<typename T>
|
|
291
298
|
bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) {
|
|
292
|
-
const gguf_context * ctx =
|
|
299
|
+
const gguf_context * ctx = metadata;
|
|
293
300
|
const int kid = gguf_find_key(ctx, key.c_str());
|
|
294
301
|
|
|
295
302
|
if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
|
|
@@ -330,7 +337,7 @@ namespace GGUFMeta {
|
|
|
330
337
|
|
|
331
338
|
template<typename T, size_t N_MAX>
|
|
332
339
|
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
|
|
333
|
-
const gguf_context * ctx =
|
|
340
|
+
const gguf_context * ctx = metadata;
|
|
334
341
|
const int kid = gguf_find_key(ctx, key.c_str());
|
|
335
342
|
|
|
336
343
|
if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
|
|
@@ -344,6 +351,7 @@ namespace GGUFMeta {
|
|
|
344
351
|
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
|
|
345
352
|
|
|
346
353
|
switch (arr_info.gt) {
|
|
354
|
+
case GGUF_TYPE_BOOL:
|
|
347
355
|
case GGUF_TYPE_UINT32:
|
|
348
356
|
case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
|
|
349
357
|
(std::is_same<T, uint32_t>::value)); break;
|
|
@@ -365,7 +373,13 @@ namespace GGUFMeta {
|
|
|
365
373
|
result[i] = value;
|
|
366
374
|
}
|
|
367
375
|
} else {
|
|
368
|
-
|
|
376
|
+
if (arr_info.gt == GGUF_TYPE_BOOL) {
|
|
377
|
+
std::transform((const bool *)arr_info.data, (const bool *)arr_info.data + arr_info.length, result.begin(), [](bool x) {
|
|
378
|
+
return static_cast<T>(x);
|
|
379
|
+
});
|
|
380
|
+
} else {
|
|
381
|
+
std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
|
|
382
|
+
}
|
|
369
383
|
}
|
|
370
384
|
|
|
371
385
|
return true;
|
|
@@ -385,7 +399,7 @@ namespace GGUFMeta {
|
|
|
385
399
|
const struct llama_model_kv_override * override =
|
|
386
400
|
it != kv_overrides.end() ? &it->second : nullptr;
|
|
387
401
|
|
|
388
|
-
const bool found = GGUFMeta::GKV<T>::set(
|
|
402
|
+
const bool found = GGUFMeta::GKV<T>::set(metadata, key, result, override);
|
|
389
403
|
|
|
390
404
|
if (required && !found) {
|
|
391
405
|
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
|
@@ -419,7 +433,7 @@ namespace GGUFMeta {
|
|
|
419
433
|
// get array of n <= N_MAX elements, or a single element repeated n times
|
|
420
434
|
template<typename T, size_t N_MAX>
|
|
421
435
|
bool llama_model_loader::get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required) {
|
|
422
|
-
const int kid = gguf_find_key(
|
|
436
|
+
const int kid = gguf_find_key(metadata, key.c_str());
|
|
423
437
|
|
|
424
438
|
if (kid < 0) {
|
|
425
439
|
if (required) {
|
|
@@ -432,9 +446,9 @@ namespace GGUFMeta {
|
|
|
432
446
|
throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
|
|
433
447
|
}
|
|
434
448
|
|
|
435
|
-
if (gguf_get_kv_type(
|
|
449
|
+
if (gguf_get_kv_type(metadata, kid) == GGUF_TYPE_ARRAY) {
|
|
436
450
|
struct GGUFMeta::ArrayInfo arr_info =
|
|
437
|
-
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(
|
|
451
|
+
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid);
|
|
438
452
|
|
|
439
453
|
if (n != arr_info.length) {
|
|
440
454
|
throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length));
|
|
@@ -465,7 +479,7 @@ namespace GGUFMeta {
|
|
|
465
479
|
bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
|
|
466
480
|
const std::string key = llm_kv(kid);
|
|
467
481
|
|
|
468
|
-
const int id = gguf_find_key(
|
|
482
|
+
const int id = gguf_find_key(metadata, key.c_str());
|
|
469
483
|
|
|
470
484
|
if (id < 0) {
|
|
471
485
|
if (required) {
|
|
@@ -475,7 +489,7 @@ namespace GGUFMeta {
|
|
|
475
489
|
}
|
|
476
490
|
|
|
477
491
|
// throw and error if type is an array
|
|
478
|
-
if (gguf_get_kv_type(
|
|
492
|
+
if (gguf_get_kv_type(metadata, id) == GGUF_TYPE_ARRAY) {
|
|
479
493
|
if (required) {
|
|
480
494
|
throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
|
|
481
495
|
}
|
|
@@ -492,6 +506,9 @@ namespace GGUFMeta {
|
|
|
492
506
|
|
|
493
507
|
|
|
494
508
|
llama_model_loader::llama_model_loader(
|
|
509
|
+
struct gguf_context * meta,
|
|
510
|
+
llama_model_set_tensor_data_t set_tensor_data,
|
|
511
|
+
void * set_tensor_data_ud,
|
|
495
512
|
const std::string & fname,
|
|
496
513
|
std::vector<std::string> & splits,
|
|
497
514
|
bool use_mmap,
|
|
@@ -499,7 +516,8 @@ llama_model_loader::llama_model_loader(
|
|
|
499
516
|
bool check_tensors,
|
|
500
517
|
bool no_alloc,
|
|
501
518
|
const llama_model_kv_override * param_overrides_p,
|
|
502
|
-
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p)
|
|
519
|
+
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p)
|
|
520
|
+
: metadata(meta), set_tensor_data(set_tensor_data), set_tensor_data_ud(set_tensor_data_ud) {
|
|
503
521
|
int trace = 0;
|
|
504
522
|
if (getenv("LLAMA_TRACE")) {
|
|
505
523
|
trace = atoi(getenv("LLAMA_TRACE"));
|
|
@@ -513,130 +531,142 @@ llama_model_loader::llama_model_loader(
|
|
|
513
531
|
|
|
514
532
|
tensor_buft_overrides = param_tensor_buft_overrides_p;
|
|
515
533
|
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
meta.reset(gguf_init_from_file(fname.c_str(), params));
|
|
524
|
-
if (!meta) {
|
|
525
|
-
throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
|
|
526
|
-
}
|
|
534
|
+
if (!fname.empty()) {
|
|
535
|
+
// Load the main GGUF
|
|
536
|
+
struct ggml_context * ctx = NULL;
|
|
537
|
+
struct gguf_init_params params = {
|
|
538
|
+
/*.no_alloc = */ true,
|
|
539
|
+
/*.ctx = */ &ctx,
|
|
540
|
+
};
|
|
527
541
|
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
use_direct_io = use_direct_io && files.back()->has_direct_io();
|
|
542
|
+
metadata_ptr.reset(gguf_init_from_file(fname.c_str(), params));
|
|
543
|
+
metadata = metadata_ptr.get();
|
|
544
|
+
if (metadata == nullptr) {
|
|
545
|
+
throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
|
|
546
|
+
}
|
|
535
547
|
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
use_mmap = false;
|
|
539
|
-
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
|
540
|
-
}
|
|
548
|
+
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
|
549
|
+
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
|
541
550
|
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
// so we build a unified tensors index for weights.
|
|
545
|
-
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
|
546
|
-
std::string tensor_name = std::string(cur->name);
|
|
547
|
-
// make sure there is no duplicated tensor names
|
|
548
|
-
if (weights_map.find(tensor_name) != weights_map.end()) {
|
|
549
|
-
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
|
550
|
-
}
|
|
551
|
-
n_elements += ggml_nelements(cur);
|
|
552
|
-
n_bytes += ggml_nbytes(cur);
|
|
553
|
-
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur));
|
|
554
|
-
}
|
|
555
|
-
uint16_t n_split = 0;
|
|
556
|
-
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
|
551
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
|
552
|
+
contexts.emplace_back(ctx);
|
|
557
553
|
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
|
|
566
|
-
}
|
|
554
|
+
if (use_mmap && use_direct_io) {
|
|
555
|
+
if (files.back()->has_direct_io()) {
|
|
556
|
+
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
|
557
|
+
use_mmap = false;
|
|
558
|
+
} else {
|
|
559
|
+
LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
|
|
560
|
+
use_direct_io = false;
|
|
567
561
|
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
562
|
+
// reopen file using std::fopen for mmap
|
|
563
|
+
files.pop_back();
|
|
564
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb", false));
|
|
565
|
+
}
|
|
571
566
|
}
|
|
572
567
|
|
|
573
|
-
//
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
568
|
+
// Save tensors data offset of the main file.
|
|
569
|
+
// For subsidiary files, `meta` tensor data offset must not be used,
|
|
570
|
+
// so we build a unified tensors index for weights.
|
|
571
|
+
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
|
572
|
+
std::string tensor_name = std::string(cur->name);
|
|
573
|
+
// make sure there is no duplicated tensor names
|
|
574
|
+
if (weights_map.find(tensor_name) != weights_map.end()) {
|
|
575
|
+
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
|
576
|
+
}
|
|
577
|
+
n_elements += ggml_nelements(cur);
|
|
578
|
+
n_bytes += ggml_nbytes(cur);
|
|
579
|
+
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur));
|
|
580
|
+
}
|
|
581
|
+
uint16_t n_split = 0;
|
|
582
|
+
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
|
583
|
+
|
|
584
|
+
// Load additional GGML contexts
|
|
585
|
+
if (n_split > 1) {
|
|
586
|
+
// make sure the main file is loaded first
|
|
587
|
+
uint16_t idx = 0;
|
|
588
|
+
const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
|
|
589
|
+
get_key(kv_split_no, idx);
|
|
590
|
+
if (idx != 0) {
|
|
591
|
+
throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
|
|
592
|
+
}
|
|
577
593
|
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
594
|
+
// generate list of splits if needed
|
|
595
|
+
if (splits.empty()) {
|
|
596
|
+
splits = llama_get_list_splits(fname, idx, n_split);
|
|
597
|
+
}
|
|
581
598
|
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
599
|
+
// in case user give a custom list of splits, check if it matches the expected number
|
|
600
|
+
if (n_split != (uint16_t)splits.size()) {
|
|
601
|
+
throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
|
|
602
|
+
}
|
|
585
603
|
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
/*.ctx = */ &ctx,
|
|
589
|
-
};
|
|
590
|
-
gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
|
|
591
|
-
if (!ctx_gguf) {
|
|
592
|
-
throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
|
|
604
|
+
if (trace > 0) {
|
|
605
|
+
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
|
|
593
606
|
}
|
|
594
607
|
|
|
595
|
-
//
|
|
596
|
-
{
|
|
597
|
-
const
|
|
598
|
-
|
|
599
|
-
|
|
608
|
+
// load other splits
|
|
609
|
+
for (idx = 1; idx < n_split; idx++) {
|
|
610
|
+
const char * fname_split = splits[idx].c_str();
|
|
611
|
+
|
|
612
|
+
struct gguf_init_params split_params = {
|
|
613
|
+
/*.no_alloc = */ true,
|
|
614
|
+
/*.ctx = */ &ctx,
|
|
615
|
+
};
|
|
616
|
+
gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
|
|
617
|
+
if (!ctx_gguf) {
|
|
618
|
+
throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
|
|
600
619
|
}
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
620
|
+
|
|
621
|
+
// check idx
|
|
622
|
+
{
|
|
623
|
+
const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
|
|
624
|
+
if (kid < 0) {
|
|
625
|
+
throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
|
|
626
|
+
}
|
|
627
|
+
int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
|
|
628
|
+
if (idx_gguf != idx) {
|
|
629
|
+
throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
|
|
630
|
+
}
|
|
604
631
|
}
|
|
605
|
-
}
|
|
606
632
|
|
|
607
|
-
|
|
608
|
-
|
|
633
|
+
files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
|
|
634
|
+
contexts.emplace_back(ctx);
|
|
609
635
|
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
636
|
+
// Save tensors data offset info of the shard.
|
|
637
|
+
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
|
638
|
+
std::string tensor_name = std::string(cur->name);
|
|
639
|
+
// make sure there is no duplicated tensor names
|
|
640
|
+
if (weights_map.find(tensor_name) != weights_map.end()) {
|
|
641
|
+
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
|
642
|
+
}
|
|
643
|
+
n_elements += ggml_nelements(cur);
|
|
644
|
+
n_bytes += ggml_nbytes(cur);
|
|
645
|
+
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
|
|
616
646
|
}
|
|
617
|
-
n_elements += ggml_nelements(cur);
|
|
618
|
-
n_bytes += ggml_nbytes(cur);
|
|
619
|
-
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
|
|
620
647
|
}
|
|
621
|
-
}
|
|
622
648
|
|
|
623
|
-
|
|
649
|
+
get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
|
|
624
650
|
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
651
|
+
// sanity check
|
|
652
|
+
{
|
|
653
|
+
const int n_tensors_loaded = (int) weights_map.size();
|
|
654
|
+
if (n_tensors != n_tensors_loaded) {
|
|
655
|
+
throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
|
|
656
|
+
}
|
|
630
657
|
}
|
|
631
|
-
}
|
|
632
658
|
|
|
633
|
-
|
|
659
|
+
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
|
|
660
|
+
}
|
|
661
|
+
} else {
|
|
662
|
+
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
|
663
|
+
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
|
634
664
|
}
|
|
635
665
|
|
|
636
|
-
n_kv = gguf_get_n_kv(
|
|
666
|
+
n_kv = gguf_get_n_kv(metadata);
|
|
637
667
|
n_tensors = weights_map.size();
|
|
638
668
|
|
|
639
|
-
fver = (enum llama_fver) gguf_get_version(
|
|
669
|
+
fver = (enum llama_fver) gguf_get_version(metadata);
|
|
640
670
|
|
|
641
671
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
|
642
672
|
__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
|
|
@@ -695,6 +725,7 @@ llama_model_loader::llama_model_loader(
|
|
|
695
725
|
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
|
696
726
|
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
|
697
727
|
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
|
728
|
+
case GGML_TYPE_NVFP4: ftype = LLAMA_FTYPE_MOSTLY_NVFP4; break;
|
|
698
729
|
default:
|
|
699
730
|
{
|
|
700
731
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
|
@@ -715,14 +746,14 @@ llama_model_loader::llama_model_loader(
|
|
|
715
746
|
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
|
716
747
|
|
|
717
748
|
for (int i = 0; i < n_kv; i++) {
|
|
718
|
-
const char * name = gguf_get_key(
|
|
719
|
-
const enum gguf_type type = gguf_get_kv_type(
|
|
749
|
+
const char * name = gguf_get_key(metadata, i);
|
|
750
|
+
const enum gguf_type type = gguf_get_kv_type(metadata, i);
|
|
720
751
|
const std::string type_name =
|
|
721
752
|
type == GGUF_TYPE_ARRAY
|
|
722
|
-
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(
|
|
753
|
+
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(metadata, i)), gguf_get_arr_n(metadata, i))
|
|
723
754
|
: gguf_type_name(type);
|
|
724
755
|
|
|
725
|
-
std::string value = gguf_kv_to_str(
|
|
756
|
+
std::string value = gguf_kv_to_str(metadata, i);
|
|
726
757
|
const size_t MAX_VALUE_LEN = 40;
|
|
727
758
|
if (value.size() > MAX_VALUE_LEN) {
|
|
728
759
|
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
|
@@ -824,15 +855,382 @@ const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::stri
|
|
|
824
855
|
return cur;
|
|
825
856
|
}
|
|
826
857
|
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
858
|
+
// checks if the weight tensor can be used with the specified buffer type and device
|
|
859
|
+
static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
|
|
860
|
+
GGML_ASSERT(w != nullptr);
|
|
861
|
+
|
|
862
|
+
if (op == GGML_OP_NONE) {
|
|
863
|
+
return true;
|
|
864
|
+
}
|
|
865
|
+
|
|
866
|
+
ggml_init_params params = {
|
|
867
|
+
/*.mem_size =*/ ggml_tensor_overhead()*8,
|
|
868
|
+
/*.mem_buffer =*/ NULL,
|
|
869
|
+
/*.no_alloc =*/ true,
|
|
870
|
+
};
|
|
871
|
+
ggml_context_ptr ctx_ptr { ggml_init(params) };
|
|
872
|
+
if (!ctx_ptr) {
|
|
873
|
+
throw std::runtime_error(format("failed to create ggml context"));
|
|
874
|
+
}
|
|
875
|
+
ggml_context * ctx = ctx_ptr.get();
|
|
876
|
+
|
|
877
|
+
ggml_tensor * op_tensor = nullptr;
|
|
878
|
+
|
|
879
|
+
switch (op) {
|
|
880
|
+
case GGML_OP_GET_ROWS:
|
|
881
|
+
{
|
|
882
|
+
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
|
|
883
|
+
op_tensor = ggml_get_rows(ctx, w, b);
|
|
884
|
+
} break;
|
|
885
|
+
case GGML_OP_MUL_MAT:
|
|
886
|
+
{
|
|
887
|
+
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
|
|
888
|
+
op_tensor = ggml_mul_mat(ctx, w, b);
|
|
889
|
+
} break;
|
|
890
|
+
case GGML_OP_MUL_MAT_ID:
|
|
891
|
+
{
|
|
892
|
+
const int n_expert_used = hparams.n_expert_used;
|
|
893
|
+
GGML_ASSERT(n_expert_used > 0);
|
|
894
|
+
ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
|
|
895
|
+
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
|
|
896
|
+
op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
|
|
897
|
+
} break;
|
|
898
|
+
case GGML_OP_ADD:
|
|
899
|
+
{
|
|
900
|
+
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
|
901
|
+
op_tensor = ggml_add(ctx, a, w);
|
|
902
|
+
} break;
|
|
903
|
+
case GGML_OP_ADD_ID:
|
|
904
|
+
{
|
|
905
|
+
const int n_expert_used = hparams.n_expert_used;
|
|
906
|
+
GGML_ASSERT(n_expert_used > 0);
|
|
907
|
+
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
|
|
908
|
+
ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
|
|
909
|
+
op_tensor = ggml_add_id(ctx, a, w, c);
|
|
910
|
+
} break;
|
|
911
|
+
case GGML_OP_MUL:
|
|
912
|
+
{
|
|
913
|
+
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
|
914
|
+
op_tensor = ggml_mul(ctx, a, w);
|
|
915
|
+
} break;
|
|
916
|
+
case GGML_OP_DIV:
|
|
917
|
+
{
|
|
918
|
+
ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
|
|
919
|
+
op_tensor = ggml_div(ctx, a, w);
|
|
920
|
+
} break;
|
|
921
|
+
case GGML_OP_ROPE:
|
|
922
|
+
{
|
|
923
|
+
const int n_embd_head = hparams.n_embd_head_v();
|
|
924
|
+
const int n_head = hparams.n_head();
|
|
925
|
+
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
|
|
926
|
+
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
|
|
927
|
+
op_tensor = ggml_rope_ext(
|
|
928
|
+
ctx, a, b, w,
|
|
929
|
+
0, 0, 0, 0, 0,
|
|
930
|
+
0, 0, 0, 0
|
|
931
|
+
);
|
|
932
|
+
|
|
933
|
+
} break;
|
|
934
|
+
case GGML_OP_SSM_CONV:
|
|
935
|
+
{
|
|
936
|
+
const int64_t n_seq_tokens = 512;
|
|
937
|
+
const int64_t n_seqs = 3;
|
|
938
|
+
ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
|
|
939
|
+
op_tensor = ggml_ssm_conv(ctx, conv_x, w);
|
|
940
|
+
} break;
|
|
941
|
+
case GGML_OP_SSM_SCAN:
|
|
942
|
+
{
|
|
943
|
+
// w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
|
|
944
|
+
const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
|
|
945
|
+
const int64_t n_head = w->ne[1];
|
|
946
|
+
const int64_t head_dim = hparams.ssm_d_inner / n_head;
|
|
947
|
+
const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
|
|
948
|
+
const int64_t n_seq_tokens = 512;
|
|
949
|
+
const int64_t n_seqs = 3;
|
|
950
|
+
ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
|
|
951
|
+
ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
|
|
952
|
+
ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
|
|
953
|
+
ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
|
|
954
|
+
ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
|
|
955
|
+
ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
|
|
956
|
+
op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
|
|
957
|
+
} break;
|
|
958
|
+
case GGML_OP_RWKV_WKV6:
|
|
959
|
+
{
|
|
960
|
+
// FIXME
|
|
961
|
+
const int64_t S = 123;
|
|
962
|
+
const int64_t H = 123;
|
|
963
|
+
const int64_t n_tokens = 123;
|
|
964
|
+
const int64_t n_seqs = 123;
|
|
965
|
+
ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
|
966
|
+
ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
|
967
|
+
ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
|
968
|
+
ggml_tensor * tf = w;
|
|
969
|
+
ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
|
970
|
+
ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
|
|
971
|
+
op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
|
|
972
|
+
} break;
|
|
973
|
+
case GGML_OP_IM2COL:
|
|
974
|
+
{
|
|
975
|
+
const int n_embd_inp = hparams.n_embd_inp();
|
|
976
|
+
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
|
|
977
|
+
op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
|
|
978
|
+
} break;
|
|
979
|
+
case GGML_OP_SCALE:
|
|
980
|
+
{
|
|
981
|
+
op_tensor = ggml_scale(ctx, w, 1.0f);
|
|
982
|
+
} break;
|
|
983
|
+
default:
|
|
984
|
+
GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
|
|
985
|
+
}
|
|
986
|
+
|
|
987
|
+
// create a temporary dummy buffer for the weight so that supports_op can check the buffer type
|
|
988
|
+
GGML_ASSERT(w->buffer == nullptr);
|
|
989
|
+
w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
|
|
990
|
+
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
|
|
991
|
+
ggml_backend_buffer_free(w->buffer);
|
|
992
|
+
w->buffer = nullptr;
|
|
993
|
+
|
|
994
|
+
return op_supported;
|
|
995
|
+
}
|
|
996
|
+
|
|
997
|
+
// find the first buffer type in the list that can use the tensor
|
|
998
|
+
static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t * buft_list) {
|
|
999
|
+
GGML_ASSERT(!buft_list->empty());
|
|
1000
|
+
for (const auto & cur : *buft_list) {
|
|
1001
|
+
ggml_backend_dev_t cur_dev = cur.first;
|
|
1002
|
+
ggml_backend_buffer_type_t cur_buft = cur.second;
|
|
1003
|
+
if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
|
|
1004
|
+
return cur_buft;
|
|
1005
|
+
}
|
|
1006
|
+
}
|
|
1007
|
+
|
|
1008
|
+
return nullptr;
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
struct ggml_tensor * llama_model_loader::create_tensor(
|
|
1012
|
+
const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output,
|
|
1013
|
+
const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) {
|
|
1014
|
+
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
|
1015
|
+
auto it = ctx_map.find(buft);
|
|
1016
|
+
if (it == ctx_map.end()) {
|
|
1017
|
+
// one ggml context per buffer type
|
|
1018
|
+
int max_n_tensors = n_tensors;
|
|
1019
|
+
max_n_tensors += 1; // duplicated output tensor
|
|
1020
|
+
max_n_tensors += hparams.n_layer*2; // duplicated rope freq tensors
|
|
1021
|
+
if (files.empty()) {
|
|
1022
|
+
max_n_tensors += hparams.n_layer*256; // this should be well above what any model actually uses
|
|
1023
|
+
}
|
|
1024
|
+
const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
|
|
1025
|
+
|
|
1026
|
+
ggml_init_params params = {
|
|
1027
|
+
/*.mem_size =*/ ctx_size,
|
|
1028
|
+
/*.mem_buffer =*/ NULL,
|
|
1029
|
+
/*.no_alloc =*/ true,
|
|
1030
|
+
};
|
|
1031
|
+
|
|
1032
|
+
ggml_context * ctx = ggml_init(params);
|
|
1033
|
+
if (!ctx) {
|
|
1034
|
+
throw std::runtime_error(format("failed to create ggml context"));
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
ctx_map.emplace(buft, ctx);
|
|
1038
|
+
|
|
1039
|
+
return ctx;
|
|
1040
|
+
}
|
|
1041
|
+
return it->second.get();
|
|
1042
|
+
};
|
|
1043
|
+
|
|
1044
|
+
auto buft_for_tensor = [&](ggml_tensor * t_meta) -> ggml_backend_buffer_type_t {
|
|
1045
|
+
if (!t_meta) {
|
|
1046
|
+
if (flags & TENSOR_NOT_REQUIRED) {
|
|
1047
|
+
return nullptr;
|
|
1048
|
+
}
|
|
1049
|
+
throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
|
|
1050
|
+
}
|
|
1051
|
+
|
|
1052
|
+
// some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
|
|
1053
|
+
// the tensor is duplicated
|
|
1054
|
+
// to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
|
|
1055
|
+
llm_tensor tn_tensor = tn.tensor;
|
|
1056
|
+
if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && (flags & TENSOR_DUPLICATED)) {
|
|
1057
|
+
tn_tensor = LLM_TENSOR_OUTPUT;
|
|
1058
|
+
}
|
|
1059
|
+
|
|
1060
|
+
llm_tensor_info info;
|
|
1061
|
+
try {
|
|
1062
|
+
info = llm_tensor_info_for(tn_tensor);
|
|
1063
|
+
} catch (const std::out_of_range & e) {
|
|
1064
|
+
throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
|
|
1065
|
+
}
|
|
1066
|
+
|
|
1067
|
+
// skip unused tensors
|
|
1068
|
+
if (info.op == GGML_OP_NONE || (flags & TENSOR_SKIP)) {
|
|
1069
|
+
const size_t nbytes = ggml_nbytes(t_meta);
|
|
1070
|
+
LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
|
|
1071
|
+
|
|
1072
|
+
size_data -= nbytes;
|
|
1073
|
+
n_created++;
|
|
1074
|
+
|
|
1075
|
+
return nullptr;
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1078
|
+
// tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
|
|
1079
|
+
ggml_op op;
|
|
1080
|
+
bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
|
|
1081
|
+
if (bias) {
|
|
1082
|
+
if (info.op == GGML_OP_MUL_MAT_ID) {
|
|
1083
|
+
op = GGML_OP_ADD_ID;
|
|
1084
|
+
} else {
|
|
1085
|
+
op = GGML_OP_ADD;
|
|
1086
|
+
}
|
|
1087
|
+
} else {
|
|
1088
|
+
op = info.op;
|
|
1089
|
+
}
|
|
1090
|
+
|
|
1091
|
+
// sanity checks
|
|
1092
|
+
if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
|
|
1093
|
+
if (tn.bid != -1) {
|
|
1094
|
+
GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
|
|
1095
|
+
}
|
|
1096
|
+
} else {
|
|
1097
|
+
if (tn.bid == -1) {
|
|
1098
|
+
GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
|
|
1099
|
+
}
|
|
1100
|
+
}
|
|
1101
|
+
|
|
1102
|
+
// select the buffer type for this tensor
|
|
1103
|
+
const buft_list_t * buft_list;
|
|
1104
|
+
switch (info.layer) {
|
|
1105
|
+
case LLM_TENSOR_LAYER_INPUT:
|
|
1106
|
+
buft_list = buft_list_input;
|
|
1107
|
+
break;
|
|
1108
|
+
case LLM_TENSOR_LAYER_OUTPUT:
|
|
1109
|
+
buft_list = buft_list_output;
|
|
1110
|
+
break;
|
|
1111
|
+
case LLM_TENSOR_LAYER_REPEATING:
|
|
1112
|
+
GGML_ASSERT(buft_list_layer != nullptr);
|
|
1113
|
+
buft_list = buft_list_layer;
|
|
1114
|
+
break;
|
|
1115
|
+
default:
|
|
1116
|
+
GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
|
|
1117
|
+
}
|
|
1118
|
+
|
|
1119
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
|
1120
|
+
|
|
1121
|
+
// check overrides
|
|
1122
|
+
if (tensor_buft_overrides) {
|
|
1123
|
+
std::string tensor_name = tn.str();
|
|
1124
|
+
for (const auto * overrides = tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
|
1125
|
+
std::regex pattern(overrides->pattern);
|
|
1126
|
+
if (std::regex_search(tensor_name, pattern)) {
|
|
1127
|
+
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
|
|
1128
|
+
// when overriding to a CPU buffer, consider the extra buffer types
|
|
1129
|
+
buft = select_weight_buft(hparams, t_meta, op, buft_list_cpu);
|
|
1130
|
+
} else {
|
|
1131
|
+
buft = overrides->buft;
|
|
1132
|
+
}
|
|
1133
|
+
|
|
1134
|
+
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
|
|
1135
|
+
tensor_name.c_str(),
|
|
1136
|
+
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
|
|
1137
|
+
ggml_backend_buft_name(buft));
|
|
1138
|
+
break;
|
|
1139
|
+
}
|
|
1140
|
+
}
|
|
1141
|
+
}
|
|
1142
|
+
|
|
1143
|
+
if (!buft) {
|
|
1144
|
+
buft = select_weight_buft(hparams, t_meta, op, buft_list);
|
|
1145
|
+
if (!buft) {
|
|
1146
|
+
throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
|
|
1147
|
+
}
|
|
1148
|
+
}
|
|
1149
|
+
|
|
1150
|
+
// avoid using a host buffer when using mmap
|
|
1151
|
+
auto * buft_dev = ggml_backend_buft_get_device(buft);
|
|
1152
|
+
if (use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
|
|
1153
|
+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
1154
|
+
if (!cpu_dev) {
|
|
1155
|
+
throw std::runtime_error("no CPU backend found");
|
|
1156
|
+
}
|
|
1157
|
+
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
|
1158
|
+
}
|
|
1159
|
+
|
|
1160
|
+
if (buft != buft_list->front().second) {
|
|
1161
|
+
if (n_tensors_moved == 0) {
|
|
1162
|
+
first_tensor_moved_name = t_meta->name;
|
|
1163
|
+
first_tensor_moved_type_name = ggml_type_name(t_meta->type);
|
|
1164
|
+
first_moved_from_buft = buft_list->front().second;
|
|
1165
|
+
first_moved_to_buft = buft;
|
|
1166
|
+
}
|
|
1167
|
+
n_tensors_moved++;
|
|
1168
|
+
}
|
|
1169
|
+
|
|
1170
|
+
return buft;
|
|
1171
|
+
};
|
|
1172
|
+
|
|
1173
|
+
if (files.empty()) {
|
|
1174
|
+
if (flags & TENSOR_SKIP_IF_VIRTUAL) {
|
|
1175
|
+
return nullptr;
|
|
1176
|
+
}
|
|
1177
|
+
ggml_type type = GGML_TYPE_F32;
|
|
1178
|
+
const int64_t tid = gguf_find_tensor(metadata, tn.str().c_str());
|
|
1179
|
+
if (tid != -1) {
|
|
1180
|
+
type = gguf_get_tensor_type(metadata, tid);
|
|
1181
|
+
}
|
|
1182
|
+
|
|
1183
|
+
// for tensors that are not required some of the dimensions can be invalid:
|
|
1184
|
+
if (flags & TENSOR_NOT_REQUIRED) {
|
|
1185
|
+
for (size_t dim = 0; dim < ne.size(); dim++) {
|
|
1186
|
+
if (ne.begin()[dim] <= 0) {
|
|
1187
|
+
return nullptr;
|
|
1188
|
+
}
|
|
1189
|
+
}
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1192
|
+
ggml_tensor t_meta;
|
|
1193
|
+
memset(&t_meta, 0, sizeof(ggml_tensor));
|
|
1194
|
+
t_meta.type = type;
|
|
1195
|
+
for (size_t dim = 0; dim < GGML_MAX_DIMS; dim++) {
|
|
1196
|
+
t_meta.ne[dim] = dim < ne.size() ? ne.begin()[dim] : 1;
|
|
1197
|
+
GGML_ASSERT(t_meta.ne[dim] >= 1);
|
|
1198
|
+
t_meta.nb[dim] = dim == 0 ? ggml_type_size(type) : t_meta.ne[dim-1]*t_meta.nb[dim-1];
|
|
1199
|
+
GGML_ASSERT(t_meta.nb[dim] >= 1);
|
|
1200
|
+
}
|
|
1201
|
+
ggml_set_name(&t_meta, tn.str().c_str());
|
|
1202
|
+
|
|
1203
|
+
ggml_backend_buffer_type_t buft = buft_for_tensor(&t_meta);
|
|
1204
|
+
GGML_ASSERT(buft != nullptr);
|
|
1205
|
+
ggml_context * ctx = ctx_for_buft(buft);
|
|
1206
|
+
ggml_tensor * ret = ggml_dup_tensor(ctx, &t_meta);
|
|
1207
|
+
ggml_set_name(ret, tn.str().c_str());
|
|
1208
|
+
return ret;
|
|
1209
|
+
}
|
|
1210
|
+
|
|
1211
|
+
ggml_tensor * t_meta = get_tensor_meta(tn.str().c_str());
|
|
1212
|
+
ggml_backend_buffer_type_t buft = buft_for_tensor(t_meta);
|
|
1213
|
+
if (buft == nullptr) {
|
|
1214
|
+
return nullptr; // return type is ggml_tensor *
|
|
1215
|
+
}
|
|
1216
|
+
ggml_context * ctx = ctx_for_buft(buft);
|
|
1217
|
+
|
|
1218
|
+
// if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
|
|
1219
|
+
if (flags & TENSOR_DUPLICATED) {
|
|
1220
|
+
ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
|
|
1221
|
+
if (t) {
|
|
1222
|
+
return t;
|
|
1223
|
+
}
|
|
1224
|
+
}
|
|
1225
|
+
|
|
1226
|
+
LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, tn.str().c_str());
|
|
1227
|
+
const struct ggml_tensor * cur = check_tensor_dims(tn.str(), ne, !(flags & TENSOR_NOT_REQUIRED));
|
|
830
1228
|
|
|
831
1229
|
if (cur == NULL) {
|
|
832
1230
|
return NULL;
|
|
833
1231
|
}
|
|
834
1232
|
|
|
835
|
-
bool duplicated = flags & TENSOR_DUPLICATED;
|
|
1233
|
+
const bool duplicated = flags & TENSOR_DUPLICATED;
|
|
836
1234
|
|
|
837
1235
|
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
|
|
838
1236
|
ggml_set_name(tensor, ggml_get_name(cur));
|
|
@@ -844,7 +1242,6 @@ struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx
|
|
|
844
1242
|
}
|
|
845
1243
|
|
|
846
1244
|
return tensor;
|
|
847
|
-
|
|
848
1245
|
}
|
|
849
1246
|
|
|
850
1247
|
struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required) {
|
|
@@ -879,6 +1276,11 @@ void llama_model_loader::done_getting_tensors() const {
|
|
|
879
1276
|
if (n_created != n_tensors) {
|
|
880
1277
|
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
|
|
881
1278
|
}
|
|
1279
|
+
if (n_tensors_moved > 0) {
|
|
1280
|
+
LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %zu others) cannot be used with preferred buffer type %s, using %s instead\n",
|
|
1281
|
+
__func__, first_tensor_moved_name.c_str(), first_tensor_moved_type_name.c_str(), n_tensors_moved - 1,
|
|
1282
|
+
ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
|
|
1283
|
+
}
|
|
882
1284
|
}
|
|
883
1285
|
|
|
884
1286
|
void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) {
|
|
@@ -960,6 +1362,12 @@ bool llama_model_loader::load_all_data(
|
|
|
960
1362
|
llama_mlocks * lmlocks,
|
|
961
1363
|
llama_progress_callback progress_callback,
|
|
962
1364
|
void * progress_callback_user_data) {
|
|
1365
|
+
if (files.empty()) {
|
|
1366
|
+
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
|
1367
|
+
set_tensor_data(t, set_tensor_data_ud);
|
|
1368
|
+
}
|
|
1369
|
+
return true;
|
|
1370
|
+
}
|
|
963
1371
|
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
|
964
1372
|
|
|
965
1373
|
std::vector<no_init<uint8_t>> read_buf;
|