whispercpp 1.3.5 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +99 -2
- data/ext/extconf.rb +1 -0
- data/ext/ruby_whisper.c +20 -4
- data/ext/ruby_whisper.h +30 -2
- data/ext/ruby_whisper_context.c +216 -124
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_model.c +0 -1
- data/ext/ruby_whisper_params.c +0 -1
- data/ext/ruby_whisper_segment.c +0 -1
- data/ext/ruby_whisper_token.c +29 -9
- data/ext/ruby_whisper_transcribe.cpp +4 -1
- data/ext/ruby_whisper_vad_context.c +48 -1
- data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +0 -1
- data/ext/ruby_whisper_vad_segments.c +0 -1
- data/ext/sources/CMakeLists.txt +1 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/examples/bench/bench.cpp +23 -18
- data/ext/sources/examples/cli/cli.cpp +8 -0
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/server/server.cpp +18 -4
- data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
- data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
- data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
- data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
- data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
- data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
- data/ext/sources/examples/talk-llama/llama-context.h +27 -28
- data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
- data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
- data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
- data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
- data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
- data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
- data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
- data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
- data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
- data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
- data/ext/sources/examples/talk-llama/llama-model.h +72 -19
- data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
- data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
- data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
- data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
- data/ext/sources/examples/talk-llama/llama.cpp +76 -22
- data/ext/sources/examples/talk-llama/llama.h +63 -30
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
- data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
- data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
- data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
- data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
- data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/models.h +181 -46
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
- data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
- data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
- data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
- data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
- data/ext/sources/ggml/CMakeLists.txt +9 -3
- data/ext/sources/ggml/include/ggml-backend.h +1 -1
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +5 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +6 -1
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml.h +56 -9
- data/ext/sources/ggml/src/CMakeLists.txt +3 -0
- data/ext/sources/ggml/src/ggml-alloc.c +4 -9
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
- data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
- data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
- data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
- data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
- data/ext/sources/ggml/src/ggml-impl.h +62 -0
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
- data/ext/sources/ggml/src/ggml-quants.c +96 -5
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
- data/ext/sources/ggml/src/ggml.c +167 -33
- data/ext/sources/ggml/src/gguf.cpp +229 -44
- data/ext/sources/src/whisper.cpp +6 -28
- data/sig/whisper.rbs +43 -2
- data/test/test_context_params.rb +82 -0
- data/test/test_token.rb +11 -0
- data/test/test_vad_context.rb +58 -8
- data/test/test_whisper.rb +20 -0
- data/whispercpp.gemspec +1 -1
- metadata +240 -28
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
#include "ggml-cpu.h"
|
|
6
6
|
#include "ggml-backend.h"
|
|
7
7
|
#include "ggml-opt.h"
|
|
8
|
+
#include "gguf.h"
|
|
8
9
|
|
|
9
10
|
#include <stddef.h>
|
|
10
11
|
#include <stdint.h>
|
|
@@ -152,6 +153,7 @@ extern "C" {
|
|
|
152
153
|
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
|
153
154
|
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
|
154
155
|
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
|
|
156
|
+
LLAMA_FTYPE_MOSTLY_NVFP4 = 39, // except 1d tensors
|
|
155
157
|
|
|
156
158
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
|
157
159
|
};
|
|
@@ -309,7 +311,7 @@ extern "C" {
|
|
|
309
311
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
|
310
312
|
bool vocab_only; // only load the vocabulary, no weights
|
|
311
313
|
bool use_mmap; // use mmap if possible
|
|
312
|
-
bool use_direct_io; // use direct io, takes precedence over use_mmap
|
|
314
|
+
bool use_direct_io; // use direct io, takes precedence over use_mmap when supported
|
|
313
315
|
bool use_mlock; // force system to keep model in RAM
|
|
314
316
|
bool check_tensors; // validate model tensor data
|
|
315
317
|
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
|
@@ -389,6 +391,7 @@ extern "C" {
|
|
|
389
391
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
|
390
392
|
bool pure; // quantize all tensors to the default type
|
|
391
393
|
bool keep_split; // quantize to the same number of shards
|
|
394
|
+
bool dry_run; // calculate and show the final quantization size without performing quantization
|
|
392
395
|
void * imatrix; // pointer to importance matrix data
|
|
393
396
|
void * kv_overrides; // pointer to vector containing overrides
|
|
394
397
|
void * tensor_types; // pointer to vector containing tensor types
|
|
@@ -439,19 +442,30 @@ extern "C" {
|
|
|
439
442
|
|
|
440
443
|
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
|
|
441
444
|
|
|
445
|
+
typedef void (*llama_model_set_tensor_data_t)(struct ggml_tensor * tensor, void * userdata);
|
|
446
|
+
|
|
447
|
+
// Create a new model from GGUF metadata as well as a function to set the tensor data
|
|
448
|
+
// - tensors are created as GGML_TYPE_F32 by default,
|
|
449
|
+
// override by adding a tensor with the same name but a different name to the context
|
|
450
|
+
LLAMA_API struct llama_model * llama_model_init_from_user(
|
|
451
|
+
struct gguf_context * metadata,
|
|
452
|
+
llama_model_set_tensor_data_t set_tensor_data, // function to initialize tensor data with
|
|
453
|
+
void * set_tensor_data_ud, // userdata for function
|
|
454
|
+
struct llama_model_params params);
|
|
455
|
+
|
|
442
456
|
DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
|
|
443
457
|
const char * path_model,
|
|
444
458
|
struct llama_model_params params),
|
|
445
459
|
"use llama_model_load_from_file instead");
|
|
446
460
|
|
|
447
|
-
// Load
|
|
461
|
+
// Load a model from a file
|
|
448
462
|
// If the file is split into multiple parts, the file name must follow this pattern: <name>-%05d-of-%05d.gguf
|
|
449
463
|
// If the split file name does not follow this pattern, use llama_model_load_from_splits
|
|
450
464
|
LLAMA_API struct llama_model * llama_model_load_from_file(
|
|
451
465
|
const char * path_model,
|
|
452
466
|
struct llama_model_params params);
|
|
453
467
|
|
|
454
|
-
// Load
|
|
468
|
+
// Load a model from multiple splits (support custom naming scheme)
|
|
455
469
|
// The paths must be in the correct order
|
|
456
470
|
LLAMA_API struct llama_model * llama_model_load_from_splits(
|
|
457
471
|
const char ** paths,
|
|
@@ -482,13 +496,14 @@ extern "C" {
|
|
|
482
496
|
enum llama_params_fit_status {
|
|
483
497
|
LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
|
|
484
498
|
LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
|
|
485
|
-
LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error
|
|
499
|
+
LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occurred, e.g. because no model could be found at the specified path
|
|
486
500
|
};
|
|
487
501
|
|
|
488
502
|
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
|
489
503
|
// - returns true if the parameters could be successfully modified to fit device memory
|
|
490
504
|
// - this function is NOT thread safe because it modifies the global llama logger state
|
|
491
505
|
// - only parameters that have the same value as in llama_default_model_params are modified
|
|
506
|
+
// with the exception of the context size which is modified if and only if equal to 0
|
|
492
507
|
LLAMA_API enum llama_params_fit_status llama_params_fit(
|
|
493
508
|
const char * path_model,
|
|
494
509
|
struct llama_model_params * mparams,
|
|
@@ -646,7 +661,8 @@ extern "C" {
|
|
|
646
661
|
|
|
647
662
|
// Manually free a LoRA adapter
|
|
648
663
|
// NOTE: loaded adapters will be free when the associated model is deleted
|
|
649
|
-
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter)
|
|
664
|
+
LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter),
|
|
665
|
+
"adapters are now freed together with the associated model");
|
|
650
666
|
|
|
651
667
|
// Get the invocation tokens if the current lora is an alora
|
|
652
668
|
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
|
|
@@ -654,21 +670,12 @@ extern "C" {
|
|
|
654
670
|
|
|
655
671
|
// The following functions operate on a llama_context, hence the naming: llama_verb_...
|
|
656
672
|
|
|
657
|
-
//
|
|
658
|
-
|
|
659
|
-
LLAMA_API int32_t llama_set_adapter_lora(
|
|
673
|
+
// Set LoRa adapters on the context. Will only modify if the adapters currently in context are different.
|
|
674
|
+
LLAMA_API int32_t llama_set_adapters_lora(
|
|
660
675
|
struct llama_context * ctx,
|
|
661
|
-
struct llama_adapter_lora
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
// Remove a specific LoRA adapter from given context
|
|
665
|
-
// Return -1 if the adapter is not present in the context
|
|
666
|
-
LLAMA_API int32_t llama_rm_adapter_lora(
|
|
667
|
-
struct llama_context * ctx,
|
|
668
|
-
struct llama_adapter_lora * adapter);
|
|
669
|
-
|
|
670
|
-
// Remove all LoRA adapters from given context
|
|
671
|
-
LLAMA_API void llama_clear_adapter_lora(struct llama_context * ctx);
|
|
676
|
+
struct llama_adapter_lora ** adapters,
|
|
677
|
+
size_t n_adapters,
|
|
678
|
+
float * scales);
|
|
672
679
|
|
|
673
680
|
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
|
674
681
|
// the currently loaded vector.
|
|
@@ -676,7 +683,7 @@ extern "C" {
|
|
|
676
683
|
// to an n_embd x n_layers buffer starting from layer 1.
|
|
677
684
|
// il_start and il_end are the layer range the vector should apply to (both inclusive)
|
|
678
685
|
// See llama_control_vector_load in common to load a control vector.
|
|
679
|
-
LLAMA_API int32_t
|
|
686
|
+
LLAMA_API int32_t llama_set_adapter_cvec(
|
|
680
687
|
struct llama_context * ctx,
|
|
681
688
|
const float * data,
|
|
682
689
|
size_t len,
|
|
@@ -979,7 +986,7 @@ extern "C" {
|
|
|
979
986
|
|
|
980
987
|
// Logits for the ith token. For positive indices, Equivalent to:
|
|
981
988
|
// llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
|
|
982
|
-
// Negative
|
|
989
|
+
// Negative indices can be used to access logits in reverse order, -1 is the last logit.
|
|
983
990
|
// returns NULL for invalid ids.
|
|
984
991
|
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
|
985
992
|
|
|
@@ -994,7 +1001,7 @@ extern "C" {
|
|
|
994
1001
|
|
|
995
1002
|
// Get the embeddings for the ith token. For positive indices, Equivalent to:
|
|
996
1003
|
// llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
|
|
997
|
-
// Negative
|
|
1004
|
+
// Negative indices can be used to access embeddings in reverse order, -1 is the last embedding.
|
|
998
1005
|
// shape: [n_embd] (1-dimensional)
|
|
999
1006
|
// returns NULL for invalid ids.
|
|
1000
1007
|
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
|
@@ -1014,9 +1021,9 @@ extern "C" {
|
|
|
1014
1021
|
// Returns LLAMA_TOKEN_NULL if no token was sampled.
|
|
1015
1022
|
LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
|
|
1016
1023
|
|
|
1017
|
-
// Get the backend sampled
|
|
1024
|
+
// Get the backend sampled probabilities for the ith token
|
|
1018
1025
|
// The index matches llama_get_sampled_token_ith().
|
|
1019
|
-
// Returns NULL if no
|
|
1026
|
+
// Returns NULL if no probabilities were generated.
|
|
1020
1027
|
LLAMA_API float * llama_get_sampled_probs_ith (struct llama_context * ctx, int32_t i);
|
|
1021
1028
|
LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
|
|
1022
1029
|
|
|
@@ -1148,9 +1155,9 @@ extern "C" {
|
|
|
1148
1155
|
//
|
|
1149
1156
|
|
|
1150
1157
|
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
|
1151
|
-
///
|
|
1158
|
+
///
|
|
1152
1159
|
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
|
1153
|
-
/// @param tmpl A Jinja template to use for this chat.
|
|
1160
|
+
/// @param tmpl A Jinja template to use for this chat.
|
|
1154
1161
|
/// @param chat Pointer to a list of multiple llama_chat_message
|
|
1155
1162
|
/// @param n_msg Number of llama_chat_message in this chat
|
|
1156
1163
|
/// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
|
|
@@ -1255,7 +1262,6 @@ extern "C" {
|
|
|
1255
1262
|
// [EXPERIMENTAL]
|
|
1256
1263
|
// attach a sampler to the context
|
|
1257
1264
|
// note: prefer initializing the context with llama_context_params.samplers when possible
|
|
1258
|
-
// note: changing the samplers of a context can cause graph reallocations and degraded performance
|
|
1259
1265
|
LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
|
|
1260
1266
|
|
|
1261
1267
|
// mirror of llama_sampler_i:
|
|
@@ -1344,7 +1350,7 @@ extern "C" {
|
|
|
1344
1350
|
float tau,
|
|
1345
1351
|
float eta);
|
|
1346
1352
|
|
|
1347
|
-
/// @details
|
|
1353
|
+
/// @details Initializes a GBNF grammar, see grammars/README.md for details.
|
|
1348
1354
|
/// @param vocab The vocabulary that this grammar will be used with.
|
|
1349
1355
|
/// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
|
|
1350
1356
|
/// @param grammar_root The name of the start symbol for the grammar.
|
|
@@ -1395,6 +1401,33 @@ extern "C" {
|
|
|
1395
1401
|
const char ** seq_breakers,
|
|
1396
1402
|
size_t num_breakers);
|
|
1397
1403
|
|
|
1404
|
+
/// adaptive-p: select tokens near a configurable target probability over time.
|
|
1405
|
+
///
|
|
1406
|
+
/// the adaptive-p sampler transforms the token probability distribution to favor tokens
|
|
1407
|
+
/// that fall near a user-configurable probability target.
|
|
1408
|
+
///
|
|
1409
|
+
/// internally, the sampler maintains an exponential moving average of the *ORIGINAL*
|
|
1410
|
+
/// probabilities of selected tokens at each sampling step. it uses this EMA to compute an
|
|
1411
|
+
/// adapted target probability at each sampling step, thus maintaining the desired target
|
|
1412
|
+
/// probability over time.
|
|
1413
|
+
///
|
|
1414
|
+
/// adaptive-p selects a token ID rather than just mutating candidates, so it must be last
|
|
1415
|
+
/// in the sampler chain (like mirostat, dist, greedy).
|
|
1416
|
+
///
|
|
1417
|
+
/// only mild truncation before this sampler is recommended. we suggest applying min-p
|
|
1418
|
+
/// before adaptive-p as the only other active sampler in the chain.
|
|
1419
|
+
///
|
|
1420
|
+
/// @param target select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
|
|
1421
|
+
/// @param decay EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99)
|
|
1422
|
+
/// @param seed RNG seed
|
|
1423
|
+
///
|
|
1424
|
+
/// ref: https://github.com/ggml-org/llama.cpp/pull/17927
|
|
1425
|
+
///
|
|
1426
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_adaptive_p(
|
|
1427
|
+
float target,
|
|
1428
|
+
float decay,
|
|
1429
|
+
uint32_t seed);
|
|
1430
|
+
|
|
1398
1431
|
LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
|
|
1399
1432
|
int32_t n_vocab,
|
|
1400
1433
|
int32_t n_logit_bias,
|
|
@@ -1448,12 +1481,12 @@ extern "C" {
|
|
|
1448
1481
|
/// @details Build a split GGUF final path for this chunk.
|
|
1449
1482
|
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
|
|
1450
1483
|
// Returns the split_path length.
|
|
1451
|
-
LLAMA_API
|
|
1484
|
+
LLAMA_API int32_t llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int32_t split_no, int32_t split_count);
|
|
1452
1485
|
|
|
1453
1486
|
/// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
|
|
1454
1487
|
/// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
|
|
1455
1488
|
// Returns the split_prefix length.
|
|
1456
|
-
LLAMA_API
|
|
1489
|
+
LLAMA_API int32_t llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int32_t split_no, int32_t split_count);
|
|
1457
1490
|
|
|
1458
1491
|
// Print system information
|
|
1459
1492
|
LLAMA_API const char * llama_print_system_info(void);
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
3
|
llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
4
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
4
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
5
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
6
6
|
|
|
7
7
|
ggml_tensor * cur;
|
|
8
8
|
ggml_tensor * inpL;
|
|
@@ -127,7 +127,6 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
|
|
|
127
127
|
n_expert, n_expert_used,
|
|
128
128
|
LLM_FFN_SILU,
|
|
129
129
|
hparams.expert_weights_norm, // norm_w (route_norm=True)
|
|
130
|
-
hparams.expert_weights_scale, // scale_w
|
|
131
130
|
hparams.expert_weights_scale, // w_scale (route_scale=2.826)
|
|
132
131
|
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
|
133
132
|
il);
|
|
@@ -3,10 +3,10 @@
|
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
6
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
7
7
|
|
|
8
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
9
|
-
GGML_ASSERT(n_embd_head ==
|
|
8
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
9
|
+
GGML_ASSERT(n_embd_head == n_rot);
|
|
10
10
|
|
|
11
11
|
ggml_tensor * cur;
|
|
12
12
|
ggml_tensor * inpL;
|
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
5
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
6
6
|
|
|
7
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
8
|
-
GGML_ASSERT(n_embd_head ==
|
|
7
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
8
|
+
GGML_ASSERT(n_embd_head == n_rot);
|
|
9
9
|
|
|
10
10
|
ggml_tensor * cur;
|
|
11
11
|
ggml_tensor * inpL;
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
|
-
|
|
4
3
|
llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
5
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
4
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
6
5
|
|
|
7
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
8
|
-
GGML_ASSERT(n_embd_head ==
|
|
6
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
7
|
+
GGML_ASSERT(n_embd_head == n_rot);
|
|
9
8
|
|
|
10
9
|
ggml_tensor * cur;
|
|
11
10
|
ggml_tensor * inpL;
|
|
@@ -104,7 +103,7 @@ llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_pa
|
|
|
104
103
|
nullptr,
|
|
105
104
|
n_expert, n_expert_used,
|
|
106
105
|
LLM_FFN_SILU, true,
|
|
107
|
-
|
|
106
|
+
hparams.expert_weights_scale,
|
|
108
107
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
109
108
|
il);
|
|
110
109
|
cb(cur, "ffn_moe_out", il);
|
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
5
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
6
6
|
|
|
7
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
8
|
-
GGML_ASSERT(n_embd_head ==
|
|
7
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
8
|
+
GGML_ASSERT(n_embd_head == n_rot);
|
|
9
9
|
|
|
10
10
|
ggml_tensor * cur;
|
|
11
11
|
ggml_tensor * inpL;
|
|
@@ -56,6 +56,7 @@ llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_grap
|
|
|
56
56
|
);
|
|
57
57
|
break;
|
|
58
58
|
case LLM_TYPE_13B:
|
|
59
|
+
case LLM_TYPE_UNKNOWN:
|
|
59
60
|
break;
|
|
60
61
|
default:
|
|
61
62
|
GGML_ABORT("fatal error");
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
|
-
|
|
4
3
|
llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
5
4
|
ggml_tensor * cur;
|
|
6
5
|
ggml_tensor * inpL;
|
|
@@ -97,7 +96,7 @@ llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_
|
|
|
97
96
|
nullptr,
|
|
98
97
|
n_expert, n_expert_used,
|
|
99
98
|
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
100
|
-
|
|
99
|
+
hparams.expert_weights_scale,
|
|
101
100
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
102
101
|
il);
|
|
103
102
|
cb(moe_out, "ffn_moe_out", il);
|
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
3
|
llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
|
|
6
4
|
llm_graph_context(params) {
|
|
7
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
8
6
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
9
7
|
|
|
10
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
8
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
11
9
|
|
|
12
10
|
ggml_tensor * cur;
|
|
13
11
|
ggml_tensor * inpL;
|
|
@@ -90,7 +88,7 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
|
|
|
90
88
|
model.layers[il].ffn_exp_probs_b,
|
|
91
89
|
n_expert, n_expert_used,
|
|
92
90
|
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
93
|
-
|
|
91
|
+
hparams.expert_weights_scale,
|
|
94
92
|
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
|
95
93
|
il);
|
|
96
94
|
cb(moe_out, "ffn_moe_out", il);
|
|
@@ -1,12 +1,10 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
3
|
llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
6
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
4
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
7
5
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
8
6
|
|
|
9
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
10
8
|
|
|
11
9
|
ggml_tensor * cur;
|
|
12
10
|
ggml_tensor * inpL;
|
|
@@ -129,9 +127,17 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
|
|
|
129
127
|
// feed-forward network
|
|
130
128
|
if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
|
|
131
129
|
// MoE branch
|
|
132
|
-
cur = build_moe_ffn(cur,
|
|
133
|
-
|
|
134
|
-
|
|
130
|
+
cur = build_moe_ffn(cur,
|
|
131
|
+
model.layers[il].ffn_gate_inp,
|
|
132
|
+
model.layers[il].ffn_up_exps,
|
|
133
|
+
nullptr,
|
|
134
|
+
model.layers[il].ffn_down_exps,
|
|
135
|
+
nullptr,
|
|
136
|
+
hparams.n_expert, hparams.n_expert_used,
|
|
137
|
+
LLM_FFN_GELU, false,
|
|
138
|
+
hparams.expert_weights_scale,
|
|
139
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
140
|
+
il);
|
|
135
141
|
cb(cur, "ffn_moe_out", il);
|
|
136
142
|
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE ||
|
|
137
143
|
model.arch == LLM_ARCH_JINA_BERT_V3) {
|
|
@@ -2,9 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
5
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
6
6
|
|
|
7
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
8
8
|
|
|
9
9
|
ggml_tensor * cur;
|
|
10
10
|
ggml_tensor * inpL;
|
|
@@ -29,10 +29,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|
|
29
29
|
// self-attention
|
|
30
30
|
{
|
|
31
31
|
// compute Q and K and RoPE them
|
|
32
|
-
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
33
|
-
if (model.layers[il].wq_scale) {
|
|
34
|
-
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
|
|
35
|
-
}
|
|
32
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
|
|
36
33
|
cb(Qcur, "Qcur", il);
|
|
37
34
|
if (model.layers[il].bq) {
|
|
38
35
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
@@ -40,10 +37,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|
|
40
37
|
}
|
|
41
38
|
|
|
42
39
|
// B1.K
|
|
43
|
-
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
44
|
-
if (model.layers[il].wk_scale) {
|
|
45
|
-
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
|
|
46
|
-
}
|
|
40
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
|
|
47
41
|
cb(Kcur, "Kcur", il);
|
|
48
42
|
if (model.layers[il].bk) {
|
|
49
43
|
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
@@ -51,10 +45,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|
|
51
45
|
}
|
|
52
46
|
|
|
53
47
|
// B1.V
|
|
54
|
-
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
55
|
-
if (model.layers[il].wv_scale) {
|
|
56
|
-
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
|
|
57
|
-
}
|
|
48
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
|
|
58
49
|
cb(Vcur, "Vcur", il);
|
|
59
50
|
if (model.layers[il].bv) {
|
|
60
51
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
@@ -90,10 +81,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|
|
90
81
|
LLM_NORM_RMS, il);
|
|
91
82
|
cb(cur, "attn_sub_norm", il);
|
|
92
83
|
|
|
93
|
-
cur = build_lora_mm(model.layers[il].wo, cur);
|
|
94
|
-
if (model.layers[il].wo_scale) {
|
|
95
|
-
cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
|
|
96
|
-
}
|
|
84
|
+
cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
|
|
97
85
|
if (model.layers[il].bo) {
|
|
98
86
|
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
|
99
87
|
}
|
|
@@ -115,8 +103,8 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|
|
115
103
|
cb(cur, "ffn_norm", il);
|
|
116
104
|
|
|
117
105
|
cur = build_ffn(cur,
|
|
118
|
-
model.layers[il].ffn_up, NULL, model.layers[il].
|
|
119
|
-
model.layers[il].ffn_gate, NULL, model.layers[il].
|
|
106
|
+
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s,
|
|
107
|
+
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s,
|
|
120
108
|
NULL, NULL, NULL,
|
|
121
109
|
NULL,
|
|
122
110
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
@@ -127,10 +115,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|
|
127
115
|
LLM_NORM_RMS, il);
|
|
128
116
|
cb(cur, "ffn_sub_norm", il);
|
|
129
117
|
|
|
130
|
-
cur = build_lora_mm(model.layers[il].ffn_down, cur);
|
|
131
|
-
if (model.layers[il].ffn_down_scale) {
|
|
132
|
-
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
|
|
133
|
-
}
|
|
118
|
+
cur = build_lora_mm(model.layers[il].ffn_down, cur, model.layers[il].ffn_down_s);
|
|
134
119
|
cb(cur, "ffn_down", il);
|
|
135
120
|
|
|
136
121
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
3
|
llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
4
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
4
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
5
5
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6
6
|
|
|
7
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
8
8
|
|
|
9
9
|
ggml_tensor * cur;
|
|
10
10
|
ggml_tensor * inpL;
|
|
@@ -3,10 +3,10 @@
|
|
|
3
3
|
#include <float.h>
|
|
4
4
|
|
|
5
5
|
llm_build_chameleon::llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
6
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
7
7
|
|
|
8
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
9
|
-
GGML_ASSERT(n_embd_head ==
|
|
8
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
9
|
+
GGML_ASSERT(n_embd_head == n_rot);
|
|
10
10
|
|
|
11
11
|
ggml_tensor * cur;
|
|
12
12
|
ggml_tensor * inpL;
|
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
5
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
6
6
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
7
7
|
|
|
8
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
8
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
9
9
|
|
|
10
10
|
ggml_tensor * cur;
|
|
11
11
|
ggml_tensor * inpL;
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
3
|
llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
4
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
4
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
5
5
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6
6
|
|
|
7
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
8
|
-
GGML_ASSERT(n_embd_head ==
|
|
7
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
8
|
+
GGML_ASSERT(n_embd_head == n_rot);
|
|
9
9
|
|
|
10
10
|
ggml_tensor * cur;
|
|
11
11
|
ggml_tensor * inpL;
|
|
@@ -2,11 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
|
|
4
4
|
llm_graph_context(params) {
|
|
5
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
6
6
|
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
|
7
7
|
|
|
8
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
9
|
-
GGML_ASSERT(n_embd_head ==
|
|
8
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
9
|
+
GGML_ASSERT(n_embd_head == n_rot);
|
|
10
10
|
|
|
11
11
|
ggml_tensor * inpL;
|
|
12
12
|
ggml_tensor * cur;
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
3
|
llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
4
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
4
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
5
5
|
|
|
6
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
6
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
7
7
|
|
|
8
8
|
const float f_logit_scale = hparams.f_logit_scale;
|
|
9
9
|
|
|
@@ -4,9 +4,9 @@
|
|
|
4
4
|
|
|
5
5
|
llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_graph_params & params) :
|
|
6
6
|
llm_graph_context(params) {
|
|
7
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
7
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
8
8
|
|
|
9
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
9
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
10
10
|
|
|
11
11
|
const float f_logit_scale = hparams.f_logit_scale;
|
|
12
12
|
|
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
|
-
|
|
4
3
|
llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
5
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
4
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
6
5
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
7
6
|
|
|
8
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
9
|
-
GGML_ASSERT(n_embd_head ==
|
|
7
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
8
|
+
GGML_ASSERT(n_embd_head == n_rot);
|
|
10
9
|
|
|
11
10
|
ggml_tensor * cur;
|
|
12
11
|
ggml_tensor * inpL;
|
|
@@ -89,7 +88,7 @@ llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params
|
|
|
89
88
|
nullptr,
|
|
90
89
|
n_expert, n_expert_used,
|
|
91
90
|
LLM_FFN_SILU, true,
|
|
92
|
-
|
|
91
|
+
hparams.expert_weights_scale,
|
|
93
92
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
94
93
|
il);
|
|
95
94
|
cb(cur, "ffn_moe_out", il);
|
|
@@ -3,10 +3,10 @@
|
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
6
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6
|
+
const int64_t n_embd_head = hparams.n_embd_head_v();
|
|
7
7
|
|
|
8
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
9
|
-
GGML_ASSERT(n_embd_head ==
|
|
8
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
|
9
|
+
GGML_ASSERT(n_embd_head == n_rot);
|
|
10
10
|
|
|
11
11
|
ggml_tensor * cur;
|
|
12
12
|
ggml_tensor * inpL;
|