whispercpp 1.3.4 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +158 -44
- data/ext/extconf.rb +3 -2
- data/ext/ruby_whisper.c +34 -6
- data/ext/ruby_whisper.h +67 -0
- data/ext/ruby_whisper_context.c +236 -144
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_model.c +12 -13
- data/ext/ruby_whisper_params.c +47 -24
- data/ext/ruby_whisper_segment.c +84 -20
- data/ext/ruby_whisper_token.c +371 -0
- data/ext/ruby_whisper_transcribe.cpp +5 -2
- data/ext/ruby_whisper_vad_context.c +122 -0
- data/ext/ruby_whisper_vad_context_detect.cpp +51 -0
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +138 -0
- data/ext/ruby_whisper_vad_segments.c +105 -0
- data/ext/sources/CMakeLists.txt +4 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/arm64-apple-clang.cmake +16 -0
- data/ext/sources/cmake/arm64-windows-llvm.cmake +16 -0
- data/ext/sources/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/cmake/x64-windows-llvm.cmake +5 -0
- data/ext/sources/examples/addon.node/vad-example.js +2 -2
- data/ext/sources/examples/bench/bench.cpp +23 -18
- data/ext/sources/examples/cli/cli.cpp +129 -112
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/lsp/CMakeLists.txt +2 -1
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/quantize/CMakeLists.txt +2 -1
- data/ext/sources/examples/server/server.cpp +28 -15
- data/ext/sources/examples/talk-llama/CMakeLists.txt +8 -3
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +5 -2
- data/ext/sources/examples/talk-llama/llama-adapter.h +7 -0
- data/ext/sources/examples/talk-llama/llama-arch.cpp +2378 -1988
- data/ext/sources/examples/talk-llama/llama-arch.h +109 -2
- data/ext/sources/examples/talk-llama/llama-batch.cpp +78 -34
- data/ext/sources/examples/talk-llama/llama-batch.h +17 -4
- data/ext/sources/examples/talk-llama/llama-chat.cpp +100 -4
- data/ext/sources/examples/talk-llama/llama-chat.h +5 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +1088 -403
- data/ext/sources/examples/talk-llama/llama-context.h +70 -23
- data/ext/sources/examples/talk-llama/llama-cparams.h +6 -0
- data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +295 -60
- data/ext/sources/examples/talk-llama/llama-grammar.h +22 -1
- data/ext/sources/examples/talk-llama/llama-graph.cpp +925 -155
- data/ext/sources/examples/talk-llama/llama-graph.h +234 -23
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +79 -38
- data/ext/sources/examples/talk-llama/llama-hparams.h +118 -18
- data/ext/sources/examples/talk-llama/llama-impl.cpp +11 -7
- data/ext/sources/examples/talk-llama/llama-impl.h +14 -2
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +8 -4
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +405 -140
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +24 -10
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +44 -2
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +12 -10
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +42 -31
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +2 -2
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +197 -45
- data/ext/sources/examples/talk-llama/llama-mmap.h +8 -3
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +606 -116
- data/ext/sources/examples/talk-llama/llama-model-loader.h +41 -5
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +61 -44
- data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
- data/ext/sources/examples/talk-llama/llama-model.cpp +2756 -13643
- data/ext/sources/examples/talk-llama/llama-model.h +112 -18
- data/ext/sources/examples/talk-llama/llama-quant.cpp +582 -365
- data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +1409 -199
- data/ext/sources/examples/talk-llama/llama-sampler.h +42 -0
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +248 -82
- data/ext/sources/examples/talk-llama/llama-vocab.h +50 -40
- data/ext/sources/examples/talk-llama/llama.cpp +802 -21
- data/ext/sources/examples/talk-llama/llama.h +210 -39
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +190 -0
- data/ext/sources/examples/talk-llama/models/apertus.cpp +125 -0
- data/ext/sources/examples/talk-llama/models/arcee.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/arctic.cpp +137 -0
- data/ext/sources/examples/talk-llama/models/arwkv7.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +143 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +133 -0
- data/ext/sources/examples/talk-llama/models/bert.cpp +184 -0
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +145 -0
- data/ext/sources/examples/talk-llama/models/bloom.cpp +101 -0
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +111 -0
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +102 -0
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +134 -0
- data/ext/sources/examples/talk-llama/models/command-r.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/deci.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +142 -0
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +262 -0
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/dream.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +148 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +114 -0
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +111 -0
- data/ext/sources/examples/talk-llama/models/falcon.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +116 -0
- data/ext/sources/examples/talk-llama/models/gemma.cpp +112 -0
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +155 -0
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +384 -0
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +170 -0
- data/ext/sources/examples/talk-llama/models/glm4.cpp +157 -0
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +195 -0
- data/ext/sources/examples/talk-llama/models/granite.cpp +210 -0
- data/ext/sources/examples/talk-llama/models/grok.cpp +159 -0
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +139 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +153 -0
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/jais.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +106 -0
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +196 -0
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/llada.cpp +99 -0
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/llama.cpp +175 -0
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/mamba-base.cpp +289 -0
- data/ext/sources/examples/talk-llama/models/mamba.cpp +54 -0
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +129 -0
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +200 -0
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +160 -0
- data/ext/sources/examples/talk-llama/models/models.h +704 -0
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +109 -0
- data/ext/sources/examples/talk-llama/models/mpt.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +162 -0
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +104 -0
- data/ext/sources/examples/talk-llama/models/olmo.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +127 -0
- data/ext/sources/examples/talk-llama/models/openelm.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/orion.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi3.cpp +152 -0
- data/ext/sources/examples/talk-llama/models/plamo.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +320 -0
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/plm.cpp +169 -0
- data/ext/sources/examples/talk-llama/models/qwen.cpp +108 -0
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +151 -0
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +131 -0
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +525 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +140 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/refact.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +164 -0
- data/ext/sources/examples/talk-llama/models/rwkv6.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +137 -0
- data/ext/sources/examples/talk-llama/models/rwkv7.cpp +90 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +146 -0
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +100 -0
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +166 -0
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +96 -0
- data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +149 -0
- data/ext/sources/examples/talk-llama/models/xverse.cpp +108 -0
- data/ext/sources/examples/talk-llama/unicode.cpp +121 -79
- data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +1 -1
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +1 -1
- data/ext/sources/ggml/CMakeLists.txt +90 -56
- data/ext/sources/ggml/include/ggml-alloc.h +9 -0
- data/ext/sources/ggml/include/ggml-backend.h +5 -2
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +6 -0
- data/ext/sources/ggml/include/ggml-hexagon.h +19 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +14 -12
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml-zendnn.h +22 -0
- data/ext/sources/ggml/include/ggml.h +246 -21
- data/ext/sources/ggml/src/CMakeLists.txt +85 -11
- data/ext/sources/ggml/src/ggml-alloc.c +128 -50
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-impl.h +1 -4
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +54 -88
- data/ext/sources/ggml/src/ggml-backend.cpp +76 -23
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +18 -4
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +11 -11
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +58 -46
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +139 -48
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +2427 -1785
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +238 -362
- data/ext/sources/ggml/src/ggml-cann/common.h +285 -211
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +663 -831
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +170 -95
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -18
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +513 -27
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +4192 -992
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1761 -49
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +124 -24
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +157 -28
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -3
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +251 -80
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +19 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +587 -119
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +33 -44
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1093 -194
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1284 -203
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1519 -527
- data/ext/sources/ggml/src/ggml-cpu/ops.h +6 -4
- data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +3632 -781
- data/ext/sources/ggml/src/ggml-cpu/repack.h +129 -4
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +152 -46
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +3 -2
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +152 -1
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +7 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +140 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.h +261 -146
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +72 -1
- data/ext/sources/ggml/src/ggml-cuda/argmax.cu +2 -2
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +132 -6
- data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +16 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +33 -31
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +474 -85
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +342 -246
- data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +1 -5
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cu +307 -0
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/diag.cu +77 -0
- data/ext/sources/ggml/src/ggml-cuda/diag.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +98 -74
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +973 -665
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +35 -741
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +1255 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +33 -40
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +40 -18
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +206 -45
- data/ext/sources/ggml/src/ggml-cuda/fill.cu +37 -0
- data/ext/sources/ggml/src/ggml-cuda/fill.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1688 -302
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +12 -10
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +908 -48
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +88 -20
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +502 -90
- data/ext/sources/ggml/src/ggml-cuda/mmid.cu +164 -0
- data/ext/sources/ggml/src/ggml-cuda/mmid.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +69 -176
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +532 -193
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +460 -104
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +5 -2
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +360 -122
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +2 -1
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +73 -39
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +152 -1
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +14 -0
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +364 -149
- data/ext/sources/ggml/src/ggml-cuda/rope.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +101 -47
- data/ext/sources/ggml/src/ggml-cuda/set.cu +39 -0
- data/ext/sources/ggml/src/ggml-cuda/set.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +163 -41
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +275 -0
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +68 -50
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +49 -84
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +22 -4
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +95 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +275 -119
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -7
- data/ext/sources/ggml/src/ggml-cuda/tri.cu +136 -0
- data/ext/sources/ggml/src/ggml-cuda/tri.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +160 -11
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +38 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +163 -7
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +22 -1
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +6 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3325 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +46 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +813 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +891 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +713 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +112 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +155 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +63 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +26 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +1199 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2670 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +497 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +168 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +419 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +382 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +153 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +14 -13
- data/ext/sources/ggml/src/ggml-impl.h +129 -6
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +15 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +173 -34
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +912 -344
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +124 -59
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +588 -144
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +396 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +1724 -421
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +16 -3
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +333 -114
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +3050 -1539
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +3 -1
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +30 -1
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4279 -497
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +267 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +4 -3
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +35 -16
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
- data/ext/sources/ggml/src/ggml-quants.c +96 -5
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +438 -156
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +59 -87
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +81 -0
- data/ext/sources/ggml/src/ggml-sycl/add-id.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -29
- data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +0 -6
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +427 -20
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +55 -44
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +103 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +79 -0
- data/ext/sources/ggml/src/ggml-sycl/count-equal.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +0 -3
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +18 -0
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +867 -50
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +401 -358
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +12 -2
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +645 -155
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +221 -66
- data/ext/sources/ggml/src/ggml-sycl/norm.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/pad.cpp +97 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.hpp +24 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.cpp +122 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +457 -281
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/set.cpp +73 -0
- data/ext/sources/ggml/src/ggml-sycl/set.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +327 -162
- data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +4 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +71 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +39 -19
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +5994 -3055
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +18 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +47 -49
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +9 -21
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +386 -160
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +82 -20
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +400 -174
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +123 -37
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +10 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +17 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +2 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +19 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +13 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mat_vec_base.comp → mul_mat_vec_base.glsl} +77 -29
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +71 -21
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +41 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +44 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +39 -36
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +494 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +88 -105
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +41 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mm_funcs.comp → mul_mm_funcs.glsl} +69 -59
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +74 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +92 -230
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +21 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +10 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +49 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +207 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +8 -49
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +8 -32
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +8 -32
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +33 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +8 -38
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +50 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +2 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +43 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +345 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +90 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +384 -180
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +28 -2
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1374 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +2544 -726
- data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +73 -15
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +72 -261
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +766 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +147 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +196 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +83 -17
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +91 -0
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
- data/ext/sources/ggml/src/ggml.c +590 -64
- data/ext/sources/ggml/src/gguf.cpp +229 -44
- data/ext/sources/include/whisper.h +1 -0
- data/ext/sources/src/CMakeLists.txt +3 -1
- data/ext/sources/src/whisper.cpp +106 -62
- data/ext/sources/tests/CMakeLists.txt +2 -2
- data/ext/sources/tests/test-vad-full.cpp +4 -2
- data/ext/sources/tests/test-vad.cpp +1 -1
- data/extsources.rb +1 -0
- data/lib/whisper/model/uri.rb +17 -18
- data/sig/whisper.rbs +162 -4
- data/test/test_context_params.rb +82 -0
- data/test/test_params.rb +16 -8
- data/test/test_segment.rb +0 -1
- data/test/test_token.rb +81 -0
- data/test/test_vad.rb +1 -1
- data/test/test_vad_context.rb +100 -0
- data/test/test_vad_segment.rb +19 -0
- data/test/test_vad_segments.rb +16 -0
- data/test/test_whisper.rb +27 -0
- data/whispercpp.gemspec +1 -1
- metadata +502 -37
- data/ext/sources/build-xcframework.sh +0 -571
- data/ext/sources/examples/talk-llama/llama-sampling.h +0 -32
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cann/Doxyfile +0 -2579
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +0 -44
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +0 -41
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +0 -44
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +0 -41
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +0 -48
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
#include "llama
|
|
1
|
+
#include "llama.h"
|
|
2
2
|
#include "llama-impl.h"
|
|
3
3
|
#include "llama-model.h"
|
|
4
4
|
#include "llama-model-loader.h"
|
|
5
5
|
|
|
6
|
-
#include <algorithm>
|
|
7
6
|
#include <cmath>
|
|
8
7
|
#include <cstring>
|
|
8
|
+
#include <string>
|
|
9
9
|
#include <cinttypes>
|
|
10
10
|
#include <fstream>
|
|
11
11
|
#include <mutex>
|
|
@@ -13,10 +13,28 @@
|
|
|
13
13
|
#include <thread>
|
|
14
14
|
#include <unordered_map>
|
|
15
15
|
|
|
16
|
-
//
|
|
17
|
-
struct
|
|
16
|
+
// result of parsing --tensor-type option
|
|
17
|
+
// (changes to this struct must be reflected in tools/quantize/quantize.cpp)
|
|
18
|
+
struct tensor_type_option {
|
|
18
19
|
std::string name;
|
|
19
|
-
ggml_type
|
|
20
|
+
ggml_type type = GGML_TYPE_COUNT;
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
// tensor categorization - used to avoid repeated string matching in quantization logic.
|
|
24
|
+
// this is different from LLM_TN - we want broad categories, not specific tensor names per arch.
|
|
25
|
+
enum class tensor_category {
|
|
26
|
+
TOKEN_EMBD,
|
|
27
|
+
ATTENTION_Q,
|
|
28
|
+
ATTENTION_V,
|
|
29
|
+
ATTENTION_K,
|
|
30
|
+
ATTENTION_QKV,
|
|
31
|
+
ATTENTION_KV_B,
|
|
32
|
+
ATTENTION_OUTPUT,
|
|
33
|
+
FFN_UP,
|
|
34
|
+
FFN_GATE,
|
|
35
|
+
FFN_DOWN,
|
|
36
|
+
OUTPUT,
|
|
37
|
+
OTHER
|
|
20
38
|
};
|
|
21
39
|
|
|
22
40
|
static void zeros(std::ofstream & file, size_t n) {
|
|
@@ -54,7 +72,7 @@ static std::string remap_layer(const std::string & orig_name, const std::vector<
|
|
|
54
72
|
return orig_name;
|
|
55
73
|
}
|
|
56
74
|
|
|
57
|
-
static std::string remap_imatrix
|
|
75
|
+
static std::string remap_imatrix(const std::string & orig_name, const std::map<int, std::string> & mapped) {
|
|
58
76
|
if (mapped.empty()) {
|
|
59
77
|
return orig_name;
|
|
60
78
|
}
|
|
@@ -76,6 +94,73 @@ static std::string remap_imatrix (const std::string & orig_name, const std::map<
|
|
|
76
94
|
return orig_name;
|
|
77
95
|
}
|
|
78
96
|
|
|
97
|
+
//
|
|
98
|
+
// helper functions for tensor name matching
|
|
99
|
+
//
|
|
100
|
+
|
|
101
|
+
static bool tensor_name_match_token_embd(const char * tensor_name) {
|
|
102
|
+
return std::strcmp(tensor_name, "token_embd.weight") == 0 ||
|
|
103
|
+
std::strcmp(tensor_name, "per_layer_token_embd.weight") == 0;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
static bool tensor_name_match_output_weight(const char * tensor_name) {
|
|
107
|
+
return std::strcmp(tensor_name, "output.weight") == 0;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
//
|
|
111
|
+
// tensor categorization for quantization
|
|
112
|
+
//
|
|
113
|
+
// (this is different from LLM_TN - we want broad categories, not specific tensor names per arch)
|
|
114
|
+
//
|
|
115
|
+
|
|
116
|
+
static tensor_category tensor_get_category(const std::string & tensor_name) {
|
|
117
|
+
if (tensor_name_match_output_weight(tensor_name.c_str())) {
|
|
118
|
+
return tensor_category::OUTPUT;
|
|
119
|
+
}
|
|
120
|
+
if (tensor_name_match_token_embd(tensor_name.c_str())) {
|
|
121
|
+
return tensor_category::TOKEN_EMBD;
|
|
122
|
+
}
|
|
123
|
+
if (tensor_name.find("attn_qkv.weight") != std::string::npos) {
|
|
124
|
+
return tensor_category::ATTENTION_QKV;
|
|
125
|
+
}
|
|
126
|
+
if (tensor_name.find("attn_kv_b.weight") != std::string::npos) {
|
|
127
|
+
return tensor_category::ATTENTION_KV_B;
|
|
128
|
+
}
|
|
129
|
+
if (tensor_name.find("attn_v.weight") != std::string::npos) {
|
|
130
|
+
return tensor_category::ATTENTION_V;
|
|
131
|
+
}
|
|
132
|
+
if (tensor_name.find("attn_k.weight") != std::string::npos) {
|
|
133
|
+
return tensor_category::ATTENTION_K;
|
|
134
|
+
}
|
|
135
|
+
if (tensor_name.find("attn_q.weight") != std::string::npos) {
|
|
136
|
+
return tensor_category::ATTENTION_Q;
|
|
137
|
+
}
|
|
138
|
+
if (tensor_name.find("attn_output.weight") != std::string::npos) {
|
|
139
|
+
return tensor_category::ATTENTION_OUTPUT;
|
|
140
|
+
}
|
|
141
|
+
if (tensor_name.find("ffn_up") != std::string::npos) {
|
|
142
|
+
return tensor_category::FFN_UP;
|
|
143
|
+
}
|
|
144
|
+
if (tensor_name.find("ffn_gate") != std::string::npos) {
|
|
145
|
+
return tensor_category::FFN_GATE;
|
|
146
|
+
}
|
|
147
|
+
if (tensor_name.find("ffn_down") != std::string::npos) {
|
|
148
|
+
return tensor_category::FFN_DOWN;
|
|
149
|
+
}
|
|
150
|
+
return tensor_category::OTHER;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// check if category is for attention-v-like tensors (more sensitive to quantization)
|
|
154
|
+
static bool category_is_attn_v(tensor_category cat) {
|
|
155
|
+
return cat == tensor_category::ATTENTION_V ||
|
|
156
|
+
cat == tensor_category::ATTENTION_QKV ||
|
|
157
|
+
cat == tensor_category::ATTENTION_KV_B;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
//
|
|
161
|
+
// quantization state
|
|
162
|
+
//
|
|
163
|
+
|
|
79
164
|
struct quantize_state_impl {
|
|
80
165
|
const llama_model & model;
|
|
81
166
|
const llama_model_quantize_params * params;
|
|
@@ -89,20 +174,42 @@ struct quantize_state_impl {
|
|
|
89
174
|
int i_ffn_gate = 0;
|
|
90
175
|
int i_ffn_up = 0;
|
|
91
176
|
|
|
92
|
-
int n_k_quantized = 0;
|
|
93
177
|
int n_fallback = 0;
|
|
94
178
|
|
|
95
179
|
bool has_imatrix = false;
|
|
96
180
|
|
|
97
|
-
// used to figure out if a model
|
|
98
|
-
bool
|
|
181
|
+
// used to figure out if a model has tied embeddings (tok_embd shares weights with output)
|
|
182
|
+
bool has_tied_embeddings = true; // assume tied until we see output.weight
|
|
183
|
+
|
|
184
|
+
// tensor type override patterns (compiled once, used twice)
|
|
185
|
+
std::vector<std::pair<std::regex, ggml_type>> tensor_type_patterns;
|
|
186
|
+
|
|
187
|
+
quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params):
|
|
188
|
+
model(model), params(params)
|
|
189
|
+
{
|
|
190
|
+
// compile regex patterns once - they are expensive
|
|
191
|
+
if (params->tensor_types) {
|
|
192
|
+
const auto & tensor_types = *static_cast<const std::vector<tensor_type_option> *>(params->tensor_types);
|
|
193
|
+
for (const auto & [tname, qtype] : tensor_types) {
|
|
194
|
+
tensor_type_patterns.emplace_back(std::regex(tname), qtype);
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
};
|
|
99
199
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
200
|
+
// per-tensor metadata, computed in the preliminary loop and used in the main loop
|
|
201
|
+
struct tensor_metadata {
|
|
202
|
+
ggml_type target_type;
|
|
203
|
+
tensor_category category;
|
|
204
|
+
std::string remapped_imatrix_name;
|
|
205
|
+
bool allows_quantization;
|
|
206
|
+
bool requires_imatrix;
|
|
104
207
|
};
|
|
105
208
|
|
|
209
|
+
//
|
|
210
|
+
// dequantization
|
|
211
|
+
//
|
|
212
|
+
|
|
106
213
|
static void llama_tensor_dequantize_impl(
|
|
107
214
|
ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
|
108
215
|
const size_t nelements, const int nthread
|
|
@@ -175,12 +282,132 @@ static void llama_tensor_dequantize_impl(
|
|
|
175
282
|
workers.clear();
|
|
176
283
|
}
|
|
177
284
|
|
|
178
|
-
|
|
285
|
+
//
|
|
286
|
+
// do we allow this tensor to be quantized?
|
|
287
|
+
//
|
|
288
|
+
|
|
289
|
+
static bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor) {
|
|
290
|
+
// trivial checks first -- no string ops needed
|
|
291
|
+
if (params->only_copy) return false;
|
|
292
|
+
|
|
293
|
+
// quantize only 2D and 3D tensors (experts)
|
|
294
|
+
if (ggml_n_dims(tensor) < 2) return false;
|
|
295
|
+
|
|
296
|
+
const std::string name = ggml_get_name(tensor);
|
|
297
|
+
|
|
298
|
+
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
|
299
|
+
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
|
300
|
+
|
|
301
|
+
// do not quantize norm tensors
|
|
302
|
+
quantize &= name.find("_norm.weight") == std::string::npos;
|
|
303
|
+
|
|
304
|
+
quantize &= params->quantize_output_tensor || name != "output.weight";
|
|
305
|
+
|
|
306
|
+
// do not quantize expert gating tensors
|
|
307
|
+
// NOTE: can't use LLM_TN here because the layer number is not known
|
|
308
|
+
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
|
|
309
|
+
|
|
310
|
+
// these are very small (e.g. 4x4)
|
|
311
|
+
quantize &= name.find("altup") == std::string::npos;
|
|
312
|
+
quantize &= name.find("laurel") == std::string::npos;
|
|
313
|
+
|
|
314
|
+
// these are not too big so keep them as it is
|
|
315
|
+
quantize &= name.find("per_layer_model_proj") == std::string::npos;
|
|
316
|
+
|
|
317
|
+
// do not quantize positional embeddings and token types (BERT)
|
|
318
|
+
quantize &= name != LLM_TN(arch)(LLM_TENSOR_POS_EMBD, "weight");
|
|
319
|
+
quantize &= name != LLM_TN(arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
|
|
320
|
+
|
|
321
|
+
// do not quantize Mamba/Kimi's small conv1d weights
|
|
322
|
+
// NOTE: can't use LLM_TN here because the layer number is not known
|
|
323
|
+
quantize &= name.find("ssm_conv1d") == std::string::npos;
|
|
324
|
+
quantize &= name.find("shortconv.conv.weight") == std::string::npos;
|
|
325
|
+
|
|
326
|
+
// do not quantize RWKV's small yet 2D weights
|
|
327
|
+
quantize &= name.find("time_mix_first.weight") == std::string::npos;
|
|
328
|
+
quantize &= name.find("time_mix_w0.weight") == std::string::npos;
|
|
329
|
+
quantize &= name.find("time_mix_w1.weight") == std::string::npos;
|
|
330
|
+
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
|
|
331
|
+
quantize &= name.find("time_mix_v0.weight") == std::string::npos;
|
|
332
|
+
quantize &= name.find("time_mix_v1.weight") == std::string::npos;
|
|
333
|
+
quantize &= name.find("time_mix_v2.weight") == std::string::npos;
|
|
334
|
+
quantize &= name.find("time_mix_a0.weight") == std::string::npos;
|
|
335
|
+
quantize &= name.find("time_mix_a1.weight") == std::string::npos;
|
|
336
|
+
quantize &= name.find("time_mix_a2.weight") == std::string::npos;
|
|
337
|
+
quantize &= name.find("time_mix_g1.weight") == std::string::npos;
|
|
338
|
+
quantize &= name.find("time_mix_g2.weight") == std::string::npos;
|
|
339
|
+
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
|
|
340
|
+
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
|
|
341
|
+
quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
|
|
342
|
+
|
|
343
|
+
// do not quantize relative position bias (T5)
|
|
344
|
+
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
|
345
|
+
|
|
346
|
+
// do not quantize specific multimodal tensors
|
|
347
|
+
quantize &= name.find(".position_embd.") == std::string::npos;
|
|
348
|
+
|
|
349
|
+
return quantize;
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
//
|
|
353
|
+
// tensor type selection
|
|
354
|
+
//
|
|
355
|
+
|
|
356
|
+
// incompatible tensor shapes are handled here - fallback to a compatible type
|
|
357
|
+
static ggml_type tensor_type_fallback(quantize_state_impl & qs, const ggml_tensor * t, const ggml_type target_type) {
|
|
358
|
+
ggml_type return_type = target_type;
|
|
359
|
+
|
|
360
|
+
const int64_t ncols = t->ne[0];
|
|
361
|
+
const int64_t qk_k = ggml_blck_size(target_type);
|
|
362
|
+
|
|
363
|
+
if (ncols % qk_k != 0) { // this tensor's shape is incompatible with this quant
|
|
364
|
+
LLAMA_LOG_WARN("warning: %-36s - ncols %6" PRId64 " not divisible by %3" PRId64 " (required for type %7s) ",
|
|
365
|
+
t->name, ncols, qk_k, ggml_type_name(target_type));
|
|
366
|
+
++qs.n_fallback;
|
|
367
|
+
|
|
368
|
+
switch (target_type) {
|
|
369
|
+
// types on the left: block size 256
|
|
370
|
+
case GGML_TYPE_IQ1_S:
|
|
371
|
+
case GGML_TYPE_IQ1_M:
|
|
372
|
+
case GGML_TYPE_IQ2_XXS:
|
|
373
|
+
case GGML_TYPE_IQ2_XS:
|
|
374
|
+
case GGML_TYPE_IQ2_S:
|
|
375
|
+
case GGML_TYPE_IQ3_XXS:
|
|
376
|
+
case GGML_TYPE_IQ3_S: // types on the right: block size 32
|
|
377
|
+
case GGML_TYPE_IQ4_XS: return_type = GGML_TYPE_IQ4_NL; break;
|
|
378
|
+
case GGML_TYPE_Q2_K:
|
|
379
|
+
case GGML_TYPE_Q3_K:
|
|
380
|
+
case GGML_TYPE_TQ1_0:
|
|
381
|
+
case GGML_TYPE_TQ2_0: return_type = GGML_TYPE_Q4_0; break;
|
|
382
|
+
case GGML_TYPE_Q4_K: return_type = GGML_TYPE_Q5_0; break;
|
|
383
|
+
case GGML_TYPE_Q5_K: return_type = GGML_TYPE_Q5_1; break;
|
|
384
|
+
case GGML_TYPE_Q6_K: return_type = GGML_TYPE_Q8_0; break;
|
|
385
|
+
default:
|
|
386
|
+
throw std::runtime_error(format("no tensor type fallback is defined for type %s",
|
|
387
|
+
ggml_type_name(target_type)));
|
|
388
|
+
}
|
|
389
|
+
if (ncols % ggml_blck_size(return_type) != 0) {
|
|
390
|
+
//
|
|
391
|
+
// the fallback return type is still not compatible for this tensor!
|
|
392
|
+
//
|
|
393
|
+
// most likely, this tensor's first dimension is not divisible by 32.
|
|
394
|
+
// this is very rare. we can either abort the quantization, or
|
|
395
|
+
// fallback to F16 / F32.
|
|
396
|
+
//
|
|
397
|
+
LLAMA_LOG_WARN("(WARNING: must use F16 due to unusual shape) ");
|
|
398
|
+
return_type = GGML_TYPE_F16;
|
|
399
|
+
}
|
|
400
|
+
LLAMA_LOG_WARN("-> falling back to %7s\n", ggml_type_name(return_type));
|
|
401
|
+
}
|
|
402
|
+
return return_type;
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
// internal standard logic for selecting the target tensor type based on tensor category, ftype, and model arch
|
|
406
|
+
static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, tensor_category category) {
|
|
179
407
|
const std::string name = ggml_get_name(tensor);
|
|
180
408
|
|
|
181
409
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
|
182
410
|
const llm_arch arch = qs.model.arch;
|
|
183
|
-
const auto tn = LLM_TN(arch);
|
|
184
411
|
|
|
185
412
|
auto use_more_bits = [](int i_layer, int n_layers) -> bool {
|
|
186
413
|
return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
|
|
@@ -204,7 +431,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
204
431
|
|
|
205
432
|
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
|
|
206
433
|
// with the quantization of the output tensor
|
|
207
|
-
if (
|
|
434
|
+
if (category == tensor_category::OUTPUT || (qs.has_tied_embeddings && category == tensor_category::TOKEN_EMBD)) {
|
|
208
435
|
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
|
209
436
|
new_type = qs.params->output_tensor_type;
|
|
210
437
|
} else {
|
|
@@ -234,7 +461,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
234
461
|
} else {
|
|
235
462
|
new_type = GGML_TYPE_Q8_0;
|
|
236
463
|
}
|
|
237
|
-
} else if (
|
|
464
|
+
} else if (category == tensor_category::TOKEN_EMBD) {
|
|
238
465
|
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
|
|
239
466
|
new_type = qs.params->token_embedding_type;
|
|
240
467
|
} else {
|
|
@@ -254,21 +481,21 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
254
481
|
}
|
|
255
482
|
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
|
256
483
|
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
|
257
|
-
if (
|
|
484
|
+
if (category_is_attn_v(category)) {
|
|
258
485
|
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
|
259
486
|
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
|
260
487
|
++qs.i_attention_wv;
|
|
261
488
|
}
|
|
262
|
-
else if (qs.model.hparams.n_expert == 8 &&
|
|
489
|
+
else if (qs.model.hparams.n_expert == 8 && category == tensor_category::ATTENTION_K) {
|
|
263
490
|
new_type = GGML_TYPE_Q4_K;
|
|
264
491
|
}
|
|
265
|
-
else if (
|
|
492
|
+
else if (category == tensor_category::FFN_DOWN) {
|
|
266
493
|
if (qs.i_ffn_down < qs.n_ffn_down/8) {
|
|
267
494
|
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
|
268
495
|
}
|
|
269
496
|
++qs.i_ffn_down;
|
|
270
497
|
}
|
|
271
|
-
else if (
|
|
498
|
+
else if (category == tensor_category::ATTENTION_OUTPUT) {
|
|
272
499
|
if (qs.model.hparams.n_expert == 8) {
|
|
273
500
|
new_type = GGML_TYPE_Q5_K;
|
|
274
501
|
} else {
|
|
@@ -276,7 +503,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
276
503
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
|
|
277
504
|
}
|
|
278
505
|
}
|
|
279
|
-
} else if (
|
|
506
|
+
} else if (category_is_attn_v(category)) {
|
|
280
507
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
|
281
508
|
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
|
282
509
|
}
|
|
@@ -314,7 +541,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
314
541
|
new_type = GGML_TYPE_Q8_0;
|
|
315
542
|
}
|
|
316
543
|
++qs.i_attention_wv;
|
|
317
|
-
} else if (
|
|
544
|
+
} else if (category == tensor_category::ATTENTION_K) {
|
|
318
545
|
if (qs.model.hparams.n_expert == 8) {
|
|
319
546
|
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
|
320
547
|
// TODO: explore better strategies
|
|
@@ -326,14 +553,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
326
553
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
|
327
554
|
new_type = GGML_TYPE_IQ2_S;
|
|
328
555
|
}
|
|
329
|
-
} else if (
|
|
556
|
+
} else if (category == tensor_category::ATTENTION_Q) {
|
|
330
557
|
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
|
|
331
558
|
new_type = GGML_TYPE_IQ3_XXS;
|
|
332
559
|
}
|
|
333
560
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
|
334
561
|
new_type = GGML_TYPE_IQ2_S;
|
|
335
562
|
}
|
|
336
|
-
} else if (
|
|
563
|
+
} else if (category == tensor_category::FFN_DOWN) {
|
|
337
564
|
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
|
|
338
565
|
int i_layer = info.first, n_layer = info.second;
|
|
339
566
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
|
@@ -378,7 +605,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
378
605
|
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
|
|
379
606
|
}
|
|
380
607
|
++qs.i_ffn_down;
|
|
381
|
-
} else if (
|
|
608
|
+
} else if (category == tensor_category::ATTENTION_OUTPUT) {
|
|
382
609
|
if (arch != LLM_ARCH_FALCON) {
|
|
383
610
|
if (qs.model.hparams.n_expert == 8) {
|
|
384
611
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
|
@@ -398,14 +625,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
398
625
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
|
399
626
|
}
|
|
400
627
|
}
|
|
401
|
-
else if (
|
|
628
|
+
else if (category == tensor_category::ATTENTION_QKV) {
|
|
402
629
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
|
403
630
|
new_type = GGML_TYPE_Q4_K;
|
|
404
631
|
}
|
|
405
632
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
|
406
633
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
|
407
634
|
}
|
|
408
|
-
else if (
|
|
635
|
+
else if (category == tensor_category::FFN_GATE) {
|
|
409
636
|
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
|
|
410
637
|
int i_layer = info.first, n_layer = info.second;
|
|
411
638
|
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
|
|
@@ -413,7 +640,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
413
640
|
}
|
|
414
641
|
++qs.i_ffn_gate;
|
|
415
642
|
}
|
|
416
|
-
else if (
|
|
643
|
+
else if (category == tensor_category::FFN_UP) {
|
|
417
644
|
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
|
|
418
645
|
int i_layer = info.first, n_layer = info.second;
|
|
419
646
|
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
|
|
@@ -422,60 +649,58 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
422
649
|
++qs.i_ffn_up;
|
|
423
650
|
}
|
|
424
651
|
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
|
|
428
|
-
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
|
|
429
|
-
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
|
430
|
-
//}
|
|
431
|
-
// This can be used to reduce the size of the Q5_K_S model.
|
|
432
|
-
// The associated PPL increase is fully in line with the size reduction
|
|
433
|
-
//else {
|
|
434
|
-
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
|
435
|
-
//}
|
|
436
|
-
bool convert_incompatible_tensor = false;
|
|
437
|
-
{
|
|
438
|
-
const int64_t nx = tensor->ne[0];
|
|
439
|
-
const int64_t ny = tensor->ne[1];
|
|
440
|
-
const int64_t qk_k = ggml_blck_size(new_type);
|
|
652
|
+
return new_type;
|
|
653
|
+
}
|
|
441
654
|
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
655
|
+
// outer wrapper: determine the ggml_type that this tensor should be quantized to
|
|
656
|
+
static ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) {
|
|
657
|
+
if (!tensor_allows_quantization(params, qs.model.arch, tensor)) {
|
|
658
|
+
return tensor->type;
|
|
659
|
+
}
|
|
660
|
+
if (params->token_embedding_type < GGML_TYPE_COUNT && tm.category == tensor_category::TOKEN_EMBD) {
|
|
661
|
+
return params->token_embedding_type;
|
|
662
|
+
}
|
|
663
|
+
if (params->output_tensor_type < GGML_TYPE_COUNT && tm.category == tensor_category::OUTPUT) {
|
|
664
|
+
return params->output_tensor_type;
|
|
448
665
|
}
|
|
449
666
|
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
667
|
+
ggml_type new_type = default_type;
|
|
668
|
+
|
|
669
|
+
// get more optimal quantization type based on the tensor shape, layer, etc.
|
|
670
|
+
if (!params->pure && ggml_is_quantized(default_type)) {
|
|
671
|
+
// if the user provided tensor types - use those
|
|
672
|
+
bool manual = false;
|
|
673
|
+
if (!qs.tensor_type_patterns.empty()) {
|
|
674
|
+
const std::string tensor_name(tensor->name);
|
|
675
|
+
for (const auto & [pattern, qtype] : qs.tensor_type_patterns) {
|
|
676
|
+
if (std::regex_search(tensor_name, pattern)) {
|
|
677
|
+
if (qtype != new_type) {
|
|
678
|
+
LLAMA_LOG_WARN("%s: %-36s - applying manual override: %s -> %s\n",
|
|
679
|
+
__func__, tensor_name.c_str(), ggml_type_name(new_type), ggml_type_name(qtype));
|
|
680
|
+
new_type = qtype;
|
|
681
|
+
manual = true;
|
|
682
|
+
break;
|
|
683
|
+
}
|
|
684
|
+
}
|
|
685
|
+
}
|
|
468
686
|
}
|
|
469
|
-
|
|
470
|
-
|
|
687
|
+
|
|
688
|
+
// if not manual - use the standard logic for choosing the quantization type based on the selected mixture
|
|
689
|
+
if (!manual) {
|
|
690
|
+
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, params->ftype, tm.category);
|
|
471
691
|
}
|
|
472
|
-
|
|
473
|
-
|
|
692
|
+
|
|
693
|
+
// incompatible tensor shapes are handled here - fallback to a compatible type
|
|
694
|
+
new_type = tensor_type_fallback(qs, tensor, new_type);
|
|
474
695
|
}
|
|
475
696
|
|
|
476
697
|
return new_type;
|
|
477
698
|
}
|
|
478
699
|
|
|
700
|
+
//
|
|
701
|
+
// quantization implementation
|
|
702
|
+
//
|
|
703
|
+
|
|
479
704
|
static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
|
480
705
|
if (nthread < 2) {
|
|
481
706
|
// single-thread
|
|
@@ -530,50 +755,85 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
|
|
|
530
755
|
return new_size;
|
|
531
756
|
}
|
|
532
757
|
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
758
|
+
//
|
|
759
|
+
// imatrix requirement check
|
|
760
|
+
//
|
|
761
|
+
|
|
762
|
+
static bool tensor_requires_imatrix(const char * tensor_name, const ggml_type dst_type, const llama_ftype ftype) {
|
|
763
|
+
if (tensor_name_match_token_embd(tensor_name) || tensor_name_match_output_weight(tensor_name)) {
|
|
764
|
+
return false;
|
|
765
|
+
}
|
|
766
|
+
switch (dst_type) {
|
|
767
|
+
case GGML_TYPE_IQ3_XXS:
|
|
768
|
+
case GGML_TYPE_IQ2_XXS:
|
|
769
|
+
case GGML_TYPE_IQ2_XS:
|
|
770
|
+
case GGML_TYPE_IQ2_S:
|
|
771
|
+
case GGML_TYPE_IQ1_M:
|
|
772
|
+
case GGML_TYPE_IQ1_S:
|
|
773
|
+
return true;
|
|
774
|
+
case GGML_TYPE_Q2_K:
|
|
775
|
+
// as a general rule, the k-type quantizations don't require imatrix data.
|
|
776
|
+
// the only exception is Q2_K tensors that are part of a Q2_K_S file.
|
|
777
|
+
return ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S;
|
|
778
|
+
default:
|
|
779
|
+
return false;
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
//
|
|
784
|
+
// given a file type, get the default tensor type
|
|
785
|
+
//
|
|
536
786
|
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
case
|
|
540
|
-
case
|
|
541
|
-
case
|
|
542
|
-
case
|
|
543
|
-
case
|
|
544
|
-
case
|
|
545
|
-
case
|
|
787
|
+
static ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
|
|
788
|
+
switch (ftype) {
|
|
789
|
+
case LLAMA_FTYPE_MOSTLY_Q4_0: return GGML_TYPE_Q4_0;
|
|
790
|
+
case LLAMA_FTYPE_MOSTLY_Q4_1: return GGML_TYPE_Q4_1;
|
|
791
|
+
case LLAMA_FTYPE_MOSTLY_Q5_0: return GGML_TYPE_Q5_0;
|
|
792
|
+
case LLAMA_FTYPE_MOSTLY_Q5_1: return GGML_TYPE_Q5_1;
|
|
793
|
+
case LLAMA_FTYPE_MOSTLY_Q8_0: return GGML_TYPE_Q8_0;
|
|
794
|
+
case LLAMA_FTYPE_MOSTLY_F16: return GGML_TYPE_F16;
|
|
795
|
+
case LLAMA_FTYPE_MOSTLY_BF16: return GGML_TYPE_BF16;
|
|
796
|
+
case LLAMA_FTYPE_ALL_F32: return GGML_TYPE_F32;
|
|
546
797
|
|
|
547
|
-
case LLAMA_FTYPE_MOSTLY_MXFP4_MOE:
|
|
798
|
+
case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return GGML_TYPE_MXFP4;
|
|
548
799
|
|
|
549
800
|
// K-quants
|
|
550
801
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
|
551
|
-
case LLAMA_FTYPE_MOSTLY_Q2_K:
|
|
552
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_XS:
|
|
802
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: return GGML_TYPE_Q2_K;
|
|
803
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return GGML_TYPE_IQ3_S;
|
|
553
804
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
|
554
805
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
|
555
|
-
case LLAMA_FTYPE_MOSTLY_Q3_K_L:
|
|
806
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return GGML_TYPE_Q3_K;
|
|
556
807
|
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
|
557
|
-
case LLAMA_FTYPE_MOSTLY_Q4_K_M:
|
|
808
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return GGML_TYPE_Q4_K;
|
|
558
809
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
|
559
|
-
case LLAMA_FTYPE_MOSTLY_Q5_K_M:
|
|
560
|
-
case LLAMA_FTYPE_MOSTLY_Q6_K:
|
|
561
|
-
case LLAMA_FTYPE_MOSTLY_TQ1_0:
|
|
562
|
-
case LLAMA_FTYPE_MOSTLY_TQ2_0:
|
|
563
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:
|
|
564
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_XS:
|
|
565
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_S:
|
|
566
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_M:
|
|
567
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:
|
|
568
|
-
case LLAMA_FTYPE_MOSTLY_IQ1_S:
|
|
569
|
-
case LLAMA_FTYPE_MOSTLY_IQ1_M:
|
|
570
|
-
case LLAMA_FTYPE_MOSTLY_IQ4_NL:
|
|
571
|
-
case LLAMA_FTYPE_MOSTLY_IQ4_XS:
|
|
572
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_S:
|
|
573
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_M:
|
|
810
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return GGML_TYPE_Q5_K;
|
|
811
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: return GGML_TYPE_Q6_K;
|
|
812
|
+
case LLAMA_FTYPE_MOSTLY_TQ1_0: return GGML_TYPE_TQ1_0;
|
|
813
|
+
case LLAMA_FTYPE_MOSTLY_TQ2_0: return GGML_TYPE_TQ2_0;
|
|
814
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return GGML_TYPE_IQ2_XXS;
|
|
815
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return GGML_TYPE_IQ2_XS;
|
|
816
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_S: return GGML_TYPE_IQ2_XS;
|
|
817
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_M: return GGML_TYPE_IQ2_S;
|
|
818
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return GGML_TYPE_IQ3_XXS;
|
|
819
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_S: return GGML_TYPE_IQ1_S;
|
|
820
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_M: return GGML_TYPE_IQ1_M;
|
|
821
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return GGML_TYPE_IQ4_NL;
|
|
822
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return GGML_TYPE_IQ4_XS;
|
|
823
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_S:
|
|
824
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_M: return GGML_TYPE_IQ3_S;
|
|
574
825
|
|
|
575
826
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
|
576
827
|
}
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
//
|
|
831
|
+
// main quantization driver
|
|
832
|
+
//
|
|
833
|
+
|
|
834
|
+
static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
|
835
|
+
ggml_type default_type;
|
|
836
|
+
llama_ftype ftype = params->ftype;
|
|
577
837
|
|
|
578
838
|
int nthread = params->nthread;
|
|
579
839
|
|
|
@@ -581,6 +841,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
581
841
|
nthread = std::thread::hardware_concurrency();
|
|
582
842
|
}
|
|
583
843
|
|
|
844
|
+
default_type = llama_ftype_get_default_type(ftype);
|
|
845
|
+
|
|
584
846
|
// mmap consistently increases speed on Linux, and also increases speed on Windows with
|
|
585
847
|
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
|
586
848
|
#if defined(__linux__) || defined(_WIN32)
|
|
@@ -596,7 +858,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
596
858
|
}
|
|
597
859
|
|
|
598
860
|
std::vector<std::string> splits = {};
|
|
599
|
-
llama_model_loader ml(
|
|
861
|
+
llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
|
|
862
|
+
fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
|
600
863
|
ml.init_mappings(false); // no prefetching
|
|
601
864
|
|
|
602
865
|
llama_model model(llama_model_default_params());
|
|
@@ -614,7 +877,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
614
877
|
if (params->imatrix) {
|
|
615
878
|
imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
|
|
616
879
|
if (imatrix_data) {
|
|
617
|
-
LLAMA_LOG_INFO("
|
|
880
|
+
LLAMA_LOG_INFO("\n%s: have importance matrix data with %d entries\n",
|
|
881
|
+
__func__, (int)imatrix_data->size());
|
|
618
882
|
qs.has_imatrix = true;
|
|
619
883
|
// check imatrix for nans or infs
|
|
620
884
|
for (const auto & kv : *imatrix_data) {
|
|
@@ -636,7 +900,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
636
900
|
}
|
|
637
901
|
|
|
638
902
|
// copy the KV pairs from the input file
|
|
639
|
-
gguf_set_kv (ctx_out.get(), ml.
|
|
903
|
+
gguf_set_kv (ctx_out.get(), ml.metadata);
|
|
640
904
|
gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
|
|
641
905
|
gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV
|
|
642
906
|
|
|
@@ -653,7 +917,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
653
917
|
gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64);
|
|
654
918
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
|
655
919
|
// Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
|
|
656
|
-
gguf_set_val_u32(ctx_out.get(), o.key, (uint32_t)abs(o.val_i64));
|
|
920
|
+
gguf_set_val_u32(ctx_out.get(), o.key, (uint32_t)std::abs(o.val_i64));
|
|
657
921
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
|
658
922
|
gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool);
|
|
659
923
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
|
|
@@ -666,7 +930,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
666
930
|
|
|
667
931
|
std::map<int, std::string> mapped;
|
|
668
932
|
int blk_id = 0;
|
|
669
|
-
int pruned_attention_w = 0;
|
|
670
933
|
|
|
671
934
|
// make a list of weights
|
|
672
935
|
std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
|
|
@@ -674,14 +937,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
674
937
|
for (const auto & it : ml.weights_map) {
|
|
675
938
|
const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
|
|
676
939
|
if (remapped_name.empty()) {
|
|
677
|
-
if (it.first.find("attn_v.weight") != std::string::npos ||
|
|
678
|
-
it.first.find("attn_qkv.weight") != std::string::npos ||
|
|
679
|
-
it.first.find("attn_kv_b.weight") != std::string::npos) {
|
|
680
|
-
pruned_attention_w++;
|
|
681
|
-
}
|
|
682
940
|
LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
|
|
683
941
|
continue;
|
|
684
|
-
}
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
if (remapped_name != it.first) {
|
|
685
945
|
ggml_set_name(it.second.tensor, remapped_name.c_str());
|
|
686
946
|
LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
|
|
687
947
|
}
|
|
@@ -701,49 +961,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
701
961
|
});
|
|
702
962
|
}
|
|
703
963
|
|
|
704
|
-
for (const auto * it : tensors) {
|
|
705
|
-
const struct ggml_tensor * tensor = it->tensor;
|
|
706
|
-
|
|
707
|
-
const std::string name = ggml_get_name(tensor);
|
|
708
|
-
|
|
709
|
-
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
|
710
|
-
if (name.find("attn_v.weight") != std::string::npos ||
|
|
711
|
-
name.find("attn_qkv.weight") != std::string::npos ||
|
|
712
|
-
name.find("attn_kv_b.weight")!= std::string::npos) {
|
|
713
|
-
++qs.n_attention_wv;
|
|
714
|
-
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
|
715
|
-
qs.has_output = true;
|
|
716
|
-
}
|
|
717
|
-
}
|
|
718
|
-
|
|
719
|
-
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
|
720
|
-
|
|
721
|
-
// sanity checks for models that have attention layers
|
|
722
|
-
if (qs.n_attention_wv != 0)
|
|
723
|
-
{
|
|
724
|
-
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
|
|
725
|
-
// attention layers have a non-zero number of kv heads
|
|
726
|
-
int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
|
|
727
|
-
if (llama_model_has_encoder(&model)) {
|
|
728
|
-
// now n_attn_layer is the number of attention layers in the encoder
|
|
729
|
-
// for each decoder block, there are 2 attention layers
|
|
730
|
-
n_attn_layer += 2 * model.hparams.dec_n_layer;
|
|
731
|
-
}
|
|
732
|
-
GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
|
|
733
|
-
}
|
|
734
|
-
|
|
735
|
-
size_t total_size_org = 0;
|
|
736
|
-
size_t total_size_new = 0;
|
|
737
|
-
|
|
738
|
-
std::vector<std::thread> workers;
|
|
739
|
-
workers.reserve(nthread);
|
|
740
|
-
|
|
741
964
|
int idx = 0;
|
|
742
|
-
|
|
743
|
-
std::vector<no_init<uint8_t>> read_data;
|
|
744
|
-
std::vector<no_init<uint8_t>> work;
|
|
745
|
-
std::vector<no_init<float>> f32_conv_buf;
|
|
746
|
-
|
|
747
965
|
uint16_t n_split = 1;
|
|
748
966
|
|
|
749
967
|
// Assume split index is continuous
|
|
@@ -755,14 +973,68 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
755
973
|
std::vector<gguf_context_ptr> ctx_outs(n_split);
|
|
756
974
|
ctx_outs[0] = std::move(ctx_out);
|
|
757
975
|
|
|
758
|
-
//
|
|
759
|
-
|
|
976
|
+
// compute tensor metadata once and cache it
|
|
977
|
+
std::vector<tensor_metadata> metadata(tensors.size());
|
|
978
|
+
|
|
979
|
+
// initialize quantization state before preliminary loop (counters for use_more_bits)
|
|
980
|
+
{
|
|
981
|
+
for (size_t i = 0; i < tensors.size(); ++i) {
|
|
982
|
+
const auto cat = tensor_get_category(tensors[i]->tensor->name);
|
|
983
|
+
if (category_is_attn_v(cat)) {
|
|
984
|
+
++qs.n_attention_wv;
|
|
985
|
+
}
|
|
986
|
+
if (cat == tensor_category::OUTPUT) {
|
|
987
|
+
qs.has_tied_embeddings = false;
|
|
988
|
+
}
|
|
989
|
+
metadata[i].category = cat; // save and re-use the category while we're at it
|
|
990
|
+
}
|
|
991
|
+
// these also need to be set to n_layer by default
|
|
992
|
+
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
|
|
993
|
+
}
|
|
994
|
+
|
|
995
|
+
// flag for --dry-run
|
|
996
|
+
bool will_require_imatrix = false;
|
|
997
|
+
|
|
998
|
+
//
|
|
999
|
+
// preliminary iteration over all weights
|
|
1000
|
+
//
|
|
1001
|
+
|
|
1002
|
+
for (size_t i = 0; i < tensors.size(); ++i) {
|
|
1003
|
+
const auto * it = tensors[i];
|
|
1004
|
+
const struct ggml_tensor * tensor = it->tensor;
|
|
1005
|
+
const std::string name = ggml_get_name(tensor);
|
|
1006
|
+
|
|
760
1007
|
uint16_t i_split = params->keep_split ? it->idx : 0;
|
|
761
|
-
ggml_tensor * tensor = it->tensor;
|
|
762
1008
|
if (!ctx_outs[i_split]) {
|
|
763
1009
|
ctx_outs[i_split].reset(gguf_init_empty());
|
|
764
1010
|
}
|
|
765
1011
|
gguf_add_tensor(ctx_outs[i_split].get(), tensor);
|
|
1012
|
+
|
|
1013
|
+
metadata[i].allows_quantization = tensor_allows_quantization(params, model.arch, tensor);
|
|
1014
|
+
|
|
1015
|
+
if (metadata[i].allows_quantization) {
|
|
1016
|
+
metadata[i].target_type = llama_tensor_get_type(qs, params, tensor, default_type, metadata[i]);
|
|
1017
|
+
} else {
|
|
1018
|
+
metadata[i].target_type = tensor->type;
|
|
1019
|
+
}
|
|
1020
|
+
|
|
1021
|
+
metadata[i].requires_imatrix = tensor_requires_imatrix(tensor->name, metadata[i].target_type, ftype);
|
|
1022
|
+
|
|
1023
|
+
if (params->imatrix) {
|
|
1024
|
+
metadata[i].remapped_imatrix_name = remap_imatrix(tensor->name, mapped);
|
|
1025
|
+
} else if (metadata[i].allows_quantization && metadata[i].requires_imatrix) {
|
|
1026
|
+
if (params->dry_run) {
|
|
1027
|
+
will_require_imatrix = true;
|
|
1028
|
+
} else {
|
|
1029
|
+
LLAMA_LOG_ERROR("\n============================================================================\n"
|
|
1030
|
+
" ERROR: this quantization requires an importance matrix!\n"
|
|
1031
|
+
" - offending tensor: %s\n"
|
|
1032
|
+
" - target type: %s\n"
|
|
1033
|
+
"============================================================================\n\n",
|
|
1034
|
+
name.c_str(), ggml_type_name(metadata[i].target_type));
|
|
1035
|
+
throw std::runtime_error("this quantization requires an imatrix!");
|
|
1036
|
+
}
|
|
1037
|
+
}
|
|
766
1038
|
}
|
|
767
1039
|
|
|
768
1040
|
// Set split info if needed
|
|
@@ -774,6 +1046,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
774
1046
|
}
|
|
775
1047
|
}
|
|
776
1048
|
|
|
1049
|
+
size_t total_size_org = 0;
|
|
1050
|
+
size_t total_size_new = 0;
|
|
1051
|
+
|
|
1052
|
+
std::vector<std::thread> workers;
|
|
1053
|
+
workers.reserve(nthread);
|
|
1054
|
+
|
|
1055
|
+
std::vector<no_init<uint8_t>> read_data;
|
|
1056
|
+
std::vector<no_init<uint8_t>> work;
|
|
1057
|
+
std::vector<no_init<float>> f32_conv_buf;
|
|
1058
|
+
|
|
777
1059
|
int cur_split = -1;
|
|
778
1060
|
std::ofstream fout;
|
|
779
1061
|
auto close_ofstream = [&]() {
|
|
@@ -803,248 +1085,182 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
803
1085
|
::zeros(fout, meta_size);
|
|
804
1086
|
};
|
|
805
1087
|
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
1088
|
+
// no output file for --dry-run
|
|
1089
|
+
if (!params->dry_run) {
|
|
1090
|
+
new_ofstream(0);
|
|
1091
|
+
}
|
|
1092
|
+
|
|
1093
|
+
//
|
|
1094
|
+
// main loop: iterate over all weights
|
|
1095
|
+
//
|
|
1096
|
+
|
|
1097
|
+
for (size_t i = 0; i < tensors.size(); ++i) {
|
|
1098
|
+
const auto & weight = *tensors[i];
|
|
1099
|
+
const auto & tm = metadata[i];
|
|
810
1100
|
ggml_tensor * tensor = weight.tensor;
|
|
811
|
-
|
|
1101
|
+
|
|
1102
|
+
if (!params->dry_run && (weight.idx != cur_split && params->keep_split)) {
|
|
812
1103
|
close_ofstream();
|
|
813
1104
|
new_ofstream(weight.idx);
|
|
814
1105
|
}
|
|
815
1106
|
|
|
816
1107
|
const std::string name = ggml_get_name(tensor);
|
|
1108
|
+
const size_t tensor_size = ggml_nbytes(tensor);
|
|
817
1109
|
|
|
818
|
-
if (!
|
|
819
|
-
if (
|
|
820
|
-
read_data.
|
|
1110
|
+
if (!params->dry_run) {
|
|
1111
|
+
if (!ml.use_mmap) {
|
|
1112
|
+
if (read_data.size() < tensor_size) {
|
|
1113
|
+
read_data.resize(tensor_size);
|
|
1114
|
+
}
|
|
1115
|
+
tensor->data = read_data.data();
|
|
821
1116
|
}
|
|
822
|
-
|
|
1117
|
+
ml.load_data_for(tensor);
|
|
823
1118
|
}
|
|
824
|
-
ml.load_data_for(tensor);
|
|
825
1119
|
|
|
826
|
-
LLAMA_LOG_INFO("[%4d/%4d]
|
|
1120
|
+
LLAMA_LOG_INFO("[%4d/%4d] %-36s - [%s], type = %6s, ",
|
|
827
1121
|
++idx, ml.n_tensors,
|
|
828
1122
|
ggml_get_name(tensor),
|
|
829
1123
|
llama_format_tensor_shape(tensor).c_str(),
|
|
830
1124
|
ggml_type_name(tensor->type));
|
|
831
1125
|
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
// quantize
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
quantize &= name.find("_norm.weight") == std::string::npos;
|
|
840
|
-
|
|
841
|
-
quantize &= params->quantize_output_tensor || name != "output.weight";
|
|
842
|
-
quantize &= !params->only_copy;
|
|
843
|
-
|
|
844
|
-
// do not quantize expert gating tensors
|
|
845
|
-
// NOTE: can't use LLM_TN here because the layer number is not known
|
|
846
|
-
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
|
|
847
|
-
|
|
848
|
-
// these are very small (e.g. 4x4)
|
|
849
|
-
quantize &= name.find("altup") == std::string::npos;
|
|
850
|
-
quantize &= name.find("laurel") == std::string::npos;
|
|
851
|
-
|
|
852
|
-
// these are not too big so keep them as it is
|
|
853
|
-
quantize &= name.find("per_layer_model_proj") == std::string::npos;
|
|
854
|
-
|
|
855
|
-
// do not quantize positional embeddings and token types (BERT)
|
|
856
|
-
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
|
|
857
|
-
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
|
|
858
|
-
|
|
859
|
-
// do not quantize Mamba's small yet 2D weights
|
|
860
|
-
// NOTE: can't use LLM_TN here because the layer number is not known
|
|
861
|
-
quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
|
|
862
|
-
quantize &= name.find("shortconv.conv.weight") == std::string::npos;
|
|
863
|
-
|
|
864
|
-
// do not quantize RWKV's small yet 2D weights
|
|
865
|
-
quantize &= name.find("time_mix_first.weight") == std::string::npos;
|
|
866
|
-
quantize &= name.find("time_mix_w0.weight") == std::string::npos;
|
|
867
|
-
quantize &= name.find("time_mix_w1.weight") == std::string::npos;
|
|
868
|
-
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
|
|
869
|
-
quantize &= name.find("time_mix_v0.weight") == std::string::npos;
|
|
870
|
-
quantize &= name.find("time_mix_v1.weight") == std::string::npos;
|
|
871
|
-
quantize &= name.find("time_mix_v2.weight") == std::string::npos;
|
|
872
|
-
quantize &= name.find("time_mix_a0.weight") == std::string::npos;
|
|
873
|
-
quantize &= name.find("time_mix_a1.weight") == std::string::npos;
|
|
874
|
-
quantize &= name.find("time_mix_a2.weight") == std::string::npos;
|
|
875
|
-
quantize &= name.find("time_mix_g1.weight") == std::string::npos;
|
|
876
|
-
quantize &= name.find("time_mix_g2.weight") == std::string::npos;
|
|
877
|
-
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
|
|
878
|
-
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
|
|
879
|
-
quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
|
|
880
|
-
|
|
881
|
-
// do not quantize relative position bias (T5)
|
|
882
|
-
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
|
883
|
-
|
|
884
|
-
ggml_type new_type;
|
|
1126
|
+
const ggml_type cur_type = tensor->type;
|
|
1127
|
+
const ggml_type new_type = tm.target_type;
|
|
1128
|
+
|
|
1129
|
+
// If we've decided to quantize to the same type the tensor is already
|
|
1130
|
+
// in then there's nothing to do.
|
|
1131
|
+
bool quantize = cur_type != new_type;
|
|
1132
|
+
|
|
885
1133
|
void * new_data;
|
|
886
1134
|
size_t new_size;
|
|
887
1135
|
|
|
888
|
-
if (
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
if (
|
|
897
|
-
|
|
898
|
-
const std::string tensor_name(tensor->name);
|
|
899
|
-
for (const auto & [tname, qtype] : tensor_types) {
|
|
900
|
-
if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
|
|
901
|
-
if (qtype != new_type) {
|
|
902
|
-
LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
|
|
903
|
-
new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
|
|
904
|
-
}
|
|
905
|
-
}
|
|
906
|
-
}
|
|
1136
|
+
if (params->dry_run) {
|
|
1137
|
+
// the --dry-run option calculates the final quantization size without quantizing
|
|
1138
|
+
if (quantize) {
|
|
1139
|
+
new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]);
|
|
1140
|
+
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n",
|
|
1141
|
+
tensor_size/1024.0/1024.0,
|
|
1142
|
+
new_size/1024.0/1024.0,
|
|
1143
|
+
ggml_type_name(new_type));
|
|
1144
|
+
if (!will_require_imatrix && tm.requires_imatrix) {
|
|
1145
|
+
will_require_imatrix = true;
|
|
907
1146
|
}
|
|
1147
|
+
} else {
|
|
1148
|
+
new_size = tensor_size;
|
|
1149
|
+
LLAMA_LOG_INFO("size = %8.3f MiB\n", new_size/1024.0/1024.0);
|
|
908
1150
|
}
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
|
|
913
|
-
new_type = params->output_tensor_type;
|
|
914
|
-
}
|
|
915
|
-
|
|
916
|
-
// If we've decided to quantize to the same type the tensor is already
|
|
917
|
-
// in then there's nothing to do.
|
|
918
|
-
quantize = tensor->type != new_type;
|
|
919
|
-
}
|
|
920
|
-
|
|
921
|
-
if (!quantize) {
|
|
922
|
-
new_type = tensor->type;
|
|
923
|
-
new_data = tensor->data;
|
|
924
|
-
new_size = ggml_nbytes(tensor);
|
|
925
|
-
LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
|
1151
|
+
total_size_org += tensor_size;
|
|
1152
|
+
total_size_new += new_size;
|
|
1153
|
+
continue;
|
|
926
1154
|
} else {
|
|
927
|
-
|
|
1155
|
+
// no --dry-run, perform quantization
|
|
1156
|
+
if (!quantize) {
|
|
1157
|
+
new_data = tensor->data;
|
|
1158
|
+
new_size = tensor_size;
|
|
1159
|
+
LLAMA_LOG_INFO("size = %8.3f MiB\n", tensor_size/1024.0/1024.0);
|
|
1160
|
+
} else {
|
|
1161
|
+
const int64_t nelements = ggml_nelements(tensor);
|
|
928
1162
|
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
} else {
|
|
935
|
-
if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
|
|
936
|
-
imatrix = it->second.data();
|
|
1163
|
+
const float * imatrix = nullptr;
|
|
1164
|
+
if (imatrix_data) {
|
|
1165
|
+
auto it = imatrix_data->find(tm.remapped_imatrix_name);
|
|
1166
|
+
if (it == imatrix_data->end()) {
|
|
1167
|
+
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
|
|
937
1168
|
} else {
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
1169
|
+
if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
|
|
1170
|
+
imatrix = it->second.data();
|
|
1171
|
+
} else {
|
|
1172
|
+
LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
|
|
1173
|
+
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
|
|
1174
|
+
|
|
1175
|
+
// this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
|
|
1176
|
+
// this is a significant error and it may be good idea to abort the process if this happens,
|
|
1177
|
+
// since many people will miss the error and not realize that most of the model is being quantized without an imatrix
|
|
1178
|
+
// tok_embd should be ignored in this case, since it always causes this warning
|
|
1179
|
+
if (!tensor_name_match_token_embd(tensor->name)) {
|
|
1180
|
+
throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
|
|
1181
|
+
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
|
|
1182
|
+
}
|
|
948
1183
|
}
|
|
949
1184
|
}
|
|
950
1185
|
}
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
|
959
|
-
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
|
960
|
-
LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
|
|
961
|
-
LLAMA_LOG_ERROR("============================================================\n\n");
|
|
962
|
-
throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
|
|
963
|
-
}
|
|
1186
|
+
if (!imatrix && tm.requires_imatrix) {
|
|
1187
|
+
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
|
1188
|
+
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
|
1189
|
+
LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
|
|
1190
|
+
LLAMA_LOG_ERROR("============================================================\n\n");
|
|
1191
|
+
throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
|
|
1192
|
+
}
|
|
964
1193
|
|
|
965
|
-
|
|
1194
|
+
float * f32_data;
|
|
966
1195
|
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
1196
|
+
if (tensor->type == GGML_TYPE_F32) {
|
|
1197
|
+
f32_data = (float *) tensor->data;
|
|
1198
|
+
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
|
1199
|
+
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
|
1200
|
+
} else {
|
|
1201
|
+
llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
|
|
1202
|
+
f32_data = (float *) f32_conv_buf.data();
|
|
1203
|
+
}
|
|
975
1204
|
|
|
976
|
-
|
|
977
|
-
|
|
1205
|
+
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
|
1206
|
+
fflush(stdout);
|
|
978
1207
|
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
}
|
|
982
|
-
new_data = work.data();
|
|
983
|
-
|
|
984
|
-
const int64_t n_per_row = tensor->ne[0];
|
|
985
|
-
const int64_t nrows = tensor->ne[1];
|
|
986
|
-
|
|
987
|
-
static const int64_t min_chunk_size = 32 * 512;
|
|
988
|
-
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
|
|
989
|
-
|
|
990
|
-
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
|
991
|
-
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
|
992
|
-
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
|
|
993
|
-
|
|
994
|
-
// quantize each expert separately since they have different importance matrices
|
|
995
|
-
new_size = 0;
|
|
996
|
-
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
|
|
997
|
-
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
|
|
998
|
-
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
|
999
|
-
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
|
1000
|
-
|
|
1001
|
-
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
|
1002
|
-
|
|
1003
|
-
// TODO: temporary sanity check that the F16 -> MXFP4 is lossless
|
|
1004
|
-
#if 0
|
|
1005
|
-
if (new_type == GGML_TYPE_MXFP4) {
|
|
1006
|
-
auto * x = f32_data_03;
|
|
1007
|
-
|
|
1008
|
-
//LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
|
|
1009
|
-
std::vector<float> deq(nrows*n_per_row);
|
|
1010
|
-
const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
|
|
1011
|
-
qtype->to_float(new_data_03, deq.data(), deq.size());
|
|
1012
|
-
|
|
1013
|
-
double err = 0.0f;
|
|
1014
|
-
for (int i = 0; i < (int) deq.size(); ++i) {
|
|
1015
|
-
err += fabsf(deq[i] - x[i]);
|
|
1016
|
-
//if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
|
|
1017
|
-
if (deq[i] != x[i]) {
|
|
1018
|
-
LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
|
|
1019
|
-
}
|
|
1020
|
-
}
|
|
1021
|
-
//LLAMA_LOG_INFO("err = %f\n", err);
|
|
1022
|
-
GGML_ASSERT(err == 0.00000);
|
|
1208
|
+
if (work.size() < (size_t)nelements * 4) {
|
|
1209
|
+
work.resize(nelements * 4); // upper bound on size
|
|
1023
1210
|
}
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
total_size_org += ggml_nbytes(tensor);
|
|
1029
|
-
total_size_new += new_size;
|
|
1211
|
+
new_data = work.data();
|
|
1212
|
+
|
|
1213
|
+
const int64_t n_per_row = tensor->ne[0];
|
|
1214
|
+
const int64_t nrows = tensor->ne[1];
|
|
1030
1215
|
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
|
|
1034
|
-
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
|
|
1216
|
+
static const int64_t min_chunk_size = 32 * 512;
|
|
1217
|
+
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
|
|
1035
1218
|
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1219
|
+
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
|
1220
|
+
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
|
1221
|
+
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
|
|
1222
|
+
|
|
1223
|
+
// quantize each expert separately since they have different importance matrices
|
|
1224
|
+
new_size = 0;
|
|
1225
|
+
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
|
|
1226
|
+
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
|
|
1227
|
+
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
|
1228
|
+
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
|
1229
|
+
|
|
1230
|
+
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
|
1231
|
+
}
|
|
1232
|
+
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0);
|
|
1233
|
+
}
|
|
1234
|
+
total_size_org += tensor_size;
|
|
1235
|
+
total_size_new += new_size;
|
|
1236
|
+
|
|
1237
|
+
// update the gguf meta data as we go
|
|
1238
|
+
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
|
|
1239
|
+
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
|
|
1240
|
+
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
|
|
1241
|
+
|
|
1242
|
+
// write tensor data + padding
|
|
1243
|
+
fout.write((const char *) new_data, new_size);
|
|
1244
|
+
zeros(fout, GGML_PAD(new_size, align) - new_size);
|
|
1245
|
+
} // no --dry-run
|
|
1246
|
+
} // main loop
|
|
1247
|
+
|
|
1248
|
+
if (!params->dry_run) {
|
|
1249
|
+
close_ofstream();
|
|
1039
1250
|
}
|
|
1040
|
-
close_ofstream();
|
|
1041
1251
|
|
|
1042
|
-
LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
|
|
1043
|
-
LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
|
|
1252
|
+
LLAMA_LOG_INFO("%s: model size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_org/1024.0/1024.0, total_size_org*8.0/ml.n_elements);
|
|
1253
|
+
LLAMA_LOG_INFO("%s: quant size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_new/1024.0/1024.0, total_size_new*8.0/ml.n_elements);
|
|
1254
|
+
|
|
1255
|
+
if (!params->imatrix && params->dry_run && will_require_imatrix) {
|
|
1256
|
+
LLAMA_LOG_WARN("%s: WARNING: dry run completed successfully, but actually completing this quantization will require an imatrix!\n",
|
|
1257
|
+
__func__
|
|
1258
|
+
);
|
|
1259
|
+
}
|
|
1044
1260
|
|
|
1045
1261
|
if (qs.n_fallback > 0) {
|
|
1046
1262
|
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
|
|
1047
|
-
__func__, qs.n_fallback,
|
|
1263
|
+
__func__, qs.n_fallback, ml.n_tensors);
|
|
1048
1264
|
}
|
|
1049
1265
|
}
|
|
1050
1266
|
|
|
@@ -1063,6 +1279,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
|
|
|
1063
1279
|
/*.only_copy =*/ false,
|
|
1064
1280
|
/*.pure =*/ false,
|
|
1065
1281
|
/*.keep_split =*/ false,
|
|
1282
|
+
/*.dry_run =*/ false,
|
|
1066
1283
|
/*.imatrix =*/ nullptr,
|
|
1067
1284
|
/*.kv_overrides =*/ nullptr,
|
|
1068
1285
|
/*.tensor_type =*/ nullptr,
|