whispercpp 1.3.5 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +99 -2
- data/ext/extconf.rb +1 -0
- data/ext/ruby_whisper.c +20 -4
- data/ext/ruby_whisper.h +30 -2
- data/ext/ruby_whisper_context.c +216 -124
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_model.c +0 -1
- data/ext/ruby_whisper_params.c +0 -1
- data/ext/ruby_whisper_segment.c +0 -1
- data/ext/ruby_whisper_token.c +29 -9
- data/ext/ruby_whisper_transcribe.cpp +4 -1
- data/ext/ruby_whisper_vad_context.c +48 -1
- data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +0 -1
- data/ext/ruby_whisper_vad_segments.c +0 -1
- data/ext/sources/CMakeLists.txt +1 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/examples/bench/bench.cpp +23 -18
- data/ext/sources/examples/cli/cli.cpp +8 -0
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/server/server.cpp +18 -4
- data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
- data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
- data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
- data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
- data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
- data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
- data/ext/sources/examples/talk-llama/llama-context.h +27 -28
- data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
- data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
- data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
- data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
- data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
- data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
- data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
- data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
- data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
- data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
- data/ext/sources/examples/talk-llama/llama-model.h +72 -19
- data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
- data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
- data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
- data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
- data/ext/sources/examples/talk-llama/llama.cpp +76 -22
- data/ext/sources/examples/talk-llama/llama.h +63 -30
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
- data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
- data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
- data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
- data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
- data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/models.h +181 -46
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
- data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
- data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
- data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
- data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
- data/ext/sources/ggml/CMakeLists.txt +9 -3
- data/ext/sources/ggml/include/ggml-backend.h +1 -1
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +5 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +6 -1
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml.h +56 -9
- data/ext/sources/ggml/src/CMakeLists.txt +3 -0
- data/ext/sources/ggml/src/ggml-alloc.c +4 -9
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
- data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
- data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
- data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
- data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
- data/ext/sources/ggml/src/ggml-impl.h +62 -0
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
- data/ext/sources/ggml/src/ggml-quants.c +96 -5
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
- data/ext/sources/ggml/src/ggml.c +167 -33
- data/ext/sources/ggml/src/gguf.cpp +229 -44
- data/ext/sources/src/whisper.cpp +6 -28
- data/sig/whisper.rbs +43 -2
- data/test/test_context_params.rb +82 -0
- data/test/test_token.rb +11 -0
- data/test/test_vad_context.rb +58 -8
- data/test/test_whisper.rb +20 -0
- data/whispercpp.gemspec +1 -1
- metadata +240 -28
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
#include "llama
|
|
1
|
+
#include "llama.h"
|
|
2
2
|
#include "llama-impl.h"
|
|
3
3
|
#include "llama-model.h"
|
|
4
4
|
#include "llama-model-loader.h"
|
|
5
5
|
|
|
6
|
-
#include <algorithm>
|
|
7
6
|
#include <cmath>
|
|
8
7
|
#include <cstring>
|
|
8
|
+
#include <string>
|
|
9
9
|
#include <cinttypes>
|
|
10
10
|
#include <fstream>
|
|
11
11
|
#include <mutex>
|
|
@@ -13,10 +13,28 @@
|
|
|
13
13
|
#include <thread>
|
|
14
14
|
#include <unordered_map>
|
|
15
15
|
|
|
16
|
-
//
|
|
17
|
-
struct
|
|
16
|
+
// result of parsing --tensor-type option
|
|
17
|
+
// (changes to this struct must be reflected in tools/quantize/quantize.cpp)
|
|
18
|
+
struct tensor_type_option {
|
|
18
19
|
std::string name;
|
|
19
|
-
ggml_type
|
|
20
|
+
ggml_type type = GGML_TYPE_COUNT;
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
// tensor categorization - used to avoid repeated string matching in quantization logic.
|
|
24
|
+
// this is different from LLM_TN - we want broad categories, not specific tensor names per arch.
|
|
25
|
+
enum class tensor_category {
|
|
26
|
+
TOKEN_EMBD,
|
|
27
|
+
ATTENTION_Q,
|
|
28
|
+
ATTENTION_V,
|
|
29
|
+
ATTENTION_K,
|
|
30
|
+
ATTENTION_QKV,
|
|
31
|
+
ATTENTION_KV_B,
|
|
32
|
+
ATTENTION_OUTPUT,
|
|
33
|
+
FFN_UP,
|
|
34
|
+
FFN_GATE,
|
|
35
|
+
FFN_DOWN,
|
|
36
|
+
OUTPUT,
|
|
37
|
+
OTHER
|
|
20
38
|
};
|
|
21
39
|
|
|
22
40
|
static void zeros(std::ofstream & file, size_t n) {
|
|
@@ -54,7 +72,7 @@ static std::string remap_layer(const std::string & orig_name, const std::vector<
|
|
|
54
72
|
return orig_name;
|
|
55
73
|
}
|
|
56
74
|
|
|
57
|
-
static std::string remap_imatrix
|
|
75
|
+
static std::string remap_imatrix(const std::string & orig_name, const std::map<int, std::string> & mapped) {
|
|
58
76
|
if (mapped.empty()) {
|
|
59
77
|
return orig_name;
|
|
60
78
|
}
|
|
@@ -76,6 +94,73 @@ static std::string remap_imatrix (const std::string & orig_name, const std::map<
|
|
|
76
94
|
return orig_name;
|
|
77
95
|
}
|
|
78
96
|
|
|
97
|
+
//
|
|
98
|
+
// helper functions for tensor name matching
|
|
99
|
+
//
|
|
100
|
+
|
|
101
|
+
static bool tensor_name_match_token_embd(const char * tensor_name) {
|
|
102
|
+
return std::strcmp(tensor_name, "token_embd.weight") == 0 ||
|
|
103
|
+
std::strcmp(tensor_name, "per_layer_token_embd.weight") == 0;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
static bool tensor_name_match_output_weight(const char * tensor_name) {
|
|
107
|
+
return std::strcmp(tensor_name, "output.weight") == 0;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
//
|
|
111
|
+
// tensor categorization for quantization
|
|
112
|
+
//
|
|
113
|
+
// (this is different from LLM_TN - we want broad categories, not specific tensor names per arch)
|
|
114
|
+
//
|
|
115
|
+
|
|
116
|
+
static tensor_category tensor_get_category(const std::string & tensor_name) {
|
|
117
|
+
if (tensor_name_match_output_weight(tensor_name.c_str())) {
|
|
118
|
+
return tensor_category::OUTPUT;
|
|
119
|
+
}
|
|
120
|
+
if (tensor_name_match_token_embd(tensor_name.c_str())) {
|
|
121
|
+
return tensor_category::TOKEN_EMBD;
|
|
122
|
+
}
|
|
123
|
+
if (tensor_name.find("attn_qkv.weight") != std::string::npos) {
|
|
124
|
+
return tensor_category::ATTENTION_QKV;
|
|
125
|
+
}
|
|
126
|
+
if (tensor_name.find("attn_kv_b.weight") != std::string::npos) {
|
|
127
|
+
return tensor_category::ATTENTION_KV_B;
|
|
128
|
+
}
|
|
129
|
+
if (tensor_name.find("attn_v.weight") != std::string::npos) {
|
|
130
|
+
return tensor_category::ATTENTION_V;
|
|
131
|
+
}
|
|
132
|
+
if (tensor_name.find("attn_k.weight") != std::string::npos) {
|
|
133
|
+
return tensor_category::ATTENTION_K;
|
|
134
|
+
}
|
|
135
|
+
if (tensor_name.find("attn_q.weight") != std::string::npos) {
|
|
136
|
+
return tensor_category::ATTENTION_Q;
|
|
137
|
+
}
|
|
138
|
+
if (tensor_name.find("attn_output.weight") != std::string::npos) {
|
|
139
|
+
return tensor_category::ATTENTION_OUTPUT;
|
|
140
|
+
}
|
|
141
|
+
if (tensor_name.find("ffn_up") != std::string::npos) {
|
|
142
|
+
return tensor_category::FFN_UP;
|
|
143
|
+
}
|
|
144
|
+
if (tensor_name.find("ffn_gate") != std::string::npos) {
|
|
145
|
+
return tensor_category::FFN_GATE;
|
|
146
|
+
}
|
|
147
|
+
if (tensor_name.find("ffn_down") != std::string::npos) {
|
|
148
|
+
return tensor_category::FFN_DOWN;
|
|
149
|
+
}
|
|
150
|
+
return tensor_category::OTHER;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// check if category is for attention-v-like tensors (more sensitive to quantization)
|
|
154
|
+
static bool category_is_attn_v(tensor_category cat) {
|
|
155
|
+
return cat == tensor_category::ATTENTION_V ||
|
|
156
|
+
cat == tensor_category::ATTENTION_QKV ||
|
|
157
|
+
cat == tensor_category::ATTENTION_KV_B;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
//
|
|
161
|
+
// quantization state
|
|
162
|
+
//
|
|
163
|
+
|
|
79
164
|
struct quantize_state_impl {
|
|
80
165
|
const llama_model & model;
|
|
81
166
|
const llama_model_quantize_params * params;
|
|
@@ -89,20 +174,42 @@ struct quantize_state_impl {
|
|
|
89
174
|
int i_ffn_gate = 0;
|
|
90
175
|
int i_ffn_up = 0;
|
|
91
176
|
|
|
92
|
-
int n_k_quantized = 0;
|
|
93
177
|
int n_fallback = 0;
|
|
94
178
|
|
|
95
179
|
bool has_imatrix = false;
|
|
96
180
|
|
|
97
|
-
// used to figure out if a model
|
|
98
|
-
bool
|
|
181
|
+
// used to figure out if a model has tied embeddings (tok_embd shares weights with output)
|
|
182
|
+
bool has_tied_embeddings = true; // assume tied until we see output.weight
|
|
183
|
+
|
|
184
|
+
// tensor type override patterns (compiled once, used twice)
|
|
185
|
+
std::vector<std::pair<std::regex, ggml_type>> tensor_type_patterns;
|
|
99
186
|
|
|
100
|
-
quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
187
|
+
quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params):
|
|
188
|
+
model(model), params(params)
|
|
189
|
+
{
|
|
190
|
+
// compile regex patterns once - they are expensive
|
|
191
|
+
if (params->tensor_types) {
|
|
192
|
+
const auto & tensor_types = *static_cast<const std::vector<tensor_type_option> *>(params->tensor_types);
|
|
193
|
+
for (const auto & [tname, qtype] : tensor_types) {
|
|
194
|
+
tensor_type_patterns.emplace_back(std::regex(tname), qtype);
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
}
|
|
104
198
|
};
|
|
105
199
|
|
|
200
|
+
// per-tensor metadata, computed in the preliminary loop and used in the main loop
|
|
201
|
+
struct tensor_metadata {
|
|
202
|
+
ggml_type target_type;
|
|
203
|
+
tensor_category category;
|
|
204
|
+
std::string remapped_imatrix_name;
|
|
205
|
+
bool allows_quantization;
|
|
206
|
+
bool requires_imatrix;
|
|
207
|
+
};
|
|
208
|
+
|
|
209
|
+
//
|
|
210
|
+
// dequantization
|
|
211
|
+
//
|
|
212
|
+
|
|
106
213
|
static void llama_tensor_dequantize_impl(
|
|
107
214
|
ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
|
108
215
|
const size_t nelements, const int nthread
|
|
@@ -175,12 +282,132 @@ static void llama_tensor_dequantize_impl(
|
|
|
175
282
|
workers.clear();
|
|
176
283
|
}
|
|
177
284
|
|
|
178
|
-
|
|
285
|
+
//
|
|
286
|
+
// do we allow this tensor to be quantized?
|
|
287
|
+
//
|
|
288
|
+
|
|
289
|
+
static bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor) {
|
|
290
|
+
// trivial checks first -- no string ops needed
|
|
291
|
+
if (params->only_copy) return false;
|
|
292
|
+
|
|
293
|
+
// quantize only 2D and 3D tensors (experts)
|
|
294
|
+
if (ggml_n_dims(tensor) < 2) return false;
|
|
295
|
+
|
|
296
|
+
const std::string name = ggml_get_name(tensor);
|
|
297
|
+
|
|
298
|
+
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
|
299
|
+
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
|
300
|
+
|
|
301
|
+
// do not quantize norm tensors
|
|
302
|
+
quantize &= name.find("_norm.weight") == std::string::npos;
|
|
303
|
+
|
|
304
|
+
quantize &= params->quantize_output_tensor || name != "output.weight";
|
|
305
|
+
|
|
306
|
+
// do not quantize expert gating tensors
|
|
307
|
+
// NOTE: can't use LLM_TN here because the layer number is not known
|
|
308
|
+
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
|
|
309
|
+
|
|
310
|
+
// these are very small (e.g. 4x4)
|
|
311
|
+
quantize &= name.find("altup") == std::string::npos;
|
|
312
|
+
quantize &= name.find("laurel") == std::string::npos;
|
|
313
|
+
|
|
314
|
+
// these are not too big so keep them as it is
|
|
315
|
+
quantize &= name.find("per_layer_model_proj") == std::string::npos;
|
|
316
|
+
|
|
317
|
+
// do not quantize positional embeddings and token types (BERT)
|
|
318
|
+
quantize &= name != LLM_TN(arch)(LLM_TENSOR_POS_EMBD, "weight");
|
|
319
|
+
quantize &= name != LLM_TN(arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
|
|
320
|
+
|
|
321
|
+
// do not quantize Mamba/Kimi's small conv1d weights
|
|
322
|
+
// NOTE: can't use LLM_TN here because the layer number is not known
|
|
323
|
+
quantize &= name.find("ssm_conv1d") == std::string::npos;
|
|
324
|
+
quantize &= name.find("shortconv.conv.weight") == std::string::npos;
|
|
325
|
+
|
|
326
|
+
// do not quantize RWKV's small yet 2D weights
|
|
327
|
+
quantize &= name.find("time_mix_first.weight") == std::string::npos;
|
|
328
|
+
quantize &= name.find("time_mix_w0.weight") == std::string::npos;
|
|
329
|
+
quantize &= name.find("time_mix_w1.weight") == std::string::npos;
|
|
330
|
+
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
|
|
331
|
+
quantize &= name.find("time_mix_v0.weight") == std::string::npos;
|
|
332
|
+
quantize &= name.find("time_mix_v1.weight") == std::string::npos;
|
|
333
|
+
quantize &= name.find("time_mix_v2.weight") == std::string::npos;
|
|
334
|
+
quantize &= name.find("time_mix_a0.weight") == std::string::npos;
|
|
335
|
+
quantize &= name.find("time_mix_a1.weight") == std::string::npos;
|
|
336
|
+
quantize &= name.find("time_mix_a2.weight") == std::string::npos;
|
|
337
|
+
quantize &= name.find("time_mix_g1.weight") == std::string::npos;
|
|
338
|
+
quantize &= name.find("time_mix_g2.weight") == std::string::npos;
|
|
339
|
+
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
|
|
340
|
+
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
|
|
341
|
+
quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
|
|
342
|
+
|
|
343
|
+
// do not quantize relative position bias (T5)
|
|
344
|
+
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
|
345
|
+
|
|
346
|
+
// do not quantize specific multimodal tensors
|
|
347
|
+
quantize &= name.find(".position_embd.") == std::string::npos;
|
|
348
|
+
|
|
349
|
+
return quantize;
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
//
|
|
353
|
+
// tensor type selection
|
|
354
|
+
//
|
|
355
|
+
|
|
356
|
+
// incompatible tensor shapes are handled here - fallback to a compatible type
|
|
357
|
+
static ggml_type tensor_type_fallback(quantize_state_impl & qs, const ggml_tensor * t, const ggml_type target_type) {
|
|
358
|
+
ggml_type return_type = target_type;
|
|
359
|
+
|
|
360
|
+
const int64_t ncols = t->ne[0];
|
|
361
|
+
const int64_t qk_k = ggml_blck_size(target_type);
|
|
362
|
+
|
|
363
|
+
if (ncols % qk_k != 0) { // this tensor's shape is incompatible with this quant
|
|
364
|
+
LLAMA_LOG_WARN("warning: %-36s - ncols %6" PRId64 " not divisible by %3" PRId64 " (required for type %7s) ",
|
|
365
|
+
t->name, ncols, qk_k, ggml_type_name(target_type));
|
|
366
|
+
++qs.n_fallback;
|
|
367
|
+
|
|
368
|
+
switch (target_type) {
|
|
369
|
+
// types on the left: block size 256
|
|
370
|
+
case GGML_TYPE_IQ1_S:
|
|
371
|
+
case GGML_TYPE_IQ1_M:
|
|
372
|
+
case GGML_TYPE_IQ2_XXS:
|
|
373
|
+
case GGML_TYPE_IQ2_XS:
|
|
374
|
+
case GGML_TYPE_IQ2_S:
|
|
375
|
+
case GGML_TYPE_IQ3_XXS:
|
|
376
|
+
case GGML_TYPE_IQ3_S: // types on the right: block size 32
|
|
377
|
+
case GGML_TYPE_IQ4_XS: return_type = GGML_TYPE_IQ4_NL; break;
|
|
378
|
+
case GGML_TYPE_Q2_K:
|
|
379
|
+
case GGML_TYPE_Q3_K:
|
|
380
|
+
case GGML_TYPE_TQ1_0:
|
|
381
|
+
case GGML_TYPE_TQ2_0: return_type = GGML_TYPE_Q4_0; break;
|
|
382
|
+
case GGML_TYPE_Q4_K: return_type = GGML_TYPE_Q5_0; break;
|
|
383
|
+
case GGML_TYPE_Q5_K: return_type = GGML_TYPE_Q5_1; break;
|
|
384
|
+
case GGML_TYPE_Q6_K: return_type = GGML_TYPE_Q8_0; break;
|
|
385
|
+
default:
|
|
386
|
+
throw std::runtime_error(format("no tensor type fallback is defined for type %s",
|
|
387
|
+
ggml_type_name(target_type)));
|
|
388
|
+
}
|
|
389
|
+
if (ncols % ggml_blck_size(return_type) != 0) {
|
|
390
|
+
//
|
|
391
|
+
// the fallback return type is still not compatible for this tensor!
|
|
392
|
+
//
|
|
393
|
+
// most likely, this tensor's first dimension is not divisible by 32.
|
|
394
|
+
// this is very rare. we can either abort the quantization, or
|
|
395
|
+
// fallback to F16 / F32.
|
|
396
|
+
//
|
|
397
|
+
LLAMA_LOG_WARN("(WARNING: must use F16 due to unusual shape) ");
|
|
398
|
+
return_type = GGML_TYPE_F16;
|
|
399
|
+
}
|
|
400
|
+
LLAMA_LOG_WARN("-> falling back to %7s\n", ggml_type_name(return_type));
|
|
401
|
+
}
|
|
402
|
+
return return_type;
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
// internal standard logic for selecting the target tensor type based on tensor category, ftype, and model arch
|
|
406
|
+
static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, tensor_category category) {
|
|
179
407
|
const std::string name = ggml_get_name(tensor);
|
|
180
408
|
|
|
181
409
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
|
182
410
|
const llm_arch arch = qs.model.arch;
|
|
183
|
-
const auto tn = LLM_TN(arch);
|
|
184
411
|
|
|
185
412
|
auto use_more_bits = [](int i_layer, int n_layers) -> bool {
|
|
186
413
|
return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
|
|
@@ -204,7 +431,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
204
431
|
|
|
205
432
|
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
|
|
206
433
|
// with the quantization of the output tensor
|
|
207
|
-
if (
|
|
434
|
+
if (category == tensor_category::OUTPUT || (qs.has_tied_embeddings && category == tensor_category::TOKEN_EMBD)) {
|
|
208
435
|
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
|
209
436
|
new_type = qs.params->output_tensor_type;
|
|
210
437
|
} else {
|
|
@@ -234,7 +461,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
234
461
|
} else {
|
|
235
462
|
new_type = GGML_TYPE_Q8_0;
|
|
236
463
|
}
|
|
237
|
-
} else if (
|
|
464
|
+
} else if (category == tensor_category::TOKEN_EMBD) {
|
|
238
465
|
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
|
|
239
466
|
new_type = qs.params->token_embedding_type;
|
|
240
467
|
} else {
|
|
@@ -254,21 +481,21 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
254
481
|
}
|
|
255
482
|
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
|
256
483
|
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
|
257
|
-
if (
|
|
484
|
+
if (category_is_attn_v(category)) {
|
|
258
485
|
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
|
259
486
|
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
|
260
487
|
++qs.i_attention_wv;
|
|
261
488
|
}
|
|
262
|
-
else if (qs.model.hparams.n_expert == 8 &&
|
|
489
|
+
else if (qs.model.hparams.n_expert == 8 && category == tensor_category::ATTENTION_K) {
|
|
263
490
|
new_type = GGML_TYPE_Q4_K;
|
|
264
491
|
}
|
|
265
|
-
else if (
|
|
492
|
+
else if (category == tensor_category::FFN_DOWN) {
|
|
266
493
|
if (qs.i_ffn_down < qs.n_ffn_down/8) {
|
|
267
494
|
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
|
268
495
|
}
|
|
269
496
|
++qs.i_ffn_down;
|
|
270
497
|
}
|
|
271
|
-
else if (
|
|
498
|
+
else if (category == tensor_category::ATTENTION_OUTPUT) {
|
|
272
499
|
if (qs.model.hparams.n_expert == 8) {
|
|
273
500
|
new_type = GGML_TYPE_Q5_K;
|
|
274
501
|
} else {
|
|
@@ -276,7 +503,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
276
503
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
|
|
277
504
|
}
|
|
278
505
|
}
|
|
279
|
-
} else if (
|
|
506
|
+
} else if (category_is_attn_v(category)) {
|
|
280
507
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
|
281
508
|
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
|
282
509
|
}
|
|
@@ -314,7 +541,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
314
541
|
new_type = GGML_TYPE_Q8_0;
|
|
315
542
|
}
|
|
316
543
|
++qs.i_attention_wv;
|
|
317
|
-
} else if (
|
|
544
|
+
} else if (category == tensor_category::ATTENTION_K) {
|
|
318
545
|
if (qs.model.hparams.n_expert == 8) {
|
|
319
546
|
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
|
320
547
|
// TODO: explore better strategies
|
|
@@ -326,14 +553,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
326
553
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
|
327
554
|
new_type = GGML_TYPE_IQ2_S;
|
|
328
555
|
}
|
|
329
|
-
} else if (
|
|
556
|
+
} else if (category == tensor_category::ATTENTION_Q) {
|
|
330
557
|
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
|
|
331
558
|
new_type = GGML_TYPE_IQ3_XXS;
|
|
332
559
|
}
|
|
333
560
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
|
334
561
|
new_type = GGML_TYPE_IQ2_S;
|
|
335
562
|
}
|
|
336
|
-
} else if (
|
|
563
|
+
} else if (category == tensor_category::FFN_DOWN) {
|
|
337
564
|
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
|
|
338
565
|
int i_layer = info.first, n_layer = info.second;
|
|
339
566
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
|
@@ -378,7 +605,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
378
605
|
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
|
|
379
606
|
}
|
|
380
607
|
++qs.i_ffn_down;
|
|
381
|
-
} else if (
|
|
608
|
+
} else if (category == tensor_category::ATTENTION_OUTPUT) {
|
|
382
609
|
if (arch != LLM_ARCH_FALCON) {
|
|
383
610
|
if (qs.model.hparams.n_expert == 8) {
|
|
384
611
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
|
@@ -398,14 +625,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
398
625
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
|
399
626
|
}
|
|
400
627
|
}
|
|
401
|
-
else if (
|
|
628
|
+
else if (category == tensor_category::ATTENTION_QKV) {
|
|
402
629
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
|
403
630
|
new_type = GGML_TYPE_Q4_K;
|
|
404
631
|
}
|
|
405
632
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
|
406
633
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
|
407
634
|
}
|
|
408
|
-
else if (
|
|
635
|
+
else if (category == tensor_category::FFN_GATE) {
|
|
409
636
|
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
|
|
410
637
|
int i_layer = info.first, n_layer = info.second;
|
|
411
638
|
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
|
|
@@ -413,7 +640,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
413
640
|
}
|
|
414
641
|
++qs.i_ffn_gate;
|
|
415
642
|
}
|
|
416
|
-
else if (
|
|
643
|
+
else if (category == tensor_category::FFN_UP) {
|
|
417
644
|
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
|
|
418
645
|
int i_layer = info.first, n_layer = info.second;
|
|
419
646
|
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
|
|
@@ -422,60 +649,58 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
422
649
|
++qs.i_ffn_up;
|
|
423
650
|
}
|
|
424
651
|
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
|
|
428
|
-
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
|
|
429
|
-
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
|
430
|
-
//}
|
|
431
|
-
// This can be used to reduce the size of the Q5_K_S model.
|
|
432
|
-
// The associated PPL increase is fully in line with the size reduction
|
|
433
|
-
//else {
|
|
434
|
-
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
|
435
|
-
//}
|
|
436
|
-
bool convert_incompatible_tensor = false;
|
|
437
|
-
{
|
|
438
|
-
const int64_t nx = tensor->ne[0];
|
|
439
|
-
const int64_t ny = tensor->ne[1];
|
|
440
|
-
const int64_t qk_k = ggml_blck_size(new_type);
|
|
652
|
+
return new_type;
|
|
653
|
+
}
|
|
441
654
|
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
655
|
+
// outer wrapper: determine the ggml_type that this tensor should be quantized to
|
|
656
|
+
static ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) {
|
|
657
|
+
if (!tensor_allows_quantization(params, qs.model.arch, tensor)) {
|
|
658
|
+
return tensor->type;
|
|
659
|
+
}
|
|
660
|
+
if (params->token_embedding_type < GGML_TYPE_COUNT && tm.category == tensor_category::TOKEN_EMBD) {
|
|
661
|
+
return params->token_embedding_type;
|
|
662
|
+
}
|
|
663
|
+
if (params->output_tensor_type < GGML_TYPE_COUNT && tm.category == tensor_category::OUTPUT) {
|
|
664
|
+
return params->output_tensor_type;
|
|
448
665
|
}
|
|
449
666
|
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
667
|
+
ggml_type new_type = default_type;
|
|
668
|
+
|
|
669
|
+
// get more optimal quantization type based on the tensor shape, layer, etc.
|
|
670
|
+
if (!params->pure && ggml_is_quantized(default_type)) {
|
|
671
|
+
// if the user provided tensor types - use those
|
|
672
|
+
bool manual = false;
|
|
673
|
+
if (!qs.tensor_type_patterns.empty()) {
|
|
674
|
+
const std::string tensor_name(tensor->name);
|
|
675
|
+
for (const auto & [pattern, qtype] : qs.tensor_type_patterns) {
|
|
676
|
+
if (std::regex_search(tensor_name, pattern)) {
|
|
677
|
+
if (qtype != new_type) {
|
|
678
|
+
LLAMA_LOG_WARN("%s: %-36s - applying manual override: %s -> %s\n",
|
|
679
|
+
__func__, tensor_name.c_str(), ggml_type_name(new_type), ggml_type_name(qtype));
|
|
680
|
+
new_type = qtype;
|
|
681
|
+
manual = true;
|
|
682
|
+
break;
|
|
683
|
+
}
|
|
684
|
+
}
|
|
685
|
+
}
|
|
468
686
|
}
|
|
469
|
-
|
|
470
|
-
|
|
687
|
+
|
|
688
|
+
// if not manual - use the standard logic for choosing the quantization type based on the selected mixture
|
|
689
|
+
if (!manual) {
|
|
690
|
+
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, params->ftype, tm.category);
|
|
471
691
|
}
|
|
472
|
-
|
|
473
|
-
|
|
692
|
+
|
|
693
|
+
// incompatible tensor shapes are handled here - fallback to a compatible type
|
|
694
|
+
new_type = tensor_type_fallback(qs, tensor, new_type);
|
|
474
695
|
}
|
|
475
696
|
|
|
476
697
|
return new_type;
|
|
477
698
|
}
|
|
478
699
|
|
|
700
|
+
//
|
|
701
|
+
// quantization implementation
|
|
702
|
+
//
|
|
703
|
+
|
|
479
704
|
static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
|
480
705
|
if (nthread < 2) {
|
|
481
706
|
// single-thread
|
|
@@ -530,50 +755,85 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
|
|
|
530
755
|
return new_size;
|
|
531
756
|
}
|
|
532
757
|
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
758
|
+
//
|
|
759
|
+
// imatrix requirement check
|
|
760
|
+
//
|
|
536
761
|
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
case
|
|
543
|
-
case
|
|
544
|
-
case
|
|
545
|
-
case
|
|
762
|
+
static bool tensor_requires_imatrix(const char * tensor_name, const ggml_type dst_type, const llama_ftype ftype) {
|
|
763
|
+
if (tensor_name_match_token_embd(tensor_name) || tensor_name_match_output_weight(tensor_name)) {
|
|
764
|
+
return false;
|
|
765
|
+
}
|
|
766
|
+
switch (dst_type) {
|
|
767
|
+
case GGML_TYPE_IQ3_XXS:
|
|
768
|
+
case GGML_TYPE_IQ2_XXS:
|
|
769
|
+
case GGML_TYPE_IQ2_XS:
|
|
770
|
+
case GGML_TYPE_IQ2_S:
|
|
771
|
+
case GGML_TYPE_IQ1_M:
|
|
772
|
+
case GGML_TYPE_IQ1_S:
|
|
773
|
+
return true;
|
|
774
|
+
case GGML_TYPE_Q2_K:
|
|
775
|
+
// as a general rule, the k-type quantizations don't require imatrix data.
|
|
776
|
+
// the only exception is Q2_K tensors that are part of a Q2_K_S file.
|
|
777
|
+
return ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S;
|
|
778
|
+
default:
|
|
779
|
+
return false;
|
|
780
|
+
}
|
|
781
|
+
}
|
|
546
782
|
|
|
547
|
-
|
|
783
|
+
//
|
|
784
|
+
// given a file type, get the default tensor type
|
|
785
|
+
//
|
|
786
|
+
|
|
787
|
+
static ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
|
|
788
|
+
switch (ftype) {
|
|
789
|
+
case LLAMA_FTYPE_MOSTLY_Q4_0: return GGML_TYPE_Q4_0;
|
|
790
|
+
case LLAMA_FTYPE_MOSTLY_Q4_1: return GGML_TYPE_Q4_1;
|
|
791
|
+
case LLAMA_FTYPE_MOSTLY_Q5_0: return GGML_TYPE_Q5_0;
|
|
792
|
+
case LLAMA_FTYPE_MOSTLY_Q5_1: return GGML_TYPE_Q5_1;
|
|
793
|
+
case LLAMA_FTYPE_MOSTLY_Q8_0: return GGML_TYPE_Q8_0;
|
|
794
|
+
case LLAMA_FTYPE_MOSTLY_F16: return GGML_TYPE_F16;
|
|
795
|
+
case LLAMA_FTYPE_MOSTLY_BF16: return GGML_TYPE_BF16;
|
|
796
|
+
case LLAMA_FTYPE_ALL_F32: return GGML_TYPE_F32;
|
|
797
|
+
|
|
798
|
+
case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return GGML_TYPE_MXFP4;
|
|
548
799
|
|
|
549
800
|
// K-quants
|
|
550
801
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
|
551
|
-
case LLAMA_FTYPE_MOSTLY_Q2_K:
|
|
552
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_XS:
|
|
802
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: return GGML_TYPE_Q2_K;
|
|
803
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return GGML_TYPE_IQ3_S;
|
|
553
804
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
|
554
805
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
|
555
|
-
case LLAMA_FTYPE_MOSTLY_Q3_K_L:
|
|
806
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return GGML_TYPE_Q3_K;
|
|
556
807
|
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
|
557
|
-
case LLAMA_FTYPE_MOSTLY_Q4_K_M:
|
|
808
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return GGML_TYPE_Q4_K;
|
|
558
809
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
|
559
|
-
case LLAMA_FTYPE_MOSTLY_Q5_K_M:
|
|
560
|
-
case LLAMA_FTYPE_MOSTLY_Q6_K:
|
|
561
|
-
case LLAMA_FTYPE_MOSTLY_TQ1_0:
|
|
562
|
-
case LLAMA_FTYPE_MOSTLY_TQ2_0:
|
|
563
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:
|
|
564
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_XS:
|
|
565
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_S:
|
|
566
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_M:
|
|
567
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:
|
|
568
|
-
case LLAMA_FTYPE_MOSTLY_IQ1_S:
|
|
569
|
-
case LLAMA_FTYPE_MOSTLY_IQ1_M:
|
|
570
|
-
case LLAMA_FTYPE_MOSTLY_IQ4_NL:
|
|
571
|
-
case LLAMA_FTYPE_MOSTLY_IQ4_XS:
|
|
572
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_S:
|
|
573
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_M:
|
|
810
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return GGML_TYPE_Q5_K;
|
|
811
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: return GGML_TYPE_Q6_K;
|
|
812
|
+
case LLAMA_FTYPE_MOSTLY_TQ1_0: return GGML_TYPE_TQ1_0;
|
|
813
|
+
case LLAMA_FTYPE_MOSTLY_TQ2_0: return GGML_TYPE_TQ2_0;
|
|
814
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return GGML_TYPE_IQ2_XXS;
|
|
815
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return GGML_TYPE_IQ2_XS;
|
|
816
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_S: return GGML_TYPE_IQ2_XS;
|
|
817
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_M: return GGML_TYPE_IQ2_S;
|
|
818
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return GGML_TYPE_IQ3_XXS;
|
|
819
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_S: return GGML_TYPE_IQ1_S;
|
|
820
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_M: return GGML_TYPE_IQ1_M;
|
|
821
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return GGML_TYPE_IQ4_NL;
|
|
822
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return GGML_TYPE_IQ4_XS;
|
|
823
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_S:
|
|
824
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_M: return GGML_TYPE_IQ3_S;
|
|
574
825
|
|
|
575
826
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
|
576
827
|
}
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
//
|
|
831
|
+
// main quantization driver
|
|
832
|
+
//
|
|
833
|
+
|
|
834
|
+
static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
|
835
|
+
ggml_type default_type;
|
|
836
|
+
llama_ftype ftype = params->ftype;
|
|
577
837
|
|
|
578
838
|
int nthread = params->nthread;
|
|
579
839
|
|
|
@@ -581,6 +841,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
581
841
|
nthread = std::thread::hardware_concurrency();
|
|
582
842
|
}
|
|
583
843
|
|
|
844
|
+
default_type = llama_ftype_get_default_type(ftype);
|
|
845
|
+
|
|
584
846
|
// mmap consistently increases speed on Linux, and also increases speed on Windows with
|
|
585
847
|
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
|
586
848
|
#if defined(__linux__) || defined(_WIN32)
|
|
@@ -596,7 +858,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
596
858
|
}
|
|
597
859
|
|
|
598
860
|
std::vector<std::string> splits = {};
|
|
599
|
-
llama_model_loader ml(
|
|
861
|
+
llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
|
|
862
|
+
fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
|
600
863
|
ml.init_mappings(false); // no prefetching
|
|
601
864
|
|
|
602
865
|
llama_model model(llama_model_default_params());
|
|
@@ -614,7 +877,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
614
877
|
if (params->imatrix) {
|
|
615
878
|
imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
|
|
616
879
|
if (imatrix_data) {
|
|
617
|
-
LLAMA_LOG_INFO("
|
|
880
|
+
LLAMA_LOG_INFO("\n%s: have importance matrix data with %d entries\n",
|
|
881
|
+
__func__, (int)imatrix_data->size());
|
|
618
882
|
qs.has_imatrix = true;
|
|
619
883
|
// check imatrix for nans or infs
|
|
620
884
|
for (const auto & kv : *imatrix_data) {
|
|
@@ -636,7 +900,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
636
900
|
}
|
|
637
901
|
|
|
638
902
|
// copy the KV pairs from the input file
|
|
639
|
-
gguf_set_kv (ctx_out.get(), ml.
|
|
903
|
+
gguf_set_kv (ctx_out.get(), ml.metadata);
|
|
640
904
|
gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
|
|
641
905
|
gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV
|
|
642
906
|
|
|
@@ -697,35 +961,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
697
961
|
});
|
|
698
962
|
}
|
|
699
963
|
|
|
700
|
-
for (const auto * it : tensors) {
|
|
701
|
-
const struct ggml_tensor * tensor = it->tensor;
|
|
702
|
-
|
|
703
|
-
const std::string name = ggml_get_name(tensor);
|
|
704
|
-
|
|
705
|
-
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
|
706
|
-
if (name.find("attn_v.weight") != std::string::npos ||
|
|
707
|
-
name.find("attn_qkv.weight") != std::string::npos ||
|
|
708
|
-
name.find("attn_kv_b.weight")!= std::string::npos) {
|
|
709
|
-
++qs.n_attention_wv;
|
|
710
|
-
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
|
711
|
-
qs.has_output = true;
|
|
712
|
-
}
|
|
713
|
-
}
|
|
714
|
-
|
|
715
|
-
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
|
716
|
-
|
|
717
|
-
size_t total_size_org = 0;
|
|
718
|
-
size_t total_size_new = 0;
|
|
719
|
-
|
|
720
|
-
std::vector<std::thread> workers;
|
|
721
|
-
workers.reserve(nthread);
|
|
722
|
-
|
|
723
964
|
int idx = 0;
|
|
724
|
-
|
|
725
|
-
std::vector<no_init<uint8_t>> read_data;
|
|
726
|
-
std::vector<no_init<uint8_t>> work;
|
|
727
|
-
std::vector<no_init<float>> f32_conv_buf;
|
|
728
|
-
|
|
729
965
|
uint16_t n_split = 1;
|
|
730
966
|
|
|
731
967
|
// Assume split index is continuous
|
|
@@ -737,14 +973,68 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
737
973
|
std::vector<gguf_context_ptr> ctx_outs(n_split);
|
|
738
974
|
ctx_outs[0] = std::move(ctx_out);
|
|
739
975
|
|
|
740
|
-
//
|
|
741
|
-
|
|
976
|
+
// compute tensor metadata once and cache it
|
|
977
|
+
std::vector<tensor_metadata> metadata(tensors.size());
|
|
978
|
+
|
|
979
|
+
// initialize quantization state before preliminary loop (counters for use_more_bits)
|
|
980
|
+
{
|
|
981
|
+
for (size_t i = 0; i < tensors.size(); ++i) {
|
|
982
|
+
const auto cat = tensor_get_category(tensors[i]->tensor->name);
|
|
983
|
+
if (category_is_attn_v(cat)) {
|
|
984
|
+
++qs.n_attention_wv;
|
|
985
|
+
}
|
|
986
|
+
if (cat == tensor_category::OUTPUT) {
|
|
987
|
+
qs.has_tied_embeddings = false;
|
|
988
|
+
}
|
|
989
|
+
metadata[i].category = cat; // save and re-use the category while we're at it
|
|
990
|
+
}
|
|
991
|
+
// these also need to be set to n_layer by default
|
|
992
|
+
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
|
|
993
|
+
}
|
|
994
|
+
|
|
995
|
+
// flag for --dry-run
|
|
996
|
+
bool will_require_imatrix = false;
|
|
997
|
+
|
|
998
|
+
//
|
|
999
|
+
// preliminary iteration over all weights
|
|
1000
|
+
//
|
|
1001
|
+
|
|
1002
|
+
for (size_t i = 0; i < tensors.size(); ++i) {
|
|
1003
|
+
const auto * it = tensors[i];
|
|
1004
|
+
const struct ggml_tensor * tensor = it->tensor;
|
|
1005
|
+
const std::string name = ggml_get_name(tensor);
|
|
1006
|
+
|
|
742
1007
|
uint16_t i_split = params->keep_split ? it->idx : 0;
|
|
743
|
-
ggml_tensor * tensor = it->tensor;
|
|
744
1008
|
if (!ctx_outs[i_split]) {
|
|
745
1009
|
ctx_outs[i_split].reset(gguf_init_empty());
|
|
746
1010
|
}
|
|
747
1011
|
gguf_add_tensor(ctx_outs[i_split].get(), tensor);
|
|
1012
|
+
|
|
1013
|
+
metadata[i].allows_quantization = tensor_allows_quantization(params, model.arch, tensor);
|
|
1014
|
+
|
|
1015
|
+
if (metadata[i].allows_quantization) {
|
|
1016
|
+
metadata[i].target_type = llama_tensor_get_type(qs, params, tensor, default_type, metadata[i]);
|
|
1017
|
+
} else {
|
|
1018
|
+
metadata[i].target_type = tensor->type;
|
|
1019
|
+
}
|
|
1020
|
+
|
|
1021
|
+
metadata[i].requires_imatrix = tensor_requires_imatrix(tensor->name, metadata[i].target_type, ftype);
|
|
1022
|
+
|
|
1023
|
+
if (params->imatrix) {
|
|
1024
|
+
metadata[i].remapped_imatrix_name = remap_imatrix(tensor->name, mapped);
|
|
1025
|
+
} else if (metadata[i].allows_quantization && metadata[i].requires_imatrix) {
|
|
1026
|
+
if (params->dry_run) {
|
|
1027
|
+
will_require_imatrix = true;
|
|
1028
|
+
} else {
|
|
1029
|
+
LLAMA_LOG_ERROR("\n============================================================================\n"
|
|
1030
|
+
" ERROR: this quantization requires an importance matrix!\n"
|
|
1031
|
+
" - offending tensor: %s\n"
|
|
1032
|
+
" - target type: %s\n"
|
|
1033
|
+
"============================================================================\n\n",
|
|
1034
|
+
name.c_str(), ggml_type_name(metadata[i].target_type));
|
|
1035
|
+
throw std::runtime_error("this quantization requires an imatrix!");
|
|
1036
|
+
}
|
|
1037
|
+
}
|
|
748
1038
|
}
|
|
749
1039
|
|
|
750
1040
|
// Set split info if needed
|
|
@@ -756,6 +1046,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
756
1046
|
}
|
|
757
1047
|
}
|
|
758
1048
|
|
|
1049
|
+
size_t total_size_org = 0;
|
|
1050
|
+
size_t total_size_new = 0;
|
|
1051
|
+
|
|
1052
|
+
std::vector<std::thread> workers;
|
|
1053
|
+
workers.reserve(nthread);
|
|
1054
|
+
|
|
1055
|
+
std::vector<no_init<uint8_t>> read_data;
|
|
1056
|
+
std::vector<no_init<uint8_t>> work;
|
|
1057
|
+
std::vector<no_init<float>> f32_conv_buf;
|
|
1058
|
+
|
|
759
1059
|
int cur_split = -1;
|
|
760
1060
|
std::ofstream fout;
|
|
761
1061
|
auto close_ofstream = [&]() {
|
|
@@ -785,251 +1085,182 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
785
1085
|
::zeros(fout, meta_size);
|
|
786
1086
|
};
|
|
787
1087
|
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
1088
|
+
// no output file for --dry-run
|
|
1089
|
+
if (!params->dry_run) {
|
|
1090
|
+
new_ofstream(0);
|
|
1091
|
+
}
|
|
1092
|
+
|
|
1093
|
+
//
|
|
1094
|
+
// main loop: iterate over all weights
|
|
1095
|
+
//
|
|
1096
|
+
|
|
1097
|
+
for (size_t i = 0; i < tensors.size(); ++i) {
|
|
1098
|
+
const auto & weight = *tensors[i];
|
|
1099
|
+
const auto & tm = metadata[i];
|
|
792
1100
|
ggml_tensor * tensor = weight.tensor;
|
|
793
|
-
|
|
1101
|
+
|
|
1102
|
+
if (!params->dry_run && (weight.idx != cur_split && params->keep_split)) {
|
|
794
1103
|
close_ofstream();
|
|
795
1104
|
new_ofstream(weight.idx);
|
|
796
1105
|
}
|
|
797
1106
|
|
|
798
1107
|
const std::string name = ggml_get_name(tensor);
|
|
1108
|
+
const size_t tensor_size = ggml_nbytes(tensor);
|
|
799
1109
|
|
|
800
|
-
if (!
|
|
801
|
-
if (
|
|
802
|
-
read_data.
|
|
1110
|
+
if (!params->dry_run) {
|
|
1111
|
+
if (!ml.use_mmap) {
|
|
1112
|
+
if (read_data.size() < tensor_size) {
|
|
1113
|
+
read_data.resize(tensor_size);
|
|
1114
|
+
}
|
|
1115
|
+
tensor->data = read_data.data();
|
|
803
1116
|
}
|
|
804
|
-
|
|
1117
|
+
ml.load_data_for(tensor);
|
|
805
1118
|
}
|
|
806
|
-
ml.load_data_for(tensor);
|
|
807
1119
|
|
|
808
|
-
LLAMA_LOG_INFO("[%4d/%4d]
|
|
1120
|
+
LLAMA_LOG_INFO("[%4d/%4d] %-36s - [%s], type = %6s, ",
|
|
809
1121
|
++idx, ml.n_tensors,
|
|
810
1122
|
ggml_get_name(tensor),
|
|
811
1123
|
llama_format_tensor_shape(tensor).c_str(),
|
|
812
1124
|
ggml_type_name(tensor->type));
|
|
813
1125
|
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
// quantize
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
quantize &= name.find("_norm.weight") == std::string::npos;
|
|
822
|
-
|
|
823
|
-
quantize &= params->quantize_output_tensor || name != "output.weight";
|
|
824
|
-
quantize &= !params->only_copy;
|
|
825
|
-
|
|
826
|
-
// do not quantize expert gating tensors
|
|
827
|
-
// NOTE: can't use LLM_TN here because the layer number is not known
|
|
828
|
-
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
|
|
829
|
-
|
|
830
|
-
// these are very small (e.g. 4x4)
|
|
831
|
-
quantize &= name.find("altup") == std::string::npos;
|
|
832
|
-
quantize &= name.find("laurel") == std::string::npos;
|
|
833
|
-
|
|
834
|
-
// these are not too big so keep them as it is
|
|
835
|
-
quantize &= name.find("per_layer_model_proj") == std::string::npos;
|
|
836
|
-
|
|
837
|
-
// do not quantize positional embeddings and token types (BERT)
|
|
838
|
-
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
|
|
839
|
-
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
|
|
840
|
-
|
|
841
|
-
// do not quantize Mamba's small yet 2D weights
|
|
842
|
-
// NOTE: can't use LLM_TN here because the layer number is not known
|
|
843
|
-
quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
|
|
844
|
-
quantize &= name.find("shortconv.conv.weight") == std::string::npos;
|
|
845
|
-
|
|
846
|
-
// do not quantize RWKV's small yet 2D weights
|
|
847
|
-
quantize &= name.find("time_mix_first.weight") == std::string::npos;
|
|
848
|
-
quantize &= name.find("time_mix_w0.weight") == std::string::npos;
|
|
849
|
-
quantize &= name.find("time_mix_w1.weight") == std::string::npos;
|
|
850
|
-
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
|
|
851
|
-
quantize &= name.find("time_mix_v0.weight") == std::string::npos;
|
|
852
|
-
quantize &= name.find("time_mix_v1.weight") == std::string::npos;
|
|
853
|
-
quantize &= name.find("time_mix_v2.weight") == std::string::npos;
|
|
854
|
-
quantize &= name.find("time_mix_a0.weight") == std::string::npos;
|
|
855
|
-
quantize &= name.find("time_mix_a1.weight") == std::string::npos;
|
|
856
|
-
quantize &= name.find("time_mix_a2.weight") == std::string::npos;
|
|
857
|
-
quantize &= name.find("time_mix_g1.weight") == std::string::npos;
|
|
858
|
-
quantize &= name.find("time_mix_g2.weight") == std::string::npos;
|
|
859
|
-
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
|
|
860
|
-
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
|
|
861
|
-
quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
|
|
862
|
-
|
|
863
|
-
// do not quantize relative position bias (T5)
|
|
864
|
-
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
|
865
|
-
|
|
866
|
-
// do not quantize specific multimodal tensors
|
|
867
|
-
quantize &= name.find(".position_embd.") == std::string::npos;
|
|
868
|
-
|
|
869
|
-
ggml_type new_type;
|
|
1126
|
+
const ggml_type cur_type = tensor->type;
|
|
1127
|
+
const ggml_type new_type = tm.target_type;
|
|
1128
|
+
|
|
1129
|
+
// If we've decided to quantize to the same type the tensor is already
|
|
1130
|
+
// in then there's nothing to do.
|
|
1131
|
+
bool quantize = cur_type != new_type;
|
|
1132
|
+
|
|
870
1133
|
void * new_data;
|
|
871
1134
|
size_t new_size;
|
|
872
1135
|
|
|
873
|
-
if (
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
if (
|
|
882
|
-
|
|
883
|
-
const std::string tensor_name(tensor->name);
|
|
884
|
-
for (const auto & [tname, qtype] : tensor_types) {
|
|
885
|
-
if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
|
|
886
|
-
if (qtype != new_type) {
|
|
887
|
-
LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
|
|
888
|
-
new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
|
|
889
|
-
}
|
|
890
|
-
}
|
|
891
|
-
}
|
|
1136
|
+
if (params->dry_run) {
|
|
1137
|
+
// the --dry-run option calculates the final quantization size without quantizing
|
|
1138
|
+
if (quantize) {
|
|
1139
|
+
new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]);
|
|
1140
|
+
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n",
|
|
1141
|
+
tensor_size/1024.0/1024.0,
|
|
1142
|
+
new_size/1024.0/1024.0,
|
|
1143
|
+
ggml_type_name(new_type));
|
|
1144
|
+
if (!will_require_imatrix && tm.requires_imatrix) {
|
|
1145
|
+
will_require_imatrix = true;
|
|
892
1146
|
}
|
|
1147
|
+
} else {
|
|
1148
|
+
new_size = tensor_size;
|
|
1149
|
+
LLAMA_LOG_INFO("size = %8.3f MiB\n", new_size/1024.0/1024.0);
|
|
893
1150
|
}
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
|
|
898
|
-
new_type = params->output_tensor_type;
|
|
899
|
-
}
|
|
900
|
-
|
|
901
|
-
// If we've decided to quantize to the same type the tensor is already
|
|
902
|
-
// in then there's nothing to do.
|
|
903
|
-
quantize = tensor->type != new_type;
|
|
904
|
-
}
|
|
905
|
-
|
|
906
|
-
if (!quantize) {
|
|
907
|
-
new_type = tensor->type;
|
|
908
|
-
new_data = tensor->data;
|
|
909
|
-
new_size = ggml_nbytes(tensor);
|
|
910
|
-
LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
|
1151
|
+
total_size_org += tensor_size;
|
|
1152
|
+
total_size_new += new_size;
|
|
1153
|
+
continue;
|
|
911
1154
|
} else {
|
|
912
|
-
|
|
1155
|
+
// no --dry-run, perform quantization
|
|
1156
|
+
if (!quantize) {
|
|
1157
|
+
new_data = tensor->data;
|
|
1158
|
+
new_size = tensor_size;
|
|
1159
|
+
LLAMA_LOG_INFO("size = %8.3f MiB\n", tensor_size/1024.0/1024.0);
|
|
1160
|
+
} else {
|
|
1161
|
+
const int64_t nelements = ggml_nelements(tensor);
|
|
913
1162
|
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
} else {
|
|
920
|
-
if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
|
|
921
|
-
imatrix = it->second.data();
|
|
1163
|
+
const float * imatrix = nullptr;
|
|
1164
|
+
if (imatrix_data) {
|
|
1165
|
+
auto it = imatrix_data->find(tm.remapped_imatrix_name);
|
|
1166
|
+
if (it == imatrix_data->end()) {
|
|
1167
|
+
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
|
|
922
1168
|
} else {
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
1169
|
+
if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
|
|
1170
|
+
imatrix = it->second.data();
|
|
1171
|
+
} else {
|
|
1172
|
+
LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
|
|
1173
|
+
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
|
|
1174
|
+
|
|
1175
|
+
// this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
|
|
1176
|
+
// this is a significant error and it may be good idea to abort the process if this happens,
|
|
1177
|
+
// since many people will miss the error and not realize that most of the model is being quantized without an imatrix
|
|
1178
|
+
// tok_embd should be ignored in this case, since it always causes this warning
|
|
1179
|
+
if (!tensor_name_match_token_embd(tensor->name)) {
|
|
1180
|
+
throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
|
|
1181
|
+
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
|
|
1182
|
+
}
|
|
933
1183
|
}
|
|
934
1184
|
}
|
|
935
1185
|
}
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
|
944
|
-
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
|
945
|
-
LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
|
|
946
|
-
LLAMA_LOG_ERROR("============================================================\n\n");
|
|
947
|
-
throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
|
|
948
|
-
}
|
|
1186
|
+
if (!imatrix && tm.requires_imatrix) {
|
|
1187
|
+
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
|
1188
|
+
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
|
1189
|
+
LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
|
|
1190
|
+
LLAMA_LOG_ERROR("============================================================\n\n");
|
|
1191
|
+
throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
|
|
1192
|
+
}
|
|
949
1193
|
|
|
950
|
-
|
|
1194
|
+
float * f32_data;
|
|
951
1195
|
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
1196
|
+
if (tensor->type == GGML_TYPE_F32) {
|
|
1197
|
+
f32_data = (float *) tensor->data;
|
|
1198
|
+
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
|
1199
|
+
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
|
1200
|
+
} else {
|
|
1201
|
+
llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
|
|
1202
|
+
f32_data = (float *) f32_conv_buf.data();
|
|
1203
|
+
}
|
|
960
1204
|
|
|
961
|
-
|
|
962
|
-
|
|
1205
|
+
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
|
1206
|
+
fflush(stdout);
|
|
963
1207
|
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
}
|
|
967
|
-
new_data = work.data();
|
|
968
|
-
|
|
969
|
-
const int64_t n_per_row = tensor->ne[0];
|
|
970
|
-
const int64_t nrows = tensor->ne[1];
|
|
971
|
-
|
|
972
|
-
static const int64_t min_chunk_size = 32 * 512;
|
|
973
|
-
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
|
|
974
|
-
|
|
975
|
-
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
|
976
|
-
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
|
977
|
-
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
|
|
978
|
-
|
|
979
|
-
// quantize each expert separately since they have different importance matrices
|
|
980
|
-
new_size = 0;
|
|
981
|
-
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
|
|
982
|
-
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
|
|
983
|
-
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
|
984
|
-
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
|
985
|
-
|
|
986
|
-
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
|
987
|
-
|
|
988
|
-
// TODO: temporary sanity check that the F16 -> MXFP4 is lossless
|
|
989
|
-
#if 0
|
|
990
|
-
if (new_type == GGML_TYPE_MXFP4) {
|
|
991
|
-
auto * x = f32_data_03;
|
|
992
|
-
|
|
993
|
-
//LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
|
|
994
|
-
std::vector<float> deq(nrows*n_per_row);
|
|
995
|
-
const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
|
|
996
|
-
qtype->to_float(new_data_03, deq.data(), deq.size());
|
|
997
|
-
|
|
998
|
-
double err = 0.0f;
|
|
999
|
-
for (int i = 0; i < (int) deq.size(); ++i) {
|
|
1000
|
-
err += fabsf(deq[i] - x[i]);
|
|
1001
|
-
//if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
|
|
1002
|
-
if (deq[i] != x[i]) {
|
|
1003
|
-
LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
|
|
1004
|
-
}
|
|
1005
|
-
}
|
|
1006
|
-
//LLAMA_LOG_INFO("err = %f\n", err);
|
|
1007
|
-
GGML_ASSERT(err == 0.00000);
|
|
1208
|
+
if (work.size() < (size_t)nelements * 4) {
|
|
1209
|
+
work.resize(nelements * 4); // upper bound on size
|
|
1008
1210
|
}
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
total_size_org += ggml_nbytes(tensor);
|
|
1014
|
-
total_size_new += new_size;
|
|
1211
|
+
new_data = work.data();
|
|
1212
|
+
|
|
1213
|
+
const int64_t n_per_row = tensor->ne[0];
|
|
1214
|
+
const int64_t nrows = tensor->ne[1];
|
|
1015
1215
|
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
|
|
1019
|
-
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
|
|
1216
|
+
static const int64_t min_chunk_size = 32 * 512;
|
|
1217
|
+
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
|
|
1020
1218
|
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1219
|
+
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
|
1220
|
+
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
|
1221
|
+
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
|
|
1222
|
+
|
|
1223
|
+
// quantize each expert separately since they have different importance matrices
|
|
1224
|
+
new_size = 0;
|
|
1225
|
+
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
|
|
1226
|
+
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
|
|
1227
|
+
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
|
1228
|
+
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
|
1229
|
+
|
|
1230
|
+
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
|
1231
|
+
}
|
|
1232
|
+
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0);
|
|
1233
|
+
}
|
|
1234
|
+
total_size_org += tensor_size;
|
|
1235
|
+
total_size_new += new_size;
|
|
1236
|
+
|
|
1237
|
+
// update the gguf meta data as we go
|
|
1238
|
+
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
|
|
1239
|
+
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
|
|
1240
|
+
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
|
|
1241
|
+
|
|
1242
|
+
// write tensor data + padding
|
|
1243
|
+
fout.write((const char *) new_data, new_size);
|
|
1244
|
+
zeros(fout, GGML_PAD(new_size, align) - new_size);
|
|
1245
|
+
} // no --dry-run
|
|
1246
|
+
} // main loop
|
|
1247
|
+
|
|
1248
|
+
if (!params->dry_run) {
|
|
1249
|
+
close_ofstream();
|
|
1024
1250
|
}
|
|
1025
|
-
close_ofstream();
|
|
1026
1251
|
|
|
1027
|
-
LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
|
|
1028
|
-
LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
|
|
1252
|
+
LLAMA_LOG_INFO("%s: model size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_org/1024.0/1024.0, total_size_org*8.0/ml.n_elements);
|
|
1253
|
+
LLAMA_LOG_INFO("%s: quant size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_new/1024.0/1024.0, total_size_new*8.0/ml.n_elements);
|
|
1254
|
+
|
|
1255
|
+
if (!params->imatrix && params->dry_run && will_require_imatrix) {
|
|
1256
|
+
LLAMA_LOG_WARN("%s: WARNING: dry run completed successfully, but actually completing this quantization will require an imatrix!\n",
|
|
1257
|
+
__func__
|
|
1258
|
+
);
|
|
1259
|
+
}
|
|
1029
1260
|
|
|
1030
1261
|
if (qs.n_fallback > 0) {
|
|
1031
1262
|
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
|
|
1032
|
-
__func__, qs.n_fallback,
|
|
1263
|
+
__func__, qs.n_fallback, ml.n_tensors);
|
|
1033
1264
|
}
|
|
1034
1265
|
}
|
|
1035
1266
|
|
|
@@ -1048,6 +1279,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
|
|
|
1048
1279
|
/*.only_copy =*/ false,
|
|
1049
1280
|
/*.pure =*/ false,
|
|
1050
1281
|
/*.keep_split =*/ false,
|
|
1282
|
+
/*.dry_run =*/ false,
|
|
1051
1283
|
/*.imatrix =*/ nullptr,
|
|
1052
1284
|
/*.kv_overrides =*/ nullptr,
|
|
1053
1285
|
/*.tensor_type =*/ nullptr,
|