whispercpp 1.3.5 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +99 -2
- data/ext/extconf.rb +1 -0
- data/ext/ruby_whisper.c +20 -4
- data/ext/ruby_whisper.h +30 -2
- data/ext/ruby_whisper_context.c +216 -124
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_model.c +0 -1
- data/ext/ruby_whisper_params.c +0 -1
- data/ext/ruby_whisper_segment.c +0 -1
- data/ext/ruby_whisper_token.c +29 -9
- data/ext/ruby_whisper_transcribe.cpp +4 -1
- data/ext/ruby_whisper_vad_context.c +48 -1
- data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +0 -1
- data/ext/ruby_whisper_vad_segments.c +0 -1
- data/ext/sources/CMakeLists.txt +1 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/examples/bench/bench.cpp +23 -18
- data/ext/sources/examples/cli/cli.cpp +8 -0
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/server/server.cpp +18 -4
- data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
- data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
- data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
- data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
- data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
- data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
- data/ext/sources/examples/talk-llama/llama-context.h +27 -28
- data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
- data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
- data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
- data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
- data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
- data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
- data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
- data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
- data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
- data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
- data/ext/sources/examples/talk-llama/llama-model.h +72 -19
- data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
- data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
- data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
- data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
- data/ext/sources/examples/talk-llama/llama.cpp +76 -22
- data/ext/sources/examples/talk-llama/llama.h +63 -30
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
- data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
- data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
- data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
- data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
- data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/models.h +181 -46
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
- data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
- data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
- data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
- data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
- data/ext/sources/ggml/CMakeLists.txt +9 -3
- data/ext/sources/ggml/include/ggml-backend.h +1 -1
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +5 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +6 -1
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml.h +56 -9
- data/ext/sources/ggml/src/CMakeLists.txt +3 -0
- data/ext/sources/ggml/src/ggml-alloc.c +4 -9
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
- data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
- data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
- data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
- data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
- data/ext/sources/ggml/src/ggml-impl.h +62 -0
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
- data/ext/sources/ggml/src/ggml-quants.c +96 -5
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
- data/ext/sources/ggml/src/ggml.c +167 -33
- data/ext/sources/ggml/src/gguf.cpp +229 -44
- data/ext/sources/src/whisper.cpp +6 -28
- data/sig/whisper.rbs +43 -2
- data/test/test_context_params.rb +82 -0
- data/test/test_token.rb +11 -0
- data/test/test_vad_context.rb +58 -8
- data/test/test_whisper.rb +20 -0
- data/whispercpp.gemspec +1 -1
- metadata +240 -28
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
|
@@ -48,6 +48,90 @@ static inline int nearest_int(float fval) {
|
|
|
48
48
|
|
|
49
49
|
extern "C" {
|
|
50
50
|
|
|
51
|
+
#if defined __riscv_zvfh
|
|
52
|
+
void ggml_quantize_mat_q8_0_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
53
|
+
assert(QK8_0 == 32);
|
|
54
|
+
assert(k % QK8_0 == 0);
|
|
55
|
+
const int nb = k / QK8_0;
|
|
56
|
+
|
|
57
|
+
block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
|
|
58
|
+
|
|
59
|
+
// scalar
|
|
60
|
+
const int blck_size_interleave = 1;
|
|
61
|
+
float srcv[4][QK8_0];
|
|
62
|
+
float id[4];
|
|
63
|
+
|
|
64
|
+
for (int i = 0; i < nb; i++) {
|
|
65
|
+
for (int row_iter = 0; row_iter < 4; row_iter++) {
|
|
66
|
+
float amax = 0.0f; // absolute max
|
|
67
|
+
|
|
68
|
+
for (int j = 0; j < QK8_0; j++) {
|
|
69
|
+
srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
|
|
70
|
+
amax = MAX(amax, fabsf(srcv[row_iter][j]));
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
const float d = amax / ((1 << 7) - 1);
|
|
74
|
+
id[row_iter] = d ? 1.0f / d : 0.0f;
|
|
75
|
+
|
|
76
|
+
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
for (int j = 0; j < QK8_0 * 4; j++) {
|
|
80
|
+
int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
|
|
81
|
+
int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
|
|
82
|
+
src_offset += (j % blck_size_interleave);
|
|
83
|
+
|
|
84
|
+
float x0 = srcv[src_id][src_offset] * id[src_id];
|
|
85
|
+
y[i].qs[j] = roundf(x0);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
void ggml_quantize_mat_q8_K_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
91
|
+
assert(QK_K == 256);
|
|
92
|
+
assert(k % QK_K == 0);
|
|
93
|
+
const int nb = k / QK_K;
|
|
94
|
+
|
|
95
|
+
block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
|
|
96
|
+
|
|
97
|
+
const int blck_size_interleave = 1;
|
|
98
|
+
float srcv[4][QK_K];
|
|
99
|
+
float iscale[4];
|
|
100
|
+
|
|
101
|
+
for (int i = 0; i < nb; i++) {
|
|
102
|
+
for (int row_iter = 0; row_iter < 4; row_iter++) {
|
|
103
|
+
float amax = 0.0f; // absolute max
|
|
104
|
+
float max = 0;
|
|
105
|
+
|
|
106
|
+
for (int j = 0; j < QK_K; j++) {
|
|
107
|
+
srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
|
|
108
|
+
// Update the maximum value of the corresponding super block
|
|
109
|
+
if(amax < fabsf(srcv[row_iter][j])) {
|
|
110
|
+
amax = fabsf(srcv[row_iter][j]);
|
|
111
|
+
max = srcv[row_iter][j];
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
iscale[row_iter] = amax ? -127.f/max : 0;
|
|
116
|
+
y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
for (int j = 0; j < QK_K / 4; j++) {
|
|
120
|
+
y[i].bsums[j] = 0;
|
|
121
|
+
}
|
|
122
|
+
for (int j = 0; j < QK_K * 4; j++) {
|
|
123
|
+
int src_id = j % 4;
|
|
124
|
+
int src_offset = j / 4;
|
|
125
|
+
int index = ((j >> 6) << 2) + (j & 3);
|
|
126
|
+
|
|
127
|
+
float x0 = srcv[src_id][src_offset] * iscale[src_id];
|
|
128
|
+
y[i].qs[j] = nearest_int(x0);
|
|
129
|
+
y[i].bsums[index] += y[i].qs[j];
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
#endif
|
|
134
|
+
|
|
51
135
|
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
52
136
|
assert(QK8_0 == 32);
|
|
53
137
|
assert(k % QK8_0 == 0);
|
|
@@ -124,7 +208,6 @@ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GG
|
|
|
124
208
|
}
|
|
125
209
|
}
|
|
126
210
|
|
|
127
|
-
|
|
128
211
|
void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
129
212
|
assert(QK_K == 256);
|
|
130
213
|
assert(k % QK_K == 0);
|
|
@@ -256,192 +339,289 @@ template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTR
|
|
|
256
339
|
ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row);
|
|
257
340
|
}
|
|
258
341
|
|
|
259
|
-
|
|
342
|
+
#if defined __riscv_zvfh
|
|
343
|
+
template <> void ggml_quantize_mat_t<1, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
|
|
344
|
+
assert(nrow == 4);
|
|
345
|
+
UNUSED(nrow);
|
|
346
|
+
ggml_quantize_mat_q8_0_4x1(x, vy, n_per_row);
|
|
347
|
+
}
|
|
260
348
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
349
|
+
template <> void ggml_quantize_mat_t<1, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
|
|
350
|
+
assert(nrow == 4);
|
|
351
|
+
UNUSED(nrow);
|
|
352
|
+
ggml_quantize_mat_q8_K_4x1(x, vy, n_per_row);
|
|
353
|
+
}
|
|
354
|
+
#endif
|
|
355
|
+
|
|
356
|
+
template <int M, int N>
|
|
357
|
+
static void ggml_gemv_q6_K_NxM_q8_K_generic_impl(int n,
|
|
358
|
+
float * GGML_RESTRICT s,
|
|
359
|
+
size_t bs,
|
|
360
|
+
const void * GGML_RESTRICT vx,
|
|
361
|
+
const void * GGML_RESTRICT vy,
|
|
362
|
+
int nr,
|
|
363
|
+
int nc) {
|
|
364
|
+
constexpr int blocklen = M;
|
|
365
|
+
constexpr int ncols_interleaved = N;
|
|
366
|
+
const int qk = QK_K;
|
|
367
|
+
const int nb = n / qk;
|
|
368
|
+
const int blocks_per_half = 64 / blocklen;
|
|
266
369
|
|
|
267
|
-
assert(nr == 1);
|
|
268
370
|
assert(n % qk == 0);
|
|
269
371
|
assert(nc % ncols_interleaved == 0);
|
|
270
372
|
|
|
271
|
-
UNUSED(s);
|
|
272
373
|
UNUSED(bs);
|
|
273
|
-
UNUSED(vx);
|
|
274
|
-
UNUSED(vy);
|
|
275
374
|
UNUSED(nr);
|
|
276
|
-
UNUSED(nc);
|
|
277
|
-
UNUSED(nb);
|
|
278
|
-
UNUSED(ncols_interleaved);
|
|
279
|
-
UNUSED(blocklen);
|
|
280
375
|
|
|
281
|
-
float sumf[
|
|
282
|
-
int sumi;
|
|
376
|
+
float sumf[8];
|
|
283
377
|
|
|
284
|
-
const
|
|
378
|
+
const block_q8_K * a_ptr = (const block_q8_K *) vy;
|
|
285
379
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
286
|
-
const
|
|
380
|
+
const block_q6_Kx8 * b_ptr = (const block_q6_Kx8 *) vx + (x * nb);
|
|
381
|
+
|
|
382
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
383
|
+
sumf[j] = 0.0f;
|
|
384
|
+
}
|
|
287
385
|
|
|
288
|
-
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
289
386
|
for (int l = 0; l < nb; l++) {
|
|
290
387
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
388
|
+
const int base_l = (k / blocks_per_half) * 128 + (k % blocks_per_half) * blocklen;
|
|
389
|
+
const int base_h = base_l + 64;
|
|
390
|
+
|
|
391
|
+
const int scale_idx_l = base_l / 16;
|
|
392
|
+
const int scale_idx_h = base_h / 16;
|
|
393
|
+
|
|
394
|
+
const int qh_shift_l = ((base_l % 128) / 32) * 2;
|
|
395
|
+
const int qh_shift_h = ((base_h % 128) / 32) * 2;
|
|
396
|
+
|
|
397
|
+
const int qh_half_l = (base_l / 128) * 32;
|
|
398
|
+
const int qh_half_h = (base_h / 128) * 32;
|
|
399
|
+
|
|
291
400
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
401
|
+
const int8_t scale_l = b_ptr[l].scales[scale_idx_l * ncols_interleaved + j];
|
|
402
|
+
const int8_t scale_h = b_ptr[l].scales[scale_idx_h * ncols_interleaved + j];
|
|
403
|
+
|
|
404
|
+
int sumi_l = 0;
|
|
405
|
+
int sumi_h = 0;
|
|
406
|
+
|
|
407
|
+
for (int i = 0; i < blocklen; i++) {
|
|
408
|
+
const int ql_pos = k * ncols_interleaved * blocklen + j * blocklen + i;
|
|
409
|
+
const int l_4 = b_ptr[l].ql[ql_pos] & 0xF;
|
|
410
|
+
const int hi_4 = (b_ptr[l].ql[ql_pos] >> 4) & 0xF;
|
|
411
|
+
|
|
412
|
+
const int qh_idx_l = qh_half_l + ((base_l + i) % 32);
|
|
413
|
+
const int qh_chunk_l = qh_idx_l / blocklen;
|
|
414
|
+
const int qh_pos_l = qh_idx_l % blocklen;
|
|
415
|
+
const int qh_offset_l = qh_chunk_l * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_l;
|
|
416
|
+
const int hi_2_l = (b_ptr[l].qh[qh_offset_l] >> qh_shift_l) & 0x3;
|
|
417
|
+
|
|
418
|
+
const int qh_idx_h = qh_half_h + ((base_h + i) % 32);
|
|
419
|
+
const int qh_chunk_h = qh_idx_h / blocklen;
|
|
420
|
+
const int qh_pos_h = qh_idx_h % blocklen;
|
|
421
|
+
const int qh_offset_h = qh_chunk_h * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_h;
|
|
422
|
+
const int hi_2_h = (b_ptr[l].qh[qh_offset_h] >> qh_shift_h) & 0x3;
|
|
423
|
+
|
|
424
|
+
const int q_l = ((hi_2_l << 4) | l_4) - 32;
|
|
425
|
+
const int q_h = ((hi_2_h << 4) | hi_4) - 32;
|
|
426
|
+
|
|
427
|
+
const int8_t a_l = a_ptr[l].qs[base_l + i];
|
|
428
|
+
const int8_t a_h = a_ptr[l].qs[base_h + i];
|
|
429
|
+
|
|
430
|
+
sumi_l += q_l * a_l;
|
|
431
|
+
sumi_h += q_h * a_h;
|
|
297
432
|
}
|
|
298
|
-
|
|
433
|
+
|
|
434
|
+
sumf[j] +=
|
|
435
|
+
(sumi_l * scale_l + sumi_h * scale_h) * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
299
436
|
}
|
|
300
437
|
}
|
|
301
438
|
}
|
|
302
|
-
|
|
439
|
+
|
|
440
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
441
|
+
s[x * ncols_interleaved + j] = sumf[j];
|
|
442
|
+
}
|
|
303
443
|
}
|
|
304
444
|
}
|
|
305
445
|
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
446
|
+
template <int M, int N>
|
|
447
|
+
static void ggml_gemm_q6_K_NxM_q8_K_generic_impl(int n,
|
|
448
|
+
float * GGML_RESTRICT s,
|
|
449
|
+
size_t bs,
|
|
450
|
+
const void * GGML_RESTRICT vx,
|
|
451
|
+
const void * GGML_RESTRICT vy,
|
|
452
|
+
int nr,
|
|
453
|
+
int nc) {
|
|
454
|
+
constexpr int blocklen = M;
|
|
455
|
+
constexpr int ncols_interleaved = N;
|
|
456
|
+
const int qk = QK_K;
|
|
457
|
+
const int nb = n / qk;
|
|
458
|
+
const int blocks_per_half = 64 / blocklen;
|
|
459
|
+
const int q8_half_stride = 512;
|
|
460
|
+
const int q8_low_high_step = 256;
|
|
311
461
|
|
|
312
|
-
assert
|
|
313
|
-
assert
|
|
462
|
+
assert(n % qk == 0);
|
|
463
|
+
assert(nr % 4 == 0);
|
|
464
|
+
assert(nc % ncols_interleaved == 0);
|
|
314
465
|
|
|
315
|
-
UNUSED(s);
|
|
316
466
|
UNUSED(bs);
|
|
317
|
-
UNUSED(vx);
|
|
318
|
-
UNUSED(vy);
|
|
319
|
-
UNUSED(nr);
|
|
320
|
-
UNUSED(nc);
|
|
321
|
-
UNUSED(nb);
|
|
322
|
-
UNUSED(ncols_interleaved);
|
|
323
|
-
UNUSED(blocklen);
|
|
324
467
|
|
|
325
|
-
float sumf[4];
|
|
326
|
-
int sumi;
|
|
468
|
+
float sumf[4][8];
|
|
327
469
|
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
470
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
471
|
+
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
|
472
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
473
|
+
const block_q6_Kx8 * b_ptr = (const block_q6_Kx8 *) vx + (x * nb);
|
|
331
474
|
|
|
332
|
-
|
|
333
|
-
for (int l = 0; l < nb; l++) {
|
|
334
|
-
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
475
|
+
for (int m = 0; m < 4; m++) {
|
|
335
476
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
336
|
-
|
|
337
|
-
for (int i = 0; i < blocklen; ++i) {
|
|
338
|
-
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
339
|
-
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
340
|
-
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
341
|
-
}
|
|
342
|
-
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
477
|
+
sumf[m][j] = 0.0f;
|
|
343
478
|
}
|
|
344
479
|
}
|
|
345
|
-
}
|
|
346
|
-
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
347
|
-
}
|
|
348
|
-
}
|
|
349
480
|
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
const int blocklen = 8;
|
|
481
|
+
for (int l = 0; l < nb; l++) {
|
|
482
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
483
|
+
const int base_l = (k / blocks_per_half) * 128 + (k % blocks_per_half) * blocklen;
|
|
484
|
+
const int base_h = base_l + 64;
|
|
355
485
|
|
|
356
|
-
|
|
357
|
-
|
|
486
|
+
const int scale_idx_l = base_l / 16;
|
|
487
|
+
const int scale_idx_h = base_h / 16;
|
|
358
488
|
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
UNUSED(vx);
|
|
362
|
-
UNUSED(vy);
|
|
363
|
-
UNUSED(nr);
|
|
364
|
-
UNUSED(nc);
|
|
365
|
-
UNUSED(nb);
|
|
366
|
-
UNUSED(ncols_interleaved);
|
|
367
|
-
UNUSED(blocklen);
|
|
489
|
+
const int qh_shift_l = ((base_l % 128) / 32) * 2;
|
|
490
|
+
const int qh_shift_h = ((base_h % 128) / 32) * 2;
|
|
368
491
|
|
|
369
|
-
|
|
370
|
-
|
|
492
|
+
const int qh_half_l = (base_l / 128) * 32;
|
|
493
|
+
const int qh_half_h = (base_h / 128) * 32;
|
|
371
494
|
|
|
372
|
-
|
|
373
|
-
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
374
|
-
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
|
495
|
+
const int q8_base = (k / blocks_per_half) * q8_half_stride + (k % blocks_per_half) * (blocklen * 4);
|
|
375
496
|
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
497
|
+
for (int m = 0; m < 4; m++) {
|
|
498
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
499
|
+
const int8_t scale_l = b_ptr[l].scales[scale_idx_l * ncols_interleaved + j];
|
|
500
|
+
const int8_t scale_h = b_ptr[l].scales[scale_idx_h * ncols_interleaved + j];
|
|
501
|
+
|
|
502
|
+
int sumi_l = 0;
|
|
503
|
+
int sumi_h = 0;
|
|
504
|
+
|
|
505
|
+
for (int i = 0; i < blocklen; i++) {
|
|
506
|
+
const int ql_pos = k * ncols_interleaved * blocklen + j * blocklen + i;
|
|
507
|
+
const int l_4 = b_ptr[l].ql[ql_pos] & 0xF;
|
|
508
|
+
const int hi_4 = (b_ptr[l].ql[ql_pos] >> 4) & 0xF;
|
|
509
|
+
|
|
510
|
+
const int qh_idx_l = qh_half_l + ((base_l + i) % 32);
|
|
511
|
+
const int qh_chunk_l = qh_idx_l / blocklen;
|
|
512
|
+
const int qh_pos_l = qh_idx_l % blocklen;
|
|
513
|
+
const int qh_offset_l =
|
|
514
|
+
qh_chunk_l * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_l;
|
|
515
|
+
const int hi_2_l = (b_ptr[l].qh[qh_offset_l] >> qh_shift_l) & 0x3;
|
|
516
|
+
|
|
517
|
+
const int qh_idx_h = qh_half_h + ((base_h + i) % 32);
|
|
518
|
+
const int qh_chunk_h = qh_idx_h / blocklen;
|
|
519
|
+
const int qh_pos_h = qh_idx_h % blocklen;
|
|
520
|
+
const int qh_offset_h =
|
|
521
|
+
qh_chunk_h * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_h;
|
|
522
|
+
const int hi_2_h = (b_ptr[l].qh[qh_offset_h] >> qh_shift_h) & 0x3;
|
|
523
|
+
|
|
524
|
+
const int q_l = ((hi_2_l << 4) | l_4) - 32;
|
|
525
|
+
const int q_h = ((hi_2_h << 4) | hi_4) - 32;
|
|
526
|
+
|
|
527
|
+
const int8_t q8_l = a_ptr[l].qs[q8_base + m * blocklen + i];
|
|
528
|
+
const int8_t q8_h = a_ptr[l].qs[q8_base + m * blocklen + i + q8_low_high_step];
|
|
529
|
+
|
|
530
|
+
sumi_l += q_l * q8_l;
|
|
531
|
+
sumi_h += q_h * q8_h;
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
sumf[m][j] += (sumi_l * scale_l + sumi_h * scale_h) * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) *
|
|
535
|
+
a_ptr[l].d[m];
|
|
536
|
+
}
|
|
385
537
|
}
|
|
386
|
-
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
for (int m = 0; m < 4; m++) {
|
|
542
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
543
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
387
544
|
}
|
|
388
545
|
}
|
|
389
546
|
}
|
|
390
|
-
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
391
547
|
}
|
|
392
548
|
}
|
|
393
549
|
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
550
|
+
template <int M, int N>
|
|
551
|
+
static void ggml_gemv_q5_K_NxM_q8_K_generic_impl(int n,
|
|
552
|
+
float * GGML_RESTRICT s,
|
|
553
|
+
size_t bs,
|
|
554
|
+
const void * GGML_RESTRICT vx,
|
|
555
|
+
const void * GGML_RESTRICT vy,
|
|
556
|
+
int nr,
|
|
557
|
+
int nc) {
|
|
558
|
+
constexpr int blocklen = M;
|
|
559
|
+
constexpr int ncols_interleaved = N;
|
|
560
|
+
const int qk = QK_K;
|
|
561
|
+
const int nb = n / qk;
|
|
562
|
+
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
563
|
+
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
564
|
+
static const uint32_t kmask3 = 0x03030303;
|
|
402
565
|
|
|
403
|
-
assert
|
|
404
|
-
assert
|
|
566
|
+
assert(n % qk == 0);
|
|
567
|
+
assert(nc % ncols_interleaved == 0);
|
|
405
568
|
|
|
406
569
|
UNUSED(bs);
|
|
407
570
|
UNUSED(nr);
|
|
408
571
|
|
|
409
|
-
float
|
|
410
|
-
float
|
|
572
|
+
float sumf[ncols_interleaved];
|
|
573
|
+
float sum_minf[ncols_interleaved];
|
|
411
574
|
uint32_t utmp[32];
|
|
412
|
-
int
|
|
413
|
-
int
|
|
414
|
-
int
|
|
575
|
+
int sumi1;
|
|
576
|
+
int sumi2;
|
|
577
|
+
int sumi;
|
|
415
578
|
|
|
416
579
|
const block_q8_K * a_ptr = (const block_q8_K *) vy;
|
|
417
580
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
418
|
-
const
|
|
581
|
+
const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
|
|
419
582
|
|
|
420
583
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
421
|
-
sumf[j]
|
|
584
|
+
sumf[j] = 0.0;
|
|
422
585
|
sum_minf[j] = 0.0;
|
|
423
586
|
}
|
|
424
587
|
for (int l = 0; l < nb; l++) {
|
|
425
588
|
for (int sb = 0; sb < 8; sb++) {
|
|
426
|
-
memcpy(utmp + sb * 4, b_ptr[l].scales + sb *
|
|
427
|
-
utmp[sb * 4 + 3]
|
|
589
|
+
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * K_SCALE_SIZE, K_SCALE_SIZE);
|
|
590
|
+
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
428
591
|
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
429
|
-
utmp[sb * 4 + 1]
|
|
430
|
-
utmp[sb * 4 + 2]
|
|
592
|
+
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
593
|
+
utmp[sb * 4 + 2] = uaux_0;
|
|
431
594
|
utmp[sb * 4 + 0] &= kmask1;
|
|
432
595
|
}
|
|
433
596
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
434
|
-
|
|
435
|
-
uint8_t *
|
|
597
|
+
constexpr int scale_stride = 32;
|
|
598
|
+
uint8_t * scales_0 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride;
|
|
599
|
+
uint8_t * scales_1 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride + 16;
|
|
600
|
+
|
|
601
|
+
const int qh_shift = (k / (32 / blocklen)) * 2;
|
|
436
602
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
437
603
|
sumi1 = 0;
|
|
438
604
|
sumi2 = 0;
|
|
439
|
-
sumi
|
|
605
|
+
sumi = 0;
|
|
440
606
|
for (int i = 0; i < blocklen; ++i) {
|
|
441
|
-
const int
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
607
|
+
const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
|
|
608
|
+
|
|
609
|
+
const int qh_idx = (k * blocklen + i) % 32;
|
|
610
|
+
const int qh_chunk = qh_idx / blocklen;
|
|
611
|
+
const int qh_pos = qh_idx % blocklen;
|
|
612
|
+
const int b_qh_offset = qh_chunk * (blocklen * ncols_interleaved) + j * blocklen + qh_pos;
|
|
613
|
+
|
|
614
|
+
const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
|
|
615
|
+
const uint8_t h0 = (qh_val >> qh_shift) & 1;
|
|
616
|
+
const uint8_t h1 = (qh_val >> (qh_shift + 1)) & 1;
|
|
617
|
+
|
|
618
|
+
const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
|
|
619
|
+
const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
|
|
620
|
+
|
|
621
|
+
const int q8_offset = (k / (32 / blocklen)) * 64 + (k % (32 / blocklen)) * blocklen + i;
|
|
622
|
+
|
|
623
|
+
sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
|
|
624
|
+
sumi2 = (v1 * a_ptr[l].qs[q8_offset + 32]);
|
|
445
625
|
sumi1 = sumi1 * scales_0[j];
|
|
446
626
|
sumi2 = sumi2 * scales_1[j];
|
|
447
627
|
sumi += sumi1 + sumi2;
|
|
@@ -452,7 +632,8 @@ void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
452
632
|
for (int sb = 0; sb < 8; sb++) {
|
|
453
633
|
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
|
454
634
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
455
|
-
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) *
|
|
635
|
+
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) *
|
|
636
|
+
GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
456
637
|
}
|
|
457
638
|
}
|
|
458
639
|
}
|
|
@@ -462,17 +643,123 @@ void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
462
643
|
}
|
|
463
644
|
}
|
|
464
645
|
|
|
465
|
-
|
|
466
|
-
|
|
646
|
+
template <int M, int N>
|
|
647
|
+
static void ggml_gemm_q5_K_NxM_q8_K_generic_impl(int n,
|
|
648
|
+
float * GGML_RESTRICT s,
|
|
649
|
+
size_t bs,
|
|
650
|
+
const void * GGML_RESTRICT vx,
|
|
651
|
+
const void * GGML_RESTRICT vy,
|
|
652
|
+
int nr,
|
|
653
|
+
int nc) {
|
|
654
|
+
constexpr int blocklen = M;
|
|
655
|
+
constexpr int ncols_interleaved = N;
|
|
656
|
+
const int qk = QK_K;
|
|
657
|
+
const int nb = n / qk;
|
|
658
|
+
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
659
|
+
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
660
|
+
static const uint32_t kmask3 = 0x03030303;
|
|
661
|
+
|
|
662
|
+
assert(n % qk == 0);
|
|
663
|
+
assert(nr % 4 == 0);
|
|
664
|
+
assert(nc % ncols_interleaved == 0);
|
|
665
|
+
|
|
666
|
+
float sumf[4][ncols_interleaved];
|
|
667
|
+
float sum_minf[4][ncols_interleaved];
|
|
668
|
+
uint32_t utmp[32];
|
|
669
|
+
int sumi1;
|
|
670
|
+
int sumi2;
|
|
671
|
+
int sumi;
|
|
672
|
+
|
|
673
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
674
|
+
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
|
675
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
676
|
+
const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
|
|
677
|
+
for (int m = 0; m < 4; m++) {
|
|
678
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
679
|
+
sumf[m][j] = 0.0;
|
|
680
|
+
sum_minf[m][j] = 0.0;
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
for (int l = 0; l < nb; l++) {
|
|
684
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
685
|
+
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * K_SCALE_SIZE, K_SCALE_SIZE);
|
|
686
|
+
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
687
|
+
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
688
|
+
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
689
|
+
utmp[sb * 4 + 2] = uaux_0;
|
|
690
|
+
utmp[sb * 4 + 0] &= kmask1;
|
|
691
|
+
}
|
|
692
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
693
|
+
constexpr int scale_stride = 32;
|
|
694
|
+
uint8_t * scales_0 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride;
|
|
695
|
+
uint8_t * scales_1 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride + 16;
|
|
696
|
+
|
|
697
|
+
const int qh_shift = (k / (32 / blocklen)) * 2;
|
|
698
|
+
for (int m = 0; m < 4; m++) {
|
|
699
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
700
|
+
sumi1 = 0;
|
|
701
|
+
sumi2 = 0;
|
|
702
|
+
sumi = 0;
|
|
703
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
704
|
+
const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
|
|
705
|
+
|
|
706
|
+
const int qh_idx = (k * blocklen + i) % 32;
|
|
707
|
+
const int qh_chunk = qh_idx / blocklen;
|
|
708
|
+
const int qh_pos = qh_idx % blocklen;
|
|
709
|
+
const int b_qh_offset =
|
|
710
|
+
qh_chunk * (blocklen * ncols_interleaved) + j * blocklen + qh_pos;
|
|
711
|
+
|
|
712
|
+
const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
|
|
713
|
+
const uint8_t h0 = (qh_val >> qh_shift) & 1;
|
|
714
|
+
const uint8_t h1 = (qh_val >> (qh_shift + 1)) & 1;
|
|
715
|
+
|
|
716
|
+
const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
|
|
717
|
+
const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
|
|
718
|
+
|
|
719
|
+
const int q8_offset = (k / (32 / blocklen)) * 256 +
|
|
720
|
+
(k % (32 / blocklen)) * 4 * blocklen + m * blocklen + i;
|
|
721
|
+
|
|
722
|
+
sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
|
|
723
|
+
sumi2 = (v1 * a_ptr[l].qs[q8_offset + 128]);
|
|
724
|
+
sumi1 = sumi1 * scales_0[j];
|
|
725
|
+
sumi2 = sumi2 * scales_1[j];
|
|
726
|
+
sumi += sumi1 + sumi2;
|
|
727
|
+
}
|
|
728
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
733
|
+
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
|
734
|
+
for (int m = 0; m < 4; m++) {
|
|
735
|
+
const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
736
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
737
|
+
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) *
|
|
738
|
+
GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
739
|
+
}
|
|
740
|
+
}
|
|
741
|
+
}
|
|
742
|
+
}
|
|
743
|
+
for (int m = 0; m < 4; m++) {
|
|
744
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
745
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
|
746
|
+
}
|
|
747
|
+
}
|
|
748
|
+
}
|
|
749
|
+
}
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
extern "C" {
|
|
753
|
+
|
|
754
|
+
void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
755
|
+
const int qk = QK8_0;
|
|
467
756
|
const int nb = n / qk;
|
|
468
|
-
const int ncols_interleaved =
|
|
469
|
-
const int blocklen =
|
|
470
|
-
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
471
|
-
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
472
|
-
static const uint32_t kmask3 = 0x03030303;
|
|
757
|
+
const int ncols_interleaved = 4;
|
|
758
|
+
const int blocklen = 4;
|
|
473
759
|
|
|
474
|
-
assert
|
|
475
|
-
assert
|
|
760
|
+
assert(nr == 1);
|
|
761
|
+
assert(n % qk == 0);
|
|
762
|
+
assert(nc % ncols_interleaved == 0);
|
|
476
763
|
|
|
477
764
|
UNUSED(s);
|
|
478
765
|
UNUSED(bs);
|
|
@@ -484,66 +771,35 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
484
771
|
UNUSED(ncols_interleaved);
|
|
485
772
|
UNUSED(blocklen);
|
|
486
773
|
|
|
487
|
-
float sumf[
|
|
488
|
-
float sum_minf[8];
|
|
489
|
-
uint32_t utmp[32];
|
|
490
|
-
int sumi1;
|
|
491
|
-
int sumi2;
|
|
774
|
+
float sumf[4];
|
|
492
775
|
int sumi;
|
|
493
776
|
|
|
494
|
-
const
|
|
777
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
495
778
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
496
|
-
const
|
|
779
|
+
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
|
497
780
|
|
|
498
|
-
for (int j = 0; j < ncols_interleaved; j++)
|
|
499
|
-
sumf[j] = 0.0;
|
|
500
|
-
sum_minf[j] = 0.0;
|
|
501
|
-
}
|
|
781
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
502
782
|
for (int l = 0; l < nb; l++) {
|
|
503
|
-
for (int sb = 0; sb < 8; sb++) {
|
|
504
|
-
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
505
|
-
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
506
|
-
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
507
|
-
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
508
|
-
utmp[sb * 4 + 2] = uaux_0;
|
|
509
|
-
utmp[sb * 4 + 0] &= kmask1;
|
|
510
|
-
}
|
|
511
783
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
512
|
-
uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
|
|
513
|
-
uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
|
|
514
784
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
515
|
-
sumi1 = 0;
|
|
516
|
-
sumi2 = 0;
|
|
517
785
|
sumi = 0;
|
|
518
786
|
for (int i = 0; i < blocklen; ++i) {
|
|
519
|
-
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i]
|
|
520
|
-
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i]
|
|
521
|
-
|
|
522
|
-
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]);
|
|
523
|
-
sumi1 = sumi1 * scales_0[j];
|
|
524
|
-
sumi2 = sumi2 * scales_1[j];
|
|
525
|
-
sumi += sumi1 + sumi2;
|
|
787
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
788
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
789
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
526
790
|
}
|
|
527
|
-
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
528
|
-
}
|
|
529
|
-
}
|
|
530
|
-
for (int sb = 0; sb < 8; sb++) {
|
|
531
|
-
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
|
|
532
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
533
|
-
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
791
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
534
792
|
}
|
|
535
793
|
}
|
|
536
794
|
}
|
|
537
|
-
for (int j = 0; j < ncols_interleaved; j++)
|
|
538
|
-
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
|
539
|
-
}
|
|
795
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
540
796
|
}
|
|
541
797
|
}
|
|
542
798
|
|
|
543
|
-
void
|
|
544
|
-
const int qk =
|
|
799
|
+
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
800
|
+
const int qk = QK8_0;
|
|
545
801
|
const int nb = n / qk;
|
|
546
|
-
const int ncols_interleaved =
|
|
802
|
+
const int ncols_interleaved = 4;
|
|
547
803
|
const int blocklen = 8;
|
|
548
804
|
|
|
549
805
|
assert (n % qk == 0);
|
|
@@ -559,82 +815,56 @@ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
559
815
|
UNUSED(ncols_interleaved);
|
|
560
816
|
UNUSED(blocklen);
|
|
561
817
|
|
|
562
|
-
float sumf[
|
|
563
|
-
float sum_minf[8];
|
|
564
|
-
int sumi1,sumi2,sumi3,sumi4;
|
|
818
|
+
float sumf[4];
|
|
565
819
|
int sumi;
|
|
566
820
|
|
|
567
|
-
const
|
|
568
|
-
for(int x = 0; x < nc / ncols_interleaved; x++) {
|
|
569
|
-
const
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
sum_minf[j] = 0.0;
|
|
573
|
-
}
|
|
821
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
822
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
823
|
+
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
|
824
|
+
|
|
825
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
574
826
|
for (int l = 0; l < nb; l++) {
|
|
575
|
-
for (int k = 0; k < (qk / (
|
|
576
|
-
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
|
577
|
-
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
|
578
|
-
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
|
579
|
-
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
|
827
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
580
828
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
581
|
-
sumi1 = 0;
|
|
582
|
-
sumi2 = 0;
|
|
583
|
-
sumi3 = 0;
|
|
584
|
-
sumi4 = 0;
|
|
585
829
|
sumi = 0;
|
|
586
|
-
int
|
|
587
|
-
|
|
588
|
-
const int
|
|
589
|
-
|
|
590
|
-
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
|
591
|
-
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
|
592
|
-
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
|
|
593
|
-
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
|
|
594
|
-
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
|
|
595
|
-
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
|
|
596
|
-
|
|
597
|
-
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
|
598
|
-
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
|
599
|
-
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
|
600
|
-
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
|
601
|
-
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
|
830
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
831
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
832
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
833
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
602
834
|
}
|
|
603
|
-
sumf[j] += sumi *
|
|
604
|
-
}
|
|
605
|
-
}
|
|
606
|
-
for(int sb = 0; sb < 8; sb++) {
|
|
607
|
-
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
|
608
|
-
for(int j = 0; j < ncols_interleaved; j++){
|
|
609
|
-
sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
835
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
610
836
|
}
|
|
611
837
|
}
|
|
612
838
|
}
|
|
613
|
-
for (int j = 0; j < ncols_interleaved; j++)
|
|
614
|
-
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
|
615
|
-
}
|
|
839
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
616
840
|
}
|
|
617
841
|
}
|
|
618
842
|
|
|
619
|
-
void
|
|
843
|
+
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
620
844
|
const int qk = QK8_0;
|
|
621
845
|
const int nb = n / qk;
|
|
622
|
-
const int ncols_interleaved =
|
|
623
|
-
const int blocklen =
|
|
846
|
+
const int ncols_interleaved = 8;
|
|
847
|
+
const int blocklen = 8;
|
|
624
848
|
|
|
625
|
-
assert(
|
|
626
|
-
assert(
|
|
627
|
-
assert(nc % ncols_interleaved == 0);
|
|
849
|
+
assert (n % qk == 0);
|
|
850
|
+
assert (nc % ncols_interleaved == 0);
|
|
628
851
|
|
|
852
|
+
UNUSED(s);
|
|
629
853
|
UNUSED(bs);
|
|
854
|
+
UNUSED(vx);
|
|
855
|
+
UNUSED(vy);
|
|
630
856
|
UNUSED(nr);
|
|
857
|
+
UNUSED(nc);
|
|
858
|
+
UNUSED(nb);
|
|
859
|
+
UNUSED(ncols_interleaved);
|
|
860
|
+
UNUSED(blocklen);
|
|
631
861
|
|
|
632
|
-
float sumf[
|
|
862
|
+
float sumf[8];
|
|
633
863
|
int sumi;
|
|
634
864
|
|
|
635
865
|
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
636
866
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
637
|
-
const
|
|
867
|
+
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
|
638
868
|
|
|
639
869
|
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
640
870
|
for (int l = 0; l < nb; l++) {
|
|
@@ -642,9 +872,9 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
642
872
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
643
873
|
sumi = 0;
|
|
644
874
|
for (int i = 0; i < blocklen; ++i) {
|
|
645
|
-
const int v0 =
|
|
646
|
-
const int v1 =
|
|
647
|
-
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
875
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
876
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
877
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
648
878
|
}
|
|
649
879
|
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
650
880
|
}
|
|
@@ -654,139 +884,1210 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
654
884
|
}
|
|
655
885
|
}
|
|
656
886
|
|
|
657
|
-
void
|
|
658
|
-
const int qk =
|
|
887
|
+
void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
888
|
+
const int qk = QK_K;
|
|
659
889
|
const int nb = n / qk;
|
|
660
890
|
const int ncols_interleaved = 8;
|
|
661
|
-
const int blocklen =
|
|
891
|
+
const int blocklen = 4;
|
|
892
|
+
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
893
|
+
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
894
|
+
static const uint32_t kmask3 = 0x03030303;
|
|
662
895
|
|
|
663
|
-
assert(
|
|
664
|
-
assert(
|
|
665
|
-
assert(nc % ncols_interleaved == 0);
|
|
896
|
+
assert (n % qk == 0);
|
|
897
|
+
assert (nc % ncols_interleaved == 0);
|
|
666
898
|
|
|
667
899
|
UNUSED(bs);
|
|
668
900
|
UNUSED(nr);
|
|
669
901
|
|
|
670
902
|
float sumf[8];
|
|
903
|
+
float sum_minf[8];
|
|
904
|
+
uint32_t utmp[32];
|
|
905
|
+
int sumi1;
|
|
906
|
+
int sumi2;
|
|
671
907
|
int sumi;
|
|
672
908
|
|
|
673
|
-
const
|
|
909
|
+
const block_q8_K * a_ptr = (const block_q8_K *) vy;
|
|
674
910
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
675
|
-
const
|
|
911
|
+
const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
|
676
912
|
|
|
677
|
-
for (int j = 0; j < ncols_interleaved; j++)
|
|
913
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
914
|
+
sumf[j] = 0.0;
|
|
915
|
+
sum_minf[j] = 0.0;
|
|
916
|
+
}
|
|
678
917
|
for (int l = 0; l < nb; l++) {
|
|
918
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
919
|
+
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
920
|
+
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
921
|
+
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
922
|
+
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
923
|
+
utmp[sb * 4 + 2] = uaux_0;
|
|
924
|
+
utmp[sb * 4 + 0] &= kmask1;
|
|
925
|
+
}
|
|
679
926
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
927
|
+
uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
|
|
928
|
+
uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
|
|
680
929
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
681
|
-
|
|
930
|
+
sumi1 = 0;
|
|
931
|
+
sumi2 = 0;
|
|
932
|
+
sumi = 0;
|
|
933
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
934
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
|
|
935
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
|
936
|
+
sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i]);
|
|
937
|
+
sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i + 32]);
|
|
938
|
+
sumi1 = sumi1 * scales_0[j];
|
|
939
|
+
sumi2 = sumi2 * scales_1[j];
|
|
940
|
+
sumi += sumi1 + sumi2;
|
|
941
|
+
}
|
|
942
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
943
|
+
}
|
|
944
|
+
}
|
|
945
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
946
|
+
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
|
947
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
948
|
+
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
}
|
|
952
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
953
|
+
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
|
954
|
+
}
|
|
955
|
+
}
|
|
956
|
+
}
|
|
957
|
+
|
|
958
|
+
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
959
|
+
const int qk = QK_K;
|
|
960
|
+
const int nb = n / qk;
|
|
961
|
+
const int ncols_interleaved = 8;
|
|
962
|
+
const int blocklen = 8;
|
|
963
|
+
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
964
|
+
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
965
|
+
static const uint32_t kmask3 = 0x03030303;
|
|
966
|
+
|
|
967
|
+
assert (n % qk == 0);
|
|
968
|
+
assert (nc % ncols_interleaved == 0);
|
|
969
|
+
|
|
970
|
+
UNUSED(bs);
|
|
971
|
+
UNUSED(nr);
|
|
972
|
+
|
|
973
|
+
float sumf[8];
|
|
974
|
+
float sum_minf[8];
|
|
975
|
+
uint32_t utmp[32];
|
|
976
|
+
int sumi1;
|
|
977
|
+
int sumi2;
|
|
978
|
+
int sumi;
|
|
979
|
+
|
|
980
|
+
const block_q8_K * a_ptr = (const block_q8_K *) vy;
|
|
981
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
982
|
+
const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
|
983
|
+
|
|
984
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
985
|
+
sumf[j] = 0.0;
|
|
986
|
+
sum_minf[j] = 0.0;
|
|
987
|
+
}
|
|
988
|
+
for (int l = 0; l < nb; l++) {
|
|
989
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
990
|
+
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
991
|
+
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
992
|
+
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
993
|
+
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
994
|
+
utmp[sb * 4 + 2] = uaux_0;
|
|
995
|
+
utmp[sb * 4 + 0] &= kmask1;
|
|
996
|
+
}
|
|
997
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
998
|
+
uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
|
|
999
|
+
uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
|
|
1000
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1001
|
+
sumi1 = 0;
|
|
1002
|
+
sumi2 = 0;
|
|
1003
|
+
sumi = 0;
|
|
1004
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1005
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
|
|
1006
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
|
1007
|
+
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]);
|
|
1008
|
+
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]);
|
|
1009
|
+
sumi1 = sumi1 * scales_0[j];
|
|
1010
|
+
sumi2 = sumi2 * scales_1[j];
|
|
1011
|
+
sumi += sumi1 + sumi2;
|
|
1012
|
+
}
|
|
1013
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
1014
|
+
}
|
|
1015
|
+
}
|
|
1016
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
1017
|
+
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
|
|
1018
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1019
|
+
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
1020
|
+
}
|
|
1021
|
+
}
|
|
1022
|
+
}
|
|
1023
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1024
|
+
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
|
1025
|
+
}
|
|
1026
|
+
}
|
|
1027
|
+
}
|
|
1028
|
+
|
|
1029
|
+
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1030
|
+
const int qk = QK_K;
|
|
1031
|
+
const int nb = n / qk;
|
|
1032
|
+
const int ncols_interleaved = 8;
|
|
1033
|
+
const int blocklen = 8;
|
|
1034
|
+
|
|
1035
|
+
assert (n % qk == 0);
|
|
1036
|
+
assert (nc % ncols_interleaved == 0);
|
|
1037
|
+
|
|
1038
|
+
UNUSED(s);
|
|
1039
|
+
UNUSED(bs);
|
|
1040
|
+
UNUSED(vx);
|
|
1041
|
+
UNUSED(vy);
|
|
1042
|
+
UNUSED(nr);
|
|
1043
|
+
UNUSED(nc);
|
|
1044
|
+
UNUSED(nb);
|
|
1045
|
+
UNUSED(ncols_interleaved);
|
|
1046
|
+
UNUSED(blocklen);
|
|
1047
|
+
|
|
1048
|
+
float sumf[8];
|
|
1049
|
+
float sum_minf[8];
|
|
1050
|
+
int sumi1,sumi2,sumi3,sumi4;
|
|
1051
|
+
int sumi;
|
|
1052
|
+
|
|
1053
|
+
const block_q8_K * a_ptr = (const block_q8_K *)vy;
|
|
1054
|
+
for(int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1055
|
+
const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
|
|
1056
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1057
|
+
sumf[j] = 0.0;
|
|
1058
|
+
sum_minf[j] = 0.0;
|
|
1059
|
+
}
|
|
1060
|
+
for (int l = 0; l < nb; l++) {
|
|
1061
|
+
for (int k = 0; k < (qk / (4 * blocklen)); k++) {
|
|
1062
|
+
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
|
1063
|
+
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
|
1064
|
+
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
|
1065
|
+
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
|
1066
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1067
|
+
sumi1 = 0;
|
|
1068
|
+
sumi2 = 0;
|
|
1069
|
+
sumi3 = 0;
|
|
1070
|
+
sumi4 = 0;
|
|
1071
|
+
sumi = 0;
|
|
1072
|
+
int offset = ((k / 2) % 2) + j * 2;
|
|
1073
|
+
for (int i = 0; i < blocklen; ++i){
|
|
1074
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
|
|
1075
|
+
const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
|
|
1076
|
+
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
|
1077
|
+
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
|
1078
|
+
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
|
|
1079
|
+
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
|
|
1080
|
+
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
|
|
1081
|
+
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
|
|
1082
|
+
|
|
1083
|
+
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
|
1084
|
+
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
|
1085
|
+
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
|
1086
|
+
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
|
1087
|
+
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
|
1088
|
+
}
|
|
1089
|
+
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
1090
|
+
}
|
|
1091
|
+
}
|
|
1092
|
+
for(int sb = 0; sb < 8; sb++) {
|
|
1093
|
+
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
|
1094
|
+
for(int j = 0; j < ncols_interleaved; j++){
|
|
1095
|
+
sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
1096
|
+
}
|
|
1097
|
+
}
|
|
1098
|
+
}
|
|
1099
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1100
|
+
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
|
1101
|
+
}
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
|
|
1105
|
+
void ggml_gemv_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1106
|
+
ggml_gemv_q5_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
void ggml_gemv_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1110
|
+
ggml_gemv_q5_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
|
|
1111
|
+
}
|
|
1112
|
+
|
|
1113
|
+
|
|
1114
|
+
void ggml_gemv_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1115
|
+
ggml_gemv_q6_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
|
|
1116
|
+
}
|
|
1117
|
+
|
|
1118
|
+
void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1119
|
+
ggml_gemv_q6_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1123
|
+
const int qk = QK8_0;
|
|
1124
|
+
const int nb = n / qk;
|
|
1125
|
+
const int ncols_interleaved = 4;
|
|
1126
|
+
const int blocklen = 4;
|
|
1127
|
+
|
|
1128
|
+
assert(nr == 1);
|
|
1129
|
+
assert(n % qk == 0);
|
|
1130
|
+
assert(nc % ncols_interleaved == 0);
|
|
1131
|
+
|
|
1132
|
+
UNUSED(bs);
|
|
1133
|
+
UNUSED(nr);
|
|
1134
|
+
|
|
1135
|
+
float sumf[4];
|
|
1136
|
+
int sumi;
|
|
1137
|
+
|
|
1138
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
1139
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1140
|
+
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
|
1141
|
+
|
|
1142
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
1143
|
+
for (int l = 0; l < nb; l++) {
|
|
1144
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1145
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1146
|
+
sumi = 0;
|
|
1147
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1148
|
+
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
1149
|
+
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
1150
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
1151
|
+
}
|
|
1152
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
1153
|
+
}
|
|
1154
|
+
}
|
|
1155
|
+
}
|
|
1156
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
1157
|
+
}
|
|
1158
|
+
}
|
|
1159
|
+
|
|
1160
|
+
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1161
|
+
const int qk = QK8_0;
|
|
1162
|
+
const int nb = n / qk;
|
|
1163
|
+
const int ncols_interleaved = 8;
|
|
1164
|
+
const int blocklen = 8;
|
|
1165
|
+
|
|
1166
|
+
assert(nr == 1);
|
|
1167
|
+
assert(n % qk == 0);
|
|
1168
|
+
assert(nc % ncols_interleaved == 0);
|
|
1169
|
+
|
|
1170
|
+
UNUSED(bs);
|
|
1171
|
+
UNUSED(nr);
|
|
1172
|
+
|
|
1173
|
+
float sumf[8];
|
|
1174
|
+
int sumi;
|
|
1175
|
+
|
|
1176
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
1177
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1178
|
+
const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
|
|
1179
|
+
|
|
1180
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
1181
|
+
for (int l = 0; l < nb; l++) {
|
|
1182
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1183
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1184
|
+
sumi = 0;
|
|
682
1185
|
for (int i = 0; i < blocklen; ++i) {
|
|
683
1186
|
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
684
1187
|
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
685
1188
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
686
1189
|
}
|
|
687
|
-
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
1190
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
1191
|
+
}
|
|
1192
|
+
}
|
|
1193
|
+
}
|
|
1194
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
1195
|
+
}
|
|
1196
|
+
}
|
|
1197
|
+
|
|
1198
|
+
void ggml_gemv_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1199
|
+
const int qk = QK8_0;
|
|
1200
|
+
const int nb = n / qk;
|
|
1201
|
+
const int ncols_interleaved = 4;
|
|
1202
|
+
const int blocklen = 4;
|
|
1203
|
+
|
|
1204
|
+
assert(nr == 1);
|
|
1205
|
+
assert(n % qk == 0);
|
|
1206
|
+
assert(nc % ncols_interleaved == 0);
|
|
1207
|
+
|
|
1208
|
+
UNUSED(bs);
|
|
1209
|
+
UNUSED(nr);
|
|
1210
|
+
|
|
1211
|
+
float sumf[4];
|
|
1212
|
+
int sumi;
|
|
1213
|
+
|
|
1214
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
1215
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1216
|
+
const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb);
|
|
1217
|
+
|
|
1218
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
1219
|
+
for (int l = 0; l < nb; l++) {
|
|
1220
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1221
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1222
|
+
sumi = 0;
|
|
1223
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1224
|
+
const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
1225
|
+
const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
1226
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
1227
|
+
}
|
|
1228
|
+
sumf[j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
1229
|
+
}
|
|
1230
|
+
}
|
|
1231
|
+
}
|
|
1232
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
1233
|
+
}
|
|
1234
|
+
}
|
|
1235
|
+
|
|
1236
|
+
void ggml_gemv_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1237
|
+
const int qk = QK8_0;
|
|
1238
|
+
const int nb = n / qk;
|
|
1239
|
+
const int ncols_interleaved = 8;
|
|
1240
|
+
const int blocklen = 8;
|
|
1241
|
+
|
|
1242
|
+
assert(nr == 1);
|
|
1243
|
+
assert(n % qk == 0);
|
|
1244
|
+
assert(nc % ncols_interleaved == 0);
|
|
1245
|
+
|
|
1246
|
+
UNUSED(bs);
|
|
1247
|
+
UNUSED(nr);
|
|
1248
|
+
|
|
1249
|
+
float sumf[8];
|
|
1250
|
+
int sumi;
|
|
1251
|
+
|
|
1252
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
1253
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1254
|
+
const block_mxfp4x8 * b_ptr = (const block_mxfp4x8 *) vx + (x * nb);
|
|
1255
|
+
|
|
1256
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
1257
|
+
for (int l = 0; l < nb; l++) {
|
|
1258
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1259
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1260
|
+
sumi = 0;
|
|
1261
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1262
|
+
const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
1263
|
+
const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
1264
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
1265
|
+
}
|
|
1266
|
+
sumf[j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
1267
|
+
}
|
|
1268
|
+
}
|
|
1269
|
+
}
|
|
1270
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
1271
|
+
}
|
|
1272
|
+
}
|
|
1273
|
+
|
|
1274
|
+
void ggml_gemv_q8_0_4x4_q8_0_generic(int n,
|
|
1275
|
+
float * GGML_RESTRICT s,
|
|
1276
|
+
size_t bs,
|
|
1277
|
+
const void * GGML_RESTRICT vx,
|
|
1278
|
+
const void * GGML_RESTRICT vy,
|
|
1279
|
+
int nr,
|
|
1280
|
+
int nc) {
|
|
1281
|
+
const int qk = QK8_0;
|
|
1282
|
+
const int nb = n / qk;
|
|
1283
|
+
const int ncols_interleaved = 4;
|
|
1284
|
+
const int blocklen = 4;
|
|
1285
|
+
|
|
1286
|
+
assert(nr == 1);
|
|
1287
|
+
assert(n % qk == 0);
|
|
1288
|
+
assert(nc % ncols_interleaved == 0);
|
|
1289
|
+
|
|
1290
|
+
UNUSED(bs);
|
|
1291
|
+
UNUSED(nr);
|
|
1292
|
+
|
|
1293
|
+
float sumf[4];
|
|
1294
|
+
int sumi;
|
|
1295
|
+
|
|
1296
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
1297
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1298
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
1299
|
+
|
|
1300
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1301
|
+
sumf[j] = 0.0;
|
|
1302
|
+
}
|
|
1303
|
+
for (int l = 0; l < nb; l++) {
|
|
1304
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
1305
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1306
|
+
sumi = 0;
|
|
1307
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1308
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
1309
|
+
sumi += v0 * a_ptr[l].qs[k * blocklen + i];
|
|
1310
|
+
}
|
|
1311
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
1312
|
+
}
|
|
1313
|
+
}
|
|
1314
|
+
}
|
|
1315
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1316
|
+
s[x * ncols_interleaved + j] = sumf[j];
|
|
1317
|
+
}
|
|
1318
|
+
}
|
|
1319
|
+
}
|
|
1320
|
+
|
|
1321
|
+
void ggml_gemv_q8_0_4x8_q8_0_generic(int n,
|
|
1322
|
+
float * GGML_RESTRICT s,
|
|
1323
|
+
size_t bs,
|
|
1324
|
+
const void * GGML_RESTRICT vx,
|
|
1325
|
+
const void * GGML_RESTRICT vy,
|
|
1326
|
+
int nr,
|
|
1327
|
+
int nc) {
|
|
1328
|
+
const int qk = QK8_0;
|
|
1329
|
+
const int nb = n / qk;
|
|
1330
|
+
const int ncols_interleaved = 4;
|
|
1331
|
+
const int blocklen = 8;
|
|
1332
|
+
|
|
1333
|
+
assert(nr == 1);
|
|
1334
|
+
assert(n % qk == 0);
|
|
1335
|
+
assert(nc % ncols_interleaved == 0);
|
|
1336
|
+
|
|
1337
|
+
UNUSED(bs);
|
|
1338
|
+
UNUSED(nr);
|
|
1339
|
+
|
|
1340
|
+
float sumf[4];
|
|
1341
|
+
int sumi;
|
|
1342
|
+
|
|
1343
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
1344
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1345
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
1346
|
+
|
|
1347
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1348
|
+
sumf[j] = 0.0;
|
|
1349
|
+
}
|
|
1350
|
+
for (int l = 0; l < nb; l++) {
|
|
1351
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
1352
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1353
|
+
sumi = 0;
|
|
1354
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1355
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
1356
|
+
sumi += v0 * a_ptr[l].qs[k * blocklen + i];
|
|
1357
|
+
}
|
|
1358
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
1359
|
+
}
|
|
1360
|
+
}
|
|
1361
|
+
}
|
|
1362
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1363
|
+
s[x * ncols_interleaved + j] = sumf[j];
|
|
1364
|
+
}
|
|
1365
|
+
}
|
|
1366
|
+
}
|
|
1367
|
+
|
|
1368
|
+
#if defined __riscv_zvfh
|
|
1369
|
+
void ggml_gemv_q4_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1370
|
+
const int qk = QK8_0;
|
|
1371
|
+
const int nb = n / qk;
|
|
1372
|
+
const int ncols_interleaved = 16;
|
|
1373
|
+
const int blocklen = 1;
|
|
1374
|
+
|
|
1375
|
+
assert (n % qk == 0);
|
|
1376
|
+
assert (nc % ncols_interleaved == 0);
|
|
1377
|
+
|
|
1378
|
+
UNUSED(s);
|
|
1379
|
+
UNUSED(bs);
|
|
1380
|
+
UNUSED(vx);
|
|
1381
|
+
UNUSED(vy);
|
|
1382
|
+
UNUSED(nr);
|
|
1383
|
+
UNUSED(nc);
|
|
1384
|
+
UNUSED(nb);
|
|
1385
|
+
UNUSED(ncols_interleaved);
|
|
1386
|
+
UNUSED(blocklen);
|
|
1387
|
+
|
|
1388
|
+
float sumf[16];
|
|
1389
|
+
int sumi;
|
|
1390
|
+
|
|
1391
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
1392
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1393
|
+
const block_q4_0x16 * b_ptr = (const block_q4_0x16 *) vx + (x * nb);
|
|
1394
|
+
|
|
1395
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
1396
|
+
for (int l = 0; l < nb; l++) {
|
|
1397
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1398
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1399
|
+
sumi = 0;
|
|
1400
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1401
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
1402
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
1403
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
1404
|
+
}
|
|
1405
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
1406
|
+
}
|
|
1407
|
+
}
|
|
1408
|
+
}
|
|
1409
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
1410
|
+
}
|
|
1411
|
+
}
|
|
1412
|
+
|
|
1413
|
+
void ggml_gemv_q4_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1414
|
+
const int qk = QK_K;
|
|
1415
|
+
const int nb = n / qk;
|
|
1416
|
+
const int ncols_interleaved = 16;
|
|
1417
|
+
const int blocklen = 1;
|
|
1418
|
+
assert (n % qk == 0);
|
|
1419
|
+
assert (nc % ncols_interleaved == 0);
|
|
1420
|
+
UNUSED(s);
|
|
1421
|
+
UNUSED(bs);
|
|
1422
|
+
UNUSED(vx);
|
|
1423
|
+
UNUSED(vy);
|
|
1424
|
+
UNUSED(nr);
|
|
1425
|
+
UNUSED(nc);
|
|
1426
|
+
UNUSED(nb);
|
|
1427
|
+
UNUSED(ncols_interleaved);
|
|
1428
|
+
UNUSED(blocklen);
|
|
1429
|
+
float sumf[16];
|
|
1430
|
+
float sum_minf[16];
|
|
1431
|
+
uint8_t scales[128];
|
|
1432
|
+
uint8_t mins[128];
|
|
1433
|
+
int sumi1;
|
|
1434
|
+
int sumi2;
|
|
1435
|
+
int sumi;
|
|
1436
|
+
const block_q8_K * a_ptr = (const block_q8_K *) vy;
|
|
1437
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1438
|
+
const block_q4_Kx16 * b_ptr = (const block_q4_Kx16 *) vx + (x * nb);
|
|
1439
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1440
|
+
sumf[j] = 0.0f;
|
|
1441
|
+
sum_minf[j] = 0.0f;
|
|
1442
|
+
}
|
|
1443
|
+
for (int l = 0; l < nb; l++) {
|
|
1444
|
+
for (int i = 0; i < 128; i++) {
|
|
1445
|
+
scales[i] = b_ptr[l].scales[i] & 0x0F;
|
|
1446
|
+
mins[i] = b_ptr[l].scales[i] >> 4;
|
|
1447
|
+
}
|
|
1448
|
+
for (int i = 0; i < 64; i++) {
|
|
1449
|
+
scales[i] |= (b_ptr[l].scales[128 + i] & 0x03) << 4;
|
|
1450
|
+
mins[i] |= (b_ptr[l].scales[128 + i] & 0x0C) << 2;
|
|
1451
|
+
scales[i + 64] |= (b_ptr[l].scales[128 + i] & 0x30);
|
|
1452
|
+
mins[i + 64] |= (b_ptr[l].scales[128 + i] & 0xC0) >> 2;
|
|
1453
|
+
}
|
|
1454
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
1455
|
+
uint8_t *min = &mins[sb * 16];
|
|
1456
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1457
|
+
sum_minf[j] += min[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
1458
|
+
}
|
|
1459
|
+
}
|
|
1460
|
+
for (int sb = 0; sb < 8; sb += 2) {
|
|
1461
|
+
uint8_t *scales_0 = &scales[sb * 16];
|
|
1462
|
+
uint8_t *scales_1 = &scales[(sb + 1) * 16];
|
|
1463
|
+
for (int i = 0; i < QK4_0; i++) {
|
|
1464
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1465
|
+
sumi1 = 0;
|
|
1466
|
+
sumi2 = 0;
|
|
1467
|
+
sumi = 0;
|
|
1468
|
+
const int v0 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] & 0xF);
|
|
1469
|
+
const int v1 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] >> 4);
|
|
1470
|
+
sumi1 = (v0 * a_ptr[l].qs[sb * 32 + i]);
|
|
1471
|
+
sumi2 = (v1 * a_ptr[l].qs[sb * 32 + 32 + i]);
|
|
1472
|
+
sumi1 = sumi1 * scales_0[j];
|
|
1473
|
+
sumi2 = sumi2 * scales_1[j];
|
|
1474
|
+
sumi += sumi1 + sumi2;
|
|
1475
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
1476
|
+
}
|
|
1477
|
+
}
|
|
1478
|
+
}
|
|
1479
|
+
}
|
|
1480
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1481
|
+
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
|
1482
|
+
}
|
|
1483
|
+
}
|
|
1484
|
+
}
|
|
1485
|
+
|
|
1486
|
+
void ggml_gemv_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1487
|
+
const int qk = QK8_0;
|
|
1488
|
+
const int nb = n / qk;
|
|
1489
|
+
const int ncols_interleaved = 16;
|
|
1490
|
+
const int blocklen = 1;
|
|
1491
|
+
|
|
1492
|
+
assert(nr == 1);
|
|
1493
|
+
assert(n % qk == 0);
|
|
1494
|
+
assert(nc % ncols_interleaved == 0);
|
|
1495
|
+
|
|
1496
|
+
UNUSED(bs);
|
|
1497
|
+
UNUSED(nr);
|
|
1498
|
+
|
|
1499
|
+
float sumf[16];
|
|
1500
|
+
int sumi;
|
|
1501
|
+
|
|
1502
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
1503
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1504
|
+
const block_iq4_nlx16 * b_ptr = (const block_iq4_nlx16 *) vx + (x * nb);
|
|
1505
|
+
|
|
1506
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
1507
|
+
for (int l = 0; l < nb; l++) {
|
|
1508
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1509
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1510
|
+
sumi = 0;
|
|
1511
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1512
|
+
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
1513
|
+
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
1514
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
1515
|
+
}
|
|
1516
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
1517
|
+
}
|
|
1518
|
+
}
|
|
1519
|
+
}
|
|
1520
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
1521
|
+
}
|
|
1522
|
+
}
|
|
1523
|
+
|
|
1524
|
+
void ggml_gemv_q8_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1525
|
+
const int qk = QK8_0;
|
|
1526
|
+
const int nb = n / qk;
|
|
1527
|
+
const int ncols_interleaved = 16;
|
|
1528
|
+
const int blocklen = 1;
|
|
1529
|
+
|
|
1530
|
+
assert(nr == 1);
|
|
1531
|
+
assert(n % qk == 0);
|
|
1532
|
+
assert(nc % ncols_interleaved == 0);
|
|
1533
|
+
|
|
1534
|
+
UNUSED(bs);
|
|
1535
|
+
UNUSED(nr);
|
|
1536
|
+
|
|
1537
|
+
float sumf[16];
|
|
1538
|
+
int sumi;
|
|
1539
|
+
|
|
1540
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
1541
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1542
|
+
const block_q8_0x16 * b_ptr = (const block_q8_0x16 *) vx + (x * nb);
|
|
1543
|
+
|
|
1544
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1545
|
+
sumf[j] = 0.0;
|
|
1546
|
+
}
|
|
1547
|
+
for (int l = 0; l < nb; l++) {
|
|
1548
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
1549
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1550
|
+
sumi = 0;
|
|
1551
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1552
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
1553
|
+
sumi += v0 * a_ptr[l].qs[k * blocklen + i];
|
|
1554
|
+
}
|
|
1555
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
1556
|
+
}
|
|
1557
|
+
}
|
|
1558
|
+
}
|
|
1559
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1560
|
+
s[x * ncols_interleaved + j] = sumf[j];
|
|
1561
|
+
}
|
|
1562
|
+
}
|
|
1563
|
+
}
|
|
1564
|
+
|
|
1565
|
+
void ggml_gemv_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1566
|
+
assert(n % QK_K == 0);
|
|
1567
|
+
assert(nr == 1);
|
|
1568
|
+
assert(nc % 16 == 0);
|
|
1569
|
+
|
|
1570
|
+
UNUSED(bs);
|
|
1571
|
+
|
|
1572
|
+
const int nb = n / QK_K;
|
|
1573
|
+
const block_q2_Kx16 * x = (const block_q2_Kx16 *)vx;
|
|
1574
|
+
const block_q8_K * y = (const block_q8_K *)vy;
|
|
1575
|
+
|
|
1576
|
+
// Layout: Even-Low(0,2,4,6), Odd-Low(1,3,5,7), Even-High(8...), Odd-High(9...)
|
|
1577
|
+
const int sb_perm[16] = {
|
|
1578
|
+
0, 4, 1, 5, 2, 6, 3, 7, // 0-7
|
|
1579
|
+
8, 12, 9, 13, 10, 14, 11, 15 // 8-15
|
|
1580
|
+
};
|
|
1581
|
+
|
|
1582
|
+
for (int col_tile = 0; col_tile < nc; col_tile += 16) {
|
|
1583
|
+
const block_q2_Kx16 * x_ptr = x + (col_tile / 16) * nb;
|
|
1584
|
+
const block_q8_K * y_ptr = y;
|
|
1585
|
+
|
|
1586
|
+
float sumf[16] = {0};
|
|
1587
|
+
|
|
1588
|
+
// Loop over K-blocks
|
|
1589
|
+
for (int k_block = 0; k_block < nb; ++k_block) {
|
|
1590
|
+
int32_t isum[16] = {0};
|
|
1591
|
+
int32_t summs[16] = {0};
|
|
1592
|
+
|
|
1593
|
+
const uint8_t * qs_rhs = x_ptr[k_block].qs;
|
|
1594
|
+
const uint8_t * sc_rhs = x_ptr[k_block].scales;
|
|
1595
|
+
const int8_t * qs_lhs = y_ptr[k_block].qs;
|
|
1596
|
+
const int16_t * bs_lhs = y_ptr[k_block].bsums;
|
|
1597
|
+
|
|
1598
|
+
// Iterate over sub-blocks 0..15
|
|
1599
|
+
for (int sb = 0; sb < 16; ++sb) {
|
|
1600
|
+
// Correction Term
|
|
1601
|
+
int16_t bsum = bs_lhs[sb];
|
|
1602
|
+
int scale_offset = sb_perm[sb] * 16;
|
|
1603
|
+
|
|
1604
|
+
for (int col = 0; col < 16; ++col) {
|
|
1605
|
+
uint8_t sc_val = sc_rhs[scale_offset + col];
|
|
1606
|
+
summs[col] += bsum * (sc_val >> 4); // Min is high 4 bits
|
|
1607
|
+
}
|
|
1608
|
+
|
|
1609
|
+
// Main Dot Product
|
|
1610
|
+
// Calculate base offsets for Q2 unpacking based on SB
|
|
1611
|
+
int byte_base;
|
|
1612
|
+
if (sb < 8) byte_base = (sb % 2 == 0) ? 0 : 16;
|
|
1613
|
+
else byte_base = (sb % 2 == 0) ? 32 : 48;
|
|
1614
|
+
|
|
1615
|
+
int shift = ((sb / 2) % 4) * 2;
|
|
1616
|
+
|
|
1617
|
+
for (int col = 0; col < 16; ++col) {
|
|
1618
|
+
uint8_t sc_val = sc_rhs[scale_offset + col];
|
|
1619
|
+
int32_t d_sb = sc_val & 0xF; // Scale is low 4 bits
|
|
1620
|
+
|
|
1621
|
+
// Process 16 elements (l=0..15)
|
|
1622
|
+
for (int l = 0; l < 16; ++l) {
|
|
1623
|
+
// Q2: Interleaved by column. Byte `l` contains 4 k-values.
|
|
1624
|
+
int qs_idx = (byte_base + l) * 16 + col;
|
|
1625
|
+
uint8_t q2_val = (qs_rhs[qs_idx] >> shift) & 3;
|
|
1626
|
+
|
|
1627
|
+
// Q8: Linear access
|
|
1628
|
+
int k = sb * 16 + l;
|
|
1629
|
+
int8_t q8_val = qs_lhs[k];
|
|
1630
|
+
|
|
1631
|
+
isum[col] += q8_val * q2_val * d_sb;
|
|
1632
|
+
}
|
|
1633
|
+
}
|
|
1634
|
+
}
|
|
1635
|
+
|
|
1636
|
+
// Finalize K-Block
|
|
1637
|
+
for (int col = 0; col < 16; ++col) {
|
|
1638
|
+
float d_lhs = y_ptr[k_block].d;
|
|
1639
|
+
float d_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].d[col]);
|
|
1640
|
+
float dm_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].dmin[col]);
|
|
1641
|
+
|
|
1642
|
+
float d_all = d_lhs * d_rhs;
|
|
1643
|
+
float d_min = d_lhs * dm_rhs;
|
|
1644
|
+
|
|
1645
|
+
sumf[col] += (isum[col] * d_all) - (summs[col] * d_min);
|
|
1646
|
+
}
|
|
1647
|
+
}
|
|
1648
|
+
|
|
1649
|
+
for (int col = 0; col < 16; ++col) {
|
|
1650
|
+
s[col_tile + col] = sumf[col];
|
|
1651
|
+
}
|
|
1652
|
+
}
|
|
1653
|
+
}
|
|
1654
|
+
#endif
|
|
1655
|
+
|
|
1656
|
+
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1657
|
+
const int qk = QK8_0;
|
|
1658
|
+
const int nb = n / qk;
|
|
1659
|
+
const int ncols_interleaved = 4;
|
|
1660
|
+
const int blocklen = 4;
|
|
1661
|
+
|
|
1662
|
+
assert (n % qk == 0);
|
|
1663
|
+
assert (nr % 4 == 0);
|
|
1664
|
+
assert (nc % ncols_interleaved == 0);
|
|
1665
|
+
|
|
1666
|
+
UNUSED(s);
|
|
1667
|
+
UNUSED(bs);
|
|
1668
|
+
UNUSED(vx);
|
|
1669
|
+
UNUSED(vy);
|
|
1670
|
+
UNUSED(nr);
|
|
1671
|
+
UNUSED(nc);
|
|
1672
|
+
UNUSED(nb);
|
|
1673
|
+
UNUSED(ncols_interleaved);
|
|
1674
|
+
UNUSED(blocklen);
|
|
1675
|
+
|
|
1676
|
+
{
|
|
1677
|
+
float sumf[4][4];
|
|
1678
|
+
int sumi;
|
|
1679
|
+
|
|
1680
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
1681
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1682
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1683
|
+
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
|
1684
|
+
for (int m = 0; m < 4; m++) {
|
|
1685
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
1686
|
+
}
|
|
1687
|
+
for (int l = 0; l < nb; l++) {
|
|
1688
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1689
|
+
for (int m = 0; m < 4; m++) {
|
|
1690
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1691
|
+
sumi = 0;
|
|
1692
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1693
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
1694
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
1695
|
+
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
1696
|
+
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
1697
|
+
}
|
|
1698
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1699
|
+
}
|
|
1700
|
+
}
|
|
1701
|
+
}
|
|
1702
|
+
}
|
|
1703
|
+
for (int m = 0; m < 4; m++) {
|
|
1704
|
+
for (int j = 0; j < ncols_interleaved; j++)
|
|
1705
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1706
|
+
}
|
|
1707
|
+
}
|
|
1708
|
+
}
|
|
1709
|
+
}
|
|
1710
|
+
}
|
|
1711
|
+
|
|
1712
|
+
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1713
|
+
const int qk = QK8_0;
|
|
1714
|
+
const int nb = n / qk;
|
|
1715
|
+
const int ncols_interleaved = 4;
|
|
1716
|
+
const int blocklen = 8;
|
|
1717
|
+
|
|
1718
|
+
assert (n % qk == 0);
|
|
1719
|
+
assert (nr % 4 == 0);
|
|
1720
|
+
assert (nc % ncols_interleaved == 0);
|
|
1721
|
+
|
|
1722
|
+
UNUSED(s);
|
|
1723
|
+
UNUSED(bs);
|
|
1724
|
+
UNUSED(vx);
|
|
1725
|
+
UNUSED(vy);
|
|
1726
|
+
UNUSED(nr);
|
|
1727
|
+
UNUSED(nc);
|
|
1728
|
+
UNUSED(nb);
|
|
1729
|
+
UNUSED(ncols_interleaved);
|
|
1730
|
+
UNUSED(blocklen);
|
|
1731
|
+
|
|
1732
|
+
float sumf[4][4];
|
|
1733
|
+
int sumi;
|
|
1734
|
+
|
|
1735
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
1736
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1737
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1738
|
+
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
|
1739
|
+
for (int m = 0; m < 4; m++) {
|
|
1740
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
1741
|
+
}
|
|
1742
|
+
for (int l = 0; l < nb; l++) {
|
|
1743
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1744
|
+
for (int m = 0; m < 4; m++) {
|
|
1745
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1746
|
+
sumi = 0;
|
|
1747
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1748
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
1749
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
1750
|
+
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
1751
|
+
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
1752
|
+
}
|
|
1753
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1754
|
+
}
|
|
1755
|
+
}
|
|
1756
|
+
}
|
|
1757
|
+
}
|
|
1758
|
+
for (int m = 0; m < 4; m++) {
|
|
1759
|
+
for (int j = 0; j < ncols_interleaved; j++)
|
|
1760
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1761
|
+
}
|
|
1762
|
+
}
|
|
1763
|
+
}
|
|
1764
|
+
}
|
|
1765
|
+
|
|
1766
|
+
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1767
|
+
const int qk = QK8_0;
|
|
1768
|
+
const int nb = n / qk;
|
|
1769
|
+
const int ncols_interleaved = 8;
|
|
1770
|
+
const int blocklen = 8;
|
|
1771
|
+
|
|
1772
|
+
assert (n % qk == 0);
|
|
1773
|
+
assert (nr % 4 == 0);
|
|
1774
|
+
assert (nc % ncols_interleaved == 0);
|
|
1775
|
+
|
|
1776
|
+
UNUSED(s);
|
|
1777
|
+
UNUSED(bs);
|
|
1778
|
+
UNUSED(vx);
|
|
1779
|
+
UNUSED(vy);
|
|
1780
|
+
UNUSED(nr);
|
|
1781
|
+
UNUSED(nc);
|
|
1782
|
+
UNUSED(nb);
|
|
1783
|
+
UNUSED(ncols_interleaved);
|
|
1784
|
+
UNUSED(blocklen);
|
|
1785
|
+
|
|
1786
|
+
float sumf[4][8];
|
|
1787
|
+
int sumi;
|
|
1788
|
+
|
|
1789
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
1790
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1791
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1792
|
+
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
|
1793
|
+
for (int m = 0; m < 4; m++) {
|
|
1794
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
1795
|
+
}
|
|
1796
|
+
for (int l = 0; l < nb; l++) {
|
|
1797
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1798
|
+
for (int m = 0; m < 4; m++) {
|
|
1799
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1800
|
+
sumi = 0;
|
|
1801
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1802
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
1803
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
1804
|
+
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
1805
|
+
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
1806
|
+
}
|
|
1807
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1808
|
+
}
|
|
1809
|
+
}
|
|
1810
|
+
}
|
|
1811
|
+
}
|
|
1812
|
+
for (int m = 0; m < 4; m++) {
|
|
1813
|
+
for (int j = 0; j < ncols_interleaved; j++)
|
|
1814
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1815
|
+
}
|
|
1816
|
+
}
|
|
1817
|
+
}
|
|
1818
|
+
}
|
|
1819
|
+
|
|
1820
|
+
void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1821
|
+
const int qk = QK_K;
|
|
1822
|
+
const int nb = n / qk;
|
|
1823
|
+
const int ncols_interleaved = 8;
|
|
1824
|
+
const int blocklen = 4;
|
|
1825
|
+
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
1826
|
+
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
1827
|
+
static const uint32_t kmask3 = 0x03030303;
|
|
1828
|
+
|
|
1829
|
+
assert (n % qk == 0);
|
|
1830
|
+
assert (nr % 4 == 0);
|
|
1831
|
+
assert (nc % ncols_interleaved == 0);
|
|
1832
|
+
|
|
1833
|
+
UNUSED(nb);
|
|
1834
|
+
UNUSED(ncols_interleaved);
|
|
1835
|
+
UNUSED(blocklen);
|
|
1836
|
+
|
|
1837
|
+
float sumf[4][8];
|
|
1838
|
+
float sum_minf[4][8];
|
|
1839
|
+
uint32_t utmp[32];
|
|
1840
|
+
int sumi1;
|
|
1841
|
+
int sumi2;
|
|
1842
|
+
int sumi;
|
|
1843
|
+
|
|
1844
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
1845
|
+
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
|
1846
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1847
|
+
const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
|
1848
|
+
for (int m = 0; m < 4; m++) {
|
|
1849
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1850
|
+
sumf[m][j] = 0.0;
|
|
1851
|
+
sum_minf[m][j] = 0.0;
|
|
1852
|
+
}
|
|
1853
|
+
}
|
|
1854
|
+
for (int l = 0; l < nb; l++) {
|
|
1855
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
1856
|
+
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
1857
|
+
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
1858
|
+
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
1859
|
+
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
1860
|
+
utmp[sb * 4 + 2] = uaux_0;
|
|
1861
|
+
utmp[sb * 4 + 0] &= kmask1;
|
|
1862
|
+
}
|
|
1863
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1864
|
+
uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
|
|
1865
|
+
uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
|
|
1866
|
+
for (int m = 0; m < 4; m++) {
|
|
1867
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1868
|
+
sumi1 = 0;
|
|
1869
|
+
sumi2 = 0;
|
|
1870
|
+
sumi = 0;
|
|
1871
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1872
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
|
|
1873
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
|
1874
|
+
sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i]);
|
|
1875
|
+
sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i + 128]);
|
|
1876
|
+
sumi1 = sumi1 * scales_0[j];
|
|
1877
|
+
sumi2 = sumi2 * scales_1[j];
|
|
1878
|
+
sumi += sumi1 + sumi2;
|
|
1879
|
+
}
|
|
1880
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
1881
|
+
}
|
|
1882
|
+
}
|
|
1883
|
+
}
|
|
1884
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
1885
|
+
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
|
1886
|
+
for(int m = 0; m < 4; m++) {
|
|
1887
|
+
const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
1888
|
+
for(int j = 0; j < ncols_interleaved; j++) {
|
|
1889
|
+
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
1890
|
+
}
|
|
1891
|
+
}
|
|
1892
|
+
}
|
|
1893
|
+
}
|
|
1894
|
+
for (int m = 0; m < 4; m++) {
|
|
1895
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1896
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
|
1897
|
+
}
|
|
1898
|
+
}
|
|
1899
|
+
}
|
|
1900
|
+
}
|
|
1901
|
+
}
|
|
1902
|
+
|
|
1903
|
+
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1904
|
+
const int qk = QK_K;
|
|
1905
|
+
const int nb = n / qk;
|
|
1906
|
+
const int ncols_interleaved = 8;
|
|
1907
|
+
const int blocklen = 8;
|
|
1908
|
+
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
1909
|
+
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
1910
|
+
static const uint32_t kmask3 = 0x03030303;
|
|
1911
|
+
|
|
1912
|
+
assert (n % qk == 0);
|
|
1913
|
+
assert (nr % 4 == 0);
|
|
1914
|
+
assert (nc % ncols_interleaved == 0);
|
|
1915
|
+
|
|
1916
|
+
UNUSED(bs);
|
|
1917
|
+
|
|
1918
|
+
float sumf[4][8];
|
|
1919
|
+
float sum_minf[4][8];
|
|
1920
|
+
uint32_t utmp[32];
|
|
1921
|
+
int sumi1;
|
|
1922
|
+
int sumi2;
|
|
1923
|
+
int sumi;
|
|
1924
|
+
|
|
1925
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
1926
|
+
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
|
1927
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1928
|
+
const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
|
1929
|
+
for (int m = 0; m < 4; m++) {
|
|
1930
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1931
|
+
sumf[m][j] = 0.0;
|
|
1932
|
+
sum_minf[m][j] = 0.0;
|
|
1933
|
+
}
|
|
1934
|
+
}
|
|
1935
|
+
for (int l = 0; l < nb; l++) {
|
|
1936
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
1937
|
+
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
1938
|
+
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
1939
|
+
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
1940
|
+
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
1941
|
+
utmp[sb * 4 + 2] = uaux_0;
|
|
1942
|
+
utmp[sb * 4 + 0] &= kmask1;
|
|
1943
|
+
}
|
|
1944
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1945
|
+
uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
|
|
1946
|
+
uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
|
|
1947
|
+
for (int m = 0; m < 4; m++) {
|
|
1948
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1949
|
+
sumi1 = 0;
|
|
1950
|
+
sumi2 = 0;
|
|
1951
|
+
sumi = 0;
|
|
1952
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1953
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
|
|
1954
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
|
1955
|
+
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]);
|
|
1956
|
+
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
|
|
1957
|
+
sumi1 = sumi1 * scales_0[j];
|
|
1958
|
+
sumi2 = sumi2 * scales_1[j];
|
|
1959
|
+
sumi += sumi1 + sumi2;
|
|
1960
|
+
}
|
|
1961
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
1962
|
+
}
|
|
1963
|
+
}
|
|
1964
|
+
}
|
|
1965
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
1966
|
+
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
|
|
1967
|
+
for(int m = 0; m < 4; m++) {
|
|
1968
|
+
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
1969
|
+
for(int j = 0; j < ncols_interleaved; j++) {
|
|
1970
|
+
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
1971
|
+
}
|
|
1972
|
+
}
|
|
1973
|
+
}
|
|
1974
|
+
}
|
|
1975
|
+
for (int m = 0; m < 4; m++) {
|
|
1976
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1977
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
|
688
1978
|
}
|
|
689
1979
|
}
|
|
690
1980
|
}
|
|
691
|
-
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
692
1981
|
}
|
|
693
1982
|
}
|
|
694
1983
|
|
|
695
|
-
void
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
int nr,
|
|
701
|
-
int nc) {
|
|
702
|
-
const int qk = QK8_0;
|
|
703
|
-
const int nb = n / qk;
|
|
704
|
-
const int ncols_interleaved = 4;
|
|
705
|
-
const int blocklen = 4;
|
|
1984
|
+
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1985
|
+
const int qk = QK_K;
|
|
1986
|
+
const int nb = n / qk;
|
|
1987
|
+
const int ncols_interleaved = 8;
|
|
1988
|
+
const int blocklen = 8;
|
|
706
1989
|
|
|
707
|
-
assert(
|
|
708
|
-
assert(
|
|
709
|
-
assert(nc % ncols_interleaved == 0);
|
|
1990
|
+
assert (n % qk == 0);
|
|
1991
|
+
assert (nr % 4 == 0);
|
|
1992
|
+
assert (nc % ncols_interleaved == 0);
|
|
710
1993
|
|
|
1994
|
+
UNUSED(s);
|
|
711
1995
|
UNUSED(bs);
|
|
1996
|
+
UNUSED(vx);
|
|
1997
|
+
UNUSED(vy);
|
|
712
1998
|
UNUSED(nr);
|
|
1999
|
+
UNUSED(nc);
|
|
2000
|
+
UNUSED(nb);
|
|
2001
|
+
UNUSED(ncols_interleaved);
|
|
2002
|
+
UNUSED(blocklen);
|
|
713
2003
|
|
|
714
|
-
float sumf[4];
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
719
|
-
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
2004
|
+
float sumf[4][8];
|
|
2005
|
+
float sum_minf[4][8];
|
|
2006
|
+
int sumi1, sumi2, sumi3, sumi4;
|
|
2007
|
+
int sumi;
|
|
720
2008
|
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
for (int
|
|
2009
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
2010
|
+
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
|
2011
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
2012
|
+
const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
|
|
2013
|
+
for (int m = 0; m < 4; m++) {
|
|
726
2014
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
2015
|
+
sumf[m][j] = 0.0;
|
|
2016
|
+
sum_minf[m][j] = 0.0;
|
|
2017
|
+
}
|
|
2018
|
+
}
|
|
2019
|
+
for (int l = 0; l < nb; l++) {
|
|
2020
|
+
for (int k = 0; k < (qk / (4 * blocklen)); k++) {
|
|
2021
|
+
|
|
2022
|
+
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
|
2023
|
+
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
|
2024
|
+
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
|
2025
|
+
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
|
2026
|
+
for (int m = 0; m < 4; m++) {
|
|
2027
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
2028
|
+
sumi1 = 0;
|
|
2029
|
+
sumi2 = 0;
|
|
2030
|
+
sumi3 = 0;
|
|
2031
|
+
sumi4 = 0;
|
|
2032
|
+
sumi = 0;
|
|
2033
|
+
int offset = ((k / 2) % 2) + j * 2;
|
|
2034
|
+
for (int i = 0; i < blocklen; ++i){
|
|
2035
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
|
|
2036
|
+
const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
|
|
2037
|
+
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
|
2038
|
+
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
|
2039
|
+
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
|
|
2040
|
+
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
|
|
2041
|
+
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
|
|
2042
|
+
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
|
|
2043
|
+
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
|
2044
|
+
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
|
2045
|
+
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
|
2046
|
+
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
|
2047
|
+
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
|
2048
|
+
}
|
|
2049
|
+
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
2050
|
+
}
|
|
2051
|
+
}
|
|
2052
|
+
}
|
|
2053
|
+
for(int sb = 0; sb < 8; sb++) {
|
|
2054
|
+
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
|
2055
|
+
for(int m = 0; m < 4; m++) {
|
|
2056
|
+
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
2057
|
+
for(int j = 0; j < ncols_interleaved; j++) {
|
|
2058
|
+
int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
|
|
2059
|
+
sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
2060
|
+
}
|
|
731
2061
|
}
|
|
732
|
-
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
733
2062
|
}
|
|
734
2063
|
}
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
2064
|
+
|
|
2065
|
+
for (int m = 0; m < 4; m++) {
|
|
2066
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
2067
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
|
2068
|
+
}
|
|
2069
|
+
}
|
|
738
2070
|
}
|
|
739
2071
|
}
|
|
740
2072
|
}
|
|
741
2073
|
|
|
742
|
-
void
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
const void * GGML_RESTRICT vx,
|
|
746
|
-
const void * GGML_RESTRICT vy,
|
|
747
|
-
int nr,
|
|
748
|
-
int nc) {
|
|
749
|
-
const int qk = QK8_0;
|
|
750
|
-
const int nb = n / qk;
|
|
751
|
-
const int ncols_interleaved = 4;
|
|
752
|
-
const int blocklen = 8;
|
|
753
|
-
|
|
754
|
-
assert(nr == 1);
|
|
755
|
-
assert(n % qk == 0);
|
|
756
|
-
assert(nc % ncols_interleaved == 0);
|
|
757
|
-
|
|
758
|
-
UNUSED(bs);
|
|
759
|
-
UNUSED(nr);
|
|
2074
|
+
void ggml_gemm_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
2075
|
+
ggml_gemm_q5_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
|
|
2076
|
+
}
|
|
760
2077
|
|
|
761
|
-
|
|
762
|
-
|
|
2078
|
+
void ggml_gemm_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
2079
|
+
ggml_gemm_q5_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
|
|
2080
|
+
}
|
|
763
2081
|
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
2082
|
+
void ggml_gemm_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
2083
|
+
ggml_gemm_q6_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
|
|
2084
|
+
}
|
|
767
2085
|
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
}
|
|
771
|
-
for (int l = 0; l < nb; l++) {
|
|
772
|
-
for (int k = 0; k < (qk / blocklen); k++) {
|
|
773
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
774
|
-
sumi = 0;
|
|
775
|
-
for (int i = 0; i < blocklen; ++i) {
|
|
776
|
-
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
777
|
-
sumi += v0 * a_ptr[l].qs[k * blocklen + i];
|
|
778
|
-
}
|
|
779
|
-
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
780
|
-
}
|
|
781
|
-
}
|
|
782
|
-
}
|
|
783
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
784
|
-
s[x * ncols_interleaved + j] = sumf[j];
|
|
785
|
-
}
|
|
786
|
-
}
|
|
2086
|
+
void ggml_gemm_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
2087
|
+
ggml_gemm_q6_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
|
|
787
2088
|
}
|
|
788
2089
|
|
|
789
|
-
void
|
|
2090
|
+
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
790
2091
|
const int qk = QK8_0;
|
|
791
2092
|
const int nb = n / qk;
|
|
792
2093
|
const int ncols_interleaved = 4;
|
|
@@ -813,7 +2114,7 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
813
2114
|
for (int y = 0; y < nr / 4; y++) {
|
|
814
2115
|
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
815
2116
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
816
|
-
const
|
|
2117
|
+
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
|
817
2118
|
for (int m = 0; m < 4; m++) {
|
|
818
2119
|
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
819
2120
|
}
|
|
@@ -823,10 +2124,10 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
823
2124
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
824
2125
|
sumi = 0;
|
|
825
2126
|
for (int i = 0; i < blocklen; ++i) {
|
|
826
|
-
const int v0 =
|
|
827
|
-
const int v1 =
|
|
2127
|
+
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
2128
|
+
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
828
2129
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
829
|
-
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]))
|
|
2130
|
+
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
|
830
2131
|
}
|
|
831
2132
|
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
832
2133
|
}
|
|
@@ -842,33 +2143,23 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
842
2143
|
}
|
|
843
2144
|
}
|
|
844
2145
|
|
|
845
|
-
void
|
|
2146
|
+
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
846
2147
|
const int qk = QK8_0;
|
|
847
2148
|
const int nb = n / qk;
|
|
848
|
-
const int ncols_interleaved =
|
|
2149
|
+
const int ncols_interleaved = 8;
|
|
849
2150
|
const int blocklen = 8;
|
|
850
2151
|
|
|
851
|
-
assert
|
|
852
|
-
assert
|
|
853
|
-
assert
|
|
854
|
-
|
|
855
|
-
UNUSED(s);
|
|
856
|
-
UNUSED(bs);
|
|
857
|
-
UNUSED(vx);
|
|
858
|
-
UNUSED(vy);
|
|
859
|
-
UNUSED(nr);
|
|
860
|
-
UNUSED(nc);
|
|
861
|
-
UNUSED(nb);
|
|
862
|
-
UNUSED(ncols_interleaved);
|
|
863
|
-
UNUSED(blocklen);
|
|
2152
|
+
assert(n % qk == 0);
|
|
2153
|
+
assert(nr % 4 == 0);
|
|
2154
|
+
assert(nc % ncols_interleaved == 0);
|
|
864
2155
|
|
|
865
|
-
float sumf[4][
|
|
2156
|
+
float sumf[4][8];
|
|
866
2157
|
int sumi;
|
|
867
2158
|
|
|
868
2159
|
for (int y = 0; y < nr / 4; y++) {
|
|
869
2160
|
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
870
2161
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
871
|
-
const
|
|
2162
|
+
const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
|
|
872
2163
|
for (int m = 0; m < 4; m++) {
|
|
873
2164
|
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
874
2165
|
}
|
|
@@ -878,10 +2169,10 @@ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
878
2169
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
879
2170
|
sumi = 0;
|
|
880
2171
|
for (int i = 0; i < blocklen; ++i) {
|
|
881
|
-
const int v0 =
|
|
882
|
-
const int v1 =
|
|
2172
|
+
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
2173
|
+
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
883
2174
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
884
|
-
|
|
2175
|
+
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
|
885
2176
|
}
|
|
886
2177
|
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
887
2178
|
}
|
|
@@ -896,25 +2187,59 @@ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
896
2187
|
}
|
|
897
2188
|
}
|
|
898
2189
|
|
|
899
|
-
void
|
|
2190
|
+
void ggml_gemm_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
900
2191
|
const int qk = QK8_0;
|
|
901
2192
|
const int nb = n / qk;
|
|
902
|
-
const int ncols_interleaved =
|
|
903
|
-
const int blocklen =
|
|
2193
|
+
const int ncols_interleaved = 4;
|
|
2194
|
+
const int blocklen = 4;
|
|
904
2195
|
|
|
905
|
-
assert
|
|
906
|
-
assert
|
|
907
|
-
assert
|
|
2196
|
+
assert(n % qk == 0);
|
|
2197
|
+
assert(nr % 4 == 0);
|
|
2198
|
+
assert(nc % ncols_interleaved == 0);
|
|
908
2199
|
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
2200
|
+
float sumf[4][4];
|
|
2201
|
+
int sumi;
|
|
2202
|
+
|
|
2203
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
2204
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
2205
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
2206
|
+
const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb);
|
|
2207
|
+
for (int m = 0; m < 4; m++) {
|
|
2208
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
2209
|
+
}
|
|
2210
|
+
for (int l = 0; l < nb; l++) {
|
|
2211
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
2212
|
+
for (int m = 0; m < 4; m++) {
|
|
2213
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
2214
|
+
sumi = 0;
|
|
2215
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
2216
|
+
const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
2217
|
+
const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
2218
|
+
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
2219
|
+
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
|
2220
|
+
}
|
|
2221
|
+
sumf[m][j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
2222
|
+
}
|
|
2223
|
+
}
|
|
2224
|
+
}
|
|
2225
|
+
}
|
|
2226
|
+
for (int m = 0; m < 4; m++) {
|
|
2227
|
+
for (int j = 0; j < ncols_interleaved; j++)
|
|
2228
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
2229
|
+
}
|
|
2230
|
+
}
|
|
2231
|
+
}
|
|
2232
|
+
}
|
|
2233
|
+
|
|
2234
|
+
void ggml_gemm_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
2235
|
+
const int qk = QK8_0;
|
|
2236
|
+
const int nb = n / qk;
|
|
2237
|
+
const int ncols_interleaved = 8;
|
|
2238
|
+
const int blocklen = 8;
|
|
2239
|
+
|
|
2240
|
+
assert(n % qk == 0);
|
|
2241
|
+
assert(nr % 4 == 0);
|
|
2242
|
+
assert(nc % ncols_interleaved == 0);
|
|
918
2243
|
|
|
919
2244
|
float sumf[4][8];
|
|
920
2245
|
int sumi;
|
|
@@ -922,7 +2247,7 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
922
2247
|
for (int y = 0; y < nr / 4; y++) {
|
|
923
2248
|
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
924
2249
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
925
|
-
const
|
|
2250
|
+
const block_mxfp4x8 * b_ptr = (const block_mxfp4x8 *) vx + (x * nb);
|
|
926
2251
|
for (int m = 0; m < 4; m++) {
|
|
927
2252
|
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
928
2253
|
}
|
|
@@ -932,12 +2257,12 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
932
2257
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
933
2258
|
sumi = 0;
|
|
934
2259
|
for (int i = 0; i < blocklen; ++i) {
|
|
935
|
-
const int v0 =
|
|
936
|
-
const int v1 =
|
|
2260
|
+
const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
2261
|
+
const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
937
2262
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
938
|
-
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]))
|
|
2263
|
+
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
|
939
2264
|
}
|
|
940
|
-
sumf[m][j] += sumi *
|
|
2265
|
+
sumf[m][j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
941
2266
|
}
|
|
942
2267
|
}
|
|
943
2268
|
}
|
|
@@ -950,183 +2275,118 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
950
2275
|
}
|
|
951
2276
|
}
|
|
952
2277
|
|
|
953
|
-
void
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
assert (nc % ncols_interleaved == 0);
|
|
2278
|
+
void ggml_gemm_q8_0_4x4_q8_0_generic(int n,
|
|
2279
|
+
float * GGML_RESTRICT s,
|
|
2280
|
+
size_t bs,
|
|
2281
|
+
const void * GGML_RESTRICT vx,
|
|
2282
|
+
const void * GGML_RESTRICT vy,
|
|
2283
|
+
int nr,
|
|
2284
|
+
int nc) {
|
|
2285
|
+
const int qk = QK8_0;
|
|
2286
|
+
const int nb = n / qk;
|
|
2287
|
+
const int ncols_interleaved = 4;
|
|
2288
|
+
const int blocklen = 4;
|
|
965
2289
|
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
2290
|
+
assert(n % qk == 0);
|
|
2291
|
+
assert(nr % 4 == 0);
|
|
2292
|
+
assert(nc % ncols_interleaved == 0);
|
|
969
2293
|
|
|
970
|
-
float sumf[4][
|
|
971
|
-
|
|
972
|
-
uint32_t utmp[32];
|
|
973
|
-
int sumi1;
|
|
974
|
-
int sumi2;
|
|
975
|
-
int sumi;
|
|
2294
|
+
float sumf[4][4];
|
|
2295
|
+
int sumi;
|
|
976
2296
|
|
|
977
2297
|
for (int y = 0; y < nr / 4; y++) {
|
|
978
|
-
const
|
|
2298
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
979
2299
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
980
|
-
const
|
|
2300
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
981
2301
|
for (int m = 0; m < 4; m++) {
|
|
982
2302
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
983
2303
|
sumf[m][j] = 0.0;
|
|
984
|
-
sum_minf[m][j] = 0.0;
|
|
985
2304
|
}
|
|
986
2305
|
}
|
|
987
2306
|
for (int l = 0; l < nb; l++) {
|
|
988
|
-
for (int
|
|
989
|
-
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
990
|
-
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
991
|
-
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
992
|
-
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
993
|
-
utmp[sb * 4 + 2] = uaux_0;
|
|
994
|
-
utmp[sb * 4 + 0] &= kmask1;
|
|
995
|
-
}
|
|
996
|
-
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
997
|
-
uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
|
|
998
|
-
uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
|
|
2307
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
999
2308
|
for (int m = 0; m < 4; m++) {
|
|
1000
2309
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1001
|
-
sumi1 = 0;
|
|
1002
|
-
sumi2 = 0;
|
|
1003
2310
|
sumi = 0;
|
|
1004
2311
|
for (int i = 0; i < blocklen; ++i) {
|
|
1005
|
-
const int v0 =
|
|
1006
|
-
|
|
1007
|
-
sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i]);
|
|
1008
|
-
sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i + 128]);
|
|
1009
|
-
sumi1 = sumi1 * scales_0[j];
|
|
1010
|
-
sumi2 = sumi2 * scales_1[j];
|
|
1011
|
-
sumi += sumi1 + sumi2;
|
|
2312
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
2313
|
+
sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
|
|
1012
2314
|
}
|
|
1013
|
-
sumf[m][j] +=
|
|
1014
|
-
|
|
1015
|
-
}
|
|
1016
|
-
}
|
|
1017
|
-
for (int sb = 0; sb < 8; sb++) {
|
|
1018
|
-
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
|
1019
|
-
for(int m = 0; m < 4; m++) {
|
|
1020
|
-
const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
1021
|
-
for(int j = 0; j < ncols_interleaved; j++) {
|
|
1022
|
-
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
2315
|
+
sumf[m][j] +=
|
|
2316
|
+
sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1023
2317
|
}
|
|
1024
2318
|
}
|
|
1025
2319
|
}
|
|
1026
2320
|
}
|
|
1027
2321
|
for (int m = 0; m < 4; m++) {
|
|
1028
2322
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1029
|
-
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]
|
|
2323
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1030
2324
|
}
|
|
1031
2325
|
}
|
|
1032
2326
|
}
|
|
1033
2327
|
}
|
|
1034
2328
|
}
|
|
1035
2329
|
|
|
1036
|
-
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1037
|
-
const int qk = QK_K;
|
|
1038
|
-
const int nb = n / qk;
|
|
1039
|
-
const int ncols_interleaved = 8;
|
|
1040
|
-
const int blocklen = 8;
|
|
1041
|
-
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
1042
|
-
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
1043
|
-
static const uint32_t kmask3 = 0x03030303;
|
|
1044
2330
|
|
|
1045
|
-
assert (n % qk == 0);
|
|
1046
|
-
assert (nr % 4 == 0);
|
|
1047
|
-
assert (nc % ncols_interleaved == 0);
|
|
1048
2331
|
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
2332
|
+
void ggml_gemm_q8_0_4x8_q8_0_generic(int n,
|
|
2333
|
+
float * GGML_RESTRICT s,
|
|
2334
|
+
size_t bs,
|
|
2335
|
+
const void * GGML_RESTRICT vx,
|
|
2336
|
+
const void * GGML_RESTRICT vy,
|
|
2337
|
+
int nr,
|
|
2338
|
+
int nc) {
|
|
2339
|
+
const int qk = QK8_0;
|
|
2340
|
+
const int nb = n / qk;
|
|
2341
|
+
const int ncols_interleaved = 4;
|
|
2342
|
+
const int blocklen = 8;
|
|
1058
2343
|
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
int
|
|
2344
|
+
assert(n % qk == 0);
|
|
2345
|
+
assert(nr % 4 == 0);
|
|
2346
|
+
assert(nc % ncols_interleaved == 0);
|
|
2347
|
+
|
|
2348
|
+
float sumf[4][4];
|
|
2349
|
+
int sumi;
|
|
1065
2350
|
|
|
1066
2351
|
for (int y = 0; y < nr / 4; y++) {
|
|
1067
|
-
const
|
|
2352
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1068
2353
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1069
|
-
const
|
|
2354
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
1070
2355
|
for (int m = 0; m < 4; m++) {
|
|
1071
2356
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1072
2357
|
sumf[m][j] = 0.0;
|
|
1073
|
-
sum_minf[m][j] = 0.0;
|
|
1074
2358
|
}
|
|
1075
2359
|
}
|
|
1076
2360
|
for (int l = 0; l < nb; l++) {
|
|
1077
|
-
for (int
|
|
1078
|
-
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
1079
|
-
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
1080
|
-
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
1081
|
-
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
1082
|
-
utmp[sb * 4 + 2] = uaux_0;
|
|
1083
|
-
utmp[sb * 4 + 0] &= kmask1;
|
|
1084
|
-
}
|
|
1085
|
-
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1086
|
-
uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
|
|
1087
|
-
uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
|
|
2361
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
1088
2362
|
for (int m = 0; m < 4; m++) {
|
|
1089
2363
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1090
|
-
sumi1 = 0;
|
|
1091
|
-
sumi2 = 0;
|
|
1092
2364
|
sumi = 0;
|
|
1093
2365
|
for (int i = 0; i < blocklen; ++i) {
|
|
1094
|
-
const int v0 =
|
|
1095
|
-
|
|
1096
|
-
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]);
|
|
1097
|
-
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
|
|
1098
|
-
sumi1 = sumi1 * scales_0[j];
|
|
1099
|
-
sumi2 = sumi2 * scales_1[j];
|
|
1100
|
-
sumi += sumi1 + sumi2;
|
|
2366
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
2367
|
+
sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
|
|
1101
2368
|
}
|
|
1102
|
-
sumf[m][j] +=
|
|
1103
|
-
|
|
1104
|
-
}
|
|
1105
|
-
}
|
|
1106
|
-
for (int sb = 0; sb < 8; sb++) {
|
|
1107
|
-
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
|
|
1108
|
-
for(int m = 0; m < 4; m++) {
|
|
1109
|
-
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
1110
|
-
for(int j = 0; j < ncols_interleaved; j++) {
|
|
1111
|
-
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
2369
|
+
sumf[m][j] +=
|
|
2370
|
+
sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1112
2371
|
}
|
|
1113
2372
|
}
|
|
1114
2373
|
}
|
|
1115
2374
|
}
|
|
1116
2375
|
for (int m = 0; m < 4; m++) {
|
|
1117
2376
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1118
|
-
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]
|
|
2377
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1119
2378
|
}
|
|
1120
2379
|
}
|
|
1121
2380
|
}
|
|
1122
2381
|
}
|
|
1123
2382
|
}
|
|
1124
2383
|
|
|
1125
|
-
|
|
1126
|
-
|
|
2384
|
+
#if defined __riscv_zvfh
|
|
2385
|
+
void ggml_gemm_q4_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
2386
|
+
const int qk = QK8_0;
|
|
1127
2387
|
const int nb = n / qk;
|
|
1128
|
-
const int ncols_interleaved =
|
|
1129
|
-
const int blocklen =
|
|
2388
|
+
const int ncols_interleaved = 16;
|
|
2389
|
+
const int blocklen = 1;
|
|
1130
2390
|
|
|
1131
2391
|
assert (n % qk == 0);
|
|
1132
2392
|
assert (nr % 4 == 0);
|
|
@@ -1142,82 +2402,45 @@ void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
1142
2402
|
UNUSED(ncols_interleaved);
|
|
1143
2403
|
UNUSED(blocklen);
|
|
1144
2404
|
|
|
1145
|
-
float sumf[4][
|
|
1146
|
-
float sum_minf[4][8];
|
|
1147
|
-
int sumi1, sumi2, sumi3, sumi4;
|
|
2405
|
+
float sumf[4][16];
|
|
1148
2406
|
int sumi;
|
|
1149
2407
|
|
|
1150
2408
|
for (int y = 0; y < nr / 4; y++) {
|
|
1151
|
-
const
|
|
2409
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1152
2410
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1153
|
-
const
|
|
2411
|
+
const block_q4_0x16 * b_ptr = (const block_q4_0x16 *) vx + (x * nb);
|
|
1154
2412
|
for (int m = 0; m < 4; m++) {
|
|
1155
|
-
for (int j = 0; j < ncols_interleaved; j++)
|
|
1156
|
-
sumf[m][j] = 0.0;
|
|
1157
|
-
sum_minf[m][j] = 0.0;
|
|
1158
|
-
}
|
|
2413
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
1159
2414
|
}
|
|
1160
2415
|
for (int l = 0; l < nb; l++) {
|
|
1161
|
-
for (int k = 0; k < (qk / (
|
|
1162
|
-
|
|
1163
|
-
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
|
1164
|
-
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
|
1165
|
-
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
|
1166
|
-
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
|
2416
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1167
2417
|
for (int m = 0; m < 4; m++) {
|
|
1168
2418
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1169
|
-
sumi1 = 0;
|
|
1170
|
-
sumi2 = 0;
|
|
1171
|
-
sumi3 = 0;
|
|
1172
|
-
sumi4 = 0;
|
|
1173
2419
|
sumi = 0;
|
|
1174
|
-
int
|
|
1175
|
-
|
|
1176
|
-
const int
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
|
1180
|
-
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
|
|
1181
|
-
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
|
|
1182
|
-
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
|
|
1183
|
-
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
|
|
1184
|
-
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
|
1185
|
-
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
|
1186
|
-
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
|
1187
|
-
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
|
1188
|
-
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
|
2420
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
2421
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
2422
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
2423
|
+
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
2424
|
+
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
1189
2425
|
}
|
|
1190
|
-
sumf[m][j] += sumi *
|
|
1191
|
-
}
|
|
1192
|
-
}
|
|
1193
|
-
}
|
|
1194
|
-
for(int sb = 0; sb < 8; sb++) {
|
|
1195
|
-
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
|
1196
|
-
for(int m = 0; m < 4; m++) {
|
|
1197
|
-
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
1198
|
-
for(int j = 0; j < ncols_interleaved; j++) {
|
|
1199
|
-
int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
|
|
1200
|
-
sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
2426
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1201
2427
|
}
|
|
1202
2428
|
}
|
|
1203
2429
|
}
|
|
1204
2430
|
}
|
|
1205
|
-
|
|
1206
2431
|
for (int m = 0; m < 4; m++) {
|
|
1207
|
-
for (int j = 0; j < ncols_interleaved; j++)
|
|
1208
|
-
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]
|
|
1209
|
-
}
|
|
2432
|
+
for (int j = 0; j < ncols_interleaved; j++)
|
|
2433
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1210
2434
|
}
|
|
1211
2435
|
}
|
|
1212
2436
|
}
|
|
1213
2437
|
}
|
|
1214
2438
|
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
const int qk = QK8_0;
|
|
2439
|
+
void ggml_gemm_q4_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
2440
|
+
const int qk = QK_K;
|
|
1218
2441
|
const int nb = n / qk;
|
|
1219
|
-
const int ncols_interleaved =
|
|
1220
|
-
const int blocklen =
|
|
2442
|
+
const int ncols_interleaved = 16;
|
|
2443
|
+
const int blocklen = 1;
|
|
1221
2444
|
|
|
1222
2445
|
assert (n % qk == 0);
|
|
1223
2446
|
assert (nr % 4 == 0);
|
|
@@ -1233,59 +2456,97 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
1233
2456
|
UNUSED(ncols_interleaved);
|
|
1234
2457
|
UNUSED(blocklen);
|
|
1235
2458
|
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
2459
|
+
float sumf[4][16];
|
|
2460
|
+
float sum_minf[4][16];
|
|
2461
|
+
uint8_t scales[128];
|
|
2462
|
+
uint8_t mins[128];
|
|
2463
|
+
int sumi1;
|
|
2464
|
+
int sumi2;
|
|
2465
|
+
int sumi;
|
|
2466
|
+
|
|
2467
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
2468
|
+
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
|
2469
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
2470
|
+
const block_q4_Kx16 * b_ptr = (const block_q4_Kx16 *) vx + (x * nb);
|
|
2471
|
+
for (int m = 0; m < 4; m++) {
|
|
2472
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
2473
|
+
sumf[m][j] = 0.0;
|
|
2474
|
+
sum_minf[m][j] = 0.0;
|
|
2475
|
+
}
|
|
2476
|
+
}
|
|
2477
|
+
for (int l = 0; l < nb; l++) {
|
|
2478
|
+
for (int i = 0; i < 128; i++) {
|
|
2479
|
+
scales[i] = b_ptr[l].scales[i] & 0x0F;
|
|
2480
|
+
mins[i] = b_ptr[l].scales[i] >> 4;
|
|
2481
|
+
}
|
|
2482
|
+
for (int i = 0; i < 64; i++) {
|
|
2483
|
+
scales[i] |= (b_ptr[l].scales[128 + i] & 0x03) << 4;
|
|
2484
|
+
mins[i] |= (b_ptr[l].scales[128 + i] & 0x0C) << 2;
|
|
2485
|
+
scales[i + 64] |= (b_ptr[l].scales[128 + i] & 0x30);
|
|
2486
|
+
mins[i + 64] |= (b_ptr[l].scales[128 + i] & 0xC0) >> 2;
|
|
2487
|
+
}
|
|
1239
2488
|
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
2489
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
2490
|
+
uint8_t *min = &mins[sb * 16];
|
|
2491
|
+
for(int m = 0; m < 4; m++) {
|
|
2492
|
+
const int16_t bsums = a_ptr[l].bsums[sb * 8 + m] + a_ptr[l].bsums[sb * 8 + m + 4];
|
|
2493
|
+
for(int j = 0; j < ncols_interleaved; j++) {
|
|
2494
|
+
sum_minf[m][j] += min[j] * bsums * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
2495
|
+
}
|
|
2496
|
+
}
|
|
1246
2497
|
}
|
|
1247
|
-
|
|
1248
|
-
|
|
2498
|
+
|
|
2499
|
+
for (int sb = 0; sb < 8; sb += 2) {
|
|
2500
|
+
uint8_t *scales_0 = &scales[sb * 16];
|
|
2501
|
+
uint8_t *scales_1 = &scales[(sb + 1) * 16];
|
|
2502
|
+
|
|
2503
|
+
for (int i = 0; i < QK4_0; i++) {
|
|
1249
2504
|
for (int m = 0; m < 4; m++) {
|
|
1250
2505
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
2506
|
+
sumi1 = 0;
|
|
2507
|
+
sumi2 = 0;
|
|
1251
2508
|
sumi = 0;
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
2509
|
+
|
|
2510
|
+
const int v0 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] & 0xF);
|
|
2511
|
+
const int v1 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] >> 4);
|
|
2512
|
+
sumi1 = (v0 * a_ptr[l].qs[sb * 4 * 32 + i * 4 + m]);
|
|
2513
|
+
sumi2 = (v1 * a_ptr[l].qs[sb * 4 * 32 + 32 * 4 + i * 4 + m]);
|
|
2514
|
+
sumi1 = sumi1 * scales_0[j];
|
|
2515
|
+
sumi2 = sumi2 * scales_1[j];
|
|
2516
|
+
sumi += sumi1 + sumi2;
|
|
2517
|
+
|
|
2518
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
1259
2519
|
}
|
|
1260
2520
|
}
|
|
1261
2521
|
}
|
|
1262
2522
|
}
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
2523
|
+
}
|
|
2524
|
+
for (int m = 0; m < 4; m++) {
|
|
2525
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
2526
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
|
1266
2527
|
}
|
|
1267
2528
|
}
|
|
1268
2529
|
}
|
|
1269
2530
|
}
|
|
1270
2531
|
}
|
|
1271
2532
|
|
|
1272
|
-
void
|
|
2533
|
+
void ggml_gemm_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1273
2534
|
const int qk = QK8_0;
|
|
1274
2535
|
const int nb = n / qk;
|
|
1275
|
-
const int ncols_interleaved =
|
|
1276
|
-
const int blocklen =
|
|
2536
|
+
const int ncols_interleaved = 16;
|
|
2537
|
+
const int blocklen = 1;
|
|
1277
2538
|
|
|
1278
2539
|
assert(n % qk == 0);
|
|
1279
2540
|
assert(nr % 4 == 0);
|
|
1280
2541
|
assert(nc % ncols_interleaved == 0);
|
|
1281
2542
|
|
|
1282
|
-
float sumf[4][
|
|
2543
|
+
float sumf[4][16];
|
|
1283
2544
|
int sumi;
|
|
1284
2545
|
|
|
1285
2546
|
for (int y = 0; y < nr / 4; y++) {
|
|
1286
2547
|
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1287
2548
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1288
|
-
const
|
|
2549
|
+
const block_iq4_nlx16 * b_ptr = (const block_iq4_nlx16 *) vx + (x * nb);
|
|
1289
2550
|
for (int m = 0; m < 4; m++) {
|
|
1290
2551
|
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
1291
2552
|
}
|
|
@@ -1298,7 +2559,7 @@ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
1298
2559
|
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
1299
2560
|
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
1300
2561
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
1301
|
-
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
|
2562
|
+
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + (qk / 2) * 4]));
|
|
1302
2563
|
}
|
|
1303
2564
|
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1304
2565
|
}
|
|
@@ -1313,29 +2574,23 @@ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
1313
2574
|
}
|
|
1314
2575
|
}
|
|
1315
2576
|
|
|
1316
|
-
void
|
|
1317
|
-
float * GGML_RESTRICT s,
|
|
1318
|
-
size_t bs,
|
|
1319
|
-
const void * GGML_RESTRICT vx,
|
|
1320
|
-
const void * GGML_RESTRICT vy,
|
|
1321
|
-
int nr,
|
|
1322
|
-
int nc) {
|
|
2577
|
+
void ggml_gemm_q8_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1323
2578
|
const int qk = QK8_0;
|
|
1324
2579
|
const int nb = n / qk;
|
|
1325
|
-
const int ncols_interleaved =
|
|
1326
|
-
const int blocklen =
|
|
2580
|
+
const int ncols_interleaved = 16;
|
|
2581
|
+
const int blocklen = 1;
|
|
1327
2582
|
|
|
1328
2583
|
assert(n % qk == 0);
|
|
1329
2584
|
assert(nr % 4 == 0);
|
|
1330
2585
|
assert(nc % ncols_interleaved == 0);
|
|
1331
2586
|
|
|
1332
|
-
float sumf[4][
|
|
2587
|
+
float sumf[4][16];
|
|
1333
2588
|
int sumi;
|
|
1334
2589
|
|
|
1335
2590
|
for (int y = 0; y < nr / 4; y++) {
|
|
1336
2591
|
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1337
2592
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1338
|
-
const
|
|
2593
|
+
const block_q8_0x16 * b_ptr = (const block_q8_0x16 *) vx + (x * nb);
|
|
1339
2594
|
for (int m = 0; m < 4; m++) {
|
|
1340
2595
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1341
2596
|
sumf[m][j] = 0.0;
|
|
@@ -1365,57 +2620,102 @@ void ggml_gemm_q8_0_4x4_q8_0_generic(int n,
|
|
|
1365
2620
|
}
|
|
1366
2621
|
}
|
|
1367
2622
|
|
|
1368
|
-
void ggml_gemm_q8_0_4x8_q8_0_generic(int n,
|
|
1369
|
-
float * GGML_RESTRICT s,
|
|
1370
|
-
size_t bs,
|
|
1371
|
-
const void * GGML_RESTRICT vx,
|
|
1372
|
-
const void * GGML_RESTRICT vy,
|
|
1373
|
-
int nr,
|
|
1374
|
-
int nc) {
|
|
1375
|
-
const int qk = QK8_0;
|
|
1376
|
-
const int nb = n / qk;
|
|
1377
|
-
const int ncols_interleaved = 4;
|
|
1378
|
-
const int blocklen = 8;
|
|
1379
2623
|
|
|
1380
|
-
|
|
2624
|
+
void ggml_gemm_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
2625
|
+
assert(n % QK_K == 0);
|
|
1381
2626
|
assert(nr % 4 == 0);
|
|
1382
|
-
assert(nc %
|
|
2627
|
+
assert(nc % 16 == 0);
|
|
2628
|
+
const int nb = n / QK_K;
|
|
2629
|
+
const block_q2_Kx16 * x = (const block_q2_Kx16 *)vx;
|
|
2630
|
+
const block_q8_Kx4 * y = (const block_q8_Kx4 *)vy;
|
|
2631
|
+
|
|
2632
|
+
const int sb_perm[16] = {
|
|
2633
|
+
0, 4, 1, 5, 2, 6, 3, 7,
|
|
2634
|
+
8, 12, 9, 13, 10, 14, 11, 15
|
|
2635
|
+
};
|
|
1383
2636
|
|
|
1384
|
-
|
|
1385
|
-
int
|
|
2637
|
+
// Iterate Rows in tiles of 4
|
|
2638
|
+
for (int row_tile = 0; row_tile < nr; row_tile += 4) {
|
|
2639
|
+
// Iterate Columns in tiles of 16
|
|
2640
|
+
for (int col_tile = 0; col_tile < nc; col_tile += 16) {
|
|
2641
|
+
|
|
2642
|
+
const block_q2_Kx16 * x_ptr = x + (col_tile / 16) * nb;
|
|
2643
|
+
const block_q8_Kx4 * y_ptr = y + (row_tile / 4) * nb;
|
|
2644
|
+
|
|
2645
|
+
float sumf[4][16];
|
|
2646
|
+
memset(sumf, 0, sizeof(sumf));
|
|
2647
|
+
|
|
2648
|
+
for (int k_block = 0; k_block < nb; ++k_block) {
|
|
2649
|
+
int32_t isum[4][16];
|
|
2650
|
+
int32_t summs[4][16];
|
|
2651
|
+
memset(isum, 0, sizeof(isum));
|
|
2652
|
+
memset(summs, 0, sizeof(summs));
|
|
2653
|
+
|
|
2654
|
+
const uint8_t * qs_rhs = x_ptr[k_block].qs;
|
|
2655
|
+
const uint8_t * sc_rhs = x_ptr[k_block].scales;
|
|
2656
|
+
const int8_t * qs_lhs = y_ptr[k_block].qs;
|
|
2657
|
+
const int16_t * bs_lhs = y_ptr[k_block].bsums;
|
|
2658
|
+
|
|
2659
|
+
for (int sb = 0; sb < 16; ++sb) {
|
|
2660
|
+
int scale_offset = sb_perm[sb] * 16;
|
|
2661
|
+
|
|
2662
|
+
int byte_base;
|
|
2663
|
+
if (sb < 8) byte_base = (sb % 2 == 0) ? 0 : 16;
|
|
2664
|
+
else byte_base = (sb % 2 == 0) ? 32 : 48;
|
|
2665
|
+
int shift = ((sb / 2) % 4) * 2;
|
|
2666
|
+
|
|
2667
|
+
for (int col = 0; col < 16; ++col) {
|
|
2668
|
+
uint8_t sc_val = sc_rhs[scale_offset + col];
|
|
2669
|
+
int32_t d_sb = sc_val & 0xF;
|
|
2670
|
+
int32_t m_sb = sc_val >> 4;
|
|
2671
|
+
|
|
2672
|
+
// Correction Term
|
|
2673
|
+
for (int r = 0; r < 4; ++r) {
|
|
2674
|
+
int bsum_idx = (sb / 4) * 16 + r * 4 + (sb % 4);
|
|
2675
|
+
summs[r][col] += bs_lhs[bsum_idx] * m_sb;
|
|
2676
|
+
}
|
|
1386
2677
|
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
sumi = 0;
|
|
1401
|
-
for (int i = 0; i < blocklen; ++i) {
|
|
1402
|
-
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
1403
|
-
sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
|
|
2678
|
+
// Main Dot Product
|
|
2679
|
+
for (int l = 0; l < 16; ++l) {
|
|
2680
|
+
int qs_idx = (byte_base + l) * 16 + col;
|
|
2681
|
+
uint8_t q2_val = (qs_rhs[qs_idx] >> shift) & 3;
|
|
2682
|
+
|
|
2683
|
+
// Calculate Q8 index for this specific k and row
|
|
2684
|
+
int k = sb * 16 + l;
|
|
2685
|
+
int q8_idx = (k / 4) * 16 + (k % 4);
|
|
2686
|
+
|
|
2687
|
+
for (int r = 0; r < 4; ++r) {
|
|
2688
|
+
// Add r*4 to jump to the correct row within the 4x4 chunk
|
|
2689
|
+
int8_t q8_val = qs_lhs[q8_idx + r * 4];
|
|
2690
|
+
isum[r][col] += q8_val * q2_val * d_sb;
|
|
1404
2691
|
}
|
|
1405
|
-
sumf[m][j] +=
|
|
1406
|
-
sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1407
2692
|
}
|
|
1408
2693
|
}
|
|
1409
2694
|
}
|
|
2695
|
+
|
|
2696
|
+
// Finalize K-Block
|
|
2697
|
+
for (int col = 0; col < 16; ++col) {
|
|
2698
|
+
float d_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].d[col]);
|
|
2699
|
+
float dm_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].dmin[col]);
|
|
2700
|
+
|
|
2701
|
+
for (int r = 0; r < 4; ++r) {
|
|
2702
|
+
float d_lhs = y_ptr[k_block].d[r];
|
|
2703
|
+
float d_all = d_lhs * d_rhs;
|
|
2704
|
+
float d_min = d_lhs * dm_rhs;
|
|
2705
|
+
sumf[r][col] += (isum[r][col] * d_all) - (summs[r][col] * d_min);
|
|
2706
|
+
}
|
|
2707
|
+
}
|
|
1410
2708
|
}
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
2709
|
+
|
|
2710
|
+
for (int r = 0; r < 4; ++r) {
|
|
2711
|
+
for (int col = 0; col < 16; ++col) {
|
|
2712
|
+
s[(row_tile + r) * bs + (col_tile + col)] = sumf[r][col];
|
|
1414
2713
|
}
|
|
1415
2714
|
}
|
|
1416
2715
|
}
|
|
1417
2716
|
}
|
|
1418
2717
|
}
|
|
2718
|
+
#endif
|
|
1419
2719
|
|
|
1420
2720
|
} // extern "C"
|
|
1421
2721
|
|
|
@@ -1498,16 +2798,212 @@ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_in
|
|
|
1498
2798
|
|
|
1499
2799
|
uint64_t elems;
|
|
1500
2800
|
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
1501
|
-
elems ^= xor_mask;
|
|
2801
|
+
elems ^= xor_mask;
|
|
2802
|
+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
|
2803
|
+
}
|
|
2804
|
+
|
|
2805
|
+
return out;
|
|
2806
|
+
}
|
|
2807
|
+
|
|
2808
|
+
static block_q4_0x16 make_block_q4_0x16(block_q4_0 * in, unsigned int blck_size_interleave) {
|
|
2809
|
+
block_q4_0x16 out;
|
|
2810
|
+
|
|
2811
|
+
for (int i = 0; i < 16; i++) {
|
|
2812
|
+
out.d[i] = in[i].d;
|
|
2813
|
+
}
|
|
2814
|
+
|
|
2815
|
+
const int end = QK4_0 * 8 / blck_size_interleave;
|
|
2816
|
+
|
|
2817
|
+
if (blck_size_interleave == 1) {
|
|
2818
|
+
const uint8_t xor_mask = 0x88;
|
|
2819
|
+
for (int i = 0; i < end; ++i) {
|
|
2820
|
+
int src_id = i % 16;
|
|
2821
|
+
int src_offset = i / 16;
|
|
2822
|
+
int dst_offset = i;
|
|
2823
|
+
|
|
2824
|
+
out.qs[dst_offset] = in[src_id].qs[src_offset] ^ xor_mask;
|
|
2825
|
+
}
|
|
2826
|
+
} else {
|
|
2827
|
+
GGML_ASSERT(false);
|
|
2828
|
+
}
|
|
2829
|
+
|
|
2830
|
+
return out;
|
|
2831
|
+
}
|
|
2832
|
+
|
|
2833
|
+
static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) {
|
|
2834
|
+
block_q4_Kx8 out;
|
|
2835
|
+
//Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure
|
|
2836
|
+
for (int i = 0; i < 8; i++) {
|
|
2837
|
+
out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
|
|
2838
|
+
}
|
|
2839
|
+
|
|
2840
|
+
for (int i = 0; i < 8; i++) {
|
|
2841
|
+
out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
|
|
2842
|
+
}
|
|
2843
|
+
|
|
2844
|
+
const int end = QK_K * 4 / blck_size_interleave;
|
|
2845
|
+
|
|
2846
|
+
// Interleave Q4_K quants by taking 8 bytes at a time
|
|
2847
|
+
for (int i = 0; i < end; ++i) {
|
|
2848
|
+
int src_id = i % 8;
|
|
2849
|
+
int src_offset = (i / 8) * blck_size_interleave;
|
|
2850
|
+
int dst_offset = i * blck_size_interleave;
|
|
2851
|
+
|
|
2852
|
+
// buffer large enough for the max interleave block size (8 bytes)
|
|
2853
|
+
uint64_t elems;
|
|
2854
|
+
memcpy(&elems, &in[src_id].qs[src_offset], blck_size_interleave);
|
|
2855
|
+
memcpy(&out.qs[dst_offset], &elems, blck_size_interleave);
|
|
2856
|
+
}
|
|
2857
|
+
|
|
2858
|
+
// The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K
|
|
2859
|
+
// Currently the Q4_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
|
|
2860
|
+
// The output Q4_Kx8 structure has 96 bytes
|
|
2861
|
+
// Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q4_K structure
|
|
2862
|
+
// For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q4_K structures
|
|
2863
|
+
uint8_t s[8], m[8];
|
|
2864
|
+
|
|
2865
|
+
for (int i = 0; i < 4; i++) {
|
|
2866
|
+
for (int j = 0; j < 8; j++) {
|
|
2867
|
+
s[j] = in[j].scales[i] & 63;
|
|
2868
|
+
m[j] = in[j].scales[i + 4] & 63;
|
|
2869
|
+
}
|
|
2870
|
+
|
|
2871
|
+
out.scales[i * 12] = (s[0] & 63) + ((s[4] & 48) << 2);
|
|
2872
|
+
out.scales[i * 12 + 1] = (s[1] & 63) + ((s[5] & 48) << 2);
|
|
2873
|
+
out.scales[i * 12 + 2] = (s[2] & 63) + ((s[6] & 48) << 2);
|
|
2874
|
+
out.scales[i * 12 + 3] = (s[3] & 63) + ((s[7] & 48) << 2);
|
|
2875
|
+
out.scales[i * 12 + 4] = (m[0] & 63) + ((m[4] & 48) << 2);
|
|
2876
|
+
out.scales[i * 12 + 5] = (m[1] & 63) + ((m[5] & 48) << 2);
|
|
2877
|
+
out.scales[i * 12 + 6] = (m[2] & 63) + ((m[6] & 48) << 2);
|
|
2878
|
+
out.scales[i * 12 + 7] = (m[3] & 63) + ((m[7] & 48) << 2);
|
|
2879
|
+
out.scales[i * 12 + 8] = (s[4] & 15) + ((m[4] & 15) << 4);
|
|
2880
|
+
out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4);
|
|
2881
|
+
out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
|
|
2882
|
+
out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
|
|
2883
|
+
|
|
2884
|
+
}
|
|
2885
|
+
|
|
2886
|
+
for (int i = 0; i < 4; i++) {
|
|
2887
|
+
for (int j = 0; j < 8; j++) {
|
|
2888
|
+
s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
|
|
2889
|
+
m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
|
|
2890
|
+
}
|
|
2891
|
+
|
|
2892
|
+
out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
|
|
2893
|
+
out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2);
|
|
2894
|
+
out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2);
|
|
2895
|
+
out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2);
|
|
2896
|
+
out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2);
|
|
2897
|
+
out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2);
|
|
2898
|
+
out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2);
|
|
2899
|
+
out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2);
|
|
2900
|
+
out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4);
|
|
2901
|
+
out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
|
|
2902
|
+
out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
|
|
2903
|
+
out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
|
|
2904
|
+
|
|
2905
|
+
}
|
|
2906
|
+
|
|
2907
|
+
return out;
|
|
2908
|
+
}
|
|
2909
|
+
|
|
2910
|
+
static block_q4_Kx16 make_block_q4_Kx16(block_q4_K * in, unsigned int blck_size_interleave) {
|
|
2911
|
+
block_q4_Kx16 out;
|
|
2912
|
+
//Delta(scale) and dmin values of the 16 Q4_K structures are copied onto the output interleaved structure
|
|
2913
|
+
for (int i = 0; i < 16; i++) {
|
|
2914
|
+
out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
|
|
2915
|
+
}
|
|
2916
|
+
|
|
2917
|
+
for (int i = 0; i < 16; i++) {
|
|
2918
|
+
out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
|
|
2919
|
+
}
|
|
2920
|
+
|
|
2921
|
+
const int end = QK_K * 8 / blck_size_interleave;
|
|
2922
|
+
|
|
2923
|
+
if (blck_size_interleave == 1) {
|
|
2924
|
+
for (int i = 0; i < end; ++i) {
|
|
2925
|
+
int src_id = i % 16;
|
|
2926
|
+
int src_offset = i / 16;
|
|
2927
|
+
int dst_offset = i;
|
|
2928
|
+
|
|
2929
|
+
out.qs[dst_offset] = in[src_id].qs[src_offset];
|
|
2930
|
+
}
|
|
2931
|
+
|
|
2932
|
+
// RVV repacking.
|
|
2933
|
+
//
|
|
2934
|
+
// Extract sums and mins for all 8 sub-blocks for each block of Q4_K.
|
|
2935
|
+
uint8_t s[128], m[128];
|
|
2936
|
+
for (int i = 0; i < 4; i++) {
|
|
2937
|
+
for (int j = 0; j < 16; j++) {
|
|
2938
|
+
s[i * 16 + j] = in[j].scales[i] & 63;
|
|
2939
|
+
m[i * 16 + j] = in[j].scales[i + 4] & 63;
|
|
2940
|
+
}
|
|
2941
|
+
}
|
|
2942
|
+
for (int i = 0; i < 4; i++) {
|
|
2943
|
+
for (int j = 0; j < 16; j++) {
|
|
2944
|
+
s[64 + i * 16 + j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
|
|
2945
|
+
m[64 + i * 16 + j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
|
|
2946
|
+
}
|
|
2947
|
+
}
|
|
2948
|
+
|
|
2949
|
+
for (int i = 0; i < 128; i++) {
|
|
2950
|
+
out.scales[i] = (s[i] & 15) | ((m[i] & 15) << 4);
|
|
2951
|
+
}
|
|
2952
|
+
for (int i = 0; i < 64; i++) {
|
|
2953
|
+
out.scales[128 + i] = ((s[i] & 48) >> 4) | ((m[i] & 48) >> 2) | (s[64 + i] & 48) | ((m[64 + i] & 48) << 2);
|
|
2954
|
+
}
|
|
2955
|
+
} else {
|
|
2956
|
+
GGML_ASSERT(false);
|
|
2957
|
+
}
|
|
2958
|
+
|
|
2959
|
+
return out;
|
|
2960
|
+
}
|
|
2961
|
+
|
|
2962
|
+
static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
|
|
2963
|
+
block_q2_Kx8 out;
|
|
2964
|
+
|
|
2965
|
+
// Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
|
|
2966
|
+
for (int i = 0; i < 8; i++) {
|
|
2967
|
+
out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
|
|
2968
|
+
}
|
|
2969
|
+
|
|
2970
|
+
for (int i = 0; i < 8; i++) {
|
|
2971
|
+
out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
|
|
2972
|
+
}
|
|
2973
|
+
|
|
2974
|
+
const int end = QK_K * 2 / blck_size_interleave;
|
|
2975
|
+
|
|
2976
|
+
// Interleave Q2_K quants by taking 8 bytes at a time
|
|
2977
|
+
for (int i = 0; i < end; ++i) {
|
|
2978
|
+
int src_id = i % 8;
|
|
2979
|
+
int src_offset = (i / 8) * blck_size_interleave;
|
|
2980
|
+
int dst_offset = i * blck_size_interleave;
|
|
2981
|
+
|
|
2982
|
+
uint64_t elems;
|
|
2983
|
+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
1502
2984
|
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
|
1503
2985
|
}
|
|
1504
2986
|
|
|
2987
|
+
// The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
|
|
2988
|
+
// Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
|
|
2989
|
+
// The output Q2_Kx8 structure has 128 bytes for storing scales and mins
|
|
2990
|
+
// Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
|
|
2991
|
+
// For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
|
|
2992
|
+
|
|
2993
|
+
for (int i = 0; i < 128; i++) {
|
|
2994
|
+
// Index for selecting which q2k super block
|
|
2995
|
+
int src1 = (i % 16) / 2;
|
|
2996
|
+
// Index for selecting scale
|
|
2997
|
+
int src2 = ((i / 16) * 2) + (i % 2);
|
|
2998
|
+
|
|
2999
|
+
out.scales[i] = in[src1].scales[src2];
|
|
3000
|
+
}
|
|
1505
3001
|
return out;
|
|
1506
3002
|
}
|
|
1507
3003
|
|
|
1508
|
-
static
|
|
1509
|
-
|
|
1510
|
-
//Delta(scale) and dmin values of the eight
|
|
3004
|
+
static block_q5_Kx8 make_block_q5_Kx8(block_q5_K * in, unsigned int blck_size_interleave) {
|
|
3005
|
+
block_q5_Kx8 out;
|
|
3006
|
+
//Delta(scale) and dmin values of the eight Q5_K structures are copied onto the output interleaved structure
|
|
1511
3007
|
for (int i = 0; i < 8; i++) {
|
|
1512
3008
|
out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
|
|
1513
3009
|
}
|
|
@@ -1518,22 +3014,33 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
|
|
|
1518
3014
|
|
|
1519
3015
|
const int end = QK_K * 4 / blck_size_interleave;
|
|
1520
3016
|
|
|
1521
|
-
// Interleave
|
|
3017
|
+
// Interleave Q5_K quants by taking blck_size_interleave bytes at a time
|
|
1522
3018
|
for (int i = 0; i < end; ++i) {
|
|
1523
|
-
int src_id
|
|
3019
|
+
int src_id = i % 8;
|
|
1524
3020
|
int src_offset = (i / 8) * blck_size_interleave;
|
|
1525
3021
|
int dst_offset = i * blck_size_interleave;
|
|
1526
3022
|
|
|
1527
|
-
|
|
1528
|
-
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
1529
|
-
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
|
3023
|
+
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
|
|
1530
3024
|
}
|
|
1531
3025
|
|
|
1532
|
-
//
|
|
1533
|
-
//
|
|
1534
|
-
//
|
|
1535
|
-
//
|
|
1536
|
-
|
|
3026
|
+
// Repeat for high bits with the same chunk size, since
|
|
3027
|
+
// the high bits are interleaved in Q5_K and the index is
|
|
3028
|
+
// qh_idx = (qs_idx % 32);
|
|
3029
|
+
// qh_val = qh[qh_idx] >> (qs_idx / 32);
|
|
3030
|
+
for (int i = 0; i < end / 4; ++i) {
|
|
3031
|
+
int src_id = i % 8;
|
|
3032
|
+
int src_offset = (i / 8) * blck_size_interleave;
|
|
3033
|
+
int dst_offset = i * blck_size_interleave;
|
|
3034
|
+
|
|
3035
|
+
memcpy(&out.qh[dst_offset], &in[src_id].qh[src_offset], blck_size_interleave);
|
|
3036
|
+
}
|
|
3037
|
+
|
|
3038
|
+
// The below logic is copied over from Q4_K
|
|
3039
|
+
// The point is to unpack all the scales and mins for each sub block every time we load 12 bytes.
|
|
3040
|
+
// Currently the Q5_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
|
|
3041
|
+
// The output Q5_Kx8 structure has 96 bytes
|
|
3042
|
+
// Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q5_K structure
|
|
3043
|
+
// For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q5_K structures
|
|
1537
3044
|
uint8_t s[8], m[8];
|
|
1538
3045
|
|
|
1539
3046
|
for (int i = 0; i < 4; i++) {
|
|
@@ -1554,13 +3061,12 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
|
|
|
1554
3061
|
out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4);
|
|
1555
3062
|
out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
|
|
1556
3063
|
out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
|
|
1557
|
-
|
|
1558
3064
|
}
|
|
1559
3065
|
|
|
1560
3066
|
for (int i = 0; i < 4; i++) {
|
|
1561
3067
|
for (int j = 0; j < 8; j++) {
|
|
1562
|
-
s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
|
|
1563
|
-
m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
|
|
3068
|
+
s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i + 8] & 15);
|
|
3069
|
+
m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i + 8] & 240) >> 4);
|
|
1564
3070
|
}
|
|
1565
3071
|
|
|
1566
3072
|
out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
|
|
@@ -1575,54 +3081,117 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
|
|
|
1575
3081
|
out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
|
|
1576
3082
|
out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
|
|
1577
3083
|
out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
|
|
1578
|
-
|
|
1579
3084
|
}
|
|
1580
3085
|
|
|
1581
3086
|
return out;
|
|
1582
3087
|
}
|
|
1583
3088
|
|
|
1584
|
-
static
|
|
1585
|
-
|
|
3089
|
+
static block_q6_Kx8 make_block_q6_Kx8(block_q6_K * in, unsigned int blck_size_interleave) {
|
|
3090
|
+
block_q6_Kx8 out;
|
|
3091
|
+
constexpr int n_blocks = 8; // Kx8
|
|
3092
|
+
for (int i = 0; i < n_blocks; i++) {
|
|
3093
|
+
out.d[i] = in[i].d;
|
|
3094
|
+
}
|
|
1586
3095
|
|
|
1587
|
-
|
|
1588
|
-
|
|
1589
|
-
|
|
3096
|
+
const int end_ls = QK_K * 4 / blck_size_interleave;
|
|
3097
|
+
// Interleave Q6_K quants by taking blck_size_interleave bytes at a time
|
|
3098
|
+
for (int i = 0; i < end_ls; ++i) {
|
|
3099
|
+
int src_id = i % n_blocks;
|
|
3100
|
+
int src_offset = (i / n_blocks) * blck_size_interleave;
|
|
3101
|
+
int dst_offset = i * blck_size_interleave;
|
|
3102
|
+
|
|
3103
|
+
uint64_t elem_ls;
|
|
3104
|
+
memcpy(&elem_ls, &in[src_id].ql[src_offset], blck_size_interleave);
|
|
3105
|
+
memcpy(&out.ql[dst_offset], &elem_ls, blck_size_interleave);
|
|
1590
3106
|
}
|
|
1591
3107
|
|
|
1592
|
-
|
|
3108
|
+
// Interleave high bits using same chunk size as low bits
|
|
3109
|
+
const int end_hs = end_ls / 2;
|
|
3110
|
+
for (int i = 0; i < end_hs; ++i) {
|
|
3111
|
+
int src_id = i % n_blocks;
|
|
3112
|
+
int src_offset = (i / n_blocks) * blck_size_interleave;
|
|
3113
|
+
int dst_offset = i * blck_size_interleave;
|
|
3114
|
+
|
|
3115
|
+
uint64_t elem_hs;
|
|
3116
|
+
memcpy(&elem_hs, &in[src_id].qh[src_offset], blck_size_interleave);
|
|
3117
|
+
memcpy(&out.qh[dst_offset], &elem_hs, blck_size_interleave);
|
|
3118
|
+
}
|
|
3119
|
+
|
|
3120
|
+
// The below logic is designed so as to unpack and rearrange scales in Q6_K
|
|
3121
|
+
// The output Q6_Kx8 structure interleaves the 8 bit scales in the same fashion as the quants
|
|
3122
|
+
// Q6_K structure has an 8-bit scale per 16 elements -> 16 scales
|
|
3123
|
+
// scales: [0 bl0 0 bl1 ... 0 bl7][1 bl0 ... 1 bl7] ... [15 bl0 ... 15 bl7] (bl = block)
|
|
3124
|
+
constexpr int n_scales = QK_K / 16;
|
|
3125
|
+
|
|
3126
|
+
for (int i = 0; i < n_blocks; i++) {
|
|
3127
|
+
for (int j = 0; j < n_scales; j++) {
|
|
3128
|
+
out.scales[j * n_blocks + i] = in[i].scales[j];
|
|
3129
|
+
}
|
|
3130
|
+
}
|
|
3131
|
+
|
|
3132
|
+
return out;
|
|
3133
|
+
}
|
|
3134
|
+
|
|
3135
|
+
static block_q2_Kx16 make_block_q2_Kx16(const block_q2_K * in, unsigned int blck_size_interleave) {
|
|
3136
|
+
block_q2_Kx16 out;
|
|
3137
|
+
constexpr int N_COLS = 16;
|
|
3138
|
+
|
|
3139
|
+
// 1. Copy Super-Scales (d) and Super-Mins (dmin)
|
|
3140
|
+
for (int i = 0; i < N_COLS; i++) {
|
|
3141
|
+
out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
|
|
1593
3142
|
out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
|
|
1594
3143
|
}
|
|
1595
3144
|
|
|
1596
|
-
|
|
3145
|
+
// 2. Interleave Q2_K Data
|
|
3146
|
+
const int bytes_per_col = 64;
|
|
3147
|
+
const int total_bytes = N_COLS * bytes_per_col;
|
|
3148
|
+
const int end = total_bytes / blck_size_interleave;
|
|
1597
3149
|
|
|
1598
|
-
// Interleave Q2_K quants by taking 8 bytes at a time
|
|
1599
3150
|
for (int i = 0; i < end; ++i) {
|
|
1600
|
-
int
|
|
1601
|
-
int src_offset = (i /
|
|
3151
|
+
int src_col_id = i % N_COLS;
|
|
3152
|
+
int src_offset = (i / N_COLS) * blck_size_interleave;
|
|
1602
3153
|
int dst_offset = i * blck_size_interleave;
|
|
1603
|
-
|
|
1604
|
-
uint64_t elems;
|
|
1605
|
-
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
1606
|
-
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
|
3154
|
+
memcpy(&out.qs[dst_offset], &in[src_col_id].qs[src_offset], blck_size_interleave);
|
|
1607
3155
|
}
|
|
1608
3156
|
|
|
1609
|
-
//
|
|
1610
|
-
|
|
1611
|
-
// The output Q2_Kx8 structure has 128 bytes for storing scales and mins
|
|
1612
|
-
// Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
|
|
1613
|
-
// For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
|
|
3157
|
+
// 3. Repack Scales into the Optimized "Sequential-Parallel" Layout
|
|
3158
|
+
int out_idx = 0;
|
|
1614
3159
|
|
|
1615
|
-
|
|
3160
|
+
// Arrays define the sub-block order for each group
|
|
3161
|
+
const int even_low_sbs[] = {0, 2, 4, 6};
|
|
3162
|
+
const int odd_low_sbs[] = {1, 3, 5, 7};
|
|
3163
|
+
const int even_high_sbs[] = {8, 10, 12, 14};
|
|
3164
|
+
const int odd_high_sbs[] = {9, 11, 13, 15};
|
|
1616
3165
|
|
|
1617
|
-
|
|
1618
|
-
|
|
1619
|
-
|
|
1620
|
-
|
|
3166
|
+
// Pack Group 1: Even-Low
|
|
3167
|
+
for (int sb : even_low_sbs) {
|
|
3168
|
+
for (int col = 0; col < N_COLS; col++) {
|
|
3169
|
+
out.scales[out_idx++] = in[col].scales[sb];
|
|
3170
|
+
}
|
|
3171
|
+
}
|
|
1621
3172
|
|
|
1622
|
-
|
|
3173
|
+
// Pack Group 2: Odd-Low
|
|
3174
|
+
for (int sb : odd_low_sbs) {
|
|
3175
|
+
for (int col = 0; col < N_COLS; col++) {
|
|
3176
|
+
out.scales[out_idx++] = in[col].scales[sb];
|
|
3177
|
+
}
|
|
3178
|
+
}
|
|
3179
|
+
|
|
3180
|
+
// Pack Group 3: Even-High
|
|
3181
|
+
for (int sb : even_high_sbs) {
|
|
3182
|
+
for (int col = 0; col < N_COLS; col++) {
|
|
3183
|
+
out.scales[out_idx++] = in[col].scales[sb];
|
|
3184
|
+
}
|
|
1623
3185
|
}
|
|
1624
|
-
return out;
|
|
1625
3186
|
|
|
3187
|
+
// Pack Group 4: Odd-High
|
|
3188
|
+
for (int sb : odd_high_sbs) {
|
|
3189
|
+
for (int col = 0; col < N_COLS; col++) {
|
|
3190
|
+
out.scales[out_idx++] = in[col].scales[sb];
|
|
3191
|
+
}
|
|
3192
|
+
}
|
|
3193
|
+
|
|
3194
|
+
return out;
|
|
1626
3195
|
}
|
|
1627
3196
|
|
|
1628
3197
|
static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
@@ -1687,6 +3256,36 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
|
|
|
1687
3256
|
GGML_UNUSED(data_size);
|
|
1688
3257
|
}
|
|
1689
3258
|
|
|
3259
|
+
static int repack_q4_K_to_q4_K_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
3260
|
+
GGML_ASSERT(t->type == GGML_TYPE_Q4_K);
|
|
3261
|
+
constexpr int nrows_interleaved = 16;
|
|
3262
|
+
|
|
3263
|
+
block_q4_Kx16 * dst = (block_q4_Kx16*)t->data;
|
|
3264
|
+
const block_q4_K * src = (const block_q4_K*) data;
|
|
3265
|
+
block_q4_K dst_tmp[16];
|
|
3266
|
+
int nrow = ggml_nrows(t);
|
|
3267
|
+
int nblocks = t->ne[0] / QK_K;
|
|
3268
|
+
|
|
3269
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K));
|
|
3270
|
+
|
|
3271
|
+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
|
3272
|
+
return -1;
|
|
3273
|
+
}
|
|
3274
|
+
|
|
3275
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
3276
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
3277
|
+
for (int i = 0; i < nrows_interleaved; i++ ) {
|
|
3278
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
3279
|
+
}
|
|
3280
|
+
*dst++ = make_block_q4_Kx16(dst_tmp, interleave_block);
|
|
3281
|
+
}
|
|
3282
|
+
src += nrows_interleaved * nblocks;
|
|
3283
|
+
}
|
|
3284
|
+
return 0;
|
|
3285
|
+
|
|
3286
|
+
GGML_UNUSED(data_size);
|
|
3287
|
+
}
|
|
3288
|
+
|
|
1690
3289
|
static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
1691
3290
|
GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
|
|
1692
3291
|
GGML_ASSERT(interleave_block == 8);
|
|
@@ -1706,7 +3305,7 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block
|
|
|
1706
3305
|
|
|
1707
3306
|
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
1708
3307
|
for (int64_t x = 0; x < nblocks; x++) {
|
|
1709
|
-
for (int i
|
|
3308
|
+
for (int i = 0; i < nrows_interleaved; i++) {
|
|
1710
3309
|
dst_tmp[i] = src[x + i * nblocks];
|
|
1711
3310
|
}
|
|
1712
3311
|
*dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
|
|
@@ -1718,6 +3317,132 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block
|
|
|
1718
3317
|
GGML_UNUSED(data_size);
|
|
1719
3318
|
}
|
|
1720
3319
|
|
|
3320
|
+
static int repack_q2_K_to_q2_K_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
3321
|
+
GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
|
|
3322
|
+
constexpr int nrows_interleaved = 16;
|
|
3323
|
+
|
|
3324
|
+
block_q2_Kx16 * dst = (block_q2_Kx16*)t->data;
|
|
3325
|
+
const block_q2_K * src = (const block_q2_K*) data;
|
|
3326
|
+
|
|
3327
|
+
block_q2_K dst_tmp[nrows_interleaved];
|
|
3328
|
+
|
|
3329
|
+
int nrow = ggml_nrows(t);
|
|
3330
|
+
int nblocks = t->ne[0] / QK_K;
|
|
3331
|
+
|
|
3332
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
|
|
3333
|
+
|
|
3334
|
+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
|
3335
|
+
return -1;
|
|
3336
|
+
}
|
|
3337
|
+
|
|
3338
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
3339
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
3340
|
+
// This loop gathers 16 separate blocks (one from each column)
|
|
3341
|
+
// that correspond to the same K-dimension chunk.
|
|
3342
|
+
for (int i = 0; i < nrows_interleaved; i++ ) {
|
|
3343
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
3344
|
+
}
|
|
3345
|
+
|
|
3346
|
+
*dst++ = make_block_q2_Kx16(dst_tmp, interleave_block);
|
|
3347
|
+
}
|
|
3348
|
+
src += nrows_interleaved * nblocks;
|
|
3349
|
+
}
|
|
3350
|
+
return 0;
|
|
3351
|
+
|
|
3352
|
+
GGML_UNUSED(data_size);
|
|
3353
|
+
}
|
|
3354
|
+
|
|
3355
|
+
static int repack_q4_0_to_q4_0_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
3356
|
+
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
|
3357
|
+
constexpr int nrows_interleaved = 16;
|
|
3358
|
+
|
|
3359
|
+
block_q4_0x16 * dst = (block_q4_0x16*)t->data;
|
|
3360
|
+
const block_q4_0 * src = (const block_q4_0*) data;
|
|
3361
|
+
block_q4_0 dst_tmp[16];
|
|
3362
|
+
int nrow = ggml_nrows(t);
|
|
3363
|
+
int nblocks = t->ne[0] / QK4_0;
|
|
3364
|
+
|
|
3365
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
|
|
3366
|
+
|
|
3367
|
+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
|
3368
|
+
return -1;
|
|
3369
|
+
}
|
|
3370
|
+
|
|
3371
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
3372
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
3373
|
+
for (int i = 0; i < nrows_interleaved; i++ ) {
|
|
3374
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
3375
|
+
}
|
|
3376
|
+
*dst++ = make_block_q4_0x16(dst_tmp, interleave_block);
|
|
3377
|
+
}
|
|
3378
|
+
src += nrows_interleaved * nblocks;
|
|
3379
|
+
}
|
|
3380
|
+
return 0;
|
|
3381
|
+
|
|
3382
|
+
GGML_UNUSED(data_size);
|
|
3383
|
+
}
|
|
3384
|
+
|
|
3385
|
+
static int repack_q5_K_to_q5_K_8_bl(struct ggml_tensor * t,
|
|
3386
|
+
int interleave_block,
|
|
3387
|
+
const void * GGML_RESTRICT data,
|
|
3388
|
+
size_t data_size) {
|
|
3389
|
+
GGML_ASSERT(t->type == GGML_TYPE_Q5_K);
|
|
3390
|
+
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
|
3391
|
+
constexpr int nrows_interleaved = 8;
|
|
3392
|
+
|
|
3393
|
+
block_q5_Kx8 * dst = (block_q5_Kx8 *) t->data;
|
|
3394
|
+
const block_q5_K * src = (const block_q5_K *) data;
|
|
3395
|
+
block_q5_K dst_tmp[8];
|
|
3396
|
+
int nrow = ggml_nrows(t);
|
|
3397
|
+
int nblocks = t->ne[0] / QK_K;
|
|
3398
|
+
|
|
3399
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q5_K));
|
|
3400
|
+
|
|
3401
|
+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
|
3402
|
+
return -1;
|
|
3403
|
+
}
|
|
3404
|
+
|
|
3405
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
3406
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
3407
|
+
for (int i = 0; i < nrows_interleaved; i++) {
|
|
3408
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
3409
|
+
}
|
|
3410
|
+
*dst++ = make_block_q5_Kx8(dst_tmp, interleave_block);
|
|
3411
|
+
}
|
|
3412
|
+
src += nrows_interleaved * nblocks;
|
|
3413
|
+
}
|
|
3414
|
+
return 0;
|
|
3415
|
+
}
|
|
3416
|
+
|
|
3417
|
+
static int repack_q6_K_to_q6_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
3418
|
+
GGML_ASSERT(t->type == GGML_TYPE_Q6_K);
|
|
3419
|
+
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
|
3420
|
+
constexpr int nrows_interleaved = 8;
|
|
3421
|
+
|
|
3422
|
+
block_q6_Kx8 * dst = (block_q6_Kx8 *)t->data;
|
|
3423
|
+
const block_q6_K * src = (const block_q6_K *) data;
|
|
3424
|
+
block_q6_K dst_tmp[8];
|
|
3425
|
+
int nrow = ggml_nrows(t);
|
|
3426
|
+
int nblocks = t->ne[0] / QK_K;
|
|
3427
|
+
|
|
3428
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q6_K));
|
|
3429
|
+
|
|
3430
|
+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
|
3431
|
+
return -1;
|
|
3432
|
+
}
|
|
3433
|
+
|
|
3434
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
3435
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
3436
|
+
for (int i = 0; i < nrows_interleaved; i++) {
|
|
3437
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
3438
|
+
}
|
|
3439
|
+
*dst++ = make_block_q6_Kx8(dst_tmp, interleave_block);
|
|
3440
|
+
}
|
|
3441
|
+
src += nrows_interleaved * nblocks;
|
|
3442
|
+
}
|
|
3443
|
+
return 0;
|
|
3444
|
+
}
|
|
3445
|
+
|
|
1721
3446
|
static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
1722
3447
|
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
|
1723
3448
|
GGML_ASSERT(interleave_block == 8);
|
|
@@ -1757,9 +3482,63 @@ static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor * t,
|
|
|
1757
3482
|
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
|
1758
3483
|
constexpr int nrows_interleaved = 4;
|
|
1759
3484
|
|
|
1760
|
-
block_q8_0x4 * dst = (block_q8_0x4 *) t->data;
|
|
3485
|
+
block_q8_0x4 * dst = (block_q8_0x4 *) t->data;
|
|
3486
|
+
const block_q8_0 * src = (const block_q8_0 *) data;
|
|
3487
|
+
block_q8_0 dst_tmp[4];
|
|
3488
|
+
int nrow = ggml_nrows(t);
|
|
3489
|
+
int nblocks = t->ne[0] / QK8_0;
|
|
3490
|
+
|
|
3491
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q8_0));
|
|
3492
|
+
|
|
3493
|
+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
|
3494
|
+
return -1;
|
|
3495
|
+
}
|
|
3496
|
+
|
|
3497
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
3498
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
3499
|
+
for (int i = 0; i < nrows_interleaved; i++) {
|
|
3500
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
3501
|
+
}
|
|
3502
|
+
*dst++ = make_block_q8_0x4(dst_tmp, interleave_block);
|
|
3503
|
+
}
|
|
3504
|
+
src += nrows_interleaved * nblocks;
|
|
3505
|
+
}
|
|
3506
|
+
return 0;
|
|
3507
|
+
}
|
|
3508
|
+
|
|
3509
|
+
static block_q8_0x16 make_block_q8_0x16(block_q8_0 * in, unsigned int blck_size_interleave) {
|
|
3510
|
+
block_q8_0x16 out;
|
|
3511
|
+
|
|
3512
|
+
for (int i = 0; i < 16; i++) {
|
|
3513
|
+
out.d[i] = in[i].d;
|
|
3514
|
+
}
|
|
3515
|
+
|
|
3516
|
+
const int end = QK8_0 * 16 / blck_size_interleave;
|
|
3517
|
+
|
|
3518
|
+
if (blck_size_interleave == 1) {
|
|
3519
|
+
for (int i = 0; i < end; ++i) {
|
|
3520
|
+
int src_id = i % 16;
|
|
3521
|
+
int src_offset = i / 16;
|
|
3522
|
+
int dst_offset = i;
|
|
3523
|
+
out.qs[dst_offset] = in[src_id].qs[src_offset];
|
|
3524
|
+
}
|
|
3525
|
+
} else {
|
|
3526
|
+
GGML_ASSERT(false);
|
|
3527
|
+
}
|
|
3528
|
+
|
|
3529
|
+
return out;
|
|
3530
|
+
}
|
|
3531
|
+
|
|
3532
|
+
static int repack_q8_0_to_q8_0_16_bl(struct ggml_tensor * t,
|
|
3533
|
+
int interleave_block,
|
|
3534
|
+
const void * GGML_RESTRICT data,
|
|
3535
|
+
size_t data_size) {
|
|
3536
|
+
GGML_ASSERT(t->type == GGML_TYPE_Q8_0);
|
|
3537
|
+
constexpr int nrows_interleaved = 16;
|
|
3538
|
+
|
|
3539
|
+
block_q8_0x16 * dst = (block_q8_0x16 *) t->data;
|
|
1761
3540
|
const block_q8_0 * src = (const block_q8_0 *) data;
|
|
1762
|
-
block_q8_0 dst_tmp[
|
|
3541
|
+
block_q8_0 dst_tmp[16];
|
|
1763
3542
|
int nrow = ggml_nrows(t);
|
|
1764
3543
|
int nblocks = t->ne[0] / QK8_0;
|
|
1765
3544
|
|
|
@@ -1774,7 +3553,7 @@ static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor * t,
|
|
|
1774
3553
|
for (int i = 0; i < nrows_interleaved; i++) {
|
|
1775
3554
|
dst_tmp[i] = src[x + i * nblocks];
|
|
1776
3555
|
}
|
|
1777
|
-
*dst++ =
|
|
3556
|
+
*dst++ = make_block_q8_0x16(dst_tmp, interleave_block);
|
|
1778
3557
|
}
|
|
1779
3558
|
src += nrows_interleaved * nblocks;
|
|
1780
3559
|
}
|
|
@@ -1906,6 +3685,177 @@ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_b
|
|
|
1906
3685
|
GGML_UNUSED(data_size);
|
|
1907
3686
|
}
|
|
1908
3687
|
|
|
3688
|
+
static block_iq4_nlx16 make_block_iq4_nlx16(block_iq4_nl * in, unsigned int blck_size_interleave) {
|
|
3689
|
+
block_iq4_nlx16 out;
|
|
3690
|
+
|
|
3691
|
+
for (int i = 0; i < 16; i++) {
|
|
3692
|
+
out.d[i] = in[i].d;
|
|
3693
|
+
}
|
|
3694
|
+
|
|
3695
|
+
const int end = QK4_NL * 8 / blck_size_interleave;
|
|
3696
|
+
|
|
3697
|
+
if (blck_size_interleave == 1) {
|
|
3698
|
+
for (int i = 0; i < end; ++i) {
|
|
3699
|
+
int src_id = i % 16;
|
|
3700
|
+
int src_offset = i / 16;
|
|
3701
|
+
int dst_offset = i;
|
|
3702
|
+
|
|
3703
|
+
out.qs[dst_offset] = in[src_id].qs[src_offset];
|
|
3704
|
+
}
|
|
3705
|
+
} else {
|
|
3706
|
+
GGML_ASSERT(false);
|
|
3707
|
+
}
|
|
3708
|
+
|
|
3709
|
+
return out;
|
|
3710
|
+
}
|
|
3711
|
+
|
|
3712
|
+
static int repack_iq4_nl_to_iq4_nl_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
3713
|
+
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
|
|
3714
|
+
GGML_ASSERT(interleave_block == 1);
|
|
3715
|
+
|
|
3716
|
+
const block_iq4_nl * src = (const block_iq4_nl *)data;
|
|
3717
|
+
block_iq4_nlx16 * dst = ( block_iq4_nlx16 *)t->data;
|
|
3718
|
+
|
|
3719
|
+
block_iq4_nl dst_tmp[16];
|
|
3720
|
+
|
|
3721
|
+
int nrow = ggml_nrows(t);
|
|
3722
|
+
int nrows_interleaved = 16;
|
|
3723
|
+
int nblocks = t->ne[0] / QK4_NL;
|
|
3724
|
+
|
|
3725
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
|
|
3726
|
+
|
|
3727
|
+
if (t->ne[1] % nrows_interleaved != 0) {
|
|
3728
|
+
return -1;
|
|
3729
|
+
}
|
|
3730
|
+
|
|
3731
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
3732
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
3733
|
+
for (int i = 0; i < nrows_interleaved; i++) {
|
|
3734
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
3735
|
+
}
|
|
3736
|
+
*dst++ = make_block_iq4_nlx16(dst_tmp, interleave_block);
|
|
3737
|
+
}
|
|
3738
|
+
src += nrows_interleaved * nblocks;
|
|
3739
|
+
}
|
|
3740
|
+
return 0;
|
|
3741
|
+
|
|
3742
|
+
GGML_UNUSED(data_size);
|
|
3743
|
+
}
|
|
3744
|
+
|
|
3745
|
+
static block_mxfp4x4 make_block_mxfp4x4(block_mxfp4 * in, unsigned int blck_size_interleave) {
|
|
3746
|
+
block_mxfp4x4 out;
|
|
3747
|
+
|
|
3748
|
+
for (int i = 0; i < 4; i++) {
|
|
3749
|
+
out.e[i] = in[i].e;
|
|
3750
|
+
}
|
|
3751
|
+
|
|
3752
|
+
const int end = QK_MXFP4 * 2 / blck_size_interleave;
|
|
3753
|
+
|
|
3754
|
+
if (blck_size_interleave == 4) {
|
|
3755
|
+
for (int i = 0; i < end; ++i) {
|
|
3756
|
+
int src_id = i % 4;
|
|
3757
|
+
int src_offset = (i / 4) * blck_size_interleave;
|
|
3758
|
+
int dst_offset = i * blck_size_interleave;
|
|
3759
|
+
|
|
3760
|
+
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
|
|
3761
|
+
}
|
|
3762
|
+
} else {
|
|
3763
|
+
GGML_ASSERT(false);
|
|
3764
|
+
}
|
|
3765
|
+
|
|
3766
|
+
return out;
|
|
3767
|
+
}
|
|
3768
|
+
|
|
3769
|
+
static int repack_mxfp4_to_mxfp4_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
3770
|
+
GGML_ASSERT(t->type == GGML_TYPE_MXFP4);
|
|
3771
|
+
GGML_ASSERT(interleave_block == 4);
|
|
3772
|
+
|
|
3773
|
+
const block_mxfp4 * src = (const block_mxfp4 *)data;
|
|
3774
|
+
block_mxfp4x4 * dst = ( block_mxfp4x4 *)t->data;
|
|
3775
|
+
|
|
3776
|
+
block_mxfp4 dst_tmp[4];
|
|
3777
|
+
|
|
3778
|
+
int nrow = ggml_nrows(t);
|
|
3779
|
+
int nrows_interleaved = 4;
|
|
3780
|
+
int nblocks = t->ne[0] / QK_MXFP4;
|
|
3781
|
+
|
|
3782
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_mxfp4));
|
|
3783
|
+
|
|
3784
|
+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
|
3785
|
+
return -1;
|
|
3786
|
+
}
|
|
3787
|
+
|
|
3788
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
3789
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
3790
|
+
for (int i = 0; i < nrows_interleaved; i++) {
|
|
3791
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
3792
|
+
}
|
|
3793
|
+
*dst++ = make_block_mxfp4x4(dst_tmp, interleave_block);
|
|
3794
|
+
}
|
|
3795
|
+
src += nrows_interleaved * nblocks;
|
|
3796
|
+
}
|
|
3797
|
+
return 0;
|
|
3798
|
+
|
|
3799
|
+
GGML_UNUSED(data_size);
|
|
3800
|
+
}
|
|
3801
|
+
|
|
3802
|
+
static block_mxfp4x8 make_block_mxfp4x8(block_mxfp4 * in, unsigned int blck_size_interleave) {
|
|
3803
|
+
block_mxfp4x8 out;
|
|
3804
|
+
|
|
3805
|
+
for (int i = 0; i < 8; i++) {
|
|
3806
|
+
out.e[i] = in[i].e;
|
|
3807
|
+
}
|
|
3808
|
+
|
|
3809
|
+
const int end = QK_MXFP4 * 4 / blck_size_interleave;
|
|
3810
|
+
|
|
3811
|
+
if (blck_size_interleave == 8) {
|
|
3812
|
+
for (int i = 0; i < end; ++i) {
|
|
3813
|
+
int src_id = i % 8;
|
|
3814
|
+
int src_offset = (i / 8) * blck_size_interleave;
|
|
3815
|
+
int dst_offset = i * blck_size_interleave;
|
|
3816
|
+
|
|
3817
|
+
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
3818
|
+
}
|
|
3819
|
+
} else {
|
|
3820
|
+
GGML_ASSERT(false);
|
|
3821
|
+
}
|
|
3822
|
+
|
|
3823
|
+
return out;
|
|
3824
|
+
}
|
|
3825
|
+
|
|
3826
|
+
static int repack_mxfp4_to_mxfp4_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
3827
|
+
GGML_ASSERT(t->type == GGML_TYPE_MXFP4);
|
|
3828
|
+
GGML_ASSERT(interleave_block == 8);
|
|
3829
|
+
|
|
3830
|
+
const block_mxfp4 * src = (const block_mxfp4 *)data;
|
|
3831
|
+
block_mxfp4x8 * dst = ( block_mxfp4x8 *)t->data;
|
|
3832
|
+
|
|
3833
|
+
block_mxfp4 dst_tmp[8];
|
|
3834
|
+
|
|
3835
|
+
int nrow = ggml_nrows(t);
|
|
3836
|
+
int nrows_interleaved = 8;
|
|
3837
|
+
int nblocks = t->ne[0] / QK_MXFP4;
|
|
3838
|
+
|
|
3839
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_mxfp4));
|
|
3840
|
+
|
|
3841
|
+
if (t->ne[1] % nrows_interleaved != 0) {
|
|
3842
|
+
return -1;
|
|
3843
|
+
}
|
|
3844
|
+
|
|
3845
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
3846
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
3847
|
+
for (int i = 0; i < nrows_interleaved; i++) {
|
|
3848
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
3849
|
+
}
|
|
3850
|
+
*dst++ = make_block_mxfp4x8(dst_tmp, interleave_block);
|
|
3851
|
+
}
|
|
3852
|
+
src += nrows_interleaved * nblocks;
|
|
3853
|
+
}
|
|
3854
|
+
return 0;
|
|
3855
|
+
|
|
3856
|
+
GGML_UNUSED(data_size);
|
|
3857
|
+
}
|
|
3858
|
+
|
|
1909
3859
|
namespace ggml::cpu::repack {
|
|
1910
3860
|
// repack
|
|
1911
3861
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
|
|
@@ -1936,6 +3886,22 @@ template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * da
|
|
|
1936
3886
|
return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
|
|
1937
3887
|
}
|
|
1938
3888
|
|
|
3889
|
+
template <> int repack<block_q5_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3890
|
+
return repack_q5_K_to_q5_K_8_bl(t, 4, data, data_size);
|
|
3891
|
+
}
|
|
3892
|
+
|
|
3893
|
+
template <> int repack<block_q5_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3894
|
+
return repack_q5_K_to_q5_K_8_bl(t, 8, data, data_size);
|
|
3895
|
+
}
|
|
3896
|
+
|
|
3897
|
+
template <> int repack<block_q6_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3898
|
+
return repack_q6_K_to_q6_K_8_bl(t, 4, data, data_size);
|
|
3899
|
+
}
|
|
3900
|
+
|
|
3901
|
+
template <> int repack<block_q6_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3902
|
+
return repack_q6_K_to_q6_K_8_bl(t, 8, data, data_size);
|
|
3903
|
+
}
|
|
3904
|
+
|
|
1939
3905
|
template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1940
3906
|
return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
|
|
1941
3907
|
}
|
|
@@ -1949,6 +3915,14 @@ template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void *
|
|
|
1949
3915
|
return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
|
|
1950
3916
|
}
|
|
1951
3917
|
|
|
3918
|
+
template <> int repack<block_mxfp4, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3919
|
+
return repack_mxfp4_to_mxfp4_4_bl(t, 4, data, data_size);
|
|
3920
|
+
}
|
|
3921
|
+
|
|
3922
|
+
template <> int repack<block_mxfp4, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3923
|
+
return repack_mxfp4_to_mxfp4_8_bl(t, 8, data, data_size);
|
|
3924
|
+
}
|
|
3925
|
+
|
|
1952
3926
|
template <> int repack<block_q8_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1953
3927
|
return repack_q8_0_to_q8_0_4_bl(t, 4, data, data_size);
|
|
1954
3928
|
}
|
|
@@ -1957,6 +3931,28 @@ template <> int repack<block_q8_0, 8, 4>(struct ggml_tensor * t, const void * da
|
|
|
1957
3931
|
return repack_q8_0_to_q8_0_4_bl(t, 8, data, data_size);
|
|
1958
3932
|
}
|
|
1959
3933
|
|
|
3934
|
+
#if defined __riscv_zvfh
|
|
3935
|
+
template <> int repack<block_q4_0, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3936
|
+
return repack_q4_0_to_q4_0_16_bl(t, 1, data, data_size);
|
|
3937
|
+
}
|
|
3938
|
+
|
|
3939
|
+
template <> int repack<block_q4_K, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3940
|
+
return repack_q4_K_to_q4_K_16_bl(t, 1, data, data_size);
|
|
3941
|
+
}
|
|
3942
|
+
|
|
3943
|
+
template <> int repack<block_iq4_nl, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3944
|
+
return repack_iq4_nl_to_iq4_nl_16_bl(t, 1, data, data_size);
|
|
3945
|
+
}
|
|
3946
|
+
|
|
3947
|
+
template <> int repack<block_q8_0, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3948
|
+
return repack_q8_0_to_q8_0_16_bl(t, 1, data, data_size);
|
|
3949
|
+
}
|
|
3950
|
+
|
|
3951
|
+
template <> int repack<block_q2_K, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
3952
|
+
return repack_q2_K_to_q2_K_16_bl(t, 1, data, data_size);
|
|
3953
|
+
}
|
|
3954
|
+
#endif
|
|
3955
|
+
|
|
1960
3956
|
// gemv
|
|
1961
3957
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
|
1962
3958
|
void gemv(int, float *, size_t, const void *, const void *, int, int);
|
|
@@ -1973,6 +3969,17 @@ template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t
|
|
|
1973
3969
|
ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1974
3970
|
}
|
|
1975
3971
|
|
|
3972
|
+
template <>
|
|
3973
|
+
void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n,
|
|
3974
|
+
float * s,
|
|
3975
|
+
size_t bs,
|
|
3976
|
+
const void * vx,
|
|
3977
|
+
const void * vy,
|
|
3978
|
+
int nr,
|
|
3979
|
+
int nc) {
|
|
3980
|
+
ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
3981
|
+
}
|
|
3982
|
+
|
|
1976
3983
|
template <> void gemv<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1977
3984
|
ggml_gemv_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
1978
3985
|
}
|
|
@@ -1981,8 +3988,20 @@ template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
|
|
|
1981
3988
|
ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
1982
3989
|
}
|
|
1983
3990
|
|
|
1984
|
-
template <> void gemv<
|
|
1985
|
-
|
|
3991
|
+
template <> void gemv<block_q5_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
3992
|
+
ggml_gemv_q5_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
3993
|
+
}
|
|
3994
|
+
|
|
3995
|
+
template <> void gemv<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
3996
|
+
ggml_gemv_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
3997
|
+
}
|
|
3998
|
+
|
|
3999
|
+
template <> void gemv<block_q6_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4000
|
+
ggml_gemv_q6_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
4001
|
+
}
|
|
4002
|
+
|
|
4003
|
+
template <> void gemv<block_q6_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4004
|
+
ggml_gemv_q6_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
1986
4005
|
}
|
|
1987
4006
|
|
|
1988
4007
|
template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
@@ -1993,6 +4012,14 @@ template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
|
|
|
1993
4012
|
ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1994
4013
|
}
|
|
1995
4014
|
|
|
4015
|
+
template <> void gemv<block_mxfp4, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4016
|
+
ggml_gemv_mxfp4_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
4017
|
+
}
|
|
4018
|
+
|
|
4019
|
+
template <> void gemv<block_mxfp4, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4020
|
+
ggml_gemv_mxfp4_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
4021
|
+
}
|
|
4022
|
+
|
|
1996
4023
|
template <> void gemv<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1997
4024
|
ggml_gemv_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1998
4025
|
}
|
|
@@ -2001,6 +4028,28 @@ template <> void gemv<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
|
|
|
2001
4028
|
ggml_gemv_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2002
4029
|
}
|
|
2003
4030
|
|
|
4031
|
+
#if defined __riscv_zvfh
|
|
4032
|
+
template <> void gemv<block_q4_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4033
|
+
ggml_gemv_q4_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
4034
|
+
}
|
|
4035
|
+
|
|
4036
|
+
template <> void gemv<block_q4_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4037
|
+
ggml_gemv_q4_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
4038
|
+
}
|
|
4039
|
+
|
|
4040
|
+
template <> void gemv<block_iq4_nl, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4041
|
+
ggml_gemv_iq4_nl_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
4042
|
+
}
|
|
4043
|
+
|
|
4044
|
+
template <> void gemv<block_q8_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4045
|
+
ggml_gemv_q8_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
4046
|
+
}
|
|
4047
|
+
|
|
4048
|
+
template <> void gemv<block_q2_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4049
|
+
ggml_gemv_q2_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
4050
|
+
}
|
|
4051
|
+
#endif
|
|
4052
|
+
|
|
2004
4053
|
// gemm
|
|
2005
4054
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
|
2006
4055
|
void gemm(int, float *, size_t, const void *, const void *, int, int);
|
|
@@ -2013,20 +4062,43 @@ template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
|
|
|
2013
4062
|
ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2014
4063
|
}
|
|
2015
4064
|
|
|
2016
|
-
template <>
|
|
2017
|
-
|
|
4065
|
+
template <>
|
|
4066
|
+
void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n,
|
|
4067
|
+
float * s,
|
|
4068
|
+
size_t bs,
|
|
4069
|
+
const void * vx,
|
|
4070
|
+
const void * vy,
|
|
4071
|
+
int nr,
|
|
4072
|
+
int nc) {
|
|
4073
|
+
ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2018
4074
|
}
|
|
2019
4075
|
|
|
2020
|
-
template <> void gemm<
|
|
2021
|
-
|
|
4076
|
+
template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4077
|
+
ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
4078
|
+
}
|
|
4079
|
+
|
|
4080
|
+
template <> void gemm<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4081
|
+
ggml_gemm_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
2022
4082
|
}
|
|
2023
4083
|
|
|
2024
4084
|
template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
2025
4085
|
ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
2026
4086
|
}
|
|
2027
4087
|
|
|
2028
|
-
template <> void gemm<
|
|
2029
|
-
|
|
4088
|
+
template <> void gemm<block_q5_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4089
|
+
ggml_gemm_q5_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
4090
|
+
}
|
|
4091
|
+
|
|
4092
|
+
template <> void gemm<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4093
|
+
ggml_gemm_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
4094
|
+
}
|
|
4095
|
+
|
|
4096
|
+
template <> void gemm<block_q6_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4097
|
+
ggml_gemm_q6_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
4098
|
+
}
|
|
4099
|
+
|
|
4100
|
+
template <> void gemm<block_q6_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4101
|
+
ggml_gemm_q6_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
2030
4102
|
}
|
|
2031
4103
|
|
|
2032
4104
|
template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
@@ -2037,6 +4109,14 @@ template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
|
|
|
2037
4109
|
ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2038
4110
|
}
|
|
2039
4111
|
|
|
4112
|
+
template <> void gemm<block_mxfp4, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4113
|
+
ggml_gemm_mxfp4_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
4114
|
+
}
|
|
4115
|
+
|
|
4116
|
+
template <> void gemm<block_mxfp4, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4117
|
+
ggml_gemm_mxfp4_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
4118
|
+
}
|
|
4119
|
+
|
|
2040
4120
|
template <> void gemm<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
2041
4121
|
ggml_gemm_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2042
4122
|
}
|
|
@@ -2045,6 +4125,28 @@ template <> void gemm<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
|
|
|
2045
4125
|
ggml_gemm_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2046
4126
|
}
|
|
2047
4127
|
|
|
4128
|
+
#if defined __riscv_zvfh
|
|
4129
|
+
template <> void gemm<block_q4_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4130
|
+
ggml_gemm_q4_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
4131
|
+
}
|
|
4132
|
+
|
|
4133
|
+
template <> void gemm<block_q4_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4134
|
+
ggml_gemm_q4_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
4135
|
+
}
|
|
4136
|
+
|
|
4137
|
+
template <> void gemm<block_iq4_nl, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4138
|
+
ggml_gemm_iq4_nl_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
4139
|
+
}
|
|
4140
|
+
|
|
4141
|
+
template <> void gemm<block_q8_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4142
|
+
ggml_gemm_q8_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
4143
|
+
}
|
|
4144
|
+
|
|
4145
|
+
template <> void gemm<block_q2_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
4146
|
+
ggml_gemm_q2_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
4147
|
+
}
|
|
4148
|
+
#endif
|
|
4149
|
+
|
|
2048
4150
|
class tensor_traits_base : public ggml::cpu::tensor_traits {
|
|
2049
4151
|
public:
|
|
2050
4152
|
virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
|
|
@@ -2063,7 +4165,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
2063
4165
|
case GGML_OP_MUL_MAT_ID:
|
|
2064
4166
|
{
|
|
2065
4167
|
size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
|
|
2066
|
-
size = GGML_PAD(size, sizeof(int64_t)); // + padding for next
|
|
4168
|
+
size = GGML_PAD(size, sizeof(int64_t)); // + padding for next block.
|
|
2067
4169
|
|
|
2068
4170
|
const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
|
|
2069
4171
|
const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
|
|
@@ -2328,7 +4430,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
2328
4430
|
auto * wdata = (char *)params->wdata;
|
|
2329
4431
|
auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
|
|
2330
4432
|
|
|
2331
|
-
// total of [n_as][ne12 + 1]
|
|
4433
|
+
// total of [n_as][ne12 + 1] elements of type mmid_row_mapping (2*int32_t = int64_t)
|
|
2332
4434
|
auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
|
2333
4435
|
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
|
|
2334
4436
|
|
|
@@ -2393,20 +4495,19 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
2393
4495
|
for (int ir1 = 0; ir1 < nr1; ir1++) {
|
|
2394
4496
|
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
|
|
2395
4497
|
|
|
2396
|
-
const int id = row_mapping.i1;
|
|
4498
|
+
const int id = row_mapping.i1; // selected expert index
|
|
2397
4499
|
|
|
2398
4500
|
const int64_t i11 = id % ne11;
|
|
2399
|
-
const int64_t i12 = row_mapping.i2;
|
|
4501
|
+
const int64_t i12 = row_mapping.i2; // row index in src1
|
|
2400
4502
|
|
|
2401
|
-
const int64_t i1 = id;
|
|
2402
|
-
const int64_t i2 = i12;
|
|
4503
|
+
const int64_t i1 = id; // selected expert index
|
|
4504
|
+
const int64_t i2 = i12; // row
|
|
2403
4505
|
|
|
2404
4506
|
const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2);
|
|
2405
4507
|
|
|
2406
|
-
gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(
|
|
2407
|
-
|
|
2408
|
-
|
|
2409
|
-
src1_col, 1, src0_cur_end - src0_cur_start);
|
|
4508
|
+
gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(
|
|
4509
|
+
ne00, (float *) ((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
|
|
4510
|
+
src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start);
|
|
2410
4511
|
}
|
|
2411
4512
|
}
|
|
2412
4513
|
#undef MMID_MATRIX_ROW
|
|
@@ -2422,7 +4523,6 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
2422
4523
|
} // namespace ggml::cpu::repack
|
|
2423
4524
|
|
|
2424
4525
|
static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) {
|
|
2425
|
-
|
|
2426
4526
|
// instance for Q4
|
|
2427
4527
|
static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
|
|
2428
4528
|
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
|
|
@@ -2432,6 +4532,14 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
2432
4532
|
static const ggml::cpu::repack::tensor_traits<block_q4_K, 4, 8, GGML_TYPE_Q8_K> q4_K_8x4_q8_K;
|
|
2433
4533
|
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
|
2434
4534
|
|
|
4535
|
+
// instance for Q5_K
|
|
4536
|
+
static const ggml::cpu::repack::tensor_traits<block_q5_K, 4, 8, GGML_TYPE_Q8_K> q5_K_8x4_q8_K;
|
|
4537
|
+
static const ggml::cpu::repack::tensor_traits<block_q5_K, 8, 8, GGML_TYPE_Q8_K> q5_K_8x8_q8_K;
|
|
4538
|
+
|
|
4539
|
+
// instance for Q6_K
|
|
4540
|
+
static const ggml::cpu::repack::tensor_traits<block_q6_K, 4, 8, GGML_TYPE_Q8_K> q6_K_8x4_q8_K;
|
|
4541
|
+
static const ggml::cpu::repack::tensor_traits<block_q6_K, 8, 8, GGML_TYPE_Q8_K> q6_K_8x8_q8_K;
|
|
4542
|
+
|
|
2435
4543
|
// instance for Q2
|
|
2436
4544
|
static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
|
|
2437
4545
|
|
|
@@ -2439,13 +4547,28 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
2439
4547
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
|
2440
4548
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
|
|
2441
4549
|
|
|
4550
|
+
// instance for MXFP4
|
|
4551
|
+
static const ggml::cpu::repack::tensor_traits<block_mxfp4, 4, 4, GGML_TYPE_Q8_0> mxfp4_4x4_q8_0;
|
|
4552
|
+
static const ggml::cpu::repack::tensor_traits<block_mxfp4, 8, 8, GGML_TYPE_Q8_0> mxfp4_8x8_q8_0;
|
|
4553
|
+
|
|
2442
4554
|
// instance for Q8_0
|
|
2443
4555
|
static const ggml::cpu::repack::tensor_traits<block_q8_0, 4, 4, GGML_TYPE_Q8_0> q8_0_4x4_q8_0;
|
|
2444
4556
|
static const ggml::cpu::repack::tensor_traits<block_q8_0, 8, 4, GGML_TYPE_Q8_0> q8_0_4x8_q8_0;
|
|
2445
4557
|
|
|
4558
|
+
// instances for RISC-V
|
|
4559
|
+
//
|
|
4560
|
+
// These implement outer-product style matrix multiplication kernels with
|
|
4561
|
+
// an interleave of 1.
|
|
4562
|
+
#if defined __riscv_zvfh
|
|
4563
|
+
static const ggml::cpu::repack::tensor_traits<block_q4_0, 1, 16, GGML_TYPE_Q8_0> q4_0_16x1_q8_0;
|
|
4564
|
+
static const ggml::cpu::repack::tensor_traits<block_q4_K, 1, 16, GGML_TYPE_Q8_K> q4_K_16x1_q8_K;
|
|
4565
|
+
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 1, 16, GGML_TYPE_Q8_0> iq4_nl_16x1_q8_0;
|
|
4566
|
+
static const ggml::cpu::repack::tensor_traits<block_q8_0, 1, 16, GGML_TYPE_Q8_0> q8_0_16x1_q8_0;
|
|
4567
|
+
static const ggml::cpu::repack::tensor_traits<block_q2_K, 1, 16, GGML_TYPE_Q8_K> q2_K_16x1_q8_K;
|
|
4568
|
+
#endif
|
|
4569
|
+
|
|
2446
4570
|
if (cur->type == GGML_TYPE_Q4_0) {
|
|
2447
|
-
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
|
|
2448
|
-
|| (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
|
|
4571
|
+
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
|
|
2449
4572
|
if (cur->ne[1] % 8 == 0) {
|
|
2450
4573
|
return &q4_0_8x8_q8_0;
|
|
2451
4574
|
}
|
|
@@ -2460,6 +4583,17 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
2460
4583
|
return &q4_0_4x4_q8_0;
|
|
2461
4584
|
}
|
|
2462
4585
|
}
|
|
4586
|
+
if (ggml_cpu_has_riscv_v()) {
|
|
4587
|
+
#if defined __riscv_zvfh
|
|
4588
|
+
switch (__riscv_vlenb() * 8) {
|
|
4589
|
+
case 128: { break; } // TODO
|
|
4590
|
+
case 256: { if (cur->ne[1] % 16 == 0) { return &q4_0_16x1_q8_0; } break; }
|
|
4591
|
+
case 512: { break; } // TODO
|
|
4592
|
+
case 1024: { break; } // TODO
|
|
4593
|
+
default: { return nullptr; }
|
|
4594
|
+
}
|
|
4595
|
+
#endif
|
|
4596
|
+
}
|
|
2463
4597
|
} else if (cur->type == GGML_TYPE_Q4_K) {
|
|
2464
4598
|
if (ggml_cpu_has_avx2()) {
|
|
2465
4599
|
if (cur->ne[1] % 8 == 0) {
|
|
@@ -2476,12 +4610,56 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
2476
4610
|
return &q4_K_8x4_q8_K;
|
|
2477
4611
|
}
|
|
2478
4612
|
}
|
|
4613
|
+
if (ggml_cpu_has_riscv_v()) {
|
|
4614
|
+
#if defined __riscv_zvfh
|
|
4615
|
+
switch (__riscv_vlenb() * 8) {
|
|
4616
|
+
case 128: { break; } // TODO
|
|
4617
|
+
case 256: { if (cur->ne[1] % 16 == 0) { return &q4_K_16x1_q8_K; } break; }
|
|
4618
|
+
case 512: { break; } // TODO
|
|
4619
|
+
case 1024: { break; } // TODO
|
|
4620
|
+
default: { return nullptr; }
|
|
4621
|
+
}
|
|
4622
|
+
#endif
|
|
4623
|
+
}
|
|
2479
4624
|
} else if (cur->type == GGML_TYPE_Q2_K) {
|
|
2480
4625
|
if (ggml_cpu_has_avx512()) {
|
|
2481
4626
|
if (cur->ne[1] % 8 == 0) {
|
|
2482
4627
|
return &q2_K_8x8_q8_K;
|
|
2483
4628
|
}
|
|
2484
4629
|
}
|
|
4630
|
+
if (ggml_cpu_has_riscv_v()) {
|
|
4631
|
+
#if defined __riscv_zvfh
|
|
4632
|
+
switch (__riscv_vlenb() * 8) {
|
|
4633
|
+
case 128: { break; } // TODO
|
|
4634
|
+
case 256: { if (cur->ne[1] % 16 == 0) { return &q2_K_16x1_q8_K; } break; }
|
|
4635
|
+
case 512: { break; } // TODO
|
|
4636
|
+
case 1024: { break; } // TODO
|
|
4637
|
+
default: { return nullptr; }
|
|
4638
|
+
}
|
|
4639
|
+
#endif
|
|
4640
|
+
}
|
|
4641
|
+
} else if (cur->type == GGML_TYPE_Q5_K) {
|
|
4642
|
+
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
|
4643
|
+
if (cur->ne[1] % 8 == 0) {
|
|
4644
|
+
return &q5_K_8x8_q8_K;
|
|
4645
|
+
}
|
|
4646
|
+
}
|
|
4647
|
+
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
|
4648
|
+
if (cur->ne[1] % 8 == 0) {
|
|
4649
|
+
return &q5_K_8x4_q8_K;
|
|
4650
|
+
}
|
|
4651
|
+
}
|
|
4652
|
+
} else if (cur->type == GGML_TYPE_Q6_K) {
|
|
4653
|
+
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
|
4654
|
+
if (cur->ne[1] % 8 == 0) {
|
|
4655
|
+
return &q6_K_8x8_q8_K;
|
|
4656
|
+
}
|
|
4657
|
+
}
|
|
4658
|
+
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
|
4659
|
+
if (cur->ne[1] % 8 == 0) {
|
|
4660
|
+
return &q6_K_8x4_q8_K;
|
|
4661
|
+
}
|
|
4662
|
+
}
|
|
2485
4663
|
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
|
2486
4664
|
if (ggml_cpu_has_avx2()) {
|
|
2487
4665
|
if (cur->ne[1] % 8 == 0) {
|
|
@@ -2493,6 +4671,28 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
2493
4671
|
return &iq4_nl_4x4_q8_0;
|
|
2494
4672
|
}
|
|
2495
4673
|
}
|
|
4674
|
+
if (ggml_cpu_has_riscv_v()) {
|
|
4675
|
+
#if defined __riscv_zvfh
|
|
4676
|
+
switch (__riscv_vlenb() * 8) {
|
|
4677
|
+
case 128: { break; } // TODO
|
|
4678
|
+
case 256: { if (cur->ne[1] % 16 == 0) { return &iq4_nl_16x1_q8_0; } break; }
|
|
4679
|
+
case 512: { break; } // TODO
|
|
4680
|
+
case 1024: { break; } // TODO
|
|
4681
|
+
default: { return nullptr; }
|
|
4682
|
+
}
|
|
4683
|
+
#endif
|
|
4684
|
+
}
|
|
4685
|
+
} else if (cur->type == GGML_TYPE_MXFP4) {
|
|
4686
|
+
if (ggml_cpu_has_avx2()) {
|
|
4687
|
+
if (cur->ne[1] % 8 == 0) {
|
|
4688
|
+
return &mxfp4_8x8_q8_0;
|
|
4689
|
+
}
|
|
4690
|
+
}
|
|
4691
|
+
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
|
4692
|
+
if (cur->ne[1] % 4 == 0) {
|
|
4693
|
+
return &mxfp4_4x4_q8_0;
|
|
4694
|
+
}
|
|
4695
|
+
}
|
|
2496
4696
|
} else if (cur->type == GGML_TYPE_Q8_0) {
|
|
2497
4697
|
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
|
2498
4698
|
if (cur->ne[1] % 4 == 0) {
|
|
@@ -2504,6 +4704,17 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
2504
4704
|
return &q8_0_4x4_q8_0;
|
|
2505
4705
|
}
|
|
2506
4706
|
}
|
|
4707
|
+
if (ggml_cpu_has_riscv_v()) {
|
|
4708
|
+
#if defined __riscv_zvfh
|
|
4709
|
+
switch (__riscv_vlenb() * 8) {
|
|
4710
|
+
case 128: { break; } // TODO
|
|
4711
|
+
case 256: { if (cur->ne[1] % 16 == 0) { return &q8_0_16x1_q8_0; } break; }
|
|
4712
|
+
case 512: { break; } // TODO
|
|
4713
|
+
case 1024: { break; } // TODO
|
|
4714
|
+
default: { return nullptr; }
|
|
4715
|
+
}
|
|
4716
|
+
#endif
|
|
4717
|
+
}
|
|
2507
4718
|
}
|
|
2508
4719
|
|
|
2509
4720
|
return nullptr;
|