whispercpp 1.3.3 → 1.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/ruby_whisper_params.c +55 -25
- data/ext/sources/CMakeLists.txt +1 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/build-xcframework.sh +24 -0
- data/ext/sources/examples/CMakeLists.txt +1 -0
- data/ext/sources/examples/addon.node/addon.cpp +19 -19
- data/ext/sources/examples/addon.node/index.js +7 -5
- data/ext/sources/examples/bench/bench.cpp +26 -16
- data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
- data/ext/sources/examples/cli/cli.cpp +4 -2
- data/ext/sources/examples/command/command.cpp +26 -24
- data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/lsp/lsp.cpp +19 -17
- data/ext/sources/examples/server/server.cpp +24 -13
- data/ext/sources/examples/server.py +6 -1
- data/ext/sources/examples/stream/stream.cpp +4 -2
- data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
- data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
- data/ext/sources/examples/talk-llama/CMakeLists.txt +2 -2
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
- data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
- data/ext/sources/examples/talk-llama/llama-arch.cpp +588 -15
- data/ext/sources/examples/talk-llama/llama-arch.h +58 -1
- data/ext/sources/examples/talk-llama/llama-batch.cpp +103 -71
- data/ext/sources/examples/talk-llama/llama-batch.h +31 -18
- data/ext/sources/examples/talk-llama/llama-chat.cpp +120 -5
- data/ext/sources/examples/talk-llama/llama-chat.h +7 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +460 -357
- data/ext/sources/examples/talk-llama/llama-context.h +44 -29
- data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
- data/ext/sources/examples/talk-llama/llama-graph.cpp +543 -271
- data/ext/sources/examples/talk-llama/llama-graph.h +278 -168
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +118 -4
- data/ext/sources/examples/talk-llama/llama-hparams.h +61 -15
- data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
- data/ext/sources/examples/talk-llama/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +38 -29
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2020 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +358 -27
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +80 -28
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +56 -36
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +30 -29
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +48 -19
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +13 -14
- data/ext/sources/examples/talk-llama/llama-memory.h +16 -10
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +2 -0
- data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
- data/ext/sources/examples/talk-llama/llama-model.cpp +7165 -2336
- data/ext/sources/examples/talk-llama/llama-model.h +60 -9
- data/ext/sources/examples/talk-llama/llama-quant.cpp +48 -10
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +440 -13
- data/ext/sources/examples/talk-llama/llama-vocab.h +45 -0
- data/ext/sources/examples/talk-llama/llama.cpp +65 -10
- data/ext/sources/examples/talk-llama/llama.h +95 -177
- data/ext/sources/examples/talk-llama/talk-llama.cpp +9 -6
- data/ext/sources/examples/talk-llama/unicode.cpp +207 -0
- data/ext/sources/examples/talk-llama/unicode.h +45 -0
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +4 -2
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
- data/ext/sources/ggml/CMakeLists.txt +59 -31
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
- data/ext/sources/ggml/include/ggml-backend.h +17 -1
- data/ext/sources/ggml/include/ggml-cpu.h +1 -1
- data/ext/sources/ggml/include/ggml-metal.h +1 -6
- data/ext/sources/ggml/include/ggml-opt.h +25 -6
- data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
- data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
- data/ext/sources/ggml/include/ggml.h +221 -16
- data/ext/sources/ggml/src/CMakeLists.txt +17 -2
- data/ext/sources/ggml/src/ggml-alloc.c +265 -141
- data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +30 -13
- data/ext/sources/ggml/src/ggml-backend.cpp +221 -38
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
- data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +14 -0
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
- data/ext/sources/ggml/src/ggml-cann/common.h +143 -1
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +488 -69
- data/ext/sources/ggml/src/ggml-common.h +17 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +40 -18
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +4 -2
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +103 -582
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +265 -437
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +521 -353
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +32 -2
- data/ext/sources/ggml/src/ggml-cpu/common.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -6
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +70 -42
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +35 -28
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +227 -97
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +474 -1116
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1587 -1177
- data/ext/sources/ggml/src/ggml-cpu/ops.h +5 -8
- data/ext/sources/ggml/src/ggml-cpu/quants.c +35 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +458 -47
- data/ext/sources/ggml/src/ggml-cpu/repack.h +22 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +89 -60
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- data/ext/sources/ggml/src/ggml-cpu/traits.cpp +2 -2
- data/ext/sources/ggml/src/ggml-cpu/traits.h +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +170 -26
- data/ext/sources/ggml/src/ggml-cpu/vec.h +506 -63
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
- data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
- data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +250 -63
- data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +95 -22
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +15 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
- data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +498 -367
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +137 -91
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +86 -50
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +379 -107
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
- data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +56 -2
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
- data/ext/sources/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
- data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
- data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -100
- data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +21 -4
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
- data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +90 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +8 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +10 -2
- data/ext/sources/ggml/src/ggml-impl.h +119 -9
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -7
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +136 -63
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +2854 -1503
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +18 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +2510 -242
- data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +66 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +177 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
- data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
- data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
- data/ext/sources/ggml/src/ggml-quants.c +111 -16
- data/ext/sources/ggml/src/ggml-quants.h +6 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +67 -47
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +15 -5
- data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +25 -16
- data/ext/sources/ggml/src/ggml-sycl/conv.cpp +10 -4
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +166 -99
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +72 -306
- data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +67 -49
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +1 -31
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +79 -29
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +14 -26
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +9 -6
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +328 -323
- data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +80 -60
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +201 -132
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +74 -55
- data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +8 -9
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +35 -42
- data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +12 -6
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +16 -12
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3492 -883
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +55 -11
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +335 -77
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
- data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
- data/ext/sources/ggml/src/ggml.c +478 -98
- data/ext/sources/ggml/src/gguf.cpp +8 -1
- data/ext/sources/src/whisper.cpp +23 -46
- data/ext/sources/tests/CMakeLists.txt +8 -1
- data/ext/sources/tests/test-vad-full.cpp +3 -3
- data/ext/sources/tests/test-vad.cpp +2 -2
- data/lib/whisper/model/uri.rb +1 -1
- data/sig/whisper.rbs +7 -0
- data/test/test_params.rb +8 -0
- data/test/test_whisper.rb +1 -1
- data/whispercpp.gemspec +1 -1
- metadata +164 -157
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +0 -279
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +0 -1841
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +0 -303
- data/ext/sources/ggml/include/ggml-kompute.h +0 -50
- data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
- data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
- data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
- data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
- data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
- data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -6280
@@ -23,6 +23,27 @@
|
|
23
23
|
|
24
24
|
#define UNUSED GGML_UNUSED
|
25
25
|
|
26
|
+
#if defined(__VXE__) || defined(__VXE2__)
|
27
|
+
#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
|
28
|
+
#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
|
29
|
+
#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
|
30
|
+
#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
|
31
|
+
#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
|
32
|
+
#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
|
33
|
+
#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
|
34
|
+
#define B8(c,s ) B7(c,s, c), B7(c,s, s)
|
35
|
+
|
36
|
+
// precomputed tables for expanding 8bits to 8 bytes:
|
37
|
+
static const __attribute__((aligned(16))) uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b ) << 4
|
38
|
+
static const __attribute__((aligned(16))) uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
|
39
|
+
|
40
|
+
// permute mask for byteswapping
|
41
|
+
static const uint8x16_t v_kperm = (const uint8x16_t){
|
42
|
+
7, 6, 5, 4, 3, 2, 1, 0,
|
43
|
+
15, 14, 13, 12, 11, 10, 9, 8
|
44
|
+
};
|
45
|
+
#endif
|
46
|
+
|
26
47
|
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
27
48
|
assert(QK8_0 == 32);
|
28
49
|
assert(k % QK8_0 == 0);
|
@@ -32,9 +53,9 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
32
53
|
|
33
54
|
#if defined(__VXE__) || defined(__VXE2__)
|
34
55
|
for (int i = 0; i < nb; i++) {
|
35
|
-
|
36
|
-
|
37
|
-
|
56
|
+
float32x4_t srcv [8];
|
57
|
+
float32x4_t asrcv[8];
|
58
|
+
float32x4_t amaxv[8];
|
38
59
|
|
39
60
|
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
|
40
61
|
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
|
@@ -53,8 +74,9 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
53
74
|
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
54
75
|
|
55
76
|
for (int j = 0; j < 8; j++) {
|
56
|
-
const
|
57
|
-
|
77
|
+
const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
|
78
|
+
/* Uses non-default rounding for vec_signed or vec_round */
|
79
|
+
const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
|
58
80
|
|
59
81
|
y[i].qs[4*j + 0] = vec_extract(vi, 0);
|
60
82
|
y[i].qs[4*j + 1] = vec_extract(vi, 1);
|
@@ -77,9 +99,9 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
77
99
|
|
78
100
|
#if defined(__VXE__) || defined(__VXE2__)
|
79
101
|
for (int i = 0; i < nb; i++) {
|
80
|
-
|
81
|
-
|
82
|
-
|
102
|
+
float32x4_t srcv [8];
|
103
|
+
float32x4_t asrcv[8];
|
104
|
+
float32x4_t amaxv[8];
|
83
105
|
|
84
106
|
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
|
85
107
|
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
|
@@ -97,11 +119,12 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
97
119
|
|
98
120
|
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
99
121
|
|
100
|
-
|
122
|
+
int32x4_t acc = vec_splats(0);
|
101
123
|
|
102
124
|
for (int j = 0; j < 8; j++) {
|
103
|
-
const
|
104
|
-
|
125
|
+
const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
|
126
|
+
/* Uses non-default rounding for vec_signed or vec_round */
|
127
|
+
const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
|
105
128
|
|
106
129
|
y[i].qs[4*j + 0] = vec_extract(vi, 0);
|
107
130
|
y[i].qs[4*j + 1] = vec_extract(vi, 1);
|
@@ -141,55 +164,45 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
141
164
|
float sumf = 0;
|
142
165
|
|
143
166
|
#if defined(__VXE__) || defined(__VXE2__)
|
144
|
-
|
167
|
+
float32x4_t acc = vec_splats(0.0f);
|
145
168
|
|
146
|
-
const
|
147
|
-
const
|
169
|
+
const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
|
170
|
+
const int8x16_t v_s = vec_splats( (const int8_t)0x08);
|
148
171
|
|
149
172
|
for (; ib < nb; ++ib) {
|
150
|
-
const
|
151
|
-
const
|
152
|
-
const
|
173
|
+
const uint8x16_t v_x = vec_xl(0, x[ib].qs);
|
174
|
+
const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
|
175
|
+
const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
|
153
176
|
|
154
|
-
const
|
155
|
-
const
|
177
|
+
const int8x16_t v_xls = vec_sub(v_xl, v_s);
|
178
|
+
const int8x16_t v_xhs = vec_sub(v_xh, v_s);
|
156
179
|
|
157
|
-
const
|
158
|
-
const
|
180
|
+
const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
|
181
|
+
const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
|
159
182
|
|
160
|
-
const
|
161
|
-
const
|
162
|
-
const
|
163
|
-
const
|
183
|
+
const int16x8_t v_xylso = vec_mulo(v_xls, v_yl);
|
184
|
+
const int16x8_t v_xylse = vec_mule(v_xls, v_yl);
|
185
|
+
const int16x8_t v_xyhso = vec_mulo(v_xhs, v_yh);
|
186
|
+
const int16x8_t v_xyhse = vec_mule(v_xhs, v_yh);
|
164
187
|
|
165
|
-
|
188
|
+
int16x8_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
|
166
189
|
|
167
|
-
const
|
168
|
-
const
|
190
|
+
const float32x4_t v_xy = vec_float(vec_unpackh(v_xy_));
|
191
|
+
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
|
169
192
|
|
170
193
|
acc = vec_madd(v_xy, v_d, acc);
|
171
194
|
}
|
172
195
|
|
173
|
-
sumf = acc
|
174
|
-
|
175
|
-
#endif
|
176
|
-
for (; ib < nb; ++ib) {
|
177
|
-
int sumi0 = 0;
|
178
|
-
int sumi1 = 0;
|
179
|
-
|
180
|
-
for (int j = 0; j < qk/2; ++j) {
|
181
|
-
const int v0 = (x[ib].qs[j] & 0x0F) - 8;
|
182
|
-
const int v1 = (x[ib].qs[j] >> 4) - 8;
|
183
|
-
|
184
|
-
sumi0 += (v0 * y[ib].qs[j]);
|
185
|
-
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
186
|
-
}
|
187
|
-
|
188
|
-
int sumi = sumi0 + sumi1;
|
189
|
-
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
190
|
-
}
|
191
|
-
|
196
|
+
sumf = vec_hsum_f32x4(acc);
|
192
197
|
*s = sumf;
|
198
|
+
#else
|
199
|
+
UNUSED(nb);
|
200
|
+
UNUSED(x);
|
201
|
+
UNUSED(y);
|
202
|
+
UNUSED(ib);
|
203
|
+
UNUSED(sumf);
|
204
|
+
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
205
|
+
#endif
|
193
206
|
}
|
194
207
|
|
195
208
|
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
@@ -237,26 +250,406 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
237
250
|
acc = vec_madd(v_xy, v_d, acc);
|
238
251
|
}
|
239
252
|
|
240
|
-
sumf = acc
|
253
|
+
sumf = vec_hsum_f32x4(acc) + summs;
|
254
|
+
*s = sumf;
|
255
|
+
#else
|
256
|
+
UNUSED(nb);
|
257
|
+
UNUSED(x);
|
258
|
+
UNUSED(y);
|
259
|
+
UNUSED(ib);
|
260
|
+
UNUSED(sumf);
|
261
|
+
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
262
|
+
#endif
|
263
|
+
}
|
264
|
+
|
265
|
+
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
266
|
+
assert(nrc == 1);
|
267
|
+
UNUSED(nrc);
|
268
|
+
UNUSED(bx);
|
269
|
+
UNUSED(by);
|
270
|
+
UNUSED(bs);
|
271
|
+
assert(n % QK_MXFP4 == 0);
|
272
|
+
static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
|
273
|
+
|
274
|
+
const int qk = QK_MXFP4;
|
275
|
+
const int nb = n / qk;
|
276
|
+
|
277
|
+
const block_mxfp4 * GGML_RESTRICT x = vx;
|
278
|
+
const block_q8_0 * GGML_RESTRICT y = vy;
|
279
|
+
|
280
|
+
int ib = 0;
|
281
|
+
float sumf = 0.0f;
|
282
|
+
|
283
|
+
#if defined(__VXE__) || defined(__VXE2__)
|
284
|
+
const int8x16_t v_k = vec_xl(0, kvalues_mxfp4);
|
285
|
+
const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
|
286
|
+
|
287
|
+
float32x4_t v_acc = vec_splats(0.0f);
|
288
|
+
|
289
|
+
#pragma GCC unroll 8
|
290
|
+
for (; ib + 1 < nb; ib += 2) {
|
291
|
+
const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
|
292
|
+
const block_mxfp4 * GGML_RESTRICT x1 = &x[ib + 1];
|
293
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
294
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
295
|
+
|
296
|
+
const uint8x16_t v_x0 = vec_xl(0, x0->qs);
|
297
|
+
const uint8x16_t v_x1 = vec_xl(0, x1->qs);
|
298
|
+
|
299
|
+
int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
|
300
|
+
int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
|
301
|
+
int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
|
302
|
+
int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
|
303
|
+
|
304
|
+
v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
|
305
|
+
v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
|
306
|
+
v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
|
307
|
+
v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
|
308
|
+
|
309
|
+
const int8x16_t v_y0l = vec_xl(0, y0->qs);
|
310
|
+
const int8x16_t v_y0h = vec_xl(QK8_0/2, y0->qs);
|
311
|
+
const int8x16_t v_y1l = vec_xl(0, y1->qs);
|
312
|
+
const int8x16_t v_y1h = vec_xl(QK8_0/2, y1->qs);
|
313
|
+
|
314
|
+
const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0l), v_x0h, v_y0h);
|
315
|
+
const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y1l), v_x1h, v_y1h);
|
316
|
+
|
317
|
+
const float32x4_t v_xy0f = vec_float(v_xy0);
|
318
|
+
const float32x4_t v_xy1f = vec_float(v_xy1);
|
319
|
+
|
320
|
+
const float32x4_t v_d0 = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
|
321
|
+
const float32x4_t v_d1 = vec_splats(GGML_E8M0_TO_FP32_HALF(x1->e) * GGML_CPU_FP16_TO_FP32(y1->d));
|
322
|
+
|
323
|
+
v_acc = vec_madd(v_xy0f, v_d0, v_acc);
|
324
|
+
v_acc = vec_madd(v_xy1f, v_d1, v_acc);
|
325
|
+
}
|
326
|
+
|
327
|
+
for (; ib < nb; ++ib) {
|
328
|
+
const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
|
329
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
330
|
+
|
331
|
+
const uint8x16_t v_x = vec_xl(0, x0->qs);
|
332
|
+
|
333
|
+
int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
|
334
|
+
int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
|
335
|
+
|
336
|
+
v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
|
337
|
+
v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
|
338
|
+
|
339
|
+
const int8x16_t v_yl = vec_xl(0, y0->qs);
|
340
|
+
const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
|
241
341
|
|
342
|
+
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
|
343
|
+
const float32x4_t v_xyf = vec_float(v_xy);
|
344
|
+
|
345
|
+
const float32x4_t v_d = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
|
346
|
+
v_acc = vec_madd(v_xyf, v_d, v_acc);
|
347
|
+
}
|
348
|
+
|
349
|
+
sumf = vec_hsum_f32x4(v_acc);
|
350
|
+
*s = sumf;
|
351
|
+
#else
|
352
|
+
UNUSED(x);
|
353
|
+
UNUSED(y);
|
354
|
+
UNUSED(ib);
|
355
|
+
UNUSED(sumf);
|
356
|
+
ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
242
357
|
#endif
|
358
|
+
}
|
359
|
+
|
360
|
+
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
361
|
+
const int qk = QK8_0;
|
362
|
+
const int nb = n / qk;
|
363
|
+
|
364
|
+
assert(n % qk == 0);
|
365
|
+
assert(qk == QK5_0);
|
366
|
+
assert(nrc == 1);
|
367
|
+
UNUSED(nrc);
|
368
|
+
UNUSED(bx);
|
369
|
+
UNUSED(by);
|
370
|
+
UNUSED(bs);
|
371
|
+
|
372
|
+
const block_q5_0 * GGML_RESTRICT x = vx;
|
373
|
+
const block_q8_0 * GGML_RESTRICT y = vy;
|
374
|
+
|
375
|
+
int ib = 0;
|
376
|
+
float sumf = 0.0f;
|
377
|
+
|
378
|
+
#if defined(__VXE__) || defined(__VXE2__)
|
379
|
+
float32x4_t v_sum0 = vec_splats(0.0f);
|
380
|
+
float32x4_t v_sum1 = vec_splats(0.0f);
|
381
|
+
|
382
|
+
uint32_t qh0, qh1;
|
383
|
+
uint64_t tmp0[4], tmp1[4];
|
384
|
+
|
385
|
+
const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
|
386
|
+
|
387
|
+
#pragma GCC unroll 4
|
388
|
+
for (; ib + 1 < nb; ib += 2) {
|
389
|
+
const block_q5_0 * GGML_RESTRICT x0 = &x[ib + 0];
|
390
|
+
const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
391
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
392
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
393
|
+
|
394
|
+
memcpy(&qh0, x0->qh, sizeof(qh0));
|
395
|
+
memcpy(&qh1, x1->qh, sizeof(qh1));
|
396
|
+
|
397
|
+
tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF];
|
398
|
+
tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF];
|
399
|
+
tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
|
400
|
+
tmp0[3] = table_b2b_1[(qh0 >> 24) ];
|
401
|
+
|
402
|
+
tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF];
|
403
|
+
tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF];
|
404
|
+
tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
|
405
|
+
tmp1[3] = table_b2b_1[(qh1 >> 24) ];
|
406
|
+
|
407
|
+
int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0));
|
408
|
+
int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2));
|
409
|
+
int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0));
|
410
|
+
int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2));
|
411
|
+
|
412
|
+
// required for fixing the byteorder
|
413
|
+
v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm);
|
414
|
+
v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm);
|
415
|
+
v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm);
|
416
|
+
v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm);
|
417
|
+
|
418
|
+
const uint8x16_t v_x0 = vec_xl(0, (const uint8_t *)x0->qs);
|
419
|
+
const uint8x16_t v_x1 = vec_xl(0, (const uint8_t *)x1->qs);
|
420
|
+
|
421
|
+
int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
|
422
|
+
int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
|
423
|
+
int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
|
424
|
+
int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
|
425
|
+
|
426
|
+
const int8x16_t v_x0lf = vec_sub(v_x0l, v_qh0l);
|
427
|
+
const int8x16_t v_x0hf = vec_sub(v_x0h, v_qh0h);
|
428
|
+
const int8x16_t v_x1lf = vec_sub(v_x1l, v_qh1l);
|
429
|
+
const int8x16_t v_x1hf = vec_sub(v_x1h, v_qh1h);
|
430
|
+
|
431
|
+
const int8x16_t v_y0l = vec_xl(0, (const int8_t *)y0->qs);
|
432
|
+
const int8x16_t v_y0h = vec_xl(QK8_0/2, (const int8_t *)y0->qs);
|
433
|
+
const int8x16_t v_y1l = vec_xl(0, (const int8_t *)y1->qs);
|
434
|
+
const int8x16_t v_y1h = vec_xl(QK8_0/2, (const int8_t *)y1->qs);
|
435
|
+
|
436
|
+
const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
|
437
|
+
const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
|
438
|
+
|
439
|
+
const float32x4_t v_xy0f = vec_float(v_xy0);
|
440
|
+
const float32x4_t v_xy1f = vec_float(v_xy1);
|
441
|
+
|
442
|
+
const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
|
443
|
+
const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d));
|
444
|
+
|
445
|
+
v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0);
|
446
|
+
v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
|
447
|
+
}
|
448
|
+
|
449
|
+
sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1);
|
450
|
+
|
451
|
+
#pragma GCC unroll 4
|
243
452
|
for (; ib < nb; ++ib) {
|
244
|
-
|
245
|
-
|
453
|
+
const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
|
454
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
|
246
455
|
|
247
|
-
|
248
|
-
|
249
|
-
const int v1 = (x[ib].qs[j] >> 4);
|
456
|
+
uint32_t qh;
|
457
|
+
memcpy(&qh, x0->qh, sizeof(qh));
|
250
458
|
|
251
|
-
|
252
|
-
|
253
|
-
|
459
|
+
uint64_t tmp[4];
|
460
|
+
tmp[0] = table_b2b_1[(qh >> 0) & 0xFF];
|
461
|
+
tmp[1] = table_b2b_1[(qh >> 8) & 0xFF];
|
462
|
+
tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
|
463
|
+
tmp[3] = table_b2b_1[(qh >> 24) ];
|
464
|
+
|
465
|
+
int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0));
|
466
|
+
int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2));
|
467
|
+
|
468
|
+
// required for fixing the byteorder
|
469
|
+
v_qhl = vec_perm(v_qhl, v_qhl, v_kperm);
|
470
|
+
v_qhh = vec_perm(v_qhh, v_qhh, v_kperm);
|
471
|
+
|
472
|
+
const uint8x16_t v_x = vec_xl(0, (const uint8_t *)x0->qs);
|
473
|
+
int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
|
474
|
+
int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
|
475
|
+
|
476
|
+
const int8x16_t v_xlf = vec_sub(v_xl, v_qhl);
|
477
|
+
const int8x16_t v_xhf = vec_sub(v_xh, v_qhh);
|
478
|
+
|
479
|
+
const int8x16_t v_yl = vec_xl(0, (const int8_t *)y0->qs);
|
480
|
+
const int8x16_t v_yh = vec_xl(QK8_0/2, (const int8_t *)y0->qs);
|
254
481
|
|
255
|
-
|
256
|
-
|
482
|
+
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
|
483
|
+
const float32x4_t v_xyf = vec_float(v_xy);
|
484
|
+
|
485
|
+
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
|
486
|
+
const float32x4_t v_acc = vec_madd(v_xyf, v_d, vec_splats(0.0f));
|
487
|
+
|
488
|
+
sumf += vec_hsum_f32x4(v_acc);
|
257
489
|
}
|
258
490
|
|
259
491
|
*s = sumf;
|
492
|
+
#else
|
493
|
+
UNUSED(nb);
|
494
|
+
UNUSED(x);
|
495
|
+
UNUSED(y);
|
496
|
+
UNUSED(ib);
|
497
|
+
UNUSED(sumf);
|
498
|
+
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
499
|
+
#endif
|
500
|
+
}
|
501
|
+
|
502
|
+
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
503
|
+
const int qk = QK8_1;
|
504
|
+
const int nb = n / qk;
|
505
|
+
|
506
|
+
assert(n % qk == 0);
|
507
|
+
assert(qk == QK5_1);
|
508
|
+
assert(nrc == 1);
|
509
|
+
UNUSED(nrc);
|
510
|
+
UNUSED(bx);
|
511
|
+
UNUSED(by);
|
512
|
+
UNUSED(bs);
|
513
|
+
|
514
|
+
const block_q5_1 * GGML_RESTRICT x = vx;
|
515
|
+
const block_q8_1 * GGML_RESTRICT y = vy;
|
516
|
+
|
517
|
+
int ib = 0;
|
518
|
+
float sumf = 0.0f;
|
519
|
+
|
520
|
+
#if defined(__VXE__) || defined(__VXE2__)
|
521
|
+
float32x4_t v_sum0 = vec_splats(0.0f);
|
522
|
+
float32x4_t v_sum1 = vec_splats(0.0f);
|
523
|
+
|
524
|
+
float summs0 = 0.0f;
|
525
|
+
float summs1 = 0.0f;
|
526
|
+
|
527
|
+
uint32_t qh0;
|
528
|
+
uint32_t qh1;
|
529
|
+
|
530
|
+
uint64_t tmp0[4];
|
531
|
+
uint64_t tmp1[4];
|
532
|
+
|
533
|
+
const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
|
534
|
+
|
535
|
+
#pragma GCC unroll 4
|
536
|
+
for (; ib + 1 < nb; ib += 2) {
|
537
|
+
const block_q5_1 * GGML_RESTRICT x0 = &x[ib + 0];
|
538
|
+
const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1];
|
539
|
+
const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
|
540
|
+
const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
|
541
|
+
|
542
|
+
summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
|
543
|
+
summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s);
|
544
|
+
|
545
|
+
memcpy(&qh0, x0->qh, sizeof(qh0));
|
546
|
+
memcpy(&qh1, x1->qh, sizeof(qh1));
|
547
|
+
|
548
|
+
tmp0[0] = table_b2b_0[(qh0 >> 0) & 0xFF];
|
549
|
+
tmp0[1] = table_b2b_0[(qh0 >> 8) & 0xFF];
|
550
|
+
tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
|
551
|
+
tmp0[3] = table_b2b_0[(qh0 >> 24) ];
|
552
|
+
|
553
|
+
tmp1[0] = table_b2b_0[(qh1 >> 0) & 0xFF];
|
554
|
+
tmp1[1] = table_b2b_0[(qh1 >> 8) & 0xFF];
|
555
|
+
tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
|
556
|
+
tmp1[3] = table_b2b_0[(qh1 >> 24) ];
|
557
|
+
|
558
|
+
int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0));
|
559
|
+
int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2));
|
560
|
+
int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0));
|
561
|
+
int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2));
|
562
|
+
|
563
|
+
// required for fixing the byteorder
|
564
|
+
v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm);
|
565
|
+
v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm);
|
566
|
+
v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm);
|
567
|
+
v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm);
|
568
|
+
|
569
|
+
const uint8x16_t v_x0 = vec_xl(0, x0->qs);
|
570
|
+
const uint8x16_t v_x1 = vec_xl(0, x1->qs);
|
571
|
+
|
572
|
+
const int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
|
573
|
+
const int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
|
574
|
+
const int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
|
575
|
+
const int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
|
576
|
+
|
577
|
+
const int8x16_t v_x0lf = vec_or(v_x0l, v_qh0l);
|
578
|
+
const int8x16_t v_x0hf = vec_or(v_x0h, v_qh0h);
|
579
|
+
const int8x16_t v_x1lf = vec_or(v_x1l, v_qh1l);
|
580
|
+
const int8x16_t v_x1hf = vec_or(v_x1h, v_qh1h);
|
581
|
+
|
582
|
+
const int8x16_t v_y0l = vec_xl(0 , y0->qs);
|
583
|
+
const int8x16_t v_y0h = vec_xl(QK8_1/2, y0->qs);
|
584
|
+
const int8x16_t v_y1l = vec_xl(0 , y1->qs);
|
585
|
+
const int8x16_t v_y1h = vec_xl(QK8_1/2, y1->qs);
|
586
|
+
|
587
|
+
const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
|
588
|
+
const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
|
589
|
+
|
590
|
+
const float32x4_t v_xy0f = vec_float(v_xy0);
|
591
|
+
const float32x4_t v_xy1f = vec_float(v_xy1);
|
592
|
+
|
593
|
+
const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
|
594
|
+
const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d));
|
595
|
+
|
596
|
+
v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0);
|
597
|
+
v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
|
598
|
+
}
|
599
|
+
|
600
|
+
sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1) + summs0 + summs1;
|
601
|
+
|
602
|
+
#pragma GCC unroll 4
|
603
|
+
for (; ib < nb; ++ib) {
|
604
|
+
const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
|
605
|
+
const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
|
606
|
+
|
607
|
+
float summs = GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
|
608
|
+
|
609
|
+
uint32_t qh;
|
610
|
+
memcpy(&qh, x0->qh, sizeof(qh));
|
611
|
+
|
612
|
+
uint64_t tmp[4];
|
613
|
+
tmp[0] = table_b2b_0[(qh >> 0) & 0xFF];
|
614
|
+
tmp[1] = table_b2b_0[(qh >> 8) & 0xFF];
|
615
|
+
tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
|
616
|
+
tmp[3] = table_b2b_0[(qh >> 24) ];
|
617
|
+
|
618
|
+
int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0));
|
619
|
+
int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2));
|
620
|
+
|
621
|
+
// required for fixing the byteorder
|
622
|
+
v_qhl = vec_perm(v_qhl, v_qhl, v_kperm);
|
623
|
+
v_qhh = vec_perm(v_qhh, v_qhh, v_kperm);
|
624
|
+
|
625
|
+
const uint8x16_t v_x = vec_xl(0, x0->qs);
|
626
|
+
const int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
|
627
|
+
const int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
|
628
|
+
|
629
|
+
const int8x16_t v_xlf = vec_or(v_xl, v_qhl);
|
630
|
+
const int8x16_t v_xhf = vec_or(v_xh, v_qhh);
|
631
|
+
|
632
|
+
const int8x16_t v_yl = vec_xl(0 , y0->qs);
|
633
|
+
const int8x16_t v_yh = vec_xl(QK8_1/2, y0->qs);
|
634
|
+
|
635
|
+
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
|
636
|
+
const float32x4_t v_xyf = vec_float(v_xy);
|
637
|
+
|
638
|
+
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
|
639
|
+
const float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc);
|
640
|
+
|
641
|
+
sumf += vec_hsum_f32x4(v_acc) + summs;
|
642
|
+
}
|
643
|
+
|
644
|
+
*s = sumf;
|
645
|
+
#else
|
646
|
+
UNUSED(nb);
|
647
|
+
UNUSED(x);
|
648
|
+
UNUSED(y);
|
649
|
+
UNUSED(ib);
|
650
|
+
UNUSED(sumf);
|
651
|
+
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
652
|
+
#endif
|
260
653
|
}
|
261
654
|
|
262
655
|
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
@@ -277,7 +670,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
277
670
|
float sumf = 0;
|
278
671
|
|
279
672
|
#if defined(__VXE__) || defined(__VXE2__)
|
280
|
-
|
673
|
+
float32x4_t acc = vec_splats(0.0f);
|
281
674
|
|
282
675
|
#pragma GCC unroll 8
|
283
676
|
for (; ib < nb; ++ib) {
|
@@ -296,20 +689,17 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
296
689
|
acc = vec_madd(v_xy, v_d, acc);
|
297
690
|
}
|
298
691
|
|
299
|
-
sumf = acc
|
300
|
-
|
301
|
-
#endif
|
302
|
-
for (; ib < nb; ++ib) {
|
303
|
-
int sumi = 0;
|
304
|
-
|
305
|
-
for (int j = 0; j < qk; j++) {
|
306
|
-
sumi += x[ib].qs[j]*y[ib].qs[j];
|
307
|
-
}
|
308
|
-
|
309
|
-
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
310
|
-
}
|
692
|
+
sumf = vec_hsum_f32x4(acc);
|
311
693
|
|
312
694
|
*s = sumf;
|
695
|
+
#else
|
696
|
+
UNUSED(nb);
|
697
|
+
UNUSED(x);
|
698
|
+
UNUSED(y);
|
699
|
+
UNUSED(ib);
|
700
|
+
UNUSED(sumf);
|
701
|
+
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
702
|
+
#endif
|
313
703
|
}
|
314
704
|
|
315
705
|
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
@@ -343,7 +733,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
343
733
|
uint8x16_t q3h[4];
|
344
734
|
uint8x16_t q3b[2];
|
345
735
|
int8x16_t q3bytes[4];
|
346
|
-
int8x16_t q8bytes[
|
736
|
+
int8x16_t q8bytes[8];
|
347
737
|
uint8x16_t qhbits[2];
|
348
738
|
|
349
739
|
float sum = 0;
|
@@ -423,10 +813,10 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
423
813
|
isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
|
424
814
|
isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
|
425
815
|
|
426
|
-
isum += (isum0
|
427
|
-
isum += (isum1
|
428
|
-
isum += (isum2
|
429
|
-
isum += (isum3
|
816
|
+
isum += vec_hsum_i32x4(isum0) * scale[0];
|
817
|
+
isum += vec_hsum_i32x4(isum1) * scale[1];
|
818
|
+
isum += vec_hsum_i32x4(isum2) * scale[2];
|
819
|
+
isum += vec_hsum_i32x4(isum3) * scale[3];
|
430
820
|
|
431
821
|
scale += 4;
|
432
822
|
|
@@ -442,70 +832,13 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
442
832
|
*s = sum;
|
443
833
|
|
444
834
|
#else
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
// write vectorized versions for AVX, ARM_NEON, etc.
|
452
|
-
|
453
|
-
int8_t aux8[QK_K];
|
454
|
-
int16_t aux16[8];
|
455
|
-
float sums [8];
|
456
|
-
int32_t aux32[8];
|
457
|
-
memset(sums, 0, 8*sizeof(float));
|
458
|
-
|
459
|
-
uint32_t auxs[4];
|
460
|
-
const int8_t * scales = (const int8_t*)auxs;
|
461
|
-
|
462
|
-
float sumf = 0;
|
463
|
-
for (int i = 0; i < nb; ++i) {
|
464
|
-
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
465
|
-
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
466
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
467
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
468
|
-
int8_t * GGML_RESTRICT a = aux8;
|
469
|
-
uint8_t m = 1;
|
470
|
-
for (int j = 0; j < QK_K; j += 128) {
|
471
|
-
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
472
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
473
|
-
a += 32; m <<= 1;
|
474
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
475
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
476
|
-
a += 32; m <<= 1;
|
477
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
478
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
479
|
-
a += 32; m <<= 1;
|
480
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
481
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
482
|
-
a += 32; m <<= 1;
|
483
|
-
q3 += 32;
|
484
|
-
}
|
485
|
-
a = aux8;
|
486
|
-
|
487
|
-
memcpy(auxs, x[i].scales, 12);
|
488
|
-
uint32_t tmp = auxs[2];
|
489
|
-
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
490
|
-
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
491
|
-
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
492
|
-
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
493
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
494
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
495
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
496
|
-
q8 += 8; a += 8;
|
497
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
498
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
499
|
-
q8 += 8; a += 8;
|
500
|
-
}
|
501
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
502
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
503
|
-
}
|
504
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
505
|
-
*s = sumf;
|
506
|
-
|
835
|
+
UNUSED(kmask1);
|
836
|
+
UNUSED(kmask2);
|
837
|
+
UNUSED(x);
|
838
|
+
UNUSED(y);
|
839
|
+
UNUSED(nb);
|
840
|
+
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
507
841
|
#endif
|
508
|
-
|
509
842
|
}
|
510
843
|
|
511
844
|
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
@@ -581,7 +914,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
581
914
|
v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
|
582
915
|
|
583
916
|
const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
|
584
|
-
sumi1 += (p1
|
917
|
+
sumi1 += vec_hsum_i32x4(p1) * scales[2*j+0];
|
585
918
|
|
586
919
|
v_y[0] = vec_xl(0 , y0);
|
587
920
|
v_y[1] = vec_xl(16, y0);
|
@@ -591,7 +924,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
591
924
|
v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
|
592
925
|
|
593
926
|
const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
|
594
|
-
sumi2 += (p2
|
927
|
+
sumi2 += vec_hsum_i32x4(p2) * scales[2*j+1];
|
595
928
|
}
|
596
929
|
|
597
930
|
sumf += d * (sumi1 + sumi2);
|
@@ -600,61 +933,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
600
933
|
*s = sumf;
|
601
934
|
|
602
935
|
#else
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
memset(sums, 0, 8*sizeof(float));
|
612
|
-
|
613
|
-
float sumf = 0;
|
614
|
-
for (int i = 0; i < nb; ++i) {
|
615
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
616
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
617
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
618
|
-
int8_t * GGML_RESTRICT a = aux8;
|
619
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
620
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
621
|
-
a += 32;
|
622
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
623
|
-
a += 32; q4 += 32;
|
624
|
-
}
|
625
|
-
memcpy(utmp, x[i].scales, 12);
|
626
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
627
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
628
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
629
|
-
utmp[2] = uaux;
|
630
|
-
utmp[0] &= kmask1;
|
631
|
-
|
632
|
-
int sumi = 0;
|
633
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
634
|
-
a = aux8;
|
635
|
-
int is = 0;
|
636
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
637
|
-
int32_t scale = scales[is++];
|
638
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
639
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
640
|
-
q8 += 8; a += 8;
|
641
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
642
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
643
|
-
q8 += 8; a += 8;
|
644
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
645
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
646
|
-
q8 += 8; a += 8;
|
647
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
648
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
649
|
-
q8 += 8; a += 8;
|
650
|
-
}
|
651
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
652
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
653
|
-
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
654
|
-
sumf -= dmin * sumi;
|
655
|
-
}
|
656
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
657
|
-
*s = sumf;
|
936
|
+
UNUSED(x);
|
937
|
+
UNUSED(y);
|
938
|
+
UNUSED(nb);
|
939
|
+
UNUSED(kmask1);
|
940
|
+
UNUSED(kmask2);
|
941
|
+
UNUSED(kmask3);
|
942
|
+
UNUSED(utmp);
|
943
|
+
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
658
944
|
#endif
|
659
945
|
}
|
660
946
|
|
@@ -720,7 +1006,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
720
1006
|
const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
|
721
1007
|
const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
|
722
1008
|
const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
|
723
|
-
const int32_t mins = v_mins
|
1009
|
+
const int32_t mins = vec_hsum_i32x4(v_mins);
|
724
1010
|
|
725
1011
|
const uint8_t * scales = (const uint8_t *)utmp;
|
726
1012
|
const uint8_t * GGML_RESTRICT x0l = x[i].qs;
|
@@ -757,8 +1043,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
757
1043
|
int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
|
758
1044
|
int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
|
759
1045
|
|
760
|
-
sumi += (sumi0
|
761
|
-
sumi += (sumi1
|
1046
|
+
sumi += vec_hsum_i32x4(sumi0) * *scales++;
|
1047
|
+
sumi += vec_hsum_i32x4(sumi1) * *scales++;
|
762
1048
|
}
|
763
1049
|
|
764
1050
|
sumf += d * sumi - dmin * mins;
|
@@ -767,66 +1053,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
767
1053
|
*s = sumf;
|
768
1054
|
|
769
1055
|
#else
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
memset(sums, 0, 8*sizeof(float));
|
779
|
-
|
780
|
-
float sumf = 0;
|
781
|
-
for (int i = 0; i < nb; ++i) {
|
782
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
783
|
-
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
784
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
785
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
786
|
-
int8_t * GGML_RESTRICT a = aux8;
|
787
|
-
uint8_t m = 1;
|
788
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
789
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
790
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
791
|
-
a += 32; m <<= 1;
|
792
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
793
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
794
|
-
a += 32; m <<= 1;
|
795
|
-
q4 += 32;
|
796
|
-
}
|
797
|
-
memcpy(utmp, x[i].scales, 12);
|
798
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
799
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
800
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
801
|
-
utmp[2] = uaux;
|
802
|
-
utmp[0] &= kmask1;
|
803
|
-
|
804
|
-
int sumi = 0;
|
805
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
806
|
-
a = aux8;
|
807
|
-
int is = 0;
|
808
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
809
|
-
int32_t scale = scales[is++];
|
810
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
811
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
812
|
-
q8 += 8; a += 8;
|
813
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
814
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
815
|
-
q8 += 8; a += 8;
|
816
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
817
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
818
|
-
q8 += 8; a += 8;
|
819
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
820
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
821
|
-
q8 += 8; a += 8;
|
822
|
-
}
|
823
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
824
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
825
|
-
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
826
|
-
sumf -= dmin * sumi;
|
827
|
-
}
|
828
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
829
|
-
*s = sumf;
|
1056
|
+
UNUSED(x);
|
1057
|
+
UNUSED(y);
|
1058
|
+
UNUSED(nb);
|
1059
|
+
UNUSED(kmask1);
|
1060
|
+
UNUSED(kmask2);
|
1061
|
+
UNUSED(kmask3);
|
1062
|
+
UNUSED(utmp);
|
1063
|
+
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
830
1064
|
#endif
|
831
1065
|
}
|
832
1066
|
|
@@ -881,7 +1115,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
881
1115
|
const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
|
882
1116
|
const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
|
883
1117
|
|
884
|
-
const int32_t mins = v_mins
|
1118
|
+
const int32_t mins = vec_hsum_i32x4(v_mins);
|
885
1119
|
|
886
1120
|
int32_t isum = 0;
|
887
1121
|
for (int j = 0; j < QK_K/128; ++j) {
|
@@ -921,10 +1155,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
921
1155
|
int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
|
922
1156
|
int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
|
923
1157
|
|
924
|
-
isum += (summs0
|
925
|
-
(summs1
|
926
|
-
(summs2
|
927
|
-
(summs3
|
1158
|
+
isum += vec_hsum_i32x4(summs0) * scale[0] +
|
1159
|
+
vec_hsum_i32x4(summs1) * scale[1] +
|
1160
|
+
vec_hsum_i32x4(summs2) * scale[2] +
|
1161
|
+
vec_hsum_i32x4(summs3) * scale[3];
|
928
1162
|
|
929
1163
|
scale += 4;
|
930
1164
|
|
@@ -955,10 +1189,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
955
1189
|
summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
|
956
1190
|
summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
|
957
1191
|
|
958
|
-
isum += (summs0
|
959
|
-
(summs1
|
960
|
-
(summs2
|
961
|
-
(summs3
|
1192
|
+
isum += vec_hsum_i32x4(summs0) * scale[0] +
|
1193
|
+
vec_hsum_i32x4(summs1) * scale[1] +
|
1194
|
+
vec_hsum_i32x4(summs2) * scale[2] +
|
1195
|
+
vec_hsum_i32x4(summs3) * scale[3];
|
962
1196
|
|
963
1197
|
scale += 4;
|
964
1198
|
}
|
@@ -969,47 +1203,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
969
1203
|
*s = sum;
|
970
1204
|
|
971
1205
|
#else
|
972
|
-
|
973
|
-
|
974
|
-
|
975
|
-
|
976
|
-
int32_t aux32[8];
|
977
|
-
memset(sums, 0, 8*sizeof(float));
|
978
|
-
|
979
|
-
float sumf = 0;
|
980
|
-
for (int i = 0; i < nb; ++i) {
|
981
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
982
|
-
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
983
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
984
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
985
|
-
int8_t * GGML_RESTRICT a = aux8;
|
986
|
-
for (int j = 0; j < QK_K; j += 128) {
|
987
|
-
for (int l = 0; l < 32; ++l) {
|
988
|
-
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
989
|
-
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
990
|
-
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
991
|
-
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
992
|
-
}
|
993
|
-
a += 128;
|
994
|
-
q4 += 64;
|
995
|
-
qh += 32;
|
996
|
-
}
|
997
|
-
a = aux8;
|
998
|
-
int is = 0;
|
999
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
1000
|
-
int scale = x[i].scales[is++];
|
1001
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
1002
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
1003
|
-
q8 += 8; a += 8;
|
1004
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
1005
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
1006
|
-
q8 += 8; a += 8;
|
1007
|
-
}
|
1008
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
1009
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
1010
|
-
}
|
1011
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
1012
|
-
*s = sumf;
|
1206
|
+
UNUSED(x);
|
1207
|
+
UNUSED(y);
|
1208
|
+
UNUSED(nb);
|
1209
|
+
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
1013
1210
|
#endif
|
1014
1211
|
}
|
1015
1212
|
|
@@ -1183,20 +1380,18 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
1183
1380
|
const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
|
1184
1381
|
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
|
1185
1382
|
|
1186
|
-
sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy
|
1383
|
+
sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * vec_hsum_i32x4(v_xy);
|
1187
1384
|
}
|
1188
1385
|
|
1189
|
-
#endif
|
1190
|
-
for (; ib < nb; ++ib) {
|
1191
|
-
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
1192
|
-
int sumi1 = 0, sumi2 = 0;
|
1193
|
-
for (int j = 0; j < QK4_NL/2; ++j) {
|
1194
|
-
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
1195
|
-
sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
|
1196
|
-
}
|
1197
|
-
sumf += d * (sumi1 + sumi2);
|
1198
|
-
}
|
1199
1386
|
*s = sumf;
|
1387
|
+
#else
|
1388
|
+
UNUSED(x);
|
1389
|
+
UNUSED(y);
|
1390
|
+
UNUSED(nb);
|
1391
|
+
UNUSED(ib);
|
1392
|
+
UNUSED(sumf);
|
1393
|
+
ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
1394
|
+
#endif
|
1200
1395
|
}
|
1201
1396
|
|
1202
1397
|
void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
@@ -1254,8 +1449,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
1254
1449
|
|
1255
1450
|
h >>= 4;
|
1256
1451
|
|
1257
|
-
sumi1 += (vsumi0
|
1258
|
-
sumi2 += (vsumi1
|
1452
|
+
sumi1 += vec_hsum_i32x4(vsumi0) * ls1;
|
1453
|
+
sumi2 += vec_hsum_i32x4(vsumi1) * ls2;
|
1259
1454
|
}
|
1260
1455
|
|
1261
1456
|
sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
|
@@ -1264,37 +1459,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
1264
1459
|
*s = sumf;
|
1265
1460
|
|
1266
1461
|
#else
|
1267
|
-
|
1268
|
-
|
1269
|
-
|
1270
|
-
|
1271
|
-
const uint8_t * qs = x[ibl].qs;
|
1272
|
-
const int8_t * q8 = y[ibl].qs;
|
1273
|
-
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
1274
|
-
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
1275
|
-
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
1276
|
-
h >>= 4;
|
1277
|
-
const float d1 = d4d8*(ls1 - 32);
|
1278
|
-
const float d2 = d4d8*(ls2 - 32);
|
1279
|
-
int sumi1 = 0, sumi2 = 0;
|
1280
|
-
for (int j = 0; j < 16; ++j) {
|
1281
|
-
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
1282
|
-
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
1283
|
-
}
|
1284
|
-
sumf += d1 * (sumi1 + sumi2);
|
1285
|
-
qs += 16;
|
1286
|
-
q8 += 32;
|
1287
|
-
sumi1 = sumi2 = 0;
|
1288
|
-
for (int j = 0; j < 16; ++j) {
|
1289
|
-
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
1290
|
-
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
1291
|
-
}
|
1292
|
-
sumf += d2 * (sumi1 + sumi2);
|
1293
|
-
qs += 16;
|
1294
|
-
q8 += 32;
|
1295
|
-
}
|
1296
|
-
}
|
1297
|
-
*s = sumf;
|
1462
|
+
UNUSED(x);
|
1463
|
+
UNUSED(y);
|
1464
|
+
UNUSED(nb);
|
1465
|
+
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
1298
1466
|
#endif
|
1299
1467
|
}
|
1300
1468
|
|