whispercpp 1.3.3 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -43
- data/ext/extconf.rb +2 -2
- data/ext/ruby_whisper.c +14 -2
- data/ext/ruby_whisper.h +39 -0
- data/ext/ruby_whisper_context.c +22 -22
- data/ext/ruby_whisper_model.c +12 -12
- data/ext/ruby_whisper_params.c +79 -25
- data/ext/ruby_whisper_segment.c +84 -19
- data/ext/ruby_whisper_token.c +351 -0
- data/ext/ruby_whisper_transcribe.cpp +1 -1
- data/ext/ruby_whisper_vad_context.c +75 -0
- data/ext/ruby_whisper_vad_context_detect.cpp +50 -0
- data/ext/ruby_whisper_vad_segment.c +139 -0
- data/ext/ruby_whisper_vad_segments.c +106 -0
- data/ext/sources/CMakeLists.txt +4 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/arm64-apple-clang.cmake +16 -0
- data/ext/sources/cmake/arm64-windows-llvm.cmake +16 -0
- data/ext/sources/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
- data/ext/sources/cmake/x64-windows-llvm.cmake +5 -0
- data/ext/sources/examples/CMakeLists.txt +1 -0
- data/ext/sources/examples/addon.node/addon.cpp +19 -19
- data/ext/sources/examples/addon.node/index.js +7 -5
- data/ext/sources/examples/addon.node/vad-example.js +2 -2
- data/ext/sources/examples/bench/bench.cpp +26 -16
- data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
- data/ext/sources/examples/cli/cli.cpp +122 -111
- data/ext/sources/examples/command/command.cpp +26 -24
- data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/lsp/CMakeLists.txt +2 -1
- data/ext/sources/examples/lsp/lsp.cpp +19 -17
- data/ext/sources/examples/quantize/CMakeLists.txt +2 -1
- data/ext/sources/examples/server/server.cpp +34 -24
- data/ext/sources/examples/server.py +6 -1
- data/ext/sources/examples/stream/stream.cpp +4 -2
- data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
- data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
- data/ext/sources/examples/talk-llama/CMakeLists.txt +7 -3
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +113 -7
- data/ext/sources/examples/talk-llama/llama-adapter.h +13 -1
- data/ext/sources/examples/talk-llama/llama-arch.cpp +2136 -1491
- data/ext/sources/examples/talk-llama/llama-arch.h +125 -3
- data/ext/sources/examples/talk-llama/llama-batch.cpp +174 -100
- data/ext/sources/examples/talk-llama/llama-batch.h +46 -20
- data/ext/sources/examples/talk-llama/llama-chat.cpp +199 -8
- data/ext/sources/examples/talk-llama/llama-chat.h +11 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +1213 -413
- data/ext/sources/examples/talk-llama/llama-context.h +99 -36
- data/ext/sources/examples/talk-llama/llama-cparams.h +5 -4
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +288 -53
- data/ext/sources/examples/talk-llama/llama-grammar.h +22 -1
- data/ext/sources/examples/talk-llama/llama-graph.cpp +883 -294
- data/ext/sources/examples/talk-llama/llama-graph.h +361 -161
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +144 -6
- data/ext/sources/examples/talk-llama/llama-hparams.h +100 -23
- data/ext/sources/examples/talk-llama/llama-impl.cpp +7 -3
- data/ext/sources/examples/talk-llama/llama-impl.h +3 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +328 -0
- data/ext/sources/examples/talk-llama/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +38 -29
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2100 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +373 -27
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +124 -30
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +63 -41
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +30 -29
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +77 -35
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +15 -16
- data/ext/sources/examples/talk-llama/llama-memory.h +16 -10
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +172 -37
- data/ext/sources/examples/talk-llama/llama-mmap.h +8 -3
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +93 -9
- data/ext/sources/examples/talk-llama/llama-model-loader.h +9 -2
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +3 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +3369 -10145
- data/ext/sources/examples/talk-llama/llama-model.h +104 -12
- data/ext/sources/examples/talk-llama/llama-quant.cpp +53 -30
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +1520 -324
- data/ext/sources/examples/talk-llama/llama-sampling.h +19 -7
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +562 -39
- data/ext/sources/examples/talk-llama/llama-vocab.h +50 -0
- data/ext/sources/examples/talk-llama/llama.cpp +794 -12
- data/ext/sources/examples/talk-llama/llama.h +246 -190
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +191 -0
- data/ext/sources/examples/talk-llama/models/apertus.cpp +125 -0
- data/ext/sources/examples/talk-llama/models/arcee.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/arctic.cpp +138 -0
- data/ext/sources/examples/talk-llama/models/arwkv7.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/bert.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +160 -0
- data/ext/sources/examples/talk-llama/models/bloom.cpp +101 -0
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +111 -0
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +102 -0
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +134 -0
- data/ext/sources/examples/talk-llama/models/command-r.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/deci.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +259 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +134 -0
- data/ext/sources/examples/talk-llama/models/dream.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +114 -0
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +113 -0
- data/ext/sources/examples/talk-llama/models/falcon.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +116 -0
- data/ext/sources/examples/talk-llama/models/gemma.cpp +112 -0
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +155 -0
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +384 -0
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +170 -0
- data/ext/sources/examples/talk-llama/models/glm4.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +196 -0
- data/ext/sources/examples/talk-llama/models/granite.cpp +211 -0
- data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +283 -0
- data/ext/sources/examples/talk-llama/models/grok.cpp +159 -0
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +141 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +154 -0
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/jais.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +106 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +175 -0
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/llada.cpp +99 -0
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/llama.cpp +168 -0
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/mamba.cpp +55 -0
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +199 -0
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +160 -0
- data/ext/sources/examples/talk-llama/models/models.h +569 -0
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +116 -0
- data/ext/sources/examples/talk-llama/models/mpt.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +104 -0
- data/ext/sources/examples/talk-llama/models/olmo.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +127 -0
- data/ext/sources/examples/talk-llama/models/openelm.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/orion.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi3.cpp +152 -0
- data/ext/sources/examples/talk-llama/models/plamo.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +316 -0
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/plm.cpp +168 -0
- data/ext/sources/examples/talk-llama/models/qwen.cpp +108 -0
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +151 -0
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +873 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +149 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +141 -0
- data/ext/sources/examples/talk-llama/models/refact.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +162 -0
- data/ext/sources/examples/talk-llama/models/rwkv6.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/rwkv7.cpp +90 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +146 -0
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +100 -0
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +166 -0
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +96 -0
- data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +149 -0
- data/ext/sources/examples/talk-llama/models/xverse.cpp +108 -0
- data/ext/sources/examples/talk-llama/talk-llama.cpp +9 -6
- data/ext/sources/examples/talk-llama/unicode.cpp +309 -16
- data/ext/sources/examples/talk-llama/unicode.h +45 -0
- data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +1 -1
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +4 -2
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +18 -17
- data/ext/sources/ggml/CMakeLists.txt +135 -79
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
- data/ext/sources/ggml/include/ggml-alloc.h +9 -0
- data/ext/sources/ggml/include/ggml-backend.h +21 -2
- data/ext/sources/ggml/include/ggml-cpu.h +2 -1
- data/ext/sources/ggml/include/ggml-hexagon.h +19 -0
- data/ext/sources/ggml/include/ggml-metal.h +1 -6
- data/ext/sources/ggml/include/ggml-opt.h +25 -6
- data/ext/sources/ggml/include/ggml-rpc.h +8 -11
- data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
- data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
- data/ext/sources/ggml/include/ggml-zendnn.h +22 -0
- data/ext/sources/ggml/include/ggml.h +406 -23
- data/ext/sources/ggml/src/CMakeLists.txt +99 -13
- data/ext/sources/ggml/src/ggml-alloc.c +368 -161
- data/ext/sources/ggml/src/ggml-backend-impl.h +5 -5
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +55 -14
- data/ext/sources/ggml/src/ggml-backend.cpp +290 -57
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +17 -3
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +10 -13
- data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +14 -0
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +59 -45
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +138 -47
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +2586 -1917
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +348 -309
- data/ext/sources/ggml/src/ggml-cann/common.h +350 -133
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +894 -625
- data/ext/sources/ggml/src/ggml-common.h +17 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +167 -75
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +560 -622
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1002 -270
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +107 -587
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +373 -486
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +521 -353
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +4682 -1660
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +82 -4
- data/ext/sources/ggml/src/ggml-cpu/common.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +18 -9
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +263 -111
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +39 -28
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +683 -82
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +38 -43
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +435 -119
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1234 -1182
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +2167 -1480
- data/ext/sources/ggml/src/ggml-cpu/ops.h +10 -12
- data/ext/sources/ggml/src/ggml-cpu/quants.c +35 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1132 -81
- data/ext/sources/ggml/src/ggml-cpu/repack.h +36 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +120 -93
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- data/ext/sources/ggml/src/ggml-cpu/traits.cpp +2 -2
- data/ext/sources/ggml/src/ggml-cpu/traits.h +1 -1
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +7 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +294 -27
- data/ext/sources/ggml/src/ggml-cpu/vec.h +606 -48
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +92 -17
- data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
- data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/argmax.cu +2 -2
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +123 -6
- data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +16 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +588 -128
- data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +95 -22
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +25 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +335 -485
- data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +1 -5
- data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cu +307 -0
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
- data/ext/sources/ggml/src/ggml-cuda/diag.cu +77 -0
- data/ext/sources/ggml/src/ggml-cuda/diag.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +519 -378
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +750 -637
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +1244 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +98 -61
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +230 -197
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/fill.cu +37 -0
- data/ext/sources/ggml/src/ggml-cuda/fill.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1557 -294
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
- data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +57 -2
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +915 -69
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +171 -0
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +835 -0
- data/ext/sources/ggml/src/ggml-cuda/mmid.cu +164 -0
- data/ext/sources/ggml/src/ggml-cuda/mmid.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +109 -67
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1601 -733
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +802 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +12 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +286 -149
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
- data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +86 -32
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +163 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +14 -0
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +207 -98
- data/ext/sources/ggml/src/ggml-cuda/rope.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +330 -0
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/set.cu +39 -0
- data/ext/sources/ggml/src/ggml-cuda/set.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +325 -61
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +275 -0
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +14 -12
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +291 -104
- data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +21 -4
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +96 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +351 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +21 -0
- data/ext/sources/ggml/src/ggml-cuda/tri.cu +136 -0
- data/ext/sources/ggml/src/ggml-cuda/tri.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +189 -5
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +44 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +248 -6
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +8 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +70 -37
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +10 -3
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +80 -0
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3151 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +44 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +682 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +566 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +112 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.c +63 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +157 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +165 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +92 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +94 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +72 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +1020 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +1353 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +1001 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2503 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +487 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +168 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +287 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +454 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +221 -0
- data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +153 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +16 -13
- data/ext/sources/ggml/src/ggml-impl.h +186 -15
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -7
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +609 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1743 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +273 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1686 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +356 -61
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +4161 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +724 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +4495 -1876
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +21 -9
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +29 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4005 -427
- data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +147 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +66 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +82 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
- data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +177 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +94 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +33 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
- data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
- data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
- data/ext/sources/ggml/src/ggml-quants.c +111 -16
- data/ext/sources/ggml/src/ggml-quants.h +6 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +497 -195
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +48 -3
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +77 -0
- data/ext/sources/ggml/src/ggml-sycl/add-id.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +6 -5
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +117 -15
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +50 -30
- data/ext/sources/ggml/src/ggml-sycl/conv.cpp +10 -4
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +200 -99
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +79 -0
- data/ext/sources/ggml/src/ggml-sycl/count-equal.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +72 -309
- data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +18 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +67 -49
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +77 -34
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +397 -314
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +12 -2
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +14 -26
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +9 -6
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +643 -413
- data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +80 -60
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +223 -132
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +230 -55
- data/ext/sources/ggml/src/ggml-sycl/norm.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.cpp +97 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.hpp +24 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +8 -9
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.cpp +122 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +65 -59
- data/ext/sources/ggml/src/ggml-sycl/set.cpp +73 -0
- data/ext/sources/ggml/src/ggml-sycl/set.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +330 -165
- data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +4 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +12 -6
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +60 -6
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +16 -12
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +7398 -2635
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +43 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +15 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +56 -39
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +347 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +5 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +67 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +158 -16
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +38 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +103 -36
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +220 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +139 -45
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +113 -38
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +75 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +19 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +2 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +33 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +227 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +71 -21
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +41 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +44 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +20 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +143 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +494 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -556
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +230 -51
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +566 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +72 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +90 -223
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +195 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +41 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +59 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +104 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +234 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +6 -52
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +6 -35
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +6 -35
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +6 -39
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +5 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +30 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +16 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +43 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +435 -24
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +148 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.glsl +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +619 -177
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +169 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3087 -0
- data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +147 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +591 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +112 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +483 -0
- data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +466 -0
- data/ext/sources/ggml/src/ggml.c +901 -129
- data/ext/sources/ggml/src/gguf.cpp +8 -1
- data/ext/sources/include/whisper.h +1 -0
- data/ext/sources/src/CMakeLists.txt +3 -1
- data/ext/sources/src/whisper.cpp +124 -81
- data/ext/sources/tests/CMakeLists.txt +8 -1
- data/ext/sources/tests/test-vad-full.cpp +7 -5
- data/ext/sources/tests/test-vad.cpp +3 -3
- data/extsources.rb +1 -0
- data/lib/whisper/model/uri.rb +17 -18
- data/sig/whisper.rbs +126 -2
- data/test/test_params.rb +24 -8
- data/test/test_segment.rb +0 -1
- data/test/test_token.rb +70 -0
- data/test/test_vad.rb +1 -1
- data/test/test_vad_context.rb +50 -0
- data/test/test_vad_segment.rb +19 -0
- data/test/test_vad_segments.rb +16 -0
- data/test/test_whisper.rb +8 -1
- data/whispercpp.gemspec +1 -1
- metadata +439 -179
- data/ext/sources/build-xcframework.sh +0 -547
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +0 -279
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +0 -1841
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +0 -303
- data/ext/sources/ggml/include/ggml-kompute.h +0 -50
- data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
- data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
- data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
- data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
- data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
- data/ext/sources/ggml/src/ggml-cann/Doxyfile +0 -2579
- data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
- data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -506
- data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +0 -11
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -6280
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +0 -162
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -118
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -99
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -58
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
#include <cmath>
|
|
15
15
|
#include <cstring>
|
|
16
16
|
#include <cassert>
|
|
17
|
-
#include <cstdlib> // for qsort
|
|
18
17
|
#include <cstdio> // for GGML_ASSERT
|
|
19
18
|
|
|
20
19
|
#include "repack.h"
|
|
@@ -125,6 +124,58 @@ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GG
|
|
|
125
124
|
}
|
|
126
125
|
}
|
|
127
126
|
|
|
127
|
+
|
|
128
|
+
void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
129
|
+
assert(QK_K == 256);
|
|
130
|
+
assert(k % QK_K == 0);
|
|
131
|
+
const int nb = k / QK_K;
|
|
132
|
+
|
|
133
|
+
block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
|
|
134
|
+
|
|
135
|
+
// scalar
|
|
136
|
+
const int blck_size_interleave = 4;
|
|
137
|
+
float srcv[4][QK_K];
|
|
138
|
+
float iscale[4];
|
|
139
|
+
|
|
140
|
+
for (int i = 0; i < nb; i++) {
|
|
141
|
+
for (int row_iter = 0; row_iter < 4; row_iter++) {
|
|
142
|
+
float amax = 0.0f; // absolute max
|
|
143
|
+
float max = 0;
|
|
144
|
+
|
|
145
|
+
for (int j = 0; j < QK_K; j++) {
|
|
146
|
+
srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
|
|
147
|
+
// Update the maximum value of the corresponding super block
|
|
148
|
+
if(amax < fabsf(srcv[row_iter][j])) {
|
|
149
|
+
amax = fabsf(srcv[row_iter][j]);
|
|
150
|
+
max = srcv[row_iter][j];
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
iscale[row_iter] = amax ? -127.f/max : 0;
|
|
155
|
+
|
|
156
|
+
y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
for (int j = 0; j < QK_K / 4; j++) {
|
|
160
|
+
y[i].bsums[j] = 0;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Quants values are interleaved in sequence of four bytes from corresponding super blocks
|
|
164
|
+
// Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving
|
|
165
|
+
// i.e first four bsums from the first super block, followed by first four bsums from second super block and so on
|
|
166
|
+
for (int j = 0; j < QK_K * 4; j++) {
|
|
167
|
+
int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
|
|
168
|
+
int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
|
|
169
|
+
src_offset += (j % blck_size_interleave);
|
|
170
|
+
int index = (((j & 15) >> 2) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3);
|
|
171
|
+
|
|
172
|
+
float x0 = srcv[src_id][src_offset] * iscale[src_id];
|
|
173
|
+
y[i].qs[j] = nearest_int(x0);
|
|
174
|
+
y[i].bsums[index] += y[i].qs[j];
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
128
179
|
void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
129
180
|
assert(QK_K == 256);
|
|
130
181
|
assert(k % QK_K == 0);
|
|
@@ -193,6 +244,12 @@ template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_0>(const float * GGML_RESTR
|
|
|
193
244
|
ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row);
|
|
194
245
|
}
|
|
195
246
|
|
|
247
|
+
template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
|
|
248
|
+
assert(nrow == 4);
|
|
249
|
+
UNUSED(nrow);
|
|
250
|
+
ggml_quantize_mat_q8_K_4x4(x, vy, n_per_row);
|
|
251
|
+
}
|
|
252
|
+
|
|
196
253
|
template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
|
|
197
254
|
assert(nrow == 4);
|
|
198
255
|
UNUSED(nrow);
|
|
@@ -207,8 +264,9 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
207
264
|
const int ncols_interleaved = 4;
|
|
208
265
|
const int blocklen = 4;
|
|
209
266
|
|
|
210
|
-
assert
|
|
211
|
-
assert
|
|
267
|
+
assert(nr == 1);
|
|
268
|
+
assert(n % qk == 0);
|
|
269
|
+
assert(nc % ncols_interleaved == 0);
|
|
212
270
|
|
|
213
271
|
UNUSED(s);
|
|
214
272
|
UNUSED(bs);
|
|
@@ -308,29 +366,98 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
308
366
|
UNUSED(ncols_interleaved);
|
|
309
367
|
UNUSED(blocklen);
|
|
310
368
|
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
int sumi;
|
|
369
|
+
float sumf[8];
|
|
370
|
+
int sumi;
|
|
314
371
|
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
372
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
373
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
374
|
+
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
|
318
375
|
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
376
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
377
|
+
for (int l = 0; l < nb; l++) {
|
|
378
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
379
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
380
|
+
sumi = 0;
|
|
381
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
382
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
383
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
384
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
385
|
+
}
|
|
386
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
395
|
+
const int qk = QK_K;
|
|
396
|
+
const int nb = n / qk;
|
|
397
|
+
const int ncols_interleaved = 8;
|
|
398
|
+
const int blocklen = 4;
|
|
399
|
+
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
400
|
+
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
401
|
+
static const uint32_t kmask3 = 0x03030303;
|
|
402
|
+
|
|
403
|
+
assert (n % qk == 0);
|
|
404
|
+
assert (nc % ncols_interleaved == 0);
|
|
405
|
+
|
|
406
|
+
UNUSED(bs);
|
|
407
|
+
UNUSED(nr);
|
|
408
|
+
|
|
409
|
+
float sumf[8];
|
|
410
|
+
float sum_minf[8];
|
|
411
|
+
uint32_t utmp[32];
|
|
412
|
+
int sumi1;
|
|
413
|
+
int sumi2;
|
|
414
|
+
int sumi;
|
|
415
|
+
|
|
416
|
+
const block_q8_K * a_ptr = (const block_q8_K *) vy;
|
|
417
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
418
|
+
const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
|
419
|
+
|
|
420
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
421
|
+
sumf[j] = 0.0;
|
|
422
|
+
sum_minf[j] = 0.0;
|
|
423
|
+
}
|
|
424
|
+
for (int l = 0; l < nb; l++) {
|
|
425
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
426
|
+
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
427
|
+
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
428
|
+
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
429
|
+
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
430
|
+
utmp[sb * 4 + 2] = uaux_0;
|
|
431
|
+
utmp[sb * 4 + 0] &= kmask1;
|
|
432
|
+
}
|
|
433
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
434
|
+
uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
|
|
435
|
+
uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
|
|
436
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
437
|
+
sumi1 = 0;
|
|
438
|
+
sumi2 = 0;
|
|
439
|
+
sumi = 0;
|
|
440
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
441
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
|
|
442
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
|
443
|
+
sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i]);
|
|
444
|
+
sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i + 32]);
|
|
445
|
+
sumi1 = sumi1 * scales_0[j];
|
|
446
|
+
sumi2 = sumi2 * scales_1[j];
|
|
447
|
+
sumi += sumi1 + sumi2;
|
|
330
448
|
}
|
|
449
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
453
|
+
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
|
454
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
455
|
+
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
331
456
|
}
|
|
332
457
|
}
|
|
333
|
-
|
|
458
|
+
}
|
|
459
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
460
|
+
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
|
334
461
|
}
|
|
335
462
|
}
|
|
336
463
|
}
|
|
@@ -413,11 +540,11 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
413
540
|
}
|
|
414
541
|
}
|
|
415
542
|
|
|
416
|
-
void
|
|
417
|
-
const int qk =
|
|
543
|
+
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
544
|
+
const int qk = QK_K;
|
|
418
545
|
const int nb = n / qk;
|
|
419
|
-
const int ncols_interleaved =
|
|
420
|
-
const int blocklen =
|
|
546
|
+
const int ncols_interleaved = 8;
|
|
547
|
+
const int blocklen = 8;
|
|
421
548
|
|
|
422
549
|
assert (n % qk == 0);
|
|
423
550
|
assert (nc % ncols_interleaved == 0);
|
|
@@ -432,29 +559,229 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
432
559
|
UNUSED(ncols_interleaved);
|
|
433
560
|
UNUSED(blocklen);
|
|
434
561
|
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
562
|
+
float sumf[8];
|
|
563
|
+
float sum_minf[8];
|
|
564
|
+
int sumi1,sumi2,sumi3,sumi4;
|
|
565
|
+
int sumi;
|
|
438
566
|
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
567
|
+
const block_q8_K * a_ptr = (const block_q8_K *)vy;
|
|
568
|
+
for(int x = 0; x < nc / ncols_interleaved; x++) {
|
|
569
|
+
const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
|
|
570
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
571
|
+
sumf[j] = 0.0;
|
|
572
|
+
sum_minf[j] = 0.0;
|
|
573
|
+
}
|
|
574
|
+
for (int l = 0; l < nb; l++) {
|
|
575
|
+
for (int k = 0; k < (qk / (4 * blocklen)); k++) {
|
|
576
|
+
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
|
577
|
+
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
|
578
|
+
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
|
579
|
+
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
|
580
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
581
|
+
sumi1 = 0;
|
|
582
|
+
sumi2 = 0;
|
|
583
|
+
sumi3 = 0;
|
|
584
|
+
sumi4 = 0;
|
|
585
|
+
sumi = 0;
|
|
586
|
+
int offset = ((k / 2) % 2) + j * 2;
|
|
587
|
+
for (int i = 0; i < blocklen; ++i){
|
|
588
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
|
|
589
|
+
const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
|
|
590
|
+
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
|
591
|
+
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
|
592
|
+
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
|
|
593
|
+
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
|
|
594
|
+
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
|
|
595
|
+
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
|
|
596
|
+
|
|
597
|
+
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
|
598
|
+
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
|
599
|
+
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
|
600
|
+
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
|
601
|
+
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
|
602
|
+
}
|
|
603
|
+
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
for(int sb = 0; sb < 8; sb++) {
|
|
607
|
+
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
|
608
|
+
for(int j = 0; j < ncols_interleaved; j++){
|
|
609
|
+
sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
614
|
+
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
|
615
|
+
}
|
|
616
|
+
}
|
|
617
|
+
}
|
|
442
618
|
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
619
|
+
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
620
|
+
const int qk = QK8_0;
|
|
621
|
+
const int nb = n / qk;
|
|
622
|
+
const int ncols_interleaved = 4;
|
|
623
|
+
const int blocklen = 4;
|
|
624
|
+
|
|
625
|
+
assert(nr == 1);
|
|
626
|
+
assert(n % qk == 0);
|
|
627
|
+
assert(nc % ncols_interleaved == 0);
|
|
628
|
+
|
|
629
|
+
UNUSED(bs);
|
|
630
|
+
UNUSED(nr);
|
|
631
|
+
|
|
632
|
+
float sumf[4];
|
|
633
|
+
int sumi;
|
|
634
|
+
|
|
635
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
636
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
637
|
+
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
|
638
|
+
|
|
639
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
640
|
+
for (int l = 0; l < nb; l++) {
|
|
641
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
642
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
643
|
+
sumi = 0;
|
|
644
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
645
|
+
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
646
|
+
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
647
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
648
|
+
}
|
|
649
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
658
|
+
const int qk = QK8_0;
|
|
659
|
+
const int nb = n / qk;
|
|
660
|
+
const int ncols_interleaved = 8;
|
|
661
|
+
const int blocklen = 8;
|
|
662
|
+
|
|
663
|
+
assert(nr == 1);
|
|
664
|
+
assert(n % qk == 0);
|
|
665
|
+
assert(nc % ncols_interleaved == 0);
|
|
666
|
+
|
|
667
|
+
UNUSED(bs);
|
|
668
|
+
UNUSED(nr);
|
|
669
|
+
|
|
670
|
+
float sumf[8];
|
|
671
|
+
int sumi;
|
|
672
|
+
|
|
673
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
674
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
675
|
+
const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
|
|
676
|
+
|
|
677
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
678
|
+
for (int l = 0; l < nb; l++) {
|
|
679
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
680
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
681
|
+
sumi = 0;
|
|
682
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
683
|
+
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
684
|
+
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
685
|
+
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
686
|
+
}
|
|
687
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
688
|
+
}
|
|
689
|
+
}
|
|
690
|
+
}
|
|
691
|
+
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
void ggml_gemv_q8_0_4x4_q8_0_generic(int n,
|
|
696
|
+
float * GGML_RESTRICT s,
|
|
697
|
+
size_t bs,
|
|
698
|
+
const void * GGML_RESTRICT vx,
|
|
699
|
+
const void * GGML_RESTRICT vy,
|
|
700
|
+
int nr,
|
|
701
|
+
int nc) {
|
|
702
|
+
const int qk = QK8_0;
|
|
703
|
+
const int nb = n / qk;
|
|
704
|
+
const int ncols_interleaved = 4;
|
|
705
|
+
const int blocklen = 4;
|
|
706
|
+
|
|
707
|
+
assert(nr == 1);
|
|
708
|
+
assert(n % qk == 0);
|
|
709
|
+
assert(nc % ncols_interleaved == 0);
|
|
710
|
+
|
|
711
|
+
UNUSED(bs);
|
|
712
|
+
UNUSED(nr);
|
|
713
|
+
|
|
714
|
+
float sumf[4];
|
|
715
|
+
int sumi;
|
|
716
|
+
|
|
717
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
718
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
719
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
720
|
+
|
|
721
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
722
|
+
sumf[j] = 0.0;
|
|
723
|
+
}
|
|
724
|
+
for (int l = 0; l < nb; l++) {
|
|
725
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
726
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
727
|
+
sumi = 0;
|
|
728
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
729
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
730
|
+
sumi += v0 * a_ptr[l].qs[k * blocklen + i];
|
|
731
|
+
}
|
|
732
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
}
|
|
736
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
737
|
+
s[x * ncols_interleaved + j] = sumf[j];
|
|
738
|
+
}
|
|
739
|
+
}
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
void ggml_gemv_q8_0_4x8_q8_0_generic(int n,
|
|
743
|
+
float * GGML_RESTRICT s,
|
|
744
|
+
size_t bs,
|
|
745
|
+
const void * GGML_RESTRICT vx,
|
|
746
|
+
const void * GGML_RESTRICT vy,
|
|
747
|
+
int nr,
|
|
748
|
+
int nc) {
|
|
749
|
+
const int qk = QK8_0;
|
|
750
|
+
const int nb = n / qk;
|
|
751
|
+
const int ncols_interleaved = 4;
|
|
752
|
+
const int blocklen = 8;
|
|
753
|
+
|
|
754
|
+
assert(nr == 1);
|
|
755
|
+
assert(n % qk == 0);
|
|
756
|
+
assert(nc % ncols_interleaved == 0);
|
|
757
|
+
|
|
758
|
+
UNUSED(bs);
|
|
759
|
+
UNUSED(nr);
|
|
760
|
+
|
|
761
|
+
float sumf[4];
|
|
762
|
+
int sumi;
|
|
763
|
+
|
|
764
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
765
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
766
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
767
|
+
|
|
768
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
769
|
+
sumf[j] = 0.0;
|
|
770
|
+
}
|
|
771
|
+
for (int l = 0; l < nb; l++) {
|
|
772
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
773
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
774
|
+
sumi = 0;
|
|
775
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
776
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
777
|
+
sumi += v0 * a_ptr[l].qs[k * blocklen + i];
|
|
454
778
|
}
|
|
779
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
455
780
|
}
|
|
456
781
|
}
|
|
457
|
-
|
|
782
|
+
}
|
|
783
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
784
|
+
s[x * ncols_interleaved + j] = sumf[j];
|
|
458
785
|
}
|
|
459
786
|
}
|
|
460
787
|
}
|
|
@@ -623,6 +950,89 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
623
950
|
}
|
|
624
951
|
}
|
|
625
952
|
|
|
953
|
+
void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
954
|
+
const int qk = QK_K;
|
|
955
|
+
const int nb = n / qk;
|
|
956
|
+
const int ncols_interleaved = 8;
|
|
957
|
+
const int blocklen = 4;
|
|
958
|
+
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
959
|
+
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
960
|
+
static const uint32_t kmask3 = 0x03030303;
|
|
961
|
+
|
|
962
|
+
assert (n % qk == 0);
|
|
963
|
+
assert (nr % 4 == 0);
|
|
964
|
+
assert (nc % ncols_interleaved == 0);
|
|
965
|
+
|
|
966
|
+
UNUSED(nb);
|
|
967
|
+
UNUSED(ncols_interleaved);
|
|
968
|
+
UNUSED(blocklen);
|
|
969
|
+
|
|
970
|
+
float sumf[4][8];
|
|
971
|
+
float sum_minf[4][8];
|
|
972
|
+
uint32_t utmp[32];
|
|
973
|
+
int sumi1;
|
|
974
|
+
int sumi2;
|
|
975
|
+
int sumi;
|
|
976
|
+
|
|
977
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
978
|
+
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
|
979
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
980
|
+
const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
|
981
|
+
for (int m = 0; m < 4; m++) {
|
|
982
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
983
|
+
sumf[m][j] = 0.0;
|
|
984
|
+
sum_minf[m][j] = 0.0;
|
|
985
|
+
}
|
|
986
|
+
}
|
|
987
|
+
for (int l = 0; l < nb; l++) {
|
|
988
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
989
|
+
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
990
|
+
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
991
|
+
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
992
|
+
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
993
|
+
utmp[sb * 4 + 2] = uaux_0;
|
|
994
|
+
utmp[sb * 4 + 0] &= kmask1;
|
|
995
|
+
}
|
|
996
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
997
|
+
uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
|
|
998
|
+
uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
|
|
999
|
+
for (int m = 0; m < 4; m++) {
|
|
1000
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1001
|
+
sumi1 = 0;
|
|
1002
|
+
sumi2 = 0;
|
|
1003
|
+
sumi = 0;
|
|
1004
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1005
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
|
|
1006
|
+
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
|
1007
|
+
sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i]);
|
|
1008
|
+
sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i + 128]);
|
|
1009
|
+
sumi1 = sumi1 * scales_0[j];
|
|
1010
|
+
sumi2 = sumi2 * scales_1[j];
|
|
1011
|
+
sumi += sumi1 + sumi2;
|
|
1012
|
+
}
|
|
1013
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
1014
|
+
}
|
|
1015
|
+
}
|
|
1016
|
+
}
|
|
1017
|
+
for (int sb = 0; sb < 8; sb++) {
|
|
1018
|
+
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
|
1019
|
+
for(int m = 0; m < 4; m++) {
|
|
1020
|
+
const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
1021
|
+
for(int j = 0; j < ncols_interleaved; j++) {
|
|
1022
|
+
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
1023
|
+
}
|
|
1024
|
+
}
|
|
1025
|
+
}
|
|
1026
|
+
}
|
|
1027
|
+
for (int m = 0; m < 4; m++) {
|
|
1028
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1029
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
|
1030
|
+
}
|
|
1031
|
+
}
|
|
1032
|
+
}
|
|
1033
|
+
}
|
|
1034
|
+
}
|
|
1035
|
+
|
|
626
1036
|
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
627
1037
|
const int qk = QK_K;
|
|
628
1038
|
const int nb = n / qk;
|
|
@@ -712,6 +1122,97 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
712
1122
|
}
|
|
713
1123
|
}
|
|
714
1124
|
|
|
1125
|
+
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1126
|
+
const int qk = QK_K;
|
|
1127
|
+
const int nb = n / qk;
|
|
1128
|
+
const int ncols_interleaved = 8;
|
|
1129
|
+
const int blocklen = 8;
|
|
1130
|
+
|
|
1131
|
+
assert (n % qk == 0);
|
|
1132
|
+
assert (nr % 4 == 0);
|
|
1133
|
+
assert (nc % ncols_interleaved == 0);
|
|
1134
|
+
|
|
1135
|
+
UNUSED(s);
|
|
1136
|
+
UNUSED(bs);
|
|
1137
|
+
UNUSED(vx);
|
|
1138
|
+
UNUSED(vy);
|
|
1139
|
+
UNUSED(nr);
|
|
1140
|
+
UNUSED(nc);
|
|
1141
|
+
UNUSED(nb);
|
|
1142
|
+
UNUSED(ncols_interleaved);
|
|
1143
|
+
UNUSED(blocklen);
|
|
1144
|
+
|
|
1145
|
+
float sumf[4][8];
|
|
1146
|
+
float sum_minf[4][8];
|
|
1147
|
+
int sumi1, sumi2, sumi3, sumi4;
|
|
1148
|
+
int sumi;
|
|
1149
|
+
|
|
1150
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
1151
|
+
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
|
1152
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1153
|
+
const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
|
|
1154
|
+
for (int m = 0; m < 4; m++) {
|
|
1155
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1156
|
+
sumf[m][j] = 0.0;
|
|
1157
|
+
sum_minf[m][j] = 0.0;
|
|
1158
|
+
}
|
|
1159
|
+
}
|
|
1160
|
+
for (int l = 0; l < nb; l++) {
|
|
1161
|
+
for (int k = 0; k < (qk / (4 * blocklen)); k++) {
|
|
1162
|
+
|
|
1163
|
+
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
|
1164
|
+
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
|
1165
|
+
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
|
1166
|
+
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
|
1167
|
+
for (int m = 0; m < 4; m++) {
|
|
1168
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1169
|
+
sumi1 = 0;
|
|
1170
|
+
sumi2 = 0;
|
|
1171
|
+
sumi3 = 0;
|
|
1172
|
+
sumi4 = 0;
|
|
1173
|
+
sumi = 0;
|
|
1174
|
+
int offset = ((k / 2) % 2) + j * 2;
|
|
1175
|
+
for (int i = 0; i < blocklen; ++i){
|
|
1176
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
|
|
1177
|
+
const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
|
|
1178
|
+
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
|
1179
|
+
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
|
1180
|
+
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
|
|
1181
|
+
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
|
|
1182
|
+
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
|
|
1183
|
+
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
|
|
1184
|
+
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
|
1185
|
+
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
|
1186
|
+
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
|
1187
|
+
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
|
1188
|
+
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
|
1189
|
+
}
|
|
1190
|
+
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
1191
|
+
}
|
|
1192
|
+
}
|
|
1193
|
+
}
|
|
1194
|
+
for(int sb = 0; sb < 8; sb++) {
|
|
1195
|
+
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
|
1196
|
+
for(int m = 0; m < 4; m++) {
|
|
1197
|
+
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
1198
|
+
for(int j = 0; j < ncols_interleaved; j++) {
|
|
1199
|
+
int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
|
|
1200
|
+
sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
1201
|
+
}
|
|
1202
|
+
}
|
|
1203
|
+
}
|
|
1204
|
+
}
|
|
1205
|
+
|
|
1206
|
+
for (int m = 0; m < 4; m++) {
|
|
1207
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1208
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
|
1209
|
+
}
|
|
1210
|
+
}
|
|
1211
|
+
}
|
|
1212
|
+
}
|
|
1213
|
+
}
|
|
1214
|
+
|
|
1215
|
+
|
|
715
1216
|
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
716
1217
|
const int qk = QK8_0;
|
|
717
1218
|
const int nb = n / qk;
|
|
@@ -759,9 +1260,157 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
759
1260
|
}
|
|
760
1261
|
}
|
|
761
1262
|
}
|
|
762
|
-
for (int m = 0; m < 4; m++) {
|
|
763
|
-
for (int j = 0; j < ncols_interleaved; j++)
|
|
764
|
-
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1263
|
+
for (int m = 0; m < 4; m++) {
|
|
1264
|
+
for (int j = 0; j < ncols_interleaved; j++)
|
|
1265
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1266
|
+
}
|
|
1267
|
+
}
|
|
1268
|
+
}
|
|
1269
|
+
}
|
|
1270
|
+
}
|
|
1271
|
+
|
|
1272
|
+
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
1273
|
+
const int qk = QK8_0;
|
|
1274
|
+
const int nb = n / qk;
|
|
1275
|
+
const int ncols_interleaved = 8;
|
|
1276
|
+
const int blocklen = 8;
|
|
1277
|
+
|
|
1278
|
+
assert(n % qk == 0);
|
|
1279
|
+
assert(nr % 4 == 0);
|
|
1280
|
+
assert(nc % ncols_interleaved == 0);
|
|
1281
|
+
|
|
1282
|
+
float sumf[4][8];
|
|
1283
|
+
int sumi;
|
|
1284
|
+
|
|
1285
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
1286
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1287
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1288
|
+
const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
|
|
1289
|
+
for (int m = 0; m < 4; m++) {
|
|
1290
|
+
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
1291
|
+
}
|
|
1292
|
+
for (int l = 0; l < nb; l++) {
|
|
1293
|
+
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1294
|
+
for (int m = 0; m < 4; m++) {
|
|
1295
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1296
|
+
sumi = 0;
|
|
1297
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1298
|
+
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
1299
|
+
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
1300
|
+
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
1301
|
+
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
|
1302
|
+
}
|
|
1303
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1304
|
+
}
|
|
1305
|
+
}
|
|
1306
|
+
}
|
|
1307
|
+
}
|
|
1308
|
+
for (int m = 0; m < 4; m++) {
|
|
1309
|
+
for (int j = 0; j < ncols_interleaved; j++)
|
|
1310
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1311
|
+
}
|
|
1312
|
+
}
|
|
1313
|
+
}
|
|
1314
|
+
}
|
|
1315
|
+
|
|
1316
|
+
void ggml_gemm_q8_0_4x4_q8_0_generic(int n,
|
|
1317
|
+
float * GGML_RESTRICT s,
|
|
1318
|
+
size_t bs,
|
|
1319
|
+
const void * GGML_RESTRICT vx,
|
|
1320
|
+
const void * GGML_RESTRICT vy,
|
|
1321
|
+
int nr,
|
|
1322
|
+
int nc) {
|
|
1323
|
+
const int qk = QK8_0;
|
|
1324
|
+
const int nb = n / qk;
|
|
1325
|
+
const int ncols_interleaved = 4;
|
|
1326
|
+
const int blocklen = 4;
|
|
1327
|
+
|
|
1328
|
+
assert(n % qk == 0);
|
|
1329
|
+
assert(nr % 4 == 0);
|
|
1330
|
+
assert(nc % ncols_interleaved == 0);
|
|
1331
|
+
|
|
1332
|
+
float sumf[4][4];
|
|
1333
|
+
int sumi;
|
|
1334
|
+
|
|
1335
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
1336
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1337
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1338
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
1339
|
+
for (int m = 0; m < 4; m++) {
|
|
1340
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1341
|
+
sumf[m][j] = 0.0;
|
|
1342
|
+
}
|
|
1343
|
+
}
|
|
1344
|
+
for (int l = 0; l < nb; l++) {
|
|
1345
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
1346
|
+
for (int m = 0; m < 4; m++) {
|
|
1347
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1348
|
+
sumi = 0;
|
|
1349
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1350
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
1351
|
+
sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
|
|
1352
|
+
}
|
|
1353
|
+
sumf[m][j] +=
|
|
1354
|
+
sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1355
|
+
}
|
|
1356
|
+
}
|
|
1357
|
+
}
|
|
1358
|
+
}
|
|
1359
|
+
for (int m = 0; m < 4; m++) {
|
|
1360
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1361
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1362
|
+
}
|
|
1363
|
+
}
|
|
1364
|
+
}
|
|
1365
|
+
}
|
|
1366
|
+
}
|
|
1367
|
+
|
|
1368
|
+
void ggml_gemm_q8_0_4x8_q8_0_generic(int n,
|
|
1369
|
+
float * GGML_RESTRICT s,
|
|
1370
|
+
size_t bs,
|
|
1371
|
+
const void * GGML_RESTRICT vx,
|
|
1372
|
+
const void * GGML_RESTRICT vy,
|
|
1373
|
+
int nr,
|
|
1374
|
+
int nc) {
|
|
1375
|
+
const int qk = QK8_0;
|
|
1376
|
+
const int nb = n / qk;
|
|
1377
|
+
const int ncols_interleaved = 4;
|
|
1378
|
+
const int blocklen = 8;
|
|
1379
|
+
|
|
1380
|
+
assert(n % qk == 0);
|
|
1381
|
+
assert(nr % 4 == 0);
|
|
1382
|
+
assert(nc % ncols_interleaved == 0);
|
|
1383
|
+
|
|
1384
|
+
float sumf[4][4];
|
|
1385
|
+
int sumi;
|
|
1386
|
+
|
|
1387
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
1388
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1389
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1390
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
1391
|
+
for (int m = 0; m < 4; m++) {
|
|
1392
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1393
|
+
sumf[m][j] = 0.0;
|
|
1394
|
+
}
|
|
1395
|
+
}
|
|
1396
|
+
for (int l = 0; l < nb; l++) {
|
|
1397
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
1398
|
+
for (int m = 0; m < 4; m++) {
|
|
1399
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1400
|
+
sumi = 0;
|
|
1401
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1402
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
1403
|
+
sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
|
|
1404
|
+
}
|
|
1405
|
+
sumf[m][j] +=
|
|
1406
|
+
sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1407
|
+
}
|
|
1408
|
+
}
|
|
1409
|
+
}
|
|
1410
|
+
}
|
|
1411
|
+
for (int m = 0; m < 4; m++) {
|
|
1412
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1413
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
765
1414
|
}
|
|
766
1415
|
}
|
|
767
1416
|
}
|
|
@@ -770,6 +1419,23 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
770
1419
|
|
|
771
1420
|
} // extern "C"
|
|
772
1421
|
|
|
1422
|
+
static block_q8_0x4 make_block_q8_0x4(block_q8_0 * in, unsigned int blck_size_interleave) {
|
|
1423
|
+
block_q8_0x4 out;
|
|
1424
|
+
|
|
1425
|
+
for (int i = 0; i < 4; i++) {
|
|
1426
|
+
out.d[i] = in[i].d;
|
|
1427
|
+
}
|
|
1428
|
+
|
|
1429
|
+
const int end = QK8_0 * 4 / blck_size_interleave;
|
|
1430
|
+
for (int i = 0; i < end; ++i) {
|
|
1431
|
+
int src_id = i % 4;
|
|
1432
|
+
int src_offset = (i / 4) * blck_size_interleave;
|
|
1433
|
+
int dst_offset = i * blck_size_interleave;
|
|
1434
|
+
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
|
|
1435
|
+
}
|
|
1436
|
+
return out;
|
|
1437
|
+
}
|
|
1438
|
+
|
|
773
1439
|
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
|
774
1440
|
block_q4_0x4 out;
|
|
775
1441
|
|
|
@@ -915,6 +1581,50 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
|
|
|
915
1581
|
return out;
|
|
916
1582
|
}
|
|
917
1583
|
|
|
1584
|
+
static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
|
|
1585
|
+
block_q2_Kx8 out;
|
|
1586
|
+
|
|
1587
|
+
// Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
|
|
1588
|
+
for (int i = 0; i < 8; i++) {
|
|
1589
|
+
out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
|
|
1590
|
+
}
|
|
1591
|
+
|
|
1592
|
+
for (int i = 0; i < 8; i++) {
|
|
1593
|
+
out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
|
|
1594
|
+
}
|
|
1595
|
+
|
|
1596
|
+
const int end = QK_K * 2 / blck_size_interleave;
|
|
1597
|
+
|
|
1598
|
+
// Interleave Q2_K quants by taking 8 bytes at a time
|
|
1599
|
+
for (int i = 0; i < end; ++i) {
|
|
1600
|
+
int src_id = i % 8;
|
|
1601
|
+
int src_offset = (i / 8) * blck_size_interleave;
|
|
1602
|
+
int dst_offset = i * blck_size_interleave;
|
|
1603
|
+
|
|
1604
|
+
uint64_t elems;
|
|
1605
|
+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
1606
|
+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
|
1607
|
+
}
|
|
1608
|
+
|
|
1609
|
+
// The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
|
|
1610
|
+
// Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
|
|
1611
|
+
// The output Q2_Kx8 structure has 128 bytes for storing scales and mins
|
|
1612
|
+
// Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
|
|
1613
|
+
// For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
|
|
1614
|
+
|
|
1615
|
+
for(int i = 0; i < 128; i++){
|
|
1616
|
+
|
|
1617
|
+
// Index for selecting which q2k super block
|
|
1618
|
+
int src1 = (i % 16) / 2;
|
|
1619
|
+
// Index for selecting scale
|
|
1620
|
+
int src2 = ((i / 16) * 2) + (i % 2);
|
|
1621
|
+
|
|
1622
|
+
out.scales[i] = in[src1].scales[src2];
|
|
1623
|
+
}
|
|
1624
|
+
return out;
|
|
1625
|
+
|
|
1626
|
+
}
|
|
1627
|
+
|
|
918
1628
|
static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
919
1629
|
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
|
920
1630
|
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
|
@@ -945,9 +1655,10 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
|
|
|
945
1655
|
|
|
946
1656
|
GGML_UNUSED(data_size);
|
|
947
1657
|
}
|
|
1658
|
+
|
|
948
1659
|
static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
949
1660
|
GGML_ASSERT(t->type == GGML_TYPE_Q4_K);
|
|
950
|
-
GGML_ASSERT(interleave_block == 8);
|
|
1661
|
+
GGML_ASSERT(interleave_block == 8 || interleave_block == 4);
|
|
951
1662
|
constexpr int nrows_interleaved = 8;
|
|
952
1663
|
|
|
953
1664
|
block_q4_Kx8 * dst = (block_q4_Kx8*)t->data;
|
|
@@ -976,6 +1687,37 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
|
|
|
976
1687
|
GGML_UNUSED(data_size);
|
|
977
1688
|
}
|
|
978
1689
|
|
|
1690
|
+
static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
1691
|
+
GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
|
|
1692
|
+
GGML_ASSERT(interleave_block == 8);
|
|
1693
|
+
constexpr int nrows_interleaved = 8;
|
|
1694
|
+
|
|
1695
|
+
block_q2_Kx8 * dst = (block_q2_Kx8*)t->data;
|
|
1696
|
+
const block_q2_K * src = (const block_q2_K*) data;
|
|
1697
|
+
block_q2_K dst_tmp[8];
|
|
1698
|
+
int nrow = ggml_nrows(t);
|
|
1699
|
+
int nblocks = t->ne[0] / QK_K;
|
|
1700
|
+
|
|
1701
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
|
|
1702
|
+
|
|
1703
|
+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
|
1704
|
+
return -1;
|
|
1705
|
+
}
|
|
1706
|
+
|
|
1707
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
1708
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
1709
|
+
for (int i = 0; i < nrows_interleaved; i++ ) {
|
|
1710
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
1711
|
+
}
|
|
1712
|
+
*dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
|
|
1713
|
+
}
|
|
1714
|
+
src += nrows_interleaved * nblocks;
|
|
1715
|
+
}
|
|
1716
|
+
return 0;
|
|
1717
|
+
|
|
1718
|
+
GGML_UNUSED(data_size);
|
|
1719
|
+
}
|
|
1720
|
+
|
|
979
1721
|
static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
980
1722
|
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
|
981
1723
|
GGML_ASSERT(interleave_block == 8);
|
|
@@ -1007,6 +1749,38 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block
|
|
|
1007
1749
|
GGML_UNUSED(data_size);
|
|
1008
1750
|
}
|
|
1009
1751
|
|
|
1752
|
+
static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor * t,
|
|
1753
|
+
int interleave_block,
|
|
1754
|
+
const void * GGML_RESTRICT data,
|
|
1755
|
+
size_t data_size) {
|
|
1756
|
+
GGML_ASSERT(t->type == GGML_TYPE_Q8_0);
|
|
1757
|
+
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
|
1758
|
+
constexpr int nrows_interleaved = 4;
|
|
1759
|
+
|
|
1760
|
+
block_q8_0x4 * dst = (block_q8_0x4 *) t->data;
|
|
1761
|
+
const block_q8_0 * src = (const block_q8_0 *) data;
|
|
1762
|
+
block_q8_0 dst_tmp[4];
|
|
1763
|
+
int nrow = ggml_nrows(t);
|
|
1764
|
+
int nblocks = t->ne[0] / QK8_0;
|
|
1765
|
+
|
|
1766
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q8_0));
|
|
1767
|
+
|
|
1768
|
+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
|
1769
|
+
return -1;
|
|
1770
|
+
}
|
|
1771
|
+
|
|
1772
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
1773
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
1774
|
+
for (int i = 0; i < nrows_interleaved; i++) {
|
|
1775
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
1776
|
+
}
|
|
1777
|
+
*dst++ = make_block_q8_0x4(dst_tmp, interleave_block);
|
|
1778
|
+
}
|
|
1779
|
+
src += nrows_interleaved * nblocks;
|
|
1780
|
+
}
|
|
1781
|
+
return 0;
|
|
1782
|
+
}
|
|
1783
|
+
|
|
1010
1784
|
static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
|
|
1011
1785
|
block_iq4_nlx4 out;
|
|
1012
1786
|
|
|
@@ -1044,15 +1818,16 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s
|
|
|
1044
1818
|
|
|
1045
1819
|
static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
1046
1820
|
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
|
|
1047
|
-
//GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
|
1048
1821
|
GGML_ASSERT(interleave_block == 4);
|
|
1049
1822
|
|
|
1050
|
-
|
|
1051
|
-
|
|
1823
|
+
const block_iq4_nl * src = (const block_iq4_nl *)data;
|
|
1824
|
+
block_iq4_nlx4 * dst = ( block_iq4_nlx4 *)t->data;
|
|
1825
|
+
|
|
1052
1826
|
block_iq4_nl dst_tmp[4];
|
|
1827
|
+
|
|
1053
1828
|
int nrow = ggml_nrows(t);
|
|
1054
1829
|
int nrows_interleaved = 4;
|
|
1055
|
-
int nblocks = t->ne[0] /
|
|
1830
|
+
int nblocks = t->ne[0] / QK4_NL;
|
|
1056
1831
|
|
|
1057
1832
|
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
|
|
1058
1833
|
|
|
@@ -1074,6 +1849,63 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
|
|
|
1074
1849
|
GGML_UNUSED(data_size);
|
|
1075
1850
|
}
|
|
1076
1851
|
|
|
1852
|
+
static block_iq4_nlx8 make_block_iq4_nlx8(block_iq4_nl * in, unsigned int blck_size_interleave) {
|
|
1853
|
+
block_iq4_nlx8 out;
|
|
1854
|
+
|
|
1855
|
+
for (int i = 0; i < 8; i++) {
|
|
1856
|
+
out.d[i] = in[i].d;
|
|
1857
|
+
}
|
|
1858
|
+
|
|
1859
|
+
const int end = QK4_NL * 4 / blck_size_interleave;
|
|
1860
|
+
|
|
1861
|
+
if (blck_size_interleave == 8) {
|
|
1862
|
+
for (int i = 0; i < end; ++i) {
|
|
1863
|
+
int src_id = i % 8;
|
|
1864
|
+
int src_offset = (i / 8) * blck_size_interleave;
|
|
1865
|
+
int dst_offset = i * blck_size_interleave;
|
|
1866
|
+
|
|
1867
|
+
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
1868
|
+
}
|
|
1869
|
+
} else {
|
|
1870
|
+
GGML_ASSERT(false);
|
|
1871
|
+
}
|
|
1872
|
+
|
|
1873
|
+
return out;
|
|
1874
|
+
}
|
|
1875
|
+
|
|
1876
|
+
static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
1877
|
+
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
|
|
1878
|
+
GGML_ASSERT(interleave_block == 8);
|
|
1879
|
+
|
|
1880
|
+
const block_iq4_nl * src = (const block_iq4_nl *)data;
|
|
1881
|
+
block_iq4_nlx8 * dst = ( block_iq4_nlx8 *)t->data;
|
|
1882
|
+
|
|
1883
|
+
block_iq4_nl dst_tmp[8];
|
|
1884
|
+
|
|
1885
|
+
int nrow = ggml_nrows(t);
|
|
1886
|
+
int nrows_interleaved = 8;
|
|
1887
|
+
int nblocks = t->ne[0] / QK4_NL;
|
|
1888
|
+
|
|
1889
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
|
|
1890
|
+
|
|
1891
|
+
if (t->ne[1] % nrows_interleaved != 0) {
|
|
1892
|
+
return -1;
|
|
1893
|
+
}
|
|
1894
|
+
|
|
1895
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
1896
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
1897
|
+
for (int i = 0; i < nrows_interleaved; i++) {
|
|
1898
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
1899
|
+
}
|
|
1900
|
+
*dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block);
|
|
1901
|
+
}
|
|
1902
|
+
src += nrows_interleaved * nblocks;
|
|
1903
|
+
}
|
|
1904
|
+
return 0;
|
|
1905
|
+
|
|
1906
|
+
GGML_UNUSED(data_size);
|
|
1907
|
+
}
|
|
1908
|
+
|
|
1077
1909
|
namespace ggml::cpu::repack {
|
|
1078
1910
|
// repack
|
|
1079
1911
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
|
|
@@ -1096,6 +1928,14 @@ template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * da
|
|
|
1096
1928
|
return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
|
|
1097
1929
|
}
|
|
1098
1930
|
|
|
1931
|
+
template <> int repack<block_q4_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1932
|
+
return repack_q4_K_to_q4_K_8_bl(t, 4, data, data_size);
|
|
1933
|
+
}
|
|
1934
|
+
|
|
1935
|
+
template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1936
|
+
return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
|
|
1937
|
+
}
|
|
1938
|
+
|
|
1099
1939
|
template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1100
1940
|
return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
|
|
1101
1941
|
}
|
|
@@ -1105,6 +1945,18 @@ template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void *
|
|
|
1105
1945
|
// return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
|
|
1106
1946
|
//}
|
|
1107
1947
|
|
|
1948
|
+
template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1949
|
+
return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
|
|
1950
|
+
}
|
|
1951
|
+
|
|
1952
|
+
template <> int repack<block_q8_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1953
|
+
return repack_q8_0_to_q8_0_4_bl(t, 4, data, data_size);
|
|
1954
|
+
}
|
|
1955
|
+
|
|
1956
|
+
template <> int repack<block_q8_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1957
|
+
return repack_q8_0_to_q8_0_4_bl(t, 8, data, data_size);
|
|
1958
|
+
}
|
|
1959
|
+
|
|
1108
1960
|
// gemv
|
|
1109
1961
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
|
1110
1962
|
void gemv(int, float *, size_t, const void *, const void *, int, int);
|
|
@@ -1121,14 +1973,34 @@ template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t
|
|
|
1121
1973
|
ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1122
1974
|
}
|
|
1123
1975
|
|
|
1976
|
+
template <> void gemv<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1977
|
+
ggml_gemv_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
1978
|
+
}
|
|
1979
|
+
|
|
1124
1980
|
template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1125
1981
|
ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
1126
1982
|
}
|
|
1127
1983
|
|
|
1984
|
+
template <> void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1985
|
+
ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
1986
|
+
}
|
|
1987
|
+
|
|
1128
1988
|
template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1129
1989
|
ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1130
1990
|
}
|
|
1131
1991
|
|
|
1992
|
+
template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1993
|
+
ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1994
|
+
}
|
|
1995
|
+
|
|
1996
|
+
template <> void gemv<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1997
|
+
ggml_gemv_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1998
|
+
}
|
|
1999
|
+
|
|
2000
|
+
template <> void gemv<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
2001
|
+
ggml_gemv_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2002
|
+
}
|
|
2003
|
+
|
|
1132
2004
|
// gemm
|
|
1133
2005
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
|
1134
2006
|
void gemm(int, float *, size_t, const void *, const void *, int, int);
|
|
@@ -1141,6 +2013,10 @@ template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
|
|
|
1141
2013
|
ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1142
2014
|
}
|
|
1143
2015
|
|
|
2016
|
+
template <> void gemm<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
2017
|
+
ggml_gemm_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
2018
|
+
}
|
|
2019
|
+
|
|
1144
2020
|
template <> void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1145
2021
|
ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1146
2022
|
}
|
|
@@ -1149,10 +2025,26 @@ template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
|
|
|
1149
2025
|
ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
1150
2026
|
}
|
|
1151
2027
|
|
|
2028
|
+
template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
2029
|
+
ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
2030
|
+
}
|
|
2031
|
+
|
|
1152
2032
|
template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1153
2033
|
ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1154
2034
|
}
|
|
1155
2035
|
|
|
2036
|
+
template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
2037
|
+
ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2038
|
+
}
|
|
2039
|
+
|
|
2040
|
+
template <> void gemm<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
2041
|
+
ggml_gemm_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2042
|
+
}
|
|
2043
|
+
|
|
2044
|
+
template <> void gemm<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
2045
|
+
ggml_gemm_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2046
|
+
}
|
|
2047
|
+
|
|
1156
2048
|
class tensor_traits_base : public ggml::cpu::tensor_traits {
|
|
1157
2049
|
public:
|
|
1158
2050
|
virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
|
|
@@ -1204,6 +2096,55 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
1204
2096
|
return false;
|
|
1205
2097
|
}
|
|
1206
2098
|
|
|
2099
|
+
void forward_mul_mat_one_chunk(ggml_compute_params * params,
|
|
2100
|
+
ggml_tensor * op,
|
|
2101
|
+
int64_t src0_start,
|
|
2102
|
+
int64_t src0_end,
|
|
2103
|
+
int64_t src1_start,
|
|
2104
|
+
int64_t src1_end) {
|
|
2105
|
+
const ggml_tensor * src0 = op->src[0];
|
|
2106
|
+
const ggml_tensor * src1 = op->src[1];
|
|
2107
|
+
ggml_tensor * dst = op;
|
|
2108
|
+
|
|
2109
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
2110
|
+
|
|
2111
|
+
const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
|
|
2112
|
+
|
|
2113
|
+
GGML_ASSERT(ne03 == 1 && ne13 == 1);
|
|
2114
|
+
GGML_ASSERT(ne12 % ne02 == 0);
|
|
2115
|
+
const int64_t r2 = ne12 / ne02;
|
|
2116
|
+
|
|
2117
|
+
const int64_t i12 = src1_start / ne1;
|
|
2118
|
+
const int64_t i11 = src1_start - i12 * ne1;
|
|
2119
|
+
|
|
2120
|
+
// Determine batch index
|
|
2121
|
+
const int64_t i02 = i12 / r2;
|
|
2122
|
+
|
|
2123
|
+
const int64_t i1 = i11;
|
|
2124
|
+
const int64_t i2 = i12;
|
|
2125
|
+
|
|
2126
|
+
const char * src0_ptr = (const char *) src0->data + i02 * nb02;
|
|
2127
|
+
const char * src1_ptr = (const char *) params->wdata + (i11 + i12 * ne11) * src1_col_stride;
|
|
2128
|
+
char * dst_ptr = ((char *) dst->data + (i1 * nb1 + i2 * nb2));
|
|
2129
|
+
|
|
2130
|
+
const int64_t nrows = src1_end - src1_start;
|
|
2131
|
+
const int64_t ncols = src0_end - src0_start;
|
|
2132
|
+
|
|
2133
|
+
GGML_ASSERT(src1_ptr + src1_col_stride * nrows <= (const char *) params->wdata + params->wsize);
|
|
2134
|
+
|
|
2135
|
+
// If there are more than three rows in src1, use gemm; otherwise, use gemv.
|
|
2136
|
+
if (nrows > 3) {
|
|
2137
|
+
gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, (float *) (dst_ptr) + src0_start, nb1 / nb0,
|
|
2138
|
+
src0_ptr + src0_start * nb01, src1_ptr,
|
|
2139
|
+
nrows - (nrows % 4), ncols);
|
|
2140
|
+
}
|
|
2141
|
+
for (int iter = nrows - (nrows % 4); iter < nrows; iter++) {
|
|
2142
|
+
gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, (float *) (dst_ptr + (iter * nb1)) + src0_start,
|
|
2143
|
+
ne01, src0_ptr + src0_start * nb01,
|
|
2144
|
+
src1_ptr + (src1_col_stride * iter), 1 /* nrows */, ncols);
|
|
2145
|
+
}
|
|
2146
|
+
}
|
|
2147
|
+
|
|
1207
2148
|
void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) {
|
|
1208
2149
|
const ggml_tensor * src0 = op->src[0];
|
|
1209
2150
|
const ggml_tensor * src1 = op->src[1];
|
|
@@ -1225,6 +2166,12 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
1225
2166
|
GGML_ASSERT(nb1 <= nb2);
|
|
1226
2167
|
GGML_ASSERT(nb2 <= nb3);
|
|
1227
2168
|
|
|
2169
|
+
// TODO: General batched mul mat for 4D tensors
|
|
2170
|
+
// Currently only supports 3D tensors
|
|
2171
|
+
GGML_ASSERT(ne03 == 1);
|
|
2172
|
+
GGML_ASSERT(ne13 == 1);
|
|
2173
|
+
GGML_ASSERT(ne3 == 1);
|
|
2174
|
+
|
|
1228
2175
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
1229
2176
|
|
|
1230
2177
|
GGML_ASSERT(ggml_n_dims(op->src[0]) == 2);
|
|
@@ -1232,46 +2179,102 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
1232
2179
|
|
|
1233
2180
|
char * wdata = static_cast<char *>(params->wdata);
|
|
1234
2181
|
const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10);
|
|
2182
|
+
const size_t nbw2 = nbw1 * ne11;
|
|
1235
2183
|
|
|
1236
|
-
assert(params->wsize >=
|
|
2184
|
+
assert(params->wsize >= nbw2 * ne12);
|
|
1237
2185
|
|
|
1238
2186
|
const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
|
|
1239
2187
|
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
2188
|
+
// INFO: Quantization is done in planes to avoid extra complexity in chunking.
|
|
2189
|
+
// Flattening dimensions not multiple of INTER_SIZE would require extra handling depending on how
|
|
2190
|
+
// the planes are broadcast.
|
|
2191
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
|
2192
|
+
char * data_ptr = (char *) src1->data + i12 * nb12;
|
|
2193
|
+
char * wdata_ptr = wdata + i12 * nbw2;
|
|
2194
|
+
|
|
2195
|
+
for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
|
|
2196
|
+
ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) (data_ptr + i11 * nb11),
|
|
2197
|
+
(void *) (wdata_ptr + i11 * nbw1), 4, ne10);
|
|
2198
|
+
}
|
|
1244
2199
|
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
2200
|
+
const int64_t i11_processed = ne11 - ne11 % 4;
|
|
2201
|
+
for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
|
|
2202
|
+
from_float((float *) (data_ptr + i11 * nb11), (void *) (wdata_ptr + i11 * nbw1), ne10);
|
|
2203
|
+
}
|
|
1248
2204
|
}
|
|
1249
2205
|
|
|
1250
|
-
|
|
2206
|
+
// disable for NUMA
|
|
2207
|
+
const bool disable_chunking = ggml_is_numa();
|
|
1251
2208
|
|
|
1252
|
-
|
|
1253
|
-
const
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
2209
|
+
// 4x chunks per thread
|
|
2210
|
+
const int64_t nr0 = ggml_nrows(op->src[0]);
|
|
2211
|
+
|
|
2212
|
+
int nth_scaled = nth * 4;
|
|
2213
|
+
int64_t chunk_size0 = (nr0 + nth_scaled - 1) / nth_scaled;
|
|
2214
|
+
int64_t nchunk0 = (nr0 + chunk_size0 - 1) / chunk_size0;
|
|
2215
|
+
|
|
2216
|
+
// src1 is chunked only by full planes.
|
|
2217
|
+
// When we flatten we need to address dimensions not multiple of the q8 INTER_SIZE
|
|
2218
|
+
// to route them thorugh GEMV.
|
|
2219
|
+
// nchunk1 = ne12 also avoids messing the chunking for models with no 3d tensors
|
|
2220
|
+
// to avoid affecting their performance
|
|
2221
|
+
int64_t nchunk1 = ne12;
|
|
2222
|
+
|
|
2223
|
+
// Ensure minimum chunk size to avoid alignment issues with high thread counts
|
|
2224
|
+
// Minimum chunk size should be at least NB_COLS to prevent overlapping chunks after alignment
|
|
2225
|
+
const int64_t min_chunk_size = NB_COLS;
|
|
2226
|
+
if (nchunk0 > 0 && (nr0 / nchunk0) < min_chunk_size && nr0 >= min_chunk_size) {
|
|
2227
|
+
nchunk0 = (nr0 + min_chunk_size - 1) / min_chunk_size;
|
|
1260
2228
|
}
|
|
1261
2229
|
|
|
1262
|
-
|
|
1263
|
-
if
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
2230
|
+
int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
|
|
2231
|
+
// Only increase nchunk0 to nth if it won't make chunks too small
|
|
2232
|
+
if (nth == 1 || ((nchunk0 < nth || disable_chunking) && (nr0 + nth - 1) / nth >= min_chunk_size)) {
|
|
2233
|
+
nchunk0 = nth;
|
|
2234
|
+
dr0 = (nr0 + nchunk0 - 1) / nchunk0;
|
|
2235
|
+
}
|
|
2236
|
+
|
|
2237
|
+
// Ensure nchunk doesn't exceed the number of rows divided by minimum chunk size
|
|
2238
|
+
// This prevents creating too many tiny chunks that could overlap after alignment
|
|
2239
|
+
const int64_t max_nchunk = (nr0 + min_chunk_size - 1) / min_chunk_size;
|
|
2240
|
+
nchunk0 = MIN(nchunk0, max_nchunk);
|
|
2241
|
+
|
|
2242
|
+
if (ith == 0) {
|
|
2243
|
+
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
|
2244
|
+
ggml_threadpool_chunk_set(params->threadpool, nth);
|
|
1268
2245
|
}
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
2246
|
+
|
|
2247
|
+
ggml_barrier(params->threadpool);
|
|
2248
|
+
|
|
2249
|
+
// The first chunk comes from our thread_id, the rest will get auto-assigned.
|
|
2250
|
+
int current_chunk = ith;
|
|
2251
|
+
|
|
2252
|
+
while (current_chunk < nchunk0 * nchunk1) {
|
|
2253
|
+
const int64_t ith0 = current_chunk % nchunk0;
|
|
2254
|
+
const int64_t ith1 = current_chunk / nchunk0;
|
|
2255
|
+
|
|
2256
|
+
int64_t src0_start = dr0 * ith0;
|
|
2257
|
+
int64_t src0_end = MIN(src0_start + dr0, nr0);
|
|
2258
|
+
|
|
2259
|
+
// full-plane range for src1
|
|
2260
|
+
int64_t src1_start = ith1 * ne11;
|
|
2261
|
+
int64_t src1_end = (ith1 + 1) * ne11;
|
|
2262
|
+
|
|
2263
|
+
// Align boundaries to NB_COLS - round up to ensure all data is included
|
|
2264
|
+
// The chunk size limiting above ensures chunks are large enough to prevent overlaps
|
|
2265
|
+
src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
|
|
2266
|
+
src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
|
|
2267
|
+
src0_end = MIN(src0_end, ne01);
|
|
2268
|
+
|
|
2269
|
+
// Make sure current plane is the last one before exiting
|
|
2270
|
+
if (src0_start >= src0_end) {
|
|
2271
|
+
current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
|
|
2272
|
+
continue;
|
|
2273
|
+
}
|
|
2274
|
+
|
|
2275
|
+
forward_mul_mat_one_chunk(params, dst, src0_start, src0_end, src1_start, src1_end);
|
|
2276
|
+
|
|
2277
|
+
current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
|
|
1275
2278
|
}
|
|
1276
2279
|
}
|
|
1277
2280
|
|
|
@@ -1376,8 +2379,12 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
1376
2379
|
int64_t src0_cur_start = (ith * ne01) / nth;
|
|
1377
2380
|
int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
|
|
1378
2381
|
|
|
2382
|
+
// Align boundaries to NB_COLS - round up to ensure all data is included
|
|
1379
2383
|
src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
|
|
1380
2384
|
src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end;
|
|
2385
|
+
if (src0_cur_end > ne01) {
|
|
2386
|
+
src0_cur_end = ne01;
|
|
2387
|
+
}
|
|
1381
2388
|
|
|
1382
2389
|
if (src0_cur_start >= src0_cur_end) {
|
|
1383
2390
|
return;
|
|
@@ -1420,13 +2427,25 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
1420
2427
|
static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
|
|
1421
2428
|
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
|
|
1422
2429
|
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
|
|
2430
|
+
|
|
2431
|
+
// instance for Q4_K
|
|
2432
|
+
static const ggml::cpu::repack::tensor_traits<block_q4_K, 4, 8, GGML_TYPE_Q8_K> q4_K_8x4_q8_K;
|
|
1423
2433
|
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
|
1424
2434
|
|
|
2435
|
+
// instance for Q2
|
|
2436
|
+
static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
|
|
2437
|
+
|
|
1425
2438
|
// instance for IQ4
|
|
1426
2439
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
|
2440
|
+
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
|
|
2441
|
+
|
|
2442
|
+
// instance for Q8_0
|
|
2443
|
+
static const ggml::cpu::repack::tensor_traits<block_q8_0, 4, 4, GGML_TYPE_Q8_0> q8_0_4x4_q8_0;
|
|
2444
|
+
static const ggml::cpu::repack::tensor_traits<block_q8_0, 8, 4, GGML_TYPE_Q8_0> q8_0_4x8_q8_0;
|
|
1427
2445
|
|
|
1428
2446
|
if (cur->type == GGML_TYPE_Q4_0) {
|
|
1429
|
-
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
|
|
2447
|
+
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
|
|
2448
|
+
|| (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
|
|
1430
2449
|
if (cur->ne[1] % 8 == 0) {
|
|
1431
2450
|
return &q4_0_8x8_q8_0;
|
|
1432
2451
|
}
|
|
@@ -1447,12 +2466,44 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
1447
2466
|
return &q4_K_8x8_q8_K;
|
|
1448
2467
|
}
|
|
1449
2468
|
}
|
|
2469
|
+
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
|
2470
|
+
if (cur->ne[1] % 8 == 0) {
|
|
2471
|
+
return &q4_K_8x8_q8_K;
|
|
2472
|
+
}
|
|
2473
|
+
}
|
|
2474
|
+
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
|
2475
|
+
if (cur->ne[1] % 8 == 0) {
|
|
2476
|
+
return &q4_K_8x4_q8_K;
|
|
2477
|
+
}
|
|
2478
|
+
}
|
|
2479
|
+
} else if (cur->type == GGML_TYPE_Q2_K) {
|
|
2480
|
+
if (ggml_cpu_has_avx512()) {
|
|
2481
|
+
if (cur->ne[1] % 8 == 0) {
|
|
2482
|
+
return &q2_K_8x8_q8_K;
|
|
2483
|
+
}
|
|
2484
|
+
}
|
|
1450
2485
|
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
|
2486
|
+
if (ggml_cpu_has_avx2()) {
|
|
2487
|
+
if (cur->ne[1] % 8 == 0) {
|
|
2488
|
+
return &iq4_nl_8x8_q8_0;
|
|
2489
|
+
}
|
|
2490
|
+
}
|
|
1451
2491
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
|
1452
2492
|
if (cur->ne[1] % 4 == 0) {
|
|
1453
2493
|
return &iq4_nl_4x4_q8_0;
|
|
1454
2494
|
}
|
|
1455
2495
|
}
|
|
2496
|
+
} else if (cur->type == GGML_TYPE_Q8_0) {
|
|
2497
|
+
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
|
2498
|
+
if (cur->ne[1] % 4 == 0) {
|
|
2499
|
+
return &q8_0_4x8_q8_0;
|
|
2500
|
+
}
|
|
2501
|
+
}
|
|
2502
|
+
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
|
2503
|
+
if (cur->ne[1] % 4 == 0) {
|
|
2504
|
+
return &q8_0_4x4_q8_0;
|
|
2505
|
+
}
|
|
2506
|
+
}
|
|
1456
2507
|
}
|
|
1457
2508
|
|
|
1458
2509
|
return nullptr;
|