whispercpp 1.3.3 → 1.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/ruby_whisper_params.c +55 -25
- data/ext/sources/CMakeLists.txt +1 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/build-xcframework.sh +24 -0
- data/ext/sources/examples/CMakeLists.txt +1 -0
- data/ext/sources/examples/addon.node/addon.cpp +19 -19
- data/ext/sources/examples/addon.node/index.js +7 -5
- data/ext/sources/examples/bench/bench.cpp +26 -16
- data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
- data/ext/sources/examples/cli/cli.cpp +4 -2
- data/ext/sources/examples/command/command.cpp +26 -24
- data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/lsp/lsp.cpp +19 -17
- data/ext/sources/examples/server/server.cpp +24 -13
- data/ext/sources/examples/server.py +6 -1
- data/ext/sources/examples/stream/stream.cpp +4 -2
- data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
- data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
- data/ext/sources/examples/talk-llama/CMakeLists.txt +2 -2
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
- data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
- data/ext/sources/examples/talk-llama/llama-arch.cpp +588 -15
- data/ext/sources/examples/talk-llama/llama-arch.h +58 -1
- data/ext/sources/examples/talk-llama/llama-batch.cpp +103 -71
- data/ext/sources/examples/talk-llama/llama-batch.h +31 -18
- data/ext/sources/examples/talk-llama/llama-chat.cpp +120 -5
- data/ext/sources/examples/talk-llama/llama-chat.h +7 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +460 -357
- data/ext/sources/examples/talk-llama/llama-context.h +44 -29
- data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
- data/ext/sources/examples/talk-llama/llama-graph.cpp +543 -271
- data/ext/sources/examples/talk-llama/llama-graph.h +278 -168
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +118 -4
- data/ext/sources/examples/talk-llama/llama-hparams.h +61 -15
- data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
- data/ext/sources/examples/talk-llama/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +38 -29
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2020 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +358 -27
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +80 -28
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +56 -36
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +30 -29
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +48 -19
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +13 -14
- data/ext/sources/examples/talk-llama/llama-memory.h +16 -10
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +2 -0
- data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
- data/ext/sources/examples/talk-llama/llama-model.cpp +7165 -2336
- data/ext/sources/examples/talk-llama/llama-model.h +60 -9
- data/ext/sources/examples/talk-llama/llama-quant.cpp +48 -10
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +440 -13
- data/ext/sources/examples/talk-llama/llama-vocab.h +45 -0
- data/ext/sources/examples/talk-llama/llama.cpp +65 -10
- data/ext/sources/examples/talk-llama/llama.h +95 -177
- data/ext/sources/examples/talk-llama/talk-llama.cpp +9 -6
- data/ext/sources/examples/talk-llama/unicode.cpp +207 -0
- data/ext/sources/examples/talk-llama/unicode.h +45 -0
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +4 -2
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
- data/ext/sources/ggml/CMakeLists.txt +59 -31
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
- data/ext/sources/ggml/include/ggml-backend.h +17 -1
- data/ext/sources/ggml/include/ggml-cpu.h +1 -1
- data/ext/sources/ggml/include/ggml-metal.h +1 -6
- data/ext/sources/ggml/include/ggml-opt.h +25 -6
- data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
- data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
- data/ext/sources/ggml/include/ggml.h +221 -16
- data/ext/sources/ggml/src/CMakeLists.txt +17 -2
- data/ext/sources/ggml/src/ggml-alloc.c +265 -141
- data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +30 -13
- data/ext/sources/ggml/src/ggml-backend.cpp +221 -38
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
- data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +14 -0
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
- data/ext/sources/ggml/src/ggml-cann/common.h +143 -1
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +488 -69
- data/ext/sources/ggml/src/ggml-common.h +17 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +40 -18
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +4 -2
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +103 -582
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +265 -437
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +521 -353
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +32 -2
- data/ext/sources/ggml/src/ggml-cpu/common.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -6
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +70 -42
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +35 -28
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +227 -97
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +474 -1116
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1587 -1177
- data/ext/sources/ggml/src/ggml-cpu/ops.h +5 -8
- data/ext/sources/ggml/src/ggml-cpu/quants.c +35 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +458 -47
- data/ext/sources/ggml/src/ggml-cpu/repack.h +22 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +89 -60
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- data/ext/sources/ggml/src/ggml-cpu/traits.cpp +2 -2
- data/ext/sources/ggml/src/ggml-cpu/traits.h +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +170 -26
- data/ext/sources/ggml/src/ggml-cpu/vec.h +506 -63
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
- data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
- data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +250 -63
- data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +95 -22
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +15 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
- data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +498 -367
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +137 -91
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +86 -50
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +379 -107
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
- data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +56 -2
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
- data/ext/sources/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
- data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
- data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -100
- data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +21 -4
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
- data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +90 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +8 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +10 -2
- data/ext/sources/ggml/src/ggml-impl.h +119 -9
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -7
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +136 -63
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +2854 -1503
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +18 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +2510 -242
- data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +66 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +177 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
- data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
- data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
- data/ext/sources/ggml/src/ggml-quants.c +111 -16
- data/ext/sources/ggml/src/ggml-quants.h +6 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +67 -47
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +15 -5
- data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +25 -16
- data/ext/sources/ggml/src/ggml-sycl/conv.cpp +10 -4
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +166 -99
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +72 -306
- data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +67 -49
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +1 -31
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +79 -29
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +14 -26
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +9 -6
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +328 -323
- data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +80 -60
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +201 -132
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +74 -55
- data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +8 -9
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +35 -42
- data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +12 -6
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +16 -12
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3492 -883
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +55 -11
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +335 -77
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
- data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
- data/ext/sources/ggml/src/ggml.c +478 -98
- data/ext/sources/ggml/src/gguf.cpp +8 -1
- data/ext/sources/src/whisper.cpp +23 -46
- data/ext/sources/tests/CMakeLists.txt +8 -1
- data/ext/sources/tests/test-vad-full.cpp +3 -3
- data/ext/sources/tests/test-vad.cpp +2 -2
- data/lib/whisper/model/uri.rb +1 -1
- data/sig/whisper.rbs +7 -0
- data/test/test_params.rb +8 -0
- data/test/test_whisper.rb +1 -1
- data/whispercpp.gemspec +1 -1
- metadata +164 -157
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +0 -279
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +0 -1841
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +0 -303
- data/ext/sources/ggml/include/ggml-kompute.h +0 -50
- data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
- data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
- data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
- data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
- data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
- data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -6280
@@ -1,10 +1,17 @@
|
|
1
1
|
#include "llama-hparams.h"
|
2
2
|
|
3
3
|
#include "ggml.h"
|
4
|
+
#include <cassert>
|
4
5
|
|
5
|
-
void llama_hparams::set_swa_pattern(uint32_t n_pattern) {
|
6
|
-
|
7
|
-
|
6
|
+
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
7
|
+
if (dense_first) {
|
8
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
9
|
+
swa_layers[il] = n_pattern == 0 || (il % n_pattern != 0);
|
10
|
+
}
|
11
|
+
} else {
|
12
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
13
|
+
swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
|
14
|
+
}
|
8
15
|
}
|
9
16
|
}
|
10
17
|
|
@@ -65,15 +72,61 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
|
|
65
72
|
return n_embd_head_v * n_head_kv;
|
66
73
|
}
|
67
74
|
|
75
|
+
bool llama_hparams::is_n_embd_k_gqa_variable() const {
|
76
|
+
const uint32_t val = n_embd_k_gqa();
|
77
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
78
|
+
if (val != n_embd_k_gqa(il)) {
|
79
|
+
return true;
|
80
|
+
}
|
81
|
+
}
|
82
|
+
|
83
|
+
return false;
|
84
|
+
}
|
85
|
+
|
86
|
+
bool llama_hparams::is_n_embd_v_gqa_variable() const {
|
87
|
+
const uint32_t val = n_embd_v_gqa();
|
88
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
89
|
+
if (val != n_embd_v_gqa(il)) {
|
90
|
+
return true;
|
91
|
+
}
|
92
|
+
}
|
93
|
+
|
94
|
+
return false;
|
95
|
+
}
|
96
|
+
|
97
|
+
uint32_t llama_hparams::n_embd_k_gqa_max() const {
|
98
|
+
uint32_t val = n_embd_k_gqa();
|
99
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
100
|
+
val = std::max(val, n_embd_k_gqa(il));
|
101
|
+
}
|
102
|
+
|
103
|
+
return val;
|
104
|
+
}
|
105
|
+
|
106
|
+
uint32_t llama_hparams::n_embd_v_gqa_max() const {
|
107
|
+
uint32_t val = n_embd_v_gqa();
|
108
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
109
|
+
val = std::max(val, n_embd_v_gqa(il));
|
110
|
+
}
|
111
|
+
|
112
|
+
return val;
|
113
|
+
}
|
114
|
+
|
68
115
|
uint32_t llama_hparams::n_embd_r() const {
|
69
116
|
if (wkv_head_size != 0) {
|
70
117
|
// for RWKV models
|
71
118
|
return token_shift_count * n_embd;
|
72
119
|
}
|
73
120
|
|
121
|
+
if (n_shortconv_l_cache != 0) {
|
122
|
+
// for LFM2 models
|
123
|
+
return n_embd * (n_shortconv_l_cache - 1);
|
124
|
+
}
|
125
|
+
|
74
126
|
// TODO: maybe support other convolution strides than 1
|
75
127
|
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
|
76
|
-
|
128
|
+
// Corresponds to Mamba's conv_states size
|
129
|
+
return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * (ssm_d_inner + 2*ssm_n_group*ssm_d_state);
|
77
130
|
}
|
78
131
|
|
79
132
|
uint32_t llama_hparams::n_embd_s() const {
|
@@ -101,3 +154,64 @@ bool llama_hparams::is_swa(uint32_t il) const {
|
|
101
154
|
|
102
155
|
GGML_ABORT("fatal error");
|
103
156
|
}
|
157
|
+
|
158
|
+
bool llama_hparams::has_kv(uint32_t il) const {
|
159
|
+
if (n_layer_kv_from_start >= 0) {
|
160
|
+
if (il < (uint32_t) n_layer_kv_from_start) {
|
161
|
+
return true;
|
162
|
+
}
|
163
|
+
|
164
|
+
return false;
|
165
|
+
}
|
166
|
+
|
167
|
+
// by default, all layers have kv
|
168
|
+
return true;
|
169
|
+
}
|
170
|
+
|
171
|
+
uint32_t llama_hparams::n_layer_kv() const {
|
172
|
+
uint32_t res = 0;
|
173
|
+
|
174
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
175
|
+
if (has_kv(il)) {
|
176
|
+
res++;
|
177
|
+
}
|
178
|
+
}
|
179
|
+
|
180
|
+
return res;
|
181
|
+
}
|
182
|
+
|
183
|
+
bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
|
184
|
+
assert(p0 >= 0 && p1 >= 0);
|
185
|
+
|
186
|
+
switch (swa_type) {
|
187
|
+
case LLAMA_SWA_TYPE_NONE:
|
188
|
+
{
|
189
|
+
} break;
|
190
|
+
case LLAMA_SWA_TYPE_STANDARD:
|
191
|
+
{
|
192
|
+
if (p1 - p0 >= (int32_t) n_swa) {
|
193
|
+
return true;
|
194
|
+
}
|
195
|
+
} break;
|
196
|
+
case LLAMA_SWA_TYPE_CHUNKED:
|
197
|
+
{
|
198
|
+
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
|
199
|
+
|
200
|
+
if (p0 < pos_chunk_start) {
|
201
|
+
return true;
|
202
|
+
}
|
203
|
+
} break;
|
204
|
+
case LLAMA_SWA_TYPE_SYMMETRIC:
|
205
|
+
{
|
206
|
+
const int32_t half_n_swa = (int32_t) n_swa / 2;
|
207
|
+
const int32_t pos_diff = p1 - p0;
|
208
|
+
|
209
|
+
// Mask if outside the symmetric window
|
210
|
+
if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
|
211
|
+
return true;
|
212
|
+
}
|
213
|
+
} break;
|
214
|
+
}
|
215
|
+
|
216
|
+
return false;
|
217
|
+
}
|
@@ -6,18 +6,20 @@
|
|
6
6
|
|
7
7
|
// bump if necessary
|
8
8
|
#define LLAMA_MAX_LAYERS 512
|
9
|
-
#define LLAMA_MAX_EXPERTS
|
9
|
+
#define LLAMA_MAX_EXPERTS 384 // Kimi-K2
|
10
10
|
|
11
11
|
enum llama_expert_gating_func_type {
|
12
|
-
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE
|
13
|
-
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX
|
14
|
-
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID
|
12
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
|
13
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
|
14
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
|
15
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT = 3, // applied to the router weights instead of the logits
|
15
16
|
};
|
16
17
|
|
17
18
|
enum llama_swa_type {
|
18
|
-
LLAMA_SWA_TYPE_NONE
|
19
|
-
LLAMA_SWA_TYPE_STANDARD
|
20
|
-
LLAMA_SWA_TYPE_CHUNKED
|
19
|
+
LLAMA_SWA_TYPE_NONE = 0,
|
20
|
+
LLAMA_SWA_TYPE_STANDARD = 1,
|
21
|
+
LLAMA_SWA_TYPE_CHUNKED = 2,
|
22
|
+
LLAMA_SWA_TYPE_SYMMETRIC = 3,
|
21
23
|
};
|
22
24
|
|
23
25
|
struct llama_hparams_posnet {
|
@@ -40,6 +42,7 @@ struct llama_hparams {
|
|
40
42
|
uint32_t n_embd;
|
41
43
|
uint32_t n_embd_features = 0;
|
42
44
|
uint32_t n_layer;
|
45
|
+
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
|
43
46
|
uint32_t n_rot;
|
44
47
|
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
45
48
|
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
@@ -55,6 +58,8 @@ struct llama_hparams {
|
|
55
58
|
struct llama_hparams_posnet posnet;
|
56
59
|
struct llama_hparams_convnext convnext;
|
57
60
|
|
61
|
+
uint32_t n_shortconv_l_cache = 0;
|
62
|
+
|
58
63
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
|
59
64
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
|
60
65
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
@@ -64,20 +69,25 @@ struct llama_hparams {
|
|
64
69
|
uint32_t n_lora_kv = 0;
|
65
70
|
uint32_t n_ff_exp = 0;
|
66
71
|
uint32_t n_ff_shexp = 0;
|
72
|
+
uint32_t n_ff_chexp = 0;
|
67
73
|
uint32_t n_expert_shared = 0;
|
68
74
|
uint32_t n_norm_groups = 0;
|
75
|
+
uint32_t n_group_experts = 0;
|
69
76
|
|
70
|
-
float
|
77
|
+
float expert_group_scale = 0.05f;
|
78
|
+
float expert_weights_scale = 0.0f;
|
71
79
|
bool expert_weights_norm = false;
|
72
80
|
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
73
81
|
uint32_t moe_every_n_layers = 0;
|
82
|
+
uint32_t nextn_predict_layers = 0;
|
74
83
|
|
75
84
|
float f_norm_eps;
|
76
85
|
float f_norm_rms_eps;
|
77
86
|
float f_norm_group_eps;
|
78
87
|
|
79
|
-
float f_attn_logit_softcapping
|
80
|
-
float
|
88
|
+
float f_attn_logit_softcapping = 50.0f;
|
89
|
+
float f_router_logit_softcapping = 30.0f;
|
90
|
+
float f_final_logit_softcapping = 30.0f;
|
81
91
|
|
82
92
|
// for RWKV
|
83
93
|
uint32_t rescale_every_n_layers = 0;
|
@@ -96,7 +106,12 @@ struct llama_hparams {
|
|
96
106
|
float rope_freq_scale_train;
|
97
107
|
float rope_freq_scale_train_swa;
|
98
108
|
uint32_t n_ctx_orig_yarn;
|
99
|
-
float rope_yarn_log_mul;
|
109
|
+
float rope_yarn_log_mul = 0.0f;
|
110
|
+
|
111
|
+
float yarn_ext_factor = -1.0f;
|
112
|
+
float yarn_attn_factor = 1.0f;
|
113
|
+
float yarn_beta_fast = 32.0f;
|
114
|
+
float yarn_beta_slow = 1.0f;
|
100
115
|
|
101
116
|
std::array<int, 4> rope_sections;
|
102
117
|
|
@@ -114,6 +129,7 @@ struct llama_hparams {
|
|
114
129
|
uint32_t ssm_d_inner = 0;
|
115
130
|
uint32_t ssm_d_state = 0;
|
116
131
|
uint32_t ssm_dt_rank = 0;
|
132
|
+
uint32_t ssm_n_group = 0;
|
117
133
|
|
118
134
|
// for hybrid state space models
|
119
135
|
std::array<bool, LLAMA_MAX_LAYERS> recurrent_layer_arr;
|
@@ -129,15 +145,19 @@ struct llama_hparams {
|
|
129
145
|
float f_embedding_scale = 0.0f;
|
130
146
|
float f_attention_scale = 0.0f;
|
131
147
|
|
148
|
+
// grok-2
|
149
|
+
float f_attn_out_scale = 0.0f;
|
150
|
+
uint32_t attn_temp_length = 0;
|
151
|
+
|
132
152
|
bool causal_attn = true;
|
133
153
|
bool use_alibi = false;
|
134
154
|
bool attn_soft_cap = false;
|
135
|
-
bool use_kq_norm =
|
155
|
+
bool use_kq_norm = false;
|
136
156
|
|
137
157
|
// for Classifiers
|
138
158
|
uint32_t n_cls_out = 1;
|
139
159
|
|
140
|
-
// llama4
|
160
|
+
// llama4 smallthinker
|
141
161
|
uint32_t n_moe_layer_step = 0;
|
142
162
|
uint32_t n_no_rope_layer_step = 4;
|
143
163
|
uint32_t n_attn_temp_floor_scale = 8192;
|
@@ -152,15 +172,17 @@ struct llama_hparams {
|
|
152
172
|
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
153
173
|
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
|
154
174
|
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
175
|
+
uint32_t dec_n_layer = 0;
|
155
176
|
|
156
177
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
157
178
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
158
179
|
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
159
180
|
|
160
181
|
// this value n_pattern means that every nth layer is dense (i.e. non-SWA)
|
182
|
+
// dense_first means whether the pattern is start with a dense layer
|
161
183
|
// note that if n_pattern == 0, all layers are SWA
|
162
184
|
// if n_pattern == 1, all layers are dense
|
163
|
-
// example: n_pattern = 3
|
185
|
+
// example 1: n_pattern = 3, dense_first = false
|
164
186
|
// il == 0: swa
|
165
187
|
// il == 1: swa
|
166
188
|
// il == 2: dense
|
@@ -169,7 +191,13 @@ struct llama_hparams {
|
|
169
191
|
// il == 5: dense
|
170
192
|
// il == 6: swa
|
171
193
|
// etc ...
|
172
|
-
|
194
|
+
// example 2: n_pattern = 2, dense_first = true
|
195
|
+
// il == 0: dense
|
196
|
+
// il == 1: swa
|
197
|
+
// il == 2: dense
|
198
|
+
// il == 3: swa
|
199
|
+
// etc ...
|
200
|
+
void set_swa_pattern(uint32_t n_pattern, bool dense_first = false);
|
173
201
|
|
174
202
|
// return true if one of the layers is SWA
|
175
203
|
bool is_swa_any() const;
|
@@ -188,6 +216,14 @@ struct llama_hparams {
|
|
188
216
|
// dimension of value embeddings across all k-v heads
|
189
217
|
uint32_t n_embd_v_gqa(uint32_t il = 0) const;
|
190
218
|
|
219
|
+
// true if any layer has a different n_embd_k_gqa/n_embd_v_gqa
|
220
|
+
bool is_n_embd_k_gqa_variable() const;
|
221
|
+
bool is_n_embd_v_gqa_variable() const;
|
222
|
+
|
223
|
+
// return the maximum n_embd_k_gqa/n_embd_v_gqa across all layers
|
224
|
+
uint32_t n_embd_k_gqa_max() const;
|
225
|
+
uint32_t n_embd_v_gqa_max() const;
|
226
|
+
|
191
227
|
// dimension of the rolling state embeddings
|
192
228
|
// corresponds to Mamba's conv_states size or RWKV's token_shift states size
|
193
229
|
uint32_t n_embd_r() const;
|
@@ -201,6 +237,16 @@ struct llama_hparams {
|
|
201
237
|
uint32_t n_pos_per_embd() const;
|
202
238
|
|
203
239
|
bool is_swa(uint32_t il) const;
|
240
|
+
|
241
|
+
bool has_kv(uint32_t il) const;
|
242
|
+
|
243
|
+
// number of layers for which has_kv() returns true
|
244
|
+
uint32_t n_layer_kv() const;
|
245
|
+
|
246
|
+
// note that this function uses different SWA parameters from those in the hparams
|
247
|
+
// TODO: think of a better place for this function
|
248
|
+
// TODO: pack the SWA params in a struct?
|
249
|
+
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
|
204
250
|
};
|
205
251
|
|
206
252
|
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
@@ -59,3 +59,5 @@ std::string llama_format_tensor_shape(const std::vector<int64_t> & ne);
|
|
59
59
|
std::string llama_format_tensor_shape(const struct ggml_tensor * t);
|
60
60
|
|
61
61
|
std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);
|
62
|
+
|
63
|
+
#define LLAMA_TENSOR_NAME_FATTN "__fattn__"
|
@@ -0,0 +1,326 @@
|
|
1
|
+
#include "llama-kv-cache-iswa.h"
|
2
|
+
|
3
|
+
#include "llama-impl.h"
|
4
|
+
#include "llama-batch.h"
|
5
|
+
#include "llama-model.h"
|
6
|
+
|
7
|
+
#include <algorithm>
|
8
|
+
#include <cassert>
|
9
|
+
|
10
|
+
//
|
11
|
+
// llama_kv_cache_iswa
|
12
|
+
//
|
13
|
+
|
14
|
+
llama_kv_cache_iswa::llama_kv_cache_iswa(
|
15
|
+
const llama_model & model,
|
16
|
+
ggml_type type_k,
|
17
|
+
ggml_type type_v,
|
18
|
+
bool v_trans,
|
19
|
+
bool offload,
|
20
|
+
bool swa_full,
|
21
|
+
bool unified,
|
22
|
+
uint32_t kv_size,
|
23
|
+
uint32_t n_seq_max,
|
24
|
+
uint32_t n_ubatch,
|
25
|
+
uint32_t n_pad,
|
26
|
+
const layer_filter_cb & filter,
|
27
|
+
const layer_reuse_cb & reuse) : hparams(model.hparams), unified(unified) {
|
28
|
+
|
29
|
+
// chain filters
|
30
|
+
const layer_filter_cb filter_base = [&](int32_t il) {
|
31
|
+
if (filter && !filter(il)) {
|
32
|
+
return false;
|
33
|
+
}
|
34
|
+
|
35
|
+
return !model.hparams.is_swa(il);
|
36
|
+
};
|
37
|
+
|
38
|
+
const layer_filter_cb filter_swa = [&](int32_t il) {
|
39
|
+
if (filter && !filter(il)) {
|
40
|
+
return false;
|
41
|
+
}
|
42
|
+
|
43
|
+
return model.hparams.is_swa(il);
|
44
|
+
};
|
45
|
+
|
46
|
+
const uint32_t size_base = kv_size;
|
47
|
+
|
48
|
+
uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*(unified ? n_seq_max : 1) + n_ubatch, n_pad));
|
49
|
+
|
50
|
+
// when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
|
51
|
+
if (swa_full) {
|
52
|
+
LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",
|
53
|
+
__func__, "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
|
54
|
+
|
55
|
+
size_swa = size_base;
|
56
|
+
}
|
57
|
+
|
58
|
+
LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
|
59
|
+
|
60
|
+
kv_base = std::make_unique<llama_kv_cache>(
|
61
|
+
model, type_k, type_v,
|
62
|
+
v_trans, offload, unified, size_base, n_seq_max, n_pad,
|
63
|
+
0, LLAMA_SWA_TYPE_NONE, filter_base, reuse);
|
64
|
+
|
65
|
+
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
|
66
|
+
|
67
|
+
kv_swa = std::make_unique<llama_kv_cache>(
|
68
|
+
model, type_k, type_v,
|
69
|
+
v_trans, offload, unified, size_swa, n_seq_max, n_pad,
|
70
|
+
hparams.n_swa, hparams.swa_type, filter_swa, reuse);
|
71
|
+
}
|
72
|
+
|
73
|
+
void llama_kv_cache_iswa::clear(bool data) {
|
74
|
+
kv_base->clear(data);
|
75
|
+
kv_swa ->clear(data);
|
76
|
+
}
|
77
|
+
|
78
|
+
bool llama_kv_cache_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
79
|
+
bool res = true;
|
80
|
+
|
81
|
+
res = res & kv_base->seq_rm(seq_id, p0, p1);
|
82
|
+
res = res & kv_swa ->seq_rm(seq_id, p0, p1);
|
83
|
+
|
84
|
+
return res;
|
85
|
+
}
|
86
|
+
|
87
|
+
void llama_kv_cache_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
88
|
+
kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
89
|
+
kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
90
|
+
}
|
91
|
+
|
92
|
+
void llama_kv_cache_iswa::seq_keep(llama_seq_id seq_id) {
|
93
|
+
kv_base->seq_keep(seq_id);
|
94
|
+
kv_swa ->seq_keep(seq_id);
|
95
|
+
}
|
96
|
+
|
97
|
+
void llama_kv_cache_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
|
98
|
+
kv_base->seq_add(seq_id, p0, p1, shift);
|
99
|
+
kv_swa ->seq_add(seq_id, p0, p1, shift);
|
100
|
+
}
|
101
|
+
|
102
|
+
void llama_kv_cache_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
103
|
+
kv_base->seq_div(seq_id, p0, p1, d);
|
104
|
+
kv_swa ->seq_div(seq_id, p0, p1, d);
|
105
|
+
}
|
106
|
+
|
107
|
+
llama_pos llama_kv_cache_iswa::seq_pos_min(llama_seq_id seq_id) const {
|
108
|
+
// the base cache is a superset of the SWA cache, so we can just check the SWA cache
|
109
|
+
return kv_swa->seq_pos_min(seq_id);
|
110
|
+
}
|
111
|
+
|
112
|
+
llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
|
113
|
+
return kv_swa->seq_pos_max(seq_id);
|
114
|
+
}
|
115
|
+
|
116
|
+
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {
|
117
|
+
std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
|
118
|
+
for (const auto & buft_size : kv_swa->memory_breakdown()) {
|
119
|
+
mb[buft_size.first] += buft_size.second;
|
120
|
+
}
|
121
|
+
return mb;
|
122
|
+
}
|
123
|
+
|
124
|
+
llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
125
|
+
GGML_UNUSED(embd_all);
|
126
|
+
|
127
|
+
// first try simple split
|
128
|
+
do {
|
129
|
+
if (!unified) {
|
130
|
+
// requires equal splits, so we skip the simple split
|
131
|
+
break;
|
132
|
+
}
|
133
|
+
|
134
|
+
balloc.split_reset();
|
135
|
+
|
136
|
+
std::vector<llama_ubatch> ubatches;
|
137
|
+
while (true) {
|
138
|
+
auto ubatch = balloc.split_simple(n_ubatch);
|
139
|
+
|
140
|
+
if (ubatch.n_tokens == 0) {
|
141
|
+
break;
|
142
|
+
}
|
143
|
+
|
144
|
+
ubatches.push_back(std::move(ubatch)); // NOLINT
|
145
|
+
}
|
146
|
+
|
147
|
+
if (balloc.get_n_used() < balloc.get_n_tokens()) {
|
148
|
+
// failed to find a suitable split
|
149
|
+
break;
|
150
|
+
}
|
151
|
+
|
152
|
+
auto sinfos_base = kv_base->prepare(ubatches);
|
153
|
+
if (sinfos_base.empty()) {
|
154
|
+
break;
|
155
|
+
}
|
156
|
+
|
157
|
+
auto sinfos_swa = kv_swa->prepare(ubatches);
|
158
|
+
if (sinfos_swa.empty()) {
|
159
|
+
break;
|
160
|
+
}
|
161
|
+
|
162
|
+
assert(sinfos_base.size() == sinfos_swa.size());
|
163
|
+
|
164
|
+
return std::make_unique<llama_kv_cache_iswa_context>(
|
165
|
+
this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
|
166
|
+
} while (false);
|
167
|
+
|
168
|
+
// if it fails, try equal split
|
169
|
+
do {
|
170
|
+
balloc.split_reset();
|
171
|
+
|
172
|
+
std::vector<llama_ubatch> ubatches;
|
173
|
+
while (true) {
|
174
|
+
auto ubatch = balloc.split_equal(n_ubatch, !unified);
|
175
|
+
|
176
|
+
if (ubatch.n_tokens == 0) {
|
177
|
+
break;
|
178
|
+
}
|
179
|
+
|
180
|
+
ubatches.push_back(std::move(ubatch)); // NOLINT
|
181
|
+
}
|
182
|
+
|
183
|
+
if (balloc.get_n_used() < balloc.get_n_tokens()) {
|
184
|
+
// failed to find a suitable split
|
185
|
+
break;
|
186
|
+
}
|
187
|
+
|
188
|
+
auto sinfos_base = kv_base->prepare(ubatches);
|
189
|
+
if (sinfos_base.empty()) {
|
190
|
+
break;
|
191
|
+
}
|
192
|
+
|
193
|
+
auto sinfos_swa = kv_swa->prepare(ubatches);
|
194
|
+
if (sinfos_swa.empty()) {
|
195
|
+
break;
|
196
|
+
}
|
197
|
+
|
198
|
+
assert(sinfos_base.size() == sinfos_swa.size());
|
199
|
+
|
200
|
+
return std::make_unique<llama_kv_cache_iswa_context>(
|
201
|
+
this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
|
202
|
+
} while (false);
|
203
|
+
|
204
|
+
// TODO: if we fail again, we should attempt different splitting strategies
|
205
|
+
// but to do that properly, we first have to refactor the batches to be more flexible
|
206
|
+
|
207
|
+
return std::make_unique<llama_kv_cache_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
208
|
+
}
|
209
|
+
|
210
|
+
llama_memory_context_ptr llama_kv_cache_iswa::init_full() {
|
211
|
+
return std::make_unique<llama_kv_cache_iswa_context>(this);
|
212
|
+
}
|
213
|
+
|
214
|
+
llama_memory_context_ptr llama_kv_cache_iswa::init_update(llama_context * lctx, bool optimize) {
|
215
|
+
return std::make_unique<llama_kv_cache_iswa_context>(this, lctx, optimize);
|
216
|
+
}
|
217
|
+
|
218
|
+
bool llama_kv_cache_iswa::get_can_shift() const {
|
219
|
+
return kv_base->get_size() == kv_swa->get_size();
|
220
|
+
}
|
221
|
+
|
222
|
+
void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
223
|
+
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
|
224
|
+
kv_base->state_write(io, seq_id, flags);
|
225
|
+
}
|
226
|
+
|
227
|
+
kv_swa->state_write(io, seq_id, flags);
|
228
|
+
}
|
229
|
+
|
230
|
+
void llama_kv_cache_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
231
|
+
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
|
232
|
+
kv_base->state_read(io, seq_id, flags);
|
233
|
+
}
|
234
|
+
|
235
|
+
kv_swa->state_read(io, seq_id, flags);
|
236
|
+
}
|
237
|
+
|
238
|
+
llama_kv_cache * llama_kv_cache_iswa::get_base() const {
|
239
|
+
return kv_base.get();
|
240
|
+
}
|
241
|
+
|
242
|
+
llama_kv_cache * llama_kv_cache_iswa::get_swa() const {
|
243
|
+
return kv_swa.get();
|
244
|
+
}
|
245
|
+
|
246
|
+
//
|
247
|
+
// llama_kv_cache_iswa_context
|
248
|
+
//
|
249
|
+
|
250
|
+
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(llama_memory_status status) : status(status) {}
|
251
|
+
|
252
|
+
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
|
253
|
+
llama_kv_cache_iswa * kv) :
|
254
|
+
ctx_base(kv->get_base()->init_full()),
|
255
|
+
ctx_swa (kv->get_swa ()->init_full()),
|
256
|
+
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
|
257
|
+
}
|
258
|
+
|
259
|
+
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
|
260
|
+
llama_kv_cache_iswa * kv,
|
261
|
+
llama_context * lctx,
|
262
|
+
bool optimize) :
|
263
|
+
ctx_base(kv->get_base()->init_update(lctx, optimize)),
|
264
|
+
ctx_swa (kv->get_swa ()->init_update(lctx, optimize)),
|
265
|
+
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
|
266
|
+
}
|
267
|
+
|
268
|
+
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
|
269
|
+
llama_kv_cache_iswa * kv,
|
270
|
+
slot_info_vec_t sinfos_base,
|
271
|
+
slot_info_vec_t sinfos_swa,
|
272
|
+
std::vector<llama_ubatch> ubatches) :
|
273
|
+
ubatches(std::move(ubatches)),
|
274
|
+
// note: here we copy the ubatches. not sure if this is ideal
|
275
|
+
ctx_base(new llama_kv_cache_context(kv->get_base(), std::move(sinfos_base), this->ubatches)),
|
276
|
+
ctx_swa (new llama_kv_cache_context(kv->get_swa (), std::move(sinfos_swa), this->ubatches)),
|
277
|
+
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
|
278
|
+
}
|
279
|
+
|
280
|
+
llama_kv_cache_iswa_context:: ~llama_kv_cache_iswa_context() = default;
|
281
|
+
|
282
|
+
bool llama_kv_cache_iswa_context::next() {
|
283
|
+
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
284
|
+
|
285
|
+
ctx_base->next();
|
286
|
+
ctx_swa ->next();
|
287
|
+
|
288
|
+
if (++i_next >= ubatches.size()) {
|
289
|
+
return false;
|
290
|
+
}
|
291
|
+
|
292
|
+
return true;
|
293
|
+
}
|
294
|
+
|
295
|
+
bool llama_kv_cache_iswa_context::apply() {
|
296
|
+
assert(!llama_memory_status_is_fail(status));
|
297
|
+
|
298
|
+
bool res = true;
|
299
|
+
|
300
|
+
res = res & ctx_base->apply();
|
301
|
+
res = res & ctx_swa ->apply();
|
302
|
+
|
303
|
+
return res;
|
304
|
+
}
|
305
|
+
|
306
|
+
llama_memory_status llama_kv_cache_iswa_context::get_status() const {
|
307
|
+
return status;
|
308
|
+
}
|
309
|
+
|
310
|
+
const llama_ubatch & llama_kv_cache_iswa_context::get_ubatch() const {
|
311
|
+
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
312
|
+
|
313
|
+
return ubatches[i_next];
|
314
|
+
}
|
315
|
+
|
316
|
+
const llama_kv_cache_context * llama_kv_cache_iswa_context::get_base() const {
|
317
|
+
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
318
|
+
|
319
|
+
return static_cast<const llama_kv_cache_context *>(ctx_base.get());
|
320
|
+
}
|
321
|
+
|
322
|
+
const llama_kv_cache_context * llama_kv_cache_iswa_context::get_swa() const {
|
323
|
+
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
324
|
+
|
325
|
+
return static_cast<const llama_kv_cache_context *>(ctx_swa.get());
|
326
|
+
}
|