whispercpp 1.3.3 → 1.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/ruby_whisper_params.c +55 -25
- data/ext/sources/CMakeLists.txt +1 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/build-xcframework.sh +24 -0
- data/ext/sources/examples/CMakeLists.txt +1 -0
- data/ext/sources/examples/addon.node/addon.cpp +19 -19
- data/ext/sources/examples/addon.node/index.js +7 -5
- data/ext/sources/examples/bench/bench.cpp +26 -16
- data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
- data/ext/sources/examples/cli/cli.cpp +4 -2
- data/ext/sources/examples/command/command.cpp +26 -24
- data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/lsp/lsp.cpp +19 -17
- data/ext/sources/examples/server/server.cpp +24 -13
- data/ext/sources/examples/server.py +6 -1
- data/ext/sources/examples/stream/stream.cpp +4 -2
- data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
- data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
- data/ext/sources/examples/talk-llama/CMakeLists.txt +2 -2
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
- data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
- data/ext/sources/examples/talk-llama/llama-arch.cpp +588 -15
- data/ext/sources/examples/talk-llama/llama-arch.h +58 -1
- data/ext/sources/examples/talk-llama/llama-batch.cpp +103 -71
- data/ext/sources/examples/talk-llama/llama-batch.h +31 -18
- data/ext/sources/examples/talk-llama/llama-chat.cpp +120 -5
- data/ext/sources/examples/talk-llama/llama-chat.h +7 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +460 -357
- data/ext/sources/examples/talk-llama/llama-context.h +44 -29
- data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
- data/ext/sources/examples/talk-llama/llama-graph.cpp +543 -271
- data/ext/sources/examples/talk-llama/llama-graph.h +278 -168
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +118 -4
- data/ext/sources/examples/talk-llama/llama-hparams.h +61 -15
- data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
- data/ext/sources/examples/talk-llama/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +38 -29
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2020 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +358 -27
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +80 -28
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +56 -36
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +30 -29
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +48 -19
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +13 -14
- data/ext/sources/examples/talk-llama/llama-memory.h +16 -10
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +2 -0
- data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
- data/ext/sources/examples/talk-llama/llama-model.cpp +7165 -2336
- data/ext/sources/examples/talk-llama/llama-model.h +60 -9
- data/ext/sources/examples/talk-llama/llama-quant.cpp +48 -10
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +440 -13
- data/ext/sources/examples/talk-llama/llama-vocab.h +45 -0
- data/ext/sources/examples/talk-llama/llama.cpp +65 -10
- data/ext/sources/examples/talk-llama/llama.h +95 -177
- data/ext/sources/examples/talk-llama/talk-llama.cpp +9 -6
- data/ext/sources/examples/talk-llama/unicode.cpp +207 -0
- data/ext/sources/examples/talk-llama/unicode.h +45 -0
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +4 -2
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
- data/ext/sources/ggml/CMakeLists.txt +59 -31
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
- data/ext/sources/ggml/include/ggml-backend.h +17 -1
- data/ext/sources/ggml/include/ggml-cpu.h +1 -1
- data/ext/sources/ggml/include/ggml-metal.h +1 -6
- data/ext/sources/ggml/include/ggml-opt.h +25 -6
- data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
- data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
- data/ext/sources/ggml/include/ggml.h +221 -16
- data/ext/sources/ggml/src/CMakeLists.txt +17 -2
- data/ext/sources/ggml/src/ggml-alloc.c +265 -141
- data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +30 -13
- data/ext/sources/ggml/src/ggml-backend.cpp +221 -38
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
- data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +14 -0
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
- data/ext/sources/ggml/src/ggml-cann/common.h +143 -1
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +488 -69
- data/ext/sources/ggml/src/ggml-common.h +17 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +40 -18
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +4 -2
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +103 -582
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +265 -437
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +521 -353
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +32 -2
- data/ext/sources/ggml/src/ggml-cpu/common.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -6
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +70 -42
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +35 -28
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +227 -97
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +474 -1116
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1587 -1177
- data/ext/sources/ggml/src/ggml-cpu/ops.h +5 -8
- data/ext/sources/ggml/src/ggml-cpu/quants.c +35 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +458 -47
- data/ext/sources/ggml/src/ggml-cpu/repack.h +22 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +89 -60
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- data/ext/sources/ggml/src/ggml-cpu/traits.cpp +2 -2
- data/ext/sources/ggml/src/ggml-cpu/traits.h +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +170 -26
- data/ext/sources/ggml/src/ggml-cpu/vec.h +506 -63
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
- data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
- data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +250 -63
- data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +95 -22
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +15 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
- data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +498 -367
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +137 -91
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +86 -50
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +379 -107
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
- data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +56 -2
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
- data/ext/sources/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
- data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
- data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -100
- data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +21 -4
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
- data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +90 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +8 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +10 -2
- data/ext/sources/ggml/src/ggml-impl.h +119 -9
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -7
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +136 -63
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +2854 -1503
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +18 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +2510 -242
- data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +66 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +177 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
- data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
- data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
- data/ext/sources/ggml/src/ggml-quants.c +111 -16
- data/ext/sources/ggml/src/ggml-quants.h +6 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +67 -47
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +15 -5
- data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +25 -16
- data/ext/sources/ggml/src/ggml-sycl/conv.cpp +10 -4
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +166 -99
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +72 -306
- data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +67 -49
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +1 -31
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +79 -29
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +14 -26
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +9 -6
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +328 -323
- data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +80 -60
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +201 -132
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +74 -55
- data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +8 -9
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +35 -42
- data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +12 -6
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +16 -12
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3492 -883
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +55 -11
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +335 -77
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
- data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
- data/ext/sources/ggml/src/ggml.c +478 -98
- data/ext/sources/ggml/src/gguf.cpp +8 -1
- data/ext/sources/src/whisper.cpp +23 -46
- data/ext/sources/tests/CMakeLists.txt +8 -1
- data/ext/sources/tests/test-vad-full.cpp +3 -3
- data/ext/sources/tests/test-vad.cpp +2 -2
- data/lib/whisper/model/uri.rb +1 -1
- data/sig/whisper.rbs +7 -0
- data/test/test_params.rb +8 -0
- data/test/test_whisper.rb +1 -1
- data/whispercpp.gemspec +1 -1
- metadata +164 -157
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +0 -279
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +0 -1841
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +0 -303
- data/ext/sources/ggml/include/ggml-kompute.h +0 -50
- data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
- data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
- data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
- data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
- data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
- data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -6280
data/ext/sources/ggml/src/ggml.c
CHANGED
@@ -202,19 +202,34 @@ void ggml_print_backtrace(void) {
|
|
202
202
|
}
|
203
203
|
#endif
|
204
204
|
|
205
|
+
static ggml_abort_callback_t g_abort_callback = NULL;
|
206
|
+
|
207
|
+
// Set the abort callback (passing null will restore original abort functionality: printing a message to stdout)
|
208
|
+
GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback) {
|
209
|
+
ggml_abort_callback_t ret_val = g_abort_callback;
|
210
|
+
g_abort_callback = callback;
|
211
|
+
return ret_val;
|
212
|
+
}
|
213
|
+
|
205
214
|
void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
206
215
|
fflush(stdout);
|
207
216
|
|
208
|
-
|
217
|
+
char message[2048];
|
218
|
+
int offset = snprintf(message, sizeof(message), "%s:%d: ", file, line);
|
209
219
|
|
210
220
|
va_list args;
|
211
221
|
va_start(args, fmt);
|
212
|
-
|
222
|
+
vsnprintf(message + offset, sizeof(message) - offset, fmt, args);
|
213
223
|
va_end(args);
|
214
224
|
|
215
|
-
|
225
|
+
if (g_abort_callback) {
|
226
|
+
g_abort_callback(message);
|
227
|
+
} else {
|
228
|
+
// default: print error and backtrace to stderr
|
229
|
+
fprintf(stderr, "%s\n", message);
|
230
|
+
ggml_print_backtrace();
|
231
|
+
}
|
216
232
|
|
217
|
-
ggml_print_backtrace();
|
218
233
|
abort();
|
219
234
|
}
|
220
235
|
|
@@ -458,6 +473,14 @@ bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) {
|
|
458
473
|
return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0;
|
459
474
|
}
|
460
475
|
|
476
|
+
const char * ggml_version(void) {
|
477
|
+
return GGML_VERSION;
|
478
|
+
}
|
479
|
+
|
480
|
+
const char * ggml_commit(void) {
|
481
|
+
return GGML_COMMIT;
|
482
|
+
}
|
483
|
+
|
461
484
|
//
|
462
485
|
// timing
|
463
486
|
//
|
@@ -559,9 +582,6 @@ FILE * ggml_fopen(const char * fname, const char * mode) {
|
|
559
582
|
#endif
|
560
583
|
|
561
584
|
}
|
562
|
-
static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
|
563
|
-
static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
|
564
|
-
static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
|
565
585
|
|
566
586
|
static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
567
587
|
[GGML_TYPE_I8] = {
|
@@ -667,6 +687,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|
667
687
|
.is_quantized = true,
|
668
688
|
.from_float_ref = (ggml_from_float_t) quantize_row_q8_1_ref,
|
669
689
|
},
|
690
|
+
[GGML_TYPE_MXFP4] = {
|
691
|
+
.type_name = "mxfp4",
|
692
|
+
.blck_size = QK_MXFP4,
|
693
|
+
.type_size = sizeof(block_mxfp4),
|
694
|
+
.is_quantized = true,
|
695
|
+
.to_float = (ggml_to_float_t) dequantize_row_mxfp4,
|
696
|
+
.from_float_ref = (ggml_from_float_t)quantize_row_mxfp4_ref,
|
697
|
+
},
|
670
698
|
[GGML_TYPE_Q2_K] = {
|
671
699
|
.type_name = "q2_K",
|
672
700
|
.blck_size = QK_K,
|
@@ -894,6 +922,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
894
922
|
|
895
923
|
"DUP",
|
896
924
|
"ADD",
|
925
|
+
"ADD_ID",
|
897
926
|
"ADD1",
|
898
927
|
"ACC",
|
899
928
|
"SUB",
|
@@ -945,7 +974,9 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
945
974
|
"CONV_TRANSPOSE_1D",
|
946
975
|
"IM2COL",
|
947
976
|
"IM2COL_BACK",
|
977
|
+
"IM2COL_3D",
|
948
978
|
"CONV_2D",
|
979
|
+
"CONV_3D",
|
949
980
|
"CONV_2D_DW",
|
950
981
|
"CONV_TRANSPOSE_2D",
|
951
982
|
"POOL_1D",
|
@@ -983,17 +1014,19 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
983
1014
|
"CROSS_ENTROPY_LOSS",
|
984
1015
|
"CROSS_ENTROPY_LOSS_BACK",
|
985
1016
|
"OPT_STEP_ADAMW",
|
1017
|
+
"OPT_STEP_SGD",
|
986
1018
|
|
987
1019
|
"GLU",
|
988
1020
|
};
|
989
1021
|
|
990
|
-
static_assert(GGML_OP_COUNT ==
|
1022
|
+
static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90");
|
991
1023
|
|
992
1024
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
993
1025
|
"none",
|
994
1026
|
|
995
1027
|
"x",
|
996
1028
|
"x+y",
|
1029
|
+
"x[i]+y",
|
997
1030
|
"x+y",
|
998
1031
|
"view(x,nb,offset)+=y->x",
|
999
1032
|
"x-y",
|
@@ -1045,7 +1078,9 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1045
1078
|
"conv_transpose_1d(x)",
|
1046
1079
|
"im2col(x)",
|
1047
1080
|
"im2col_back(x)",
|
1081
|
+
"im2col_3d(x)",
|
1048
1082
|
"conv_2d(x)",
|
1083
|
+
"conv_3d(x)",
|
1049
1084
|
"conv_2d_dw(x)",
|
1050
1085
|
"conv_transpose_2d(x)",
|
1051
1086
|
"pool_1d(x)",
|
@@ -1083,15 +1118,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1083
1118
|
"cross_entropy_loss(x,y)",
|
1084
1119
|
"cross_entropy_loss_back(x,y)",
|
1085
1120
|
"adamw(x)",
|
1121
|
+
"sgd(x)",
|
1086
1122
|
|
1087
1123
|
"glu(x)",
|
1088
1124
|
};
|
1089
1125
|
|
1090
|
-
static_assert(GGML_OP_COUNT ==
|
1126
|
+
static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90");
|
1091
1127
|
|
1092
1128
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
1093
1129
|
|
1094
|
-
|
1095
1130
|
static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
1096
1131
|
"ABS",
|
1097
1132
|
"SGN",
|
@@ -1117,9 +1152,12 @@ static const char * GGML_GLU_OP_NAME[GGML_GLU_OP_COUNT] = {
|
|
1117
1152
|
"REGLU",
|
1118
1153
|
"GEGLU",
|
1119
1154
|
"SWIGLU",
|
1155
|
+
"SWIGLU_OAI",
|
1156
|
+
"GEGLU_ERF",
|
1157
|
+
"GEGLU_QUICK",
|
1120
1158
|
};
|
1121
1159
|
|
1122
|
-
static_assert(GGML_GLU_OP_COUNT ==
|
1160
|
+
static_assert(GGML_GLU_OP_COUNT == 6, "GGML_GLU_OP_COUNT != 6");
|
1123
1161
|
|
1124
1162
|
|
1125
1163
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
@@ -1287,6 +1325,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
1287
1325
|
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
|
1288
1326
|
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
|
1289
1327
|
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
1328
|
+
case GGML_FTYPE_MOSTLY_MXFP4: wtype = GGML_TYPE_MXFP4; break;
|
1290
1329
|
case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break;
|
1291
1330
|
case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
|
1292
1331
|
case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
|
@@ -1937,6 +1976,27 @@ struct ggml_tensor * ggml_add_cast(
|
|
1937
1976
|
return ggml_add_cast_impl(ctx, a, b, type);
|
1938
1977
|
}
|
1939
1978
|
|
1979
|
+
struct ggml_tensor * ggml_add_id(
|
1980
|
+
struct ggml_context * ctx,
|
1981
|
+
struct ggml_tensor * a,
|
1982
|
+
struct ggml_tensor * b,
|
1983
|
+
struct ggml_tensor * ids) {
|
1984
|
+
|
1985
|
+
GGML_ASSERT(a->ne[0] == b->ne[0]);
|
1986
|
+
GGML_ASSERT(a->ne[1] == ids->ne[0]);
|
1987
|
+
GGML_ASSERT(a->ne[2] == ids->ne[1]);
|
1988
|
+
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
1989
|
+
|
1990
|
+
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
1991
|
+
|
1992
|
+
result->op = GGML_OP_ADD_ID;
|
1993
|
+
result->src[0] = a;
|
1994
|
+
result->src[1] = b;
|
1995
|
+
result->src[2] = ids;
|
1996
|
+
|
1997
|
+
return result;
|
1998
|
+
}
|
1999
|
+
|
1940
2000
|
// ggml_add1
|
1941
2001
|
|
1942
2002
|
static struct ggml_tensor * ggml_add1_impl(
|
@@ -2745,6 +2805,61 @@ struct ggml_tensor * ggml_swiglu_split(
|
|
2745
2805
|
return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_SWIGLU, false);
|
2746
2806
|
}
|
2747
2807
|
|
2808
|
+
// ggml_geglu_erf
|
2809
|
+
|
2810
|
+
struct ggml_tensor * ggml_geglu_erf(
|
2811
|
+
struct ggml_context * ctx,
|
2812
|
+
struct ggml_tensor * a) {
|
2813
|
+
return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_GEGLU_ERF, false);
|
2814
|
+
}
|
2815
|
+
|
2816
|
+
struct ggml_tensor * ggml_geglu_erf_swapped(
|
2817
|
+
struct ggml_context * ctx,
|
2818
|
+
struct ggml_tensor * a) {
|
2819
|
+
return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_GEGLU_ERF, true);
|
2820
|
+
}
|
2821
|
+
|
2822
|
+
struct ggml_tensor * ggml_geglu_erf_split(
|
2823
|
+
struct ggml_context * ctx,
|
2824
|
+
struct ggml_tensor * a,
|
2825
|
+
struct ggml_tensor * b) {
|
2826
|
+
return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_GEGLU_ERF, false);
|
2827
|
+
}
|
2828
|
+
|
2829
|
+
// ggml_geglu_quick
|
2830
|
+
|
2831
|
+
struct ggml_tensor * ggml_geglu_quick(
|
2832
|
+
struct ggml_context * ctx,
|
2833
|
+
struct ggml_tensor * a) {
|
2834
|
+
return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_GEGLU_QUICK, false);
|
2835
|
+
}
|
2836
|
+
|
2837
|
+
struct ggml_tensor * ggml_geglu_quick_swapped(
|
2838
|
+
struct ggml_context * ctx,
|
2839
|
+
struct ggml_tensor * a) {
|
2840
|
+
return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_GEGLU_QUICK, true);
|
2841
|
+
}
|
2842
|
+
|
2843
|
+
struct ggml_tensor * ggml_geglu_quick_split(
|
2844
|
+
struct ggml_context * ctx,
|
2845
|
+
struct ggml_tensor * a,
|
2846
|
+
struct ggml_tensor * b) {
|
2847
|
+
return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_GEGLU_QUICK, false);
|
2848
|
+
}
|
2849
|
+
|
2850
|
+
struct ggml_tensor * ggml_swiglu_oai(
|
2851
|
+
struct ggml_context * ctx,
|
2852
|
+
struct ggml_tensor * a,
|
2853
|
+
struct ggml_tensor * b,
|
2854
|
+
float alpha,
|
2855
|
+
float limit) {
|
2856
|
+
struct ggml_tensor * result = ggml_glu_impl(ctx, a, b, GGML_GLU_OP_SWIGLU_OAI, false);
|
2857
|
+
ggml_set_op_params_f32(result, 2, alpha);
|
2858
|
+
ggml_set_op_params_f32(result, 3, limit);
|
2859
|
+
|
2860
|
+
return result;
|
2861
|
+
}
|
2862
|
+
|
2748
2863
|
// ggml_norm
|
2749
2864
|
|
2750
2865
|
static struct ggml_tensor * ggml_norm_impl(
|
@@ -3002,12 +3117,14 @@ static struct ggml_tensor * ggml_scale_impl(
|
|
3002
3117
|
struct ggml_context * ctx,
|
3003
3118
|
struct ggml_tensor * a,
|
3004
3119
|
float s,
|
3120
|
+
float b,
|
3005
3121
|
bool inplace) {
|
3006
3122
|
GGML_ASSERT(ggml_is_padded_1d(a));
|
3007
3123
|
|
3008
3124
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
3009
3125
|
|
3010
|
-
|
3126
|
+
float params[2] = { s, b };
|
3127
|
+
ggml_set_op_params(result, ¶ms, sizeof(params));
|
3011
3128
|
|
3012
3129
|
result->op = GGML_OP_SCALE;
|
3013
3130
|
result->src[0] = a;
|
@@ -3019,14 +3136,30 @@ struct ggml_tensor * ggml_scale(
|
|
3019
3136
|
struct ggml_context * ctx,
|
3020
3137
|
struct ggml_tensor * a,
|
3021
3138
|
float s) {
|
3022
|
-
return ggml_scale_impl(ctx, a, s, false);
|
3139
|
+
return ggml_scale_impl(ctx, a, s, 0.0, false);
|
3023
3140
|
}
|
3024
3141
|
|
3025
3142
|
struct ggml_tensor * ggml_scale_inplace(
|
3026
3143
|
struct ggml_context * ctx,
|
3027
3144
|
struct ggml_tensor * a,
|
3028
3145
|
float s) {
|
3029
|
-
return ggml_scale_impl(ctx, a, s, true);
|
3146
|
+
return ggml_scale_impl(ctx, a, s, 0.0, true);
|
3147
|
+
}
|
3148
|
+
|
3149
|
+
struct ggml_tensor * ggml_scale_bias(
|
3150
|
+
struct ggml_context * ctx,
|
3151
|
+
struct ggml_tensor * a,
|
3152
|
+
float s,
|
3153
|
+
float b) {
|
3154
|
+
return ggml_scale_impl(ctx, a, s, b, false);
|
3155
|
+
}
|
3156
|
+
|
3157
|
+
struct ggml_tensor * ggml_scale_bias_inplace(
|
3158
|
+
struct ggml_context * ctx,
|
3159
|
+
struct ggml_tensor * a,
|
3160
|
+
float s,
|
3161
|
+
float b) {
|
3162
|
+
return ggml_scale_impl(ctx, a, s, b, true);
|
3030
3163
|
}
|
3031
3164
|
|
3032
3165
|
// ggml_set
|
@@ -3490,6 +3623,7 @@ struct ggml_tensor * ggml_get_rows(
|
|
3490
3623
|
struct ggml_tensor * a,
|
3491
3624
|
struct ggml_tensor * b) {
|
3492
3625
|
GGML_ASSERT(a->ne[2] == b->ne[1]);
|
3626
|
+
GGML_ASSERT(a->ne[3] == b->ne[2]);
|
3493
3627
|
GGML_ASSERT(b->ne[3] == 1);
|
3494
3628
|
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
3495
3629
|
|
@@ -3543,7 +3677,7 @@ struct ggml_tensor * ggml_set_rows(
|
|
3543
3677
|
GGML_ASSERT(b->ne[3] % c->ne[2] == 0);
|
3544
3678
|
GGML_ASSERT(c->ne[3] == 1);
|
3545
3679
|
GGML_ASSERT(b->type == GGML_TYPE_F32);
|
3546
|
-
GGML_ASSERT(c->type == GGML_TYPE_I64);
|
3680
|
+
GGML_ASSERT(c->type == GGML_TYPE_I64 || c->type == GGML_TYPE_I32);
|
3547
3681
|
|
3548
3682
|
GGML_ASSERT(ggml_is_contiguous_rows(a));
|
3549
3683
|
GGML_ASSERT(ggml_is_contiguous_rows(b));
|
@@ -3553,6 +3687,7 @@ struct ggml_tensor * ggml_set_rows(
|
|
3553
3687
|
result->op = GGML_OP_SET_ROWS;
|
3554
3688
|
result->src[0] = b;
|
3555
3689
|
result->src[1] = c;
|
3690
|
+
result->src[2] = a; // note: order is weird due to legacy reasons (https://github.com/ggml-org/llama.cpp/pull/16063#discussion_r2385795931)
|
3556
3691
|
|
3557
3692
|
return result;
|
3558
3693
|
}
|
@@ -3651,9 +3786,10 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
3651
3786
|
if (mask) {
|
3652
3787
|
GGML_ASSERT(mask->type == GGML_TYPE_F16 || mask->type == GGML_TYPE_F32);
|
3653
3788
|
GGML_ASSERT(ggml_is_contiguous(mask));
|
3654
|
-
GGML_ASSERT(ggml_is_matrix(mask));
|
3655
3789
|
GGML_ASSERT(mask->ne[0] == a->ne[0]);
|
3656
3790
|
GGML_ASSERT(mask->ne[1] >= a->ne[1]);
|
3791
|
+
GGML_ASSERT(a->ne[2]%mask->ne[2] == 0);
|
3792
|
+
GGML_ASSERT(a->ne[3]%mask->ne[3] == 0);
|
3657
3793
|
}
|
3658
3794
|
|
3659
3795
|
if (max_bias > 0.0f) {
|
@@ -3693,6 +3829,22 @@ struct ggml_tensor * ggml_soft_max_ext(
|
|
3693
3829
|
return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
|
3694
3830
|
}
|
3695
3831
|
|
3832
|
+
void ggml_soft_max_add_sinks(
|
3833
|
+
struct ggml_tensor * a,
|
3834
|
+
struct ggml_tensor * sinks) {
|
3835
|
+
if (!sinks) {
|
3836
|
+
a->src[2] = NULL;
|
3837
|
+
return;
|
3838
|
+
}
|
3839
|
+
|
3840
|
+
GGML_ASSERT(a->op == GGML_OP_SOFT_MAX);
|
3841
|
+
GGML_ASSERT(a->src[2] == NULL);
|
3842
|
+
GGML_ASSERT(a->src[0]->ne[2] == sinks->ne[0]);
|
3843
|
+
GGML_ASSERT(sinks->type == GGML_TYPE_F32);
|
3844
|
+
|
3845
|
+
a->src[2] = sinks;
|
3846
|
+
}
|
3847
|
+
|
3696
3848
|
// ggml_soft_max_ext_back
|
3697
3849
|
|
3698
3850
|
static struct ggml_tensor * ggml_soft_max_ext_back_impl(
|
@@ -3740,6 +3892,7 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
3740
3892
|
struct ggml_tensor * b,
|
3741
3893
|
struct ggml_tensor * c,
|
3742
3894
|
int n_dims,
|
3895
|
+
int sections[GGML_MROPE_SECTIONS],
|
3743
3896
|
int mode,
|
3744
3897
|
int n_ctx_orig,
|
3745
3898
|
float freq_base,
|
@@ -3753,15 +3906,19 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
3753
3906
|
|
3754
3907
|
GGML_ASSERT(ggml_is_vector(b));
|
3755
3908
|
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
3756
|
-
|
3909
|
+
|
3910
|
+
bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;
|
3911
|
+
if (mrope_used) {
|
3912
|
+
GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
|
3913
|
+
} else {
|
3914
|
+
GGML_ASSERT(a->ne[2] == b->ne[0]);
|
3915
|
+
}
|
3757
3916
|
|
3758
3917
|
if (c) {
|
3759
3918
|
GGML_ASSERT(c->type == GGML_TYPE_F32);
|
3760
3919
|
GGML_ASSERT(c->ne[0] >= n_dims / 2);
|
3761
3920
|
}
|
3762
3921
|
|
3763
|
-
int sections[4] = {0, 0, 0, 0};
|
3764
|
-
|
3765
3922
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
3766
3923
|
|
3767
3924
|
int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
@@ -3771,7 +3928,11 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
3771
3928
|
memcpy(params + 8, &attn_factor, sizeof(float));
|
3772
3929
|
memcpy(params + 9, &beta_fast, sizeof(float));
|
3773
3930
|
memcpy(params + 10, &beta_slow, sizeof(float));
|
3774
|
-
|
3931
|
+
if (mrope_used && sections) {
|
3932
|
+
memcpy(params + 11, sections, sizeof(int32_t) * GGML_MROPE_SECTIONS);
|
3933
|
+
} else {
|
3934
|
+
memset(params + 11, 0, sizeof(int32_t) * GGML_MROPE_SECTIONS);
|
3935
|
+
}
|
3775
3936
|
ggml_set_op_params(result, params, sizeof(params));
|
3776
3937
|
|
3777
3938
|
result->op = GGML_OP_ROPE;
|
@@ -3789,7 +3950,7 @@ struct ggml_tensor * ggml_rope(
|
|
3789
3950
|
int n_dims,
|
3790
3951
|
int mode) {
|
3791
3952
|
return ggml_rope_impl(
|
3792
|
-
ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
|
3953
|
+
ctx, a, b, NULL, n_dims, NULL, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
|
3793
3954
|
);
|
3794
3955
|
}
|
3795
3956
|
|
@@ -3799,7 +3960,7 @@ struct ggml_tensor * ggml_rope_multi(
|
|
3799
3960
|
struct ggml_tensor * b,
|
3800
3961
|
struct ggml_tensor * c,
|
3801
3962
|
int n_dims,
|
3802
|
-
int sections[
|
3963
|
+
int sections[GGML_MROPE_SECTIONS],
|
3803
3964
|
int mode,
|
3804
3965
|
int n_ctx_orig,
|
3805
3966
|
float freq_base,
|
@@ -3808,36 +3969,31 @@ struct ggml_tensor * ggml_rope_multi(
|
|
3808
3969
|
float attn_factor,
|
3809
3970
|
float beta_fast,
|
3810
3971
|
float beta_slow) {
|
3811
|
-
|
3812
|
-
|
3813
|
-
|
3814
|
-
|
3815
|
-
|
3816
|
-
GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
|
3817
|
-
|
3818
|
-
if (c) {
|
3819
|
-
GGML_ASSERT(c->type == GGML_TYPE_F32);
|
3820
|
-
GGML_ASSERT(c->ne[0] >= n_dims / 2);
|
3821
|
-
}
|
3822
|
-
|
3823
|
-
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
3824
|
-
|
3825
|
-
int32_t params[11 + 4] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
3826
|
-
memcpy(params + 5, &freq_base, sizeof(float));
|
3827
|
-
memcpy(params + 6, &freq_scale, sizeof(float));
|
3828
|
-
memcpy(params + 7, &ext_factor, sizeof(float));
|
3829
|
-
memcpy(params + 8, &attn_factor, sizeof(float));
|
3830
|
-
memcpy(params + 9, &beta_fast, sizeof(float));
|
3831
|
-
memcpy(params + 10, &beta_slow, sizeof(float));
|
3832
|
-
memcpy(¶ms[11], sections, sizeof(int)*4);
|
3833
|
-
ggml_set_op_params(result, params, sizeof(params));
|
3834
|
-
|
3835
|
-
result->op = GGML_OP_ROPE;
|
3836
|
-
result->src[0] = a;
|
3837
|
-
result->src[1] = b;
|
3838
|
-
result->src[2] = c;
|
3972
|
+
return ggml_rope_impl(
|
3973
|
+
ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale,
|
3974
|
+
ext_factor, attn_factor, beta_fast, beta_slow, false
|
3975
|
+
);
|
3976
|
+
}
|
3839
3977
|
|
3840
|
-
|
3978
|
+
struct ggml_tensor * ggml_rope_multi_inplace(
|
3979
|
+
struct ggml_context * ctx,
|
3980
|
+
struct ggml_tensor * a,
|
3981
|
+
struct ggml_tensor * b,
|
3982
|
+
struct ggml_tensor * c,
|
3983
|
+
int n_dims,
|
3984
|
+
int sections[GGML_MROPE_SECTIONS],
|
3985
|
+
int mode,
|
3986
|
+
int n_ctx_orig,
|
3987
|
+
float freq_base,
|
3988
|
+
float freq_scale,
|
3989
|
+
float ext_factor,
|
3990
|
+
float attn_factor,
|
3991
|
+
float beta_fast,
|
3992
|
+
float beta_slow) {
|
3993
|
+
return ggml_rope_impl(
|
3994
|
+
ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale,
|
3995
|
+
ext_factor, attn_factor, beta_fast, beta_slow, true
|
3996
|
+
);
|
3841
3997
|
}
|
3842
3998
|
|
3843
3999
|
struct ggml_tensor * ggml_rope_inplace(
|
@@ -3847,7 +4003,7 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
3847
4003
|
int n_dims,
|
3848
4004
|
int mode) {
|
3849
4005
|
return ggml_rope_impl(
|
3850
|
-
ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
|
4006
|
+
ctx, a, b, NULL, n_dims, NULL, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
|
3851
4007
|
);
|
3852
4008
|
}
|
3853
4009
|
|
@@ -3866,7 +4022,7 @@ struct ggml_tensor * ggml_rope_ext(
|
|
3866
4022
|
float beta_fast,
|
3867
4023
|
float beta_slow) {
|
3868
4024
|
return ggml_rope_impl(
|
3869
|
-
ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
4025
|
+
ctx, a, b, c, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
|
3870
4026
|
ext_factor, attn_factor, beta_fast, beta_slow, false
|
3871
4027
|
);
|
3872
4028
|
}
|
@@ -3886,7 +4042,7 @@ struct ggml_tensor * ggml_rope_ext_inplace(
|
|
3886
4042
|
float beta_fast,
|
3887
4043
|
float beta_slow) {
|
3888
4044
|
return ggml_rope_impl(
|
3889
|
-
ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
4045
|
+
ctx, a, b, c, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
|
3890
4046
|
ext_factor, attn_factor, beta_fast, beta_slow, true
|
3891
4047
|
);
|
3892
4048
|
}
|
@@ -3905,7 +4061,7 @@ struct ggml_tensor * ggml_rope_custom(
|
|
3905
4061
|
float beta_fast,
|
3906
4062
|
float beta_slow) {
|
3907
4063
|
return ggml_rope_impl(
|
3908
|
-
ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
4064
|
+
ctx, a, b, NULL, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
|
3909
4065
|
ext_factor, attn_factor, beta_fast, beta_slow, false
|
3910
4066
|
);
|
3911
4067
|
}
|
@@ -3924,7 +4080,7 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
|
3924
4080
|
float beta_fast,
|
3925
4081
|
float beta_slow) {
|
3926
4082
|
return ggml_rope_impl(
|
3927
|
-
ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
4083
|
+
ctx, a, b, NULL, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
|
3928
4084
|
ext_factor, attn_factor, beta_fast, beta_slow, true
|
3929
4085
|
);
|
3930
4086
|
}
|
@@ -4122,14 +4278,13 @@ struct ggml_tensor * ggml_conv_1d_dw(
|
|
4122
4278
|
int s0,
|
4123
4279
|
int p0,
|
4124
4280
|
int d0) {
|
4125
|
-
struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], 1, a->ne[1], a->ne[2]);
|
4126
4281
|
struct ggml_tensor * new_b = ggml_reshape_4d(ctx, b, b->ne[0], 1, b->ne[1], b->ne[2]);
|
4127
4282
|
|
4128
|
-
struct ggml_tensor * im2col = ggml_im2col(ctx,
|
4283
|
+
struct ggml_tensor * im2col = ggml_im2col(ctx, a, new_b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16);
|
4129
4284
|
|
4130
4285
|
struct ggml_tensor * result = ggml_mul_mat(ctx, im2col, a);
|
4131
4286
|
|
4132
|
-
result = ggml_reshape_3d(ctx, result,
|
4287
|
+
result = ggml_reshape_3d(ctx, result, result->ne[0], result->ne[2], 1);
|
4133
4288
|
|
4134
4289
|
return result;
|
4135
4290
|
}
|
@@ -4210,6 +4365,91 @@ struct ggml_tensor * ggml_conv_2d(
|
|
4210
4365
|
return result;
|
4211
4366
|
}
|
4212
4367
|
|
4368
|
+
// a: [OC*IC, KD, KH, KW]
|
4369
|
+
// b: [N*IC, ID, IH, IW]
|
4370
|
+
// result: [N*OD, OH, OW, IC * KD * KH * KW]
|
4371
|
+
struct ggml_tensor * ggml_im2col_3d(
|
4372
|
+
struct ggml_context * ctx,
|
4373
|
+
struct ggml_tensor * a,
|
4374
|
+
struct ggml_tensor * b,
|
4375
|
+
int64_t IC,
|
4376
|
+
int s0, // stride width
|
4377
|
+
int s1, // stride height
|
4378
|
+
int s2, // stride depth
|
4379
|
+
int p0, // padding width
|
4380
|
+
int p1, // padding height
|
4381
|
+
int p2, // padding depth
|
4382
|
+
int d0, // dilation width
|
4383
|
+
int d1, // dilation height
|
4384
|
+
int d2, // dilation depth
|
4385
|
+
enum ggml_type dst_type) {
|
4386
|
+
const int64_t N = b->ne[3] / IC;
|
4387
|
+
const int64_t ID = b->ne[2];
|
4388
|
+
const int64_t IH = b->ne[1];
|
4389
|
+
const int64_t IW = b->ne[0];
|
4390
|
+
|
4391
|
+
const int64_t OC = a->ne[3] / IC;
|
4392
|
+
UNUSED(OC);
|
4393
|
+
const int64_t KD = a->ne[2];
|
4394
|
+
const int64_t KH = a->ne[1];
|
4395
|
+
const int64_t KW = a->ne[0];
|
4396
|
+
const int64_t OD = ggml_calc_conv_output_size(ID, KD, s2, p2, d2);
|
4397
|
+
const int64_t OH = ggml_calc_conv_output_size(IH, KH, s1, p1, d1);
|
4398
|
+
const int64_t OW = ggml_calc_conv_output_size(IW, KW, s0, p0, d0);
|
4399
|
+
|
4400
|
+
GGML_ASSERT((OD > 0) && "b too small compared to a");
|
4401
|
+
GGML_ASSERT((OH > 0) && "b too small compared to a");
|
4402
|
+
GGML_ASSERT((OW > 0) && "b too small compared to a");
|
4403
|
+
|
4404
|
+
|
4405
|
+
const int64_t ne[4] = {KW*KH*KD*IC, OW, OH, OD*N};
|
4406
|
+
|
4407
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
|
4408
|
+
int32_t params[] = { s0, s1, s2, p0, p1, p2, d0, d1, d2, (int32_t)IC};
|
4409
|
+
ggml_set_op_params(result, params, sizeof(params));
|
4410
|
+
|
4411
|
+
result->op = GGML_OP_IM2COL_3D;
|
4412
|
+
result->src[0] = a;
|
4413
|
+
result->src[1] = b;
|
4414
|
+
|
4415
|
+
return result;
|
4416
|
+
}
|
4417
|
+
|
4418
|
+
// a: [OC*IC, KD, KH, KW]
|
4419
|
+
// b: [N*IC, ID, IH, IW]
|
4420
|
+
// result: [N*OC, OD, OH, OW]
|
4421
|
+
struct ggml_tensor * ggml_conv_3d(
|
4422
|
+
struct ggml_context * ctx,
|
4423
|
+
struct ggml_tensor * a,
|
4424
|
+
struct ggml_tensor * b,
|
4425
|
+
int64_t IC,
|
4426
|
+
int s0, // stride width
|
4427
|
+
int s1, // stride height
|
4428
|
+
int s2, // stride depth
|
4429
|
+
int p0, // padding width
|
4430
|
+
int p1, // padding height
|
4431
|
+
int p2, // padding depth
|
4432
|
+
int d0, // dilation width
|
4433
|
+
int d1, // dilation height
|
4434
|
+
int d2 // dilation depth
|
4435
|
+
) {
|
4436
|
+
struct ggml_tensor * im2col = ggml_im2col_3d(ctx, a, b, IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, a->type); // [N*OD, OH, OW, IC * KD * KH * KW]
|
4437
|
+
|
4438
|
+
int64_t OC = a->ne[3] / IC;
|
4439
|
+
int64_t N = b->ne[3] / IC;
|
4440
|
+
struct ggml_tensor * result =
|
4441
|
+
ggml_mul_mat(ctx,
|
4442
|
+
ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N*OD, OH, OW, IC * KD * KH * KW] => [N*OD*OH*OW, IC * KD * KH * KW]
|
4443
|
+
ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2] * IC), OC)); // [OC*IC, KD, KH, KW] => [OC, IC * KD * KH * KW]
|
4444
|
+
|
4445
|
+
int64_t OD = im2col->ne[3] / N;
|
4446
|
+
result = ggml_reshape_4d(ctx, result, im2col->ne[1]*im2col->ne[2], OD, N, OC); // [OC, N*OD*OH*OW] => [OC, N, OD, OH*OW]
|
4447
|
+
result = ggml_cont(ctx, ggml_permute(ctx, result, 0, 1, 3, 2)); // [N, OC, OD, OH*OW]
|
4448
|
+
result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], OD, OC * N); // [N*OC, OD, OH, OW]
|
4449
|
+
|
4450
|
+
return result;
|
4451
|
+
}
|
4452
|
+
|
4213
4453
|
// ggml_conv_2d_sk_p0
|
4214
4454
|
|
4215
4455
|
struct ggml_tensor * ggml_conv_2d_sk_p0(
|
@@ -4331,6 +4571,56 @@ struct ggml_tensor * ggml_conv_2d_direct(
|
|
4331
4571
|
return result;
|
4332
4572
|
}
|
4333
4573
|
|
4574
|
+
// ggml_conv_3d_direct
|
4575
|
+
|
4576
|
+
struct ggml_tensor * ggml_conv_3d_direct(
|
4577
|
+
struct ggml_context * ctx,
|
4578
|
+
struct ggml_tensor * a,
|
4579
|
+
struct ggml_tensor * b,
|
4580
|
+
int s0,
|
4581
|
+
int s1,
|
4582
|
+
int s2,
|
4583
|
+
int p0,
|
4584
|
+
int p1,
|
4585
|
+
int p2,
|
4586
|
+
int d0,
|
4587
|
+
int d1,
|
4588
|
+
int d2,
|
4589
|
+
int c,
|
4590
|
+
int n,
|
4591
|
+
int oc) {
|
4592
|
+
|
4593
|
+
GGML_ASSERT(a->ne[3] == (int64_t) c * oc);
|
4594
|
+
GGML_ASSERT(b->ne[3] == (int64_t) c * n);
|
4595
|
+
|
4596
|
+
int64_t ne[4];
|
4597
|
+
ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
|
4598
|
+
ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
|
4599
|
+
ne[2] = ggml_calc_conv_output_size(b->ne[2], a->ne[2], s2, p2, d2);
|
4600
|
+
ne[3] = (int64_t) oc * n;
|
4601
|
+
|
4602
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4603
|
+
|
4604
|
+
ggml_set_op_params_i32(result, 0, s0);
|
4605
|
+
ggml_set_op_params_i32(result, 1, s1);
|
4606
|
+
ggml_set_op_params_i32(result, 2, s2);
|
4607
|
+
ggml_set_op_params_i32(result, 3, p0);
|
4608
|
+
ggml_set_op_params_i32(result, 4, p1);
|
4609
|
+
ggml_set_op_params_i32(result, 5, p2);
|
4610
|
+
ggml_set_op_params_i32(result, 6, d0);
|
4611
|
+
ggml_set_op_params_i32(result, 7, d1);
|
4612
|
+
ggml_set_op_params_i32(result, 8, d2);
|
4613
|
+
ggml_set_op_params_i32(result, 9, c);
|
4614
|
+
ggml_set_op_params_i32(result, 10, n);
|
4615
|
+
ggml_set_op_params_i32(result, 11, oc);
|
4616
|
+
|
4617
|
+
result->op = GGML_OP_CONV_3D;
|
4618
|
+
result->src[0] = a;
|
4619
|
+
result->src[1] = b;
|
4620
|
+
|
4621
|
+
return result;
|
4622
|
+
}
|
4623
|
+
|
4334
4624
|
// ggml_conv_transpose_2d_p0
|
4335
4625
|
|
4336
4626
|
static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
|
@@ -4509,11 +4799,36 @@ struct ggml_tensor * ggml_pad(
|
|
4509
4799
|
int p1,
|
4510
4800
|
int p2,
|
4511
4801
|
int p3) {
|
4802
|
+
return ggml_pad_ext(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3);
|
4803
|
+
}
|
4804
|
+
|
4805
|
+
struct ggml_tensor * ggml_pad_ext(
|
4806
|
+
struct ggml_context * ctx,
|
4807
|
+
struct ggml_tensor * a,
|
4808
|
+
int lp0,
|
4809
|
+
int rp0,
|
4810
|
+
int lp1,
|
4811
|
+
int rp1,
|
4812
|
+
int lp2,
|
4813
|
+
int rp2,
|
4814
|
+
int lp3,
|
4815
|
+
int rp3
|
4816
|
+
) {
|
4512
4817
|
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
4513
|
-
a->ne[0] +
|
4514
|
-
a->ne[1] +
|
4515
|
-
a->ne[2] +
|
4516
|
-
a->ne[3] +
|
4818
|
+
a->ne[0] + lp0 + rp0,
|
4819
|
+
a->ne[1] + lp1 + rp1,
|
4820
|
+
a->ne[2] + lp2 + rp2,
|
4821
|
+
a->ne[3] + lp3 + rp3);
|
4822
|
+
|
4823
|
+
ggml_set_op_params_i32(result, 0, lp0);
|
4824
|
+
ggml_set_op_params_i32(result, 1, rp0);
|
4825
|
+
ggml_set_op_params_i32(result, 2, lp1);
|
4826
|
+
ggml_set_op_params_i32(result, 3, rp1);
|
4827
|
+
ggml_set_op_params_i32(result, 4, lp2);
|
4828
|
+
ggml_set_op_params_i32(result, 5, rp2);
|
4829
|
+
ggml_set_op_params_i32(result, 6, lp3);
|
4830
|
+
ggml_set_op_params_i32(result, 7, rp3);
|
4831
|
+
|
4517
4832
|
|
4518
4833
|
result->op = GGML_OP_PAD;
|
4519
4834
|
result->src[0] = a;
|
@@ -4609,12 +4924,8 @@ struct ggml_tensor * ggml_timestep_embedding(
|
|
4609
4924
|
struct ggml_tensor * timesteps,
|
4610
4925
|
int dim,
|
4611
4926
|
int max_period) {
|
4612
|
-
int actual_dim = dim;
|
4613
|
-
if (dim % 2 != 0) {
|
4614
|
-
actual_dim = dim + 1;
|
4615
|
-
}
|
4616
4927
|
|
4617
|
-
struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,
|
4928
|
+
struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, timesteps->ne[0]);
|
4618
4929
|
|
4619
4930
|
ggml_set_op_params_i32(result, 0, dim);
|
4620
4931
|
ggml_set_op_params_i32(result, 1, max_period);
|
@@ -4674,13 +4985,17 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
|
4674
4985
|
GGML_ASSERT(ggml_can_mul_mat(k, q));
|
4675
4986
|
// TODO: check if vT can be multiplied by (k*qT)
|
4676
4987
|
|
4988
|
+
GGML_ASSERT(q->ne[3] == k->ne[3]);
|
4989
|
+
GGML_ASSERT(q->ne[3] == v->ne[3]);
|
4990
|
+
|
4677
4991
|
if (mask) {
|
4678
4992
|
GGML_ASSERT(ggml_is_contiguous(mask));
|
4679
|
-
GGML_ASSERT(mask->ne[2] == 1);
|
4680
|
-
GGML_ASSERT(mask->ne[3] == 1);
|
4681
4993
|
GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
|
4682
4994
|
"the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
|
4683
4995
|
//GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
|
4996
|
+
|
4997
|
+
GGML_ASSERT(q->ne[2] % mask->ne[2] == 0);
|
4998
|
+
GGML_ASSERT(q->ne[3] % mask->ne[3] == 0);
|
4684
4999
|
}
|
4685
5000
|
|
4686
5001
|
if (max_bias > 0.0f) {
|
@@ -4722,6 +5037,22 @@ enum ggml_prec ggml_flash_attn_ext_get_prec(
|
|
4722
5037
|
return (enum ggml_prec) prec_i32;
|
4723
5038
|
}
|
4724
5039
|
|
5040
|
+
void ggml_flash_attn_ext_add_sinks(
|
5041
|
+
struct ggml_tensor * a,
|
5042
|
+
struct ggml_tensor * sinks) {
|
5043
|
+
if (!sinks) {
|
5044
|
+
a->src[4] = NULL;
|
5045
|
+
return;
|
5046
|
+
}
|
5047
|
+
|
5048
|
+
GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
|
5049
|
+
GGML_ASSERT(a->src[4] == NULL);
|
5050
|
+
GGML_ASSERT(a->src[0]->ne[2] == sinks->ne[0]);
|
5051
|
+
GGML_ASSERT(sinks->type == GGML_TYPE_F32);
|
5052
|
+
|
5053
|
+
a->src[4] = sinks;
|
5054
|
+
}
|
5055
|
+
|
4725
5056
|
// ggml_flash_attn_back
|
4726
5057
|
|
4727
5058
|
struct ggml_tensor * ggml_flash_attn_back(
|
@@ -4808,7 +5139,6 @@ struct ggml_tensor * ggml_ssm_conv(
|
|
4808
5139
|
const int64_t n_s = sx->ne[2];
|
4809
5140
|
|
4810
5141
|
// TODO: maybe support other strides than 1?
|
4811
|
-
// FIXME: this is always true?
|
4812
5142
|
GGML_ASSERT(sx->ne[0] == d_conv - 1 + n_t);
|
4813
5143
|
GGML_ASSERT(sx->ne[1] == d_inner);
|
4814
5144
|
GGML_ASSERT(n_t >= 0);
|
@@ -4831,36 +5161,49 @@ struct ggml_tensor * ggml_ssm_scan(
|
|
4831
5161
|
struct ggml_tensor * dt,
|
4832
5162
|
struct ggml_tensor * A,
|
4833
5163
|
struct ggml_tensor * B,
|
4834
|
-
struct ggml_tensor * C
|
5164
|
+
struct ggml_tensor * C,
|
5165
|
+
struct ggml_tensor * ids) {
|
4835
5166
|
GGML_ASSERT(ggml_is_contiguous(s));
|
4836
|
-
GGML_ASSERT(ggml_is_contiguous(x));
|
4837
5167
|
GGML_ASSERT(ggml_is_contiguous(dt));
|
4838
5168
|
GGML_ASSERT(ggml_is_contiguous(A));
|
4839
|
-
GGML_ASSERT(
|
4840
|
-
GGML_ASSERT(ggml_is_3d(B));
|
4841
|
-
GGML_ASSERT(ggml_is_3d(s));
|
5169
|
+
GGML_ASSERT(x->nb[0] == ggml_type_size(x->type));
|
4842
5170
|
GGML_ASSERT(B->nb[0] == ggml_type_size(B->type));
|
4843
5171
|
GGML_ASSERT(C->nb[0] == ggml_type_size(C->type));
|
4844
|
-
GGML_ASSERT(
|
5172
|
+
GGML_ASSERT(x->nb[1] == x->ne[0]*x->nb[0]);
|
5173
|
+
GGML_ASSERT(B->nb[1] == B->ne[0]*B->nb[0]);
|
5174
|
+
GGML_ASSERT(C->nb[1] == C->ne[0]*C->nb[0]);
|
4845
5175
|
GGML_ASSERT(ggml_are_same_shape(B, C));
|
5176
|
+
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
4846
5177
|
|
4847
5178
|
{
|
4848
5179
|
const int64_t d_state = s->ne[0];
|
4849
|
-
const int64_t
|
4850
|
-
const int64_t
|
4851
|
-
const int64_t
|
4852
|
-
|
4853
|
-
|
4854
|
-
GGML_ASSERT(
|
4855
|
-
GGML_ASSERT(
|
4856
|
-
GGML_ASSERT(
|
5180
|
+
const int64_t head_dim = x->ne[0];
|
5181
|
+
const int64_t n_head = x->ne[1];
|
5182
|
+
const int64_t n_seq_tokens = x->ne[2];
|
5183
|
+
const int64_t n_seqs = x->ne[3];
|
5184
|
+
|
5185
|
+
GGML_ASSERT(dt->ne[0] == n_head);
|
5186
|
+
GGML_ASSERT(dt->ne[1] == n_seq_tokens);
|
5187
|
+
GGML_ASSERT(dt->ne[2] == n_seqs);
|
5188
|
+
GGML_ASSERT(ggml_is_3d(dt));
|
5189
|
+
GGML_ASSERT(s->ne[1] == head_dim);
|
5190
|
+
GGML_ASSERT(s->ne[2] == n_head);
|
4857
5191
|
GGML_ASSERT(B->ne[0] == d_state);
|
4858
|
-
GGML_ASSERT(B->ne[
|
4859
|
-
GGML_ASSERT(B->ne[
|
5192
|
+
GGML_ASSERT(B->ne[2] == n_seq_tokens);
|
5193
|
+
GGML_ASSERT(B->ne[3] == n_seqs);
|
5194
|
+
GGML_ASSERT(ids->ne[0] == n_seqs);
|
5195
|
+
GGML_ASSERT(ggml_is_vector(ids));
|
5196
|
+
GGML_ASSERT(A->ne[1] == n_head);
|
5197
|
+
GGML_ASSERT(ggml_is_matrix(A));
|
5198
|
+
|
5199
|
+
if (A->ne[0] != 1) {
|
5200
|
+
// Mamba-1 has more granular decay factors
|
5201
|
+
GGML_ASSERT(A->ne[0] == d_state);
|
5202
|
+
}
|
4860
5203
|
}
|
4861
5204
|
|
4862
5205
|
// concatenated y + ssm_states
|
4863
|
-
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) +
|
5206
|
+
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + s->ne[0]*s->ne[1]*s->ne[2]*ids->ne[0]);
|
4864
5207
|
|
4865
5208
|
result->op = GGML_OP_SSM_SCAN;
|
4866
5209
|
result->src[0] = s;
|
@@ -4869,6 +5212,7 @@ struct ggml_tensor * ggml_ssm_scan(
|
|
4869
5212
|
result->src[3] = A;
|
4870
5213
|
result->src[4] = B;
|
4871
5214
|
result->src[5] = C;
|
5215
|
+
result->src[6] = ids;
|
4872
5216
|
|
4873
5217
|
return result;
|
4874
5218
|
}
|
@@ -5424,6 +5768,28 @@ struct ggml_tensor * ggml_opt_step_adamw(
|
|
5424
5768
|
return result;
|
5425
5769
|
}
|
5426
5770
|
|
5771
|
+
// opt_step_sgd
|
5772
|
+
|
5773
|
+
struct ggml_tensor * ggml_opt_step_sgd(
|
5774
|
+
struct ggml_context * ctx,
|
5775
|
+
struct ggml_tensor * a,
|
5776
|
+
struct ggml_tensor * grad,
|
5777
|
+
struct ggml_tensor * params) {
|
5778
|
+
GGML_ASSERT(a->flags & GGML_TENSOR_FLAG_PARAM);
|
5779
|
+
GGML_ASSERT(ggml_are_same_shape(a, grad));
|
5780
|
+
GGML_ASSERT(params->type == GGML_TYPE_F32);
|
5781
|
+
GGML_ASSERT(ggml_nelements(params) == 2);
|
5782
|
+
|
5783
|
+
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
5784
|
+
|
5785
|
+
result->op = GGML_OP_OPT_STEP_SGD;
|
5786
|
+
result->src[0] = a;
|
5787
|
+
result->src[1] = grad;
|
5788
|
+
result->src[2] = params;
|
5789
|
+
|
5790
|
+
return result;
|
5791
|
+
}
|
5792
|
+
|
5427
5793
|
////////////////////////////////////////////////////////////////////////////////
|
5428
5794
|
|
5429
5795
|
struct ggml_hash_set ggml_hash_set_new(size_t size) {
|
@@ -5692,7 +6058,7 @@ static void ggml_compute_backward(
|
|
5692
6058
|
} break;
|
5693
6059
|
case GGML_OP_MEAN: {
|
5694
6060
|
if (src0_needs_grads) {
|
5695
|
-
ggml_add1_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false));
|
6061
|
+
ggml_add1_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], 0.0, false));
|
5696
6062
|
}
|
5697
6063
|
} break;
|
5698
6064
|
case GGML_OP_REPEAT: {
|
@@ -5769,7 +6135,7 @@ static void ggml_compute_backward(
|
|
5769
6135
|
if (src0_needs_grads) {
|
5770
6136
|
float s;
|
5771
6137
|
memcpy(&s, tensor->op_params, sizeof(float));
|
5772
|
-
ggml_add_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, s, false));
|
6138
|
+
ggml_add_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, s, 0.0, false));
|
5773
6139
|
}
|
5774
6140
|
} break;
|
5775
6141
|
case GGML_OP_SET: {
|
@@ -6009,13 +6375,28 @@ static void ggml_compute_backward(
|
|
6009
6375
|
}
|
6010
6376
|
GGML_ASSERT(!src1_needs_grads && "backward pass for labels not implemented");
|
6011
6377
|
} break;
|
6378
|
+
case GGML_OP_GLU: {
|
6379
|
+
switch (ggml_get_glu_op(tensor)) {
|
6380
|
+
case GGML_GLU_OP_SWIGLU: {
|
6381
|
+
if (src0_needs_grads) {
|
6382
|
+
GGML_ASSERT(src1 && "backward pass only implemented for split swiglu");
|
6383
|
+
ggml_add_or_set(ctx, cgraph, isrc0, ggml_silu_back(ctx, ggml_mul(ctx, grad, src1), src0));
|
6384
|
+
}
|
6385
|
+
if (src1_needs_grads) {
|
6386
|
+
ggml_add_or_set(ctx, cgraph, isrc1, ggml_mul(ctx, ggml_silu(ctx, src0), grad));
|
6387
|
+
}
|
6388
|
+
} break;
|
6389
|
+
default: {
|
6390
|
+
GGML_ABORT("unsupported glu op for backward pass: %s", ggml_glu_op_name(ggml_get_glu_op(tensor)));
|
6391
|
+
} //break;
|
6392
|
+
}
|
6393
|
+
} break;
|
6012
6394
|
case GGML_OP_NONE: {
|
6013
6395
|
// noop
|
6014
6396
|
} break;
|
6015
6397
|
case GGML_OP_COUNT:
|
6016
6398
|
default: {
|
6017
|
-
|
6018
|
-
GGML_ABORT("fatal error");
|
6399
|
+
GGML_ABORT("%s: unsupported ggml op for backward pass: %s\n", __func__, ggml_op_name(tensor->op));
|
6019
6400
|
} //break;
|
6020
6401
|
}
|
6021
6402
|
|
@@ -6522,20 +6903,18 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr
|
|
6522
6903
|
static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
|
6523
6904
|
struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
|
6524
6905
|
struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
|
6525
|
-
fprintf(fp, " \"%p\"
|
6906
|
+
fprintf(fp, " \"%p\" -> \"%p\" [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
|
6526
6907
|
gparent0 ? (void *) gparent0 : (void *) parent,
|
6527
|
-
gparent0 ? "g" : "x",
|
6528
6908
|
gparent ? (void *) gparent : (void *) node,
|
6529
|
-
gparent ? "g" : "x",
|
6530
6909
|
gparent ? "empty" : "vee",
|
6531
6910
|
gparent ? "dashed" : "solid",
|
6532
6911
|
label);
|
6533
6912
|
}
|
6534
6913
|
|
6535
6914
|
static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
|
6536
|
-
fprintf(fp, " \"%p\"
|
6537
|
-
(void *) parent,
|
6538
|
-
(void *) node,
|
6915
|
+
fprintf(fp, " \"%p\" -> \"%p\" [ label = \"%s\"; ]\n",
|
6916
|
+
(void *) parent,
|
6917
|
+
(void *) node,
|
6539
6918
|
label);
|
6540
6919
|
}
|
6541
6920
|
|
@@ -6756,6 +7135,7 @@ size_t ggml_quantize_chunk(
|
|
6756
7135
|
case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
6757
7136
|
case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
6758
7137
|
case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
7138
|
+
case GGML_TYPE_MXFP4: result = quantize_mxfp4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
6759
7139
|
case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
6760
7140
|
case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
6761
7141
|
case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|