whispercpp 1.3.3 → 1.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/ruby_whisper_params.c +55 -25
- data/ext/sources/CMakeLists.txt +1 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/build-xcframework.sh +24 -0
- data/ext/sources/examples/CMakeLists.txt +1 -0
- data/ext/sources/examples/addon.node/addon.cpp +19 -19
- data/ext/sources/examples/addon.node/index.js +7 -5
- data/ext/sources/examples/bench/bench.cpp +26 -16
- data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
- data/ext/sources/examples/cli/cli.cpp +4 -2
- data/ext/sources/examples/command/command.cpp +26 -24
- data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/lsp/lsp.cpp +19 -17
- data/ext/sources/examples/server/server.cpp +24 -13
- data/ext/sources/examples/server.py +6 -1
- data/ext/sources/examples/stream/stream.cpp +4 -2
- data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
- data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
- data/ext/sources/examples/talk-llama/CMakeLists.txt +2 -2
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
- data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
- data/ext/sources/examples/talk-llama/llama-arch.cpp +588 -15
- data/ext/sources/examples/talk-llama/llama-arch.h +58 -1
- data/ext/sources/examples/talk-llama/llama-batch.cpp +103 -71
- data/ext/sources/examples/talk-llama/llama-batch.h +31 -18
- data/ext/sources/examples/talk-llama/llama-chat.cpp +120 -5
- data/ext/sources/examples/talk-llama/llama-chat.h +7 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +460 -357
- data/ext/sources/examples/talk-llama/llama-context.h +44 -29
- data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
- data/ext/sources/examples/talk-llama/llama-graph.cpp +543 -271
- data/ext/sources/examples/talk-llama/llama-graph.h +278 -168
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +118 -4
- data/ext/sources/examples/talk-llama/llama-hparams.h +61 -15
- data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
- data/ext/sources/examples/talk-llama/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +38 -29
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2020 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +358 -27
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +80 -28
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +56 -36
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +30 -29
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +48 -19
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +13 -14
- data/ext/sources/examples/talk-llama/llama-memory.h +16 -10
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +2 -0
- data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
- data/ext/sources/examples/talk-llama/llama-model.cpp +7165 -2336
- data/ext/sources/examples/talk-llama/llama-model.h +60 -9
- data/ext/sources/examples/talk-llama/llama-quant.cpp +48 -10
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +440 -13
- data/ext/sources/examples/talk-llama/llama-vocab.h +45 -0
- data/ext/sources/examples/talk-llama/llama.cpp +65 -10
- data/ext/sources/examples/talk-llama/llama.h +95 -177
- data/ext/sources/examples/talk-llama/talk-llama.cpp +9 -6
- data/ext/sources/examples/talk-llama/unicode.cpp +207 -0
- data/ext/sources/examples/talk-llama/unicode.h +45 -0
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +4 -2
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
- data/ext/sources/ggml/CMakeLists.txt +59 -31
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
- data/ext/sources/ggml/include/ggml-backend.h +17 -1
- data/ext/sources/ggml/include/ggml-cpu.h +1 -1
- data/ext/sources/ggml/include/ggml-metal.h +1 -6
- data/ext/sources/ggml/include/ggml-opt.h +25 -6
- data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
- data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
- data/ext/sources/ggml/include/ggml.h +221 -16
- data/ext/sources/ggml/src/CMakeLists.txt +17 -2
- data/ext/sources/ggml/src/ggml-alloc.c +265 -141
- data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +30 -13
- data/ext/sources/ggml/src/ggml-backend.cpp +221 -38
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
- data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +14 -0
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
- data/ext/sources/ggml/src/ggml-cann/common.h +143 -1
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +488 -69
- data/ext/sources/ggml/src/ggml-common.h +17 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +40 -18
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +4 -2
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +103 -582
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +265 -437
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +521 -353
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +32 -2
- data/ext/sources/ggml/src/ggml-cpu/common.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -6
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +70 -42
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +35 -28
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +227 -97
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +474 -1116
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1587 -1177
- data/ext/sources/ggml/src/ggml-cpu/ops.h +5 -8
- data/ext/sources/ggml/src/ggml-cpu/quants.c +35 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +458 -47
- data/ext/sources/ggml/src/ggml-cpu/repack.h +22 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +89 -60
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- data/ext/sources/ggml/src/ggml-cpu/traits.cpp +2 -2
- data/ext/sources/ggml/src/ggml-cpu/traits.h +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +170 -26
- data/ext/sources/ggml/src/ggml-cpu/vec.h +506 -63
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
- data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
- data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +250 -63
- data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +95 -22
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +15 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
- data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +498 -367
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +137 -91
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +86 -50
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +379 -107
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
- data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +56 -2
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
- data/ext/sources/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
- data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
- data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -100
- data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +21 -4
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
- data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +90 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +8 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +10 -2
- data/ext/sources/ggml/src/ggml-impl.h +119 -9
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -7
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +136 -63
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +2854 -1503
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +18 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +2510 -242
- data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +66 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +177 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
- data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
- data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
- data/ext/sources/ggml/src/ggml-quants.c +111 -16
- data/ext/sources/ggml/src/ggml-quants.h +6 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +67 -47
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +15 -5
- data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +25 -16
- data/ext/sources/ggml/src/ggml-sycl/conv.cpp +10 -4
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +166 -99
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +72 -306
- data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +67 -49
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +1 -31
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +79 -29
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +14 -26
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +9 -6
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +328 -323
- data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +80 -60
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +201 -132
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +74 -55
- data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +8 -9
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +35 -42
- data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +12 -6
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +16 -12
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3492 -883
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +55 -11
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +335 -77
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
- data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
- data/ext/sources/ggml/src/ggml.c +478 -98
- data/ext/sources/ggml/src/gguf.cpp +8 -1
- data/ext/sources/src/whisper.cpp +23 -46
- data/ext/sources/tests/CMakeLists.txt +8 -1
- data/ext/sources/tests/test-vad-full.cpp +3 -3
- data/ext/sources/tests/test-vad.cpp +2 -2
- data/lib/whisper/model/uri.rb +1 -1
- data/sig/whisper.rbs +7 -0
- data/test/test_params.rb +8 -0
- data/test/test_whisper.rb +1 -1
- data/whispercpp.gemspec +1 -1
- metadata +164 -157
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +0 -279
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +0 -1841
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +0 -303
- data/ext/sources/ggml/include/ggml-kompute.h +0 -50
- data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
- data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
- data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
- data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
- data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
- data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -6280
@@ -6,6 +6,50 @@
|
|
6
6
|
#include <vector>
|
7
7
|
#include <memory>
|
8
8
|
|
9
|
+
// pre-tokenization types
|
10
|
+
enum llama_vocab_pre_type {
|
11
|
+
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
12
|
+
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
13
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
14
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
15
|
+
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
16
|
+
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
17
|
+
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
18
|
+
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
19
|
+
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
20
|
+
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
21
|
+
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
|
22
|
+
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
23
|
+
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
24
|
+
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
25
|
+
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
26
|
+
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
27
|
+
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
|
28
|
+
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
29
|
+
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
30
|
+
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
31
|
+
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
32
|
+
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
33
|
+
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
34
|
+
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
35
|
+
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
36
|
+
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
37
|
+
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
38
|
+
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
39
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
40
|
+
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
|
41
|
+
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
|
42
|
+
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
43
|
+
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
44
|
+
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
45
|
+
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
46
|
+
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
47
|
+
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
48
|
+
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
|
49
|
+
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
|
50
|
+
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
|
51
|
+
};
|
52
|
+
|
9
53
|
struct LLM_KV;
|
10
54
|
struct llama_model_loader;
|
11
55
|
|
@@ -59,6 +103,7 @@ struct llama_vocab {
|
|
59
103
|
llama_token token_sep() const;
|
60
104
|
llama_token token_nl () const;
|
61
105
|
llama_token token_pad() const;
|
106
|
+
llama_token token_mask() const;
|
62
107
|
|
63
108
|
llama_token token_prefix() const;
|
64
109
|
llama_token token_middle() const;
|
@@ -25,6 +25,18 @@
|
|
25
25
|
// interface implementation
|
26
26
|
//
|
27
27
|
|
28
|
+
const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) {
|
29
|
+
switch (flash_attn_type) {
|
30
|
+
case LLAMA_FLASH_ATTN_TYPE_AUTO:
|
31
|
+
return "auto";
|
32
|
+
case LLAMA_FLASH_ATTN_TYPE_DISABLED:
|
33
|
+
return "disabled";
|
34
|
+
case LLAMA_FLASH_ATTN_TYPE_ENABLED:
|
35
|
+
return "enabled";
|
36
|
+
}
|
37
|
+
GGML_ABORT("fatal error");
|
38
|
+
}
|
39
|
+
|
28
40
|
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
29
41
|
struct llama_sampler_chain_params result = {
|
30
42
|
/*.no_perf =*/ true,
|
@@ -47,6 +59,7 @@ bool llama_supports_mlock(void) {
|
|
47
59
|
|
48
60
|
bool llama_supports_gpu_offload(void) {
|
49
61
|
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
|
62
|
+
ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
|
50
63
|
llama_supports_rpc();
|
51
64
|
}
|
52
65
|
|
@@ -71,7 +84,9 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
|
|
71
84
|
GGML_ASSERT(dev && "CPU backend is not loaded");
|
72
85
|
auto * reg = ggml_backend_dev_backend_reg(dev);
|
73
86
|
auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
|
74
|
-
numa_init_fn
|
87
|
+
if (numa_init_fn) {
|
88
|
+
numa_init_fn(numa);
|
89
|
+
}
|
75
90
|
}
|
76
91
|
}
|
77
92
|
|
@@ -170,8 +185,13 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
170
185
|
model->devices.push_back(*dev);
|
171
186
|
}
|
172
187
|
} else {
|
188
|
+
// default device selection
|
189
|
+
|
190
|
+
// build list of available devices
|
191
|
+
std::vector<ggml_backend_dev_t> gpus;
|
192
|
+
std::vector<ggml_backend_dev_t> igpus;
|
173
193
|
std::vector<ggml_backend_dev_t> rpc_servers;
|
174
|
-
|
194
|
+
|
175
195
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
176
196
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
177
197
|
switch (ggml_backend_dev_type(dev)) {
|
@@ -180,19 +200,51 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
180
200
|
// skip CPU backends since they are handled separately
|
181
201
|
break;
|
182
202
|
|
183
|
-
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
203
|
+
case GGML_BACKEND_DEVICE_TYPE_GPU: {
|
184
204
|
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
185
205
|
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
|
186
206
|
rpc_servers.push_back(dev);
|
187
207
|
} else {
|
188
|
-
|
208
|
+
// check if there is already a GPU with the same device id
|
209
|
+
ggml_backend_dev_props props;
|
210
|
+
ggml_backend_dev_get_props(dev, &props);
|
211
|
+
auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
|
212
|
+
ggml_backend_dev_props d_props;
|
213
|
+
ggml_backend_dev_get_props(d, &d_props);
|
214
|
+
if (props.device_id && d_props.device_id) {
|
215
|
+
return strcmp(props.device_id, d_props.device_id) == 0;
|
216
|
+
}
|
217
|
+
return false;
|
218
|
+
});
|
219
|
+
|
220
|
+
if (it != gpus.end()) {
|
221
|
+
LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
|
222
|
+
__func__,
|
223
|
+
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
224
|
+
props.device_id ? props.device_id : "unknown id",
|
225
|
+
ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
|
226
|
+
} else {
|
227
|
+
gpus.push_back(dev);
|
228
|
+
}
|
189
229
|
}
|
190
230
|
break;
|
231
|
+
}
|
232
|
+
|
233
|
+
case GGML_BACKEND_DEVICE_TYPE_IGPU:
|
234
|
+
igpus.push_back(dev);
|
235
|
+
break;
|
191
236
|
}
|
192
237
|
}
|
193
|
-
|
194
|
-
|
195
|
-
|
238
|
+
|
239
|
+
// add RPC servers at the front of the list to minimize network transfers
|
240
|
+
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
|
241
|
+
|
242
|
+
// add GPUs
|
243
|
+
model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
|
244
|
+
|
245
|
+
// add integrated GPUs only if no other devices were found
|
246
|
+
if (model->devices.empty()) {
|
247
|
+
model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
|
196
248
|
}
|
197
249
|
}
|
198
250
|
|
@@ -213,9 +265,12 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
213
265
|
}
|
214
266
|
|
215
267
|
for (auto * dev : model->devices) {
|
216
|
-
|
217
|
-
|
218
|
-
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__,
|
268
|
+
ggml_backend_dev_props props;
|
269
|
+
ggml_backend_dev_get_props(dev, &props);
|
270
|
+
LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
|
271
|
+
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
272
|
+
props.device_id ? props.device_id : "unknown id",
|
273
|
+
props.memory_free/1024/1024);
|
219
274
|
}
|
220
275
|
|
221
276
|
const int status = llama_model_load(path_model, splits, *model, params);
|
@@ -64,59 +64,18 @@ extern "C" {
|
|
64
64
|
|
65
65
|
typedef struct llama_memory_i * llama_memory_t;
|
66
66
|
|
67
|
-
struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
|
68
|
-
|
69
67
|
typedef int32_t llama_pos;
|
70
68
|
typedef int32_t llama_token;
|
71
69
|
typedef int32_t llama_seq_id;
|
72
70
|
|
73
71
|
enum llama_vocab_type {
|
74
|
-
LLAMA_VOCAB_TYPE_NONE
|
75
|
-
LLAMA_VOCAB_TYPE_SPM
|
76
|
-
LLAMA_VOCAB_TYPE_BPE
|
77
|
-
LLAMA_VOCAB_TYPE_WPM
|
78
|
-
LLAMA_VOCAB_TYPE_UGM
|
79
|
-
LLAMA_VOCAB_TYPE_RWKV
|
80
|
-
|
81
|
-
|
82
|
-
// pre-tokenization types
|
83
|
-
enum llama_vocab_pre_type {
|
84
|
-
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
85
|
-
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
86
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
87
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
88
|
-
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
89
|
-
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
90
|
-
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
91
|
-
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
92
|
-
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
93
|
-
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
94
|
-
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
|
95
|
-
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
96
|
-
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
97
|
-
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
98
|
-
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
99
|
-
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
100
|
-
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
|
101
|
-
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
102
|
-
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
103
|
-
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
104
|
-
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
105
|
-
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
106
|
-
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
107
|
-
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
108
|
-
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
109
|
-
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
110
|
-
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
111
|
-
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
112
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
113
|
-
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
|
114
|
-
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
|
115
|
-
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
116
|
-
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
117
|
-
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
118
|
-
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
119
|
-
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
72
|
+
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
73
|
+
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
74
|
+
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
75
|
+
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
76
|
+
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
|
77
|
+
LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
|
78
|
+
LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
|
120
79
|
};
|
121
80
|
|
122
81
|
enum llama_rope_type {
|
@@ -191,6 +150,7 @@ extern "C" {
|
|
191
150
|
//LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
|
192
151
|
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
193
152
|
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
153
|
+
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
|
194
154
|
|
195
155
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
196
156
|
};
|
@@ -219,6 +179,14 @@ extern "C" {
|
|
219
179
|
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
|
220
180
|
};
|
221
181
|
|
182
|
+
enum llama_flash_attn_type {
|
183
|
+
LLAMA_FLASH_ATTN_TYPE_AUTO = -1,
|
184
|
+
LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
|
185
|
+
LLAMA_FLASH_ATTN_TYPE_ENABLED = 1,
|
186
|
+
};
|
187
|
+
|
188
|
+
LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
|
189
|
+
|
222
190
|
enum llama_split_mode {
|
223
191
|
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
224
192
|
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
@@ -238,7 +206,7 @@ extern "C" {
|
|
238
206
|
llama_token_data * data;
|
239
207
|
size_t size;
|
240
208
|
int64_t selected; // this is the index in the data array (i.e. not the token id)
|
241
|
-
bool sorted;
|
209
|
+
bool sorted; // note: do not assume the data is sorted - always check this flag
|
242
210
|
} llama_token_data_array;
|
243
211
|
|
244
212
|
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
@@ -323,10 +291,11 @@ extern "C" {
|
|
323
291
|
const struct llama_model_kv_override * kv_overrides;
|
324
292
|
|
325
293
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
326
|
-
bool vocab_only;
|
327
|
-
bool use_mmap;
|
328
|
-
bool use_mlock;
|
329
|
-
bool check_tensors;
|
294
|
+
bool vocab_only; // only load the vocabulary, no weights
|
295
|
+
bool use_mmap; // use mmap if possible
|
296
|
+
bool use_mlock; // force system to keep model in RAM
|
297
|
+
bool check_tensors; // validate model tensor data
|
298
|
+
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
330
299
|
};
|
331
300
|
|
332
301
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
@@ -342,6 +311,7 @@ extern "C" {
|
|
342
311
|
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
343
312
|
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
344
313
|
enum llama_attention_type attention_type; // attention type to use for embeddings
|
314
|
+
enum llama_flash_attn_type flash_attn_type; // when to enable Flash Attention
|
345
315
|
|
346
316
|
// ref: https://github.com/ggml-org/llama.cpp/pull/2054
|
347
317
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
@@ -351,7 +321,7 @@ extern "C" {
|
|
351
321
|
float yarn_beta_fast; // YaRN low correction dim
|
352
322
|
float yarn_beta_slow; // YaRN high correction dim
|
353
323
|
uint32_t yarn_orig_ctx; // YaRN original context size
|
354
|
-
float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
|
324
|
+
float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)
|
355
325
|
|
356
326
|
ggml_backend_sched_eval_callback cb_eval;
|
357
327
|
void * cb_eval_user_data;
|
@@ -368,12 +338,14 @@ extern "C" {
|
|
368
338
|
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
369
339
|
bool embeddings; // if true, extract embeddings (together with logits)
|
370
340
|
bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
|
371
|
-
bool flash_attn; // use flash attention [EXPERIMENTAL]
|
372
341
|
bool no_perf; // measure performance timings
|
373
342
|
bool op_offload; // offload host tensor operations to device
|
374
343
|
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
375
344
|
// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
|
376
345
|
// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
|
346
|
+
bool kv_unified; // use a unified buffer across the input sequences when computing the attention
|
347
|
+
// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
|
348
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
|
377
349
|
};
|
378
350
|
|
379
351
|
// model quantization parameters
|
@@ -503,8 +475,6 @@ extern "C" {
|
|
503
475
|
LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
|
504
476
|
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
|
505
477
|
|
506
|
-
DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
|
507
|
-
|
508
478
|
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
|
509
479
|
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
|
510
480
|
|
@@ -573,6 +543,9 @@ extern "C" {
|
|
573
543
|
// Returns true if the model is recurrent (like Mamba, RWKV, etc.)
|
574
544
|
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
|
575
545
|
|
546
|
+
// Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
|
547
|
+
LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
|
548
|
+
|
576
549
|
// Returns 0 on success
|
577
550
|
LLAMA_API uint32_t llama_model_quantize(
|
578
551
|
const char * fname_inp,
|
@@ -588,10 +561,32 @@ extern "C" {
|
|
588
561
|
struct llama_model * model,
|
589
562
|
const char * path_lora);
|
590
563
|
|
564
|
+
// Functions to access the adapter's GGUF metadata scalar values
|
565
|
+
// - The functions return the length of the string on success, or -1 on failure
|
566
|
+
// - The output string is always null-terminated and cleared on failure
|
567
|
+
// - When retrieving a string, an extra byte must be allocated to account for the null terminator
|
568
|
+
// - GGUF array values are not supported by these functions
|
569
|
+
|
570
|
+
// Get metadata value as a string by key name
|
571
|
+
LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size);
|
572
|
+
|
573
|
+
// Get the number of metadata key/value pairs
|
574
|
+
LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter);
|
575
|
+
|
576
|
+
// Get metadata key name by index
|
577
|
+
LLAMA_API int32_t llama_adapter_meta_key_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
|
578
|
+
|
579
|
+
// Get metadata value as a string by index
|
580
|
+
LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
|
581
|
+
|
591
582
|
// Manually free a LoRA adapter
|
592
583
|
// Note: loaded adapters will be free when the associated model is deleted
|
593
584
|
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
594
585
|
|
586
|
+
// Get the invocation tokens if the current lora is an alora
|
587
|
+
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
|
588
|
+
LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens (const struct llama_adapter_lora * adapter);
|
589
|
+
|
595
590
|
// The following functions operate on a llama_context, hence the naming: llama_verb_...
|
596
591
|
|
597
592
|
// Add a loaded LoRA adapter to given context
|
@@ -698,111 +693,6 @@ extern "C" {
|
|
698
693
|
// Check if the memory supports shifting
|
699
694
|
LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
|
700
695
|
|
701
|
-
//
|
702
|
-
// KV cache for self-attention (TODO: deprecate in favor of llama_memory)
|
703
|
-
//
|
704
|
-
|
705
|
-
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
706
|
-
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
707
|
-
DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
|
708
|
-
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
709
|
-
|
710
|
-
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
711
|
-
DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
|
712
|
-
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
713
|
-
|
714
|
-
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
715
|
-
DEPRECATED(LLAMA_API void llama_kv_self_clear(
|
716
|
-
struct llama_context * ctx),
|
717
|
-
"Use llama_memory_clear() instead");
|
718
|
-
|
719
|
-
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
720
|
-
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
721
|
-
// seq_id < 0 : match any sequence
|
722
|
-
// p0 < 0 : [0, p1]
|
723
|
-
// p1 < 0 : [p0, inf)
|
724
|
-
DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
|
725
|
-
struct llama_context * ctx,
|
726
|
-
llama_seq_id seq_id,
|
727
|
-
llama_pos p0,
|
728
|
-
llama_pos p1),
|
729
|
-
"Use llama_memory_seq_rm() instead");
|
730
|
-
|
731
|
-
// Copy all tokens that belong to the specified sequence to another sequence
|
732
|
-
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
733
|
-
// p0 < 0 : [0, p1]
|
734
|
-
// p1 < 0 : [p0, inf)
|
735
|
-
DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
|
736
|
-
struct llama_context * ctx,
|
737
|
-
llama_seq_id seq_id_src,
|
738
|
-
llama_seq_id seq_id_dst,
|
739
|
-
llama_pos p0,
|
740
|
-
llama_pos p1),
|
741
|
-
"Use llama_memory_seq_cp() instead");
|
742
|
-
|
743
|
-
// Removes all tokens that do not belong to the specified sequence
|
744
|
-
DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
|
745
|
-
struct llama_context * ctx,
|
746
|
-
llama_seq_id seq_id),
|
747
|
-
"Use llama_memory_seq_keep() instead");
|
748
|
-
|
749
|
-
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
750
|
-
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
751
|
-
// - lazily on next llama_decode()
|
752
|
-
// p0 < 0 : [0, p1]
|
753
|
-
// p1 < 0 : [p0, inf)
|
754
|
-
DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
|
755
|
-
struct llama_context * ctx,
|
756
|
-
llama_seq_id seq_id,
|
757
|
-
llama_pos p0,
|
758
|
-
llama_pos p1,
|
759
|
-
llama_pos delta),
|
760
|
-
"Use llama_memory_seq_add() instead");
|
761
|
-
|
762
|
-
// Integer division of the positions by factor of `d > 1`
|
763
|
-
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
764
|
-
// - lazily on next llama_decode()
|
765
|
-
// p0 < 0 : [0, p1]
|
766
|
-
// p1 < 0 : [p0, inf)
|
767
|
-
DEPRECATED(void llama_kv_self_seq_div(
|
768
|
-
struct llama_context * ctx,
|
769
|
-
llama_seq_id seq_id,
|
770
|
-
llama_pos p0,
|
771
|
-
llama_pos p1,
|
772
|
-
int d),
|
773
|
-
"Use llama_memory_seq_div() instead");
|
774
|
-
|
775
|
-
// Returns the smallest position present in the KV cache for the specified sequence
|
776
|
-
// This is typically non-zero only for SWA caches
|
777
|
-
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
778
|
-
// Return -1 if the sequence is empty
|
779
|
-
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
|
780
|
-
struct llama_context * ctx,
|
781
|
-
llama_seq_id seq_id),
|
782
|
-
"Use llama_memory_seq_pos_min() instead");
|
783
|
-
|
784
|
-
// Returns the largest position present in the KV cache for the specified sequence
|
785
|
-
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
786
|
-
// Return -1 if the sequence is empty
|
787
|
-
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
788
|
-
struct llama_context * ctx,
|
789
|
-
llama_seq_id seq_id),
|
790
|
-
"Use llama_memory_seq_pos_max() instead");
|
791
|
-
|
792
|
-
// Defragment the KV cache
|
793
|
-
// This will be applied:
|
794
|
-
// - lazily on next llama_decode()
|
795
|
-
DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
|
796
|
-
"simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
|
797
|
-
|
798
|
-
// Check if the context supports KV cache shifting
|
799
|
-
DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
|
800
|
-
"use llama_memory_can_shift() instead");
|
801
|
-
|
802
|
-
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
803
|
-
DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
|
804
|
-
"simply remove this call, updates are applied lazily on the next llama_decode()");
|
805
|
-
|
806
696
|
//
|
807
697
|
// State / sessions
|
808
698
|
//
|
@@ -901,6 +791,29 @@ extern "C" {
|
|
901
791
|
size_t n_token_capacity,
|
902
792
|
size_t * n_token_count_out);
|
903
793
|
|
794
|
+
#define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
|
795
|
+
|
796
|
+
typedef uint32_t llama_state_seq_flags;
|
797
|
+
|
798
|
+
LLAMA_API size_t llama_state_seq_get_size_ext(
|
799
|
+
struct llama_context * ctx,
|
800
|
+
llama_seq_id seq_id,
|
801
|
+
llama_state_seq_flags flags);
|
802
|
+
|
803
|
+
LLAMA_API size_t llama_state_seq_get_data_ext(
|
804
|
+
struct llama_context * ctx,
|
805
|
+
uint8_t * dst,
|
806
|
+
size_t size,
|
807
|
+
llama_seq_id seq_id,
|
808
|
+
llama_state_seq_flags flags);
|
809
|
+
|
810
|
+
LLAMA_API size_t llama_state_seq_set_data_ext(
|
811
|
+
struct llama_context * ctx,
|
812
|
+
const uint8_t * src,
|
813
|
+
size_t size,
|
814
|
+
llama_seq_id dest_seq_id,
|
815
|
+
llama_state_seq_flags flags);
|
816
|
+
|
904
817
|
//
|
905
818
|
// Decoding
|
906
819
|
//
|
@@ -992,6 +905,7 @@ extern "C" {
|
|
992
905
|
// in the order they have appeared in the batch.
|
993
906
|
// Rows: number of tokens for which llama_batch.logits[i] != 0
|
994
907
|
// Cols: n_vocab
|
908
|
+
// TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
|
995
909
|
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
996
910
|
|
997
911
|
// Logits for the ith token. For positive indices, Equivalent to:
|
@@ -1006,6 +920,7 @@ extern "C" {
|
|
1006
920
|
// in the order they have appeared in the batch.
|
1007
921
|
// shape: [n_outputs*n_embd]
|
1008
922
|
// Otherwise, returns NULL.
|
923
|
+
// TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
|
1009
924
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
1010
925
|
|
1011
926
|
// Get the embeddings for the ith token. For positive indices, Equivalent to:
|
@@ -1044,6 +959,7 @@ extern "C" {
|
|
1044
959
|
LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
|
1045
960
|
LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
|
1046
961
|
LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
|
962
|
+
LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
|
1047
963
|
|
1048
964
|
LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
|
1049
965
|
LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
|
@@ -1244,11 +1160,6 @@ extern "C" {
|
|
1244
1160
|
LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
|
1245
1161
|
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
|
1246
1162
|
|
1247
|
-
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
1248
|
-
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
|
1249
|
-
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
|
1250
|
-
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
|
1251
|
-
|
1252
1163
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
1253
1164
|
/// Setting k <= 0 makes this a noop
|
1254
1165
|
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
@@ -1418,23 +1329,25 @@ extern "C" {
|
|
1418
1329
|
//
|
1419
1330
|
// Performance utils
|
1420
1331
|
//
|
1421
|
-
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
|
1332
|
+
// NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements.
|
1422
1333
|
//
|
1423
1334
|
|
1424
1335
|
struct llama_perf_context_data {
|
1425
|
-
|
1426
|
-
double
|
1427
|
-
double
|
1428
|
-
double
|
1429
|
-
|
1430
|
-
|
1431
|
-
int32_t
|
1336
|
+
// ms == milliseconds
|
1337
|
+
double t_start_ms; // absolute start time
|
1338
|
+
double t_load_ms; // time needed for loading the model
|
1339
|
+
double t_p_eval_ms; // time needed for processing the prompt
|
1340
|
+
double t_eval_ms; // time needed for generating tokens
|
1341
|
+
|
1342
|
+
int32_t n_p_eval; // number of prompt tokens
|
1343
|
+
int32_t n_eval; // number of generated tokens
|
1344
|
+
int32_t n_reused; // number of times a ggml compute graph had been reused
|
1432
1345
|
};
|
1433
1346
|
|
1434
1347
|
struct llama_perf_sampler_data {
|
1435
|
-
double t_sample_ms;
|
1348
|
+
double t_sample_ms; // time needed for sampling in ms
|
1436
1349
|
|
1437
|
-
int32_t n_sample;
|
1350
|
+
int32_t n_sample; // number of sampled tokens
|
1438
1351
|
};
|
1439
1352
|
|
1440
1353
|
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
|
@@ -1446,6 +1359,9 @@ extern "C" {
|
|
1446
1359
|
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
1447
1360
|
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
1448
1361
|
|
1362
|
+
// print a breakdown of per-device memory use via LLAMA_LOG:
|
1363
|
+
LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
|
1364
|
+
|
1449
1365
|
//
|
1450
1366
|
// training
|
1451
1367
|
//
|
@@ -1464,6 +1380,8 @@ extern "C" {
|
|
1464
1380
|
|
1465
1381
|
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
1466
1382
|
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
1383
|
+
|
1384
|
+
enum ggml_opt_optimizer_type optimizer_type;
|
1467
1385
|
};
|
1468
1386
|
|
1469
1387
|
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
|
@@ -66,7 +66,7 @@ struct whisper_params {
|
|
66
66
|
float top_p = 0.80f;
|
67
67
|
float min_p = 0.01f;
|
68
68
|
float temp = 0.30f;
|
69
|
-
|
69
|
+
|
70
70
|
float vad_thold = 0.6f;
|
71
71
|
float freq_thold = 100.0f;
|
72
72
|
|
@@ -76,7 +76,7 @@ struct whisper_params {
|
|
76
76
|
bool no_timestamps = true;
|
77
77
|
bool verbose_prompt = false;
|
78
78
|
bool use_gpu = true;
|
79
|
-
bool flash_attn =
|
79
|
+
bool flash_attn = true;
|
80
80
|
|
81
81
|
std::string person = "Georgi";
|
82
82
|
std::string bot_name = "LLaMA";
|
@@ -122,6 +122,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
|
|
122
122
|
else if (arg == "-vp" || arg == "--verbose-prompt") { params.verbose_prompt = true; }
|
123
123
|
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
|
124
124
|
else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; }
|
125
|
+
else if (arg == "-nfa" || arg == "--no-flash-attn") { params.flash_attn = false; }
|
125
126
|
else if (arg == "-p" || arg == "--person") { params.person = argv[++i]; }
|
126
127
|
else if (arg == "-bn" || arg == "--bot-name") { params.bot_name = argv[++i]; }
|
127
128
|
else if (arg == "--session") { params.path_session = argv[++i]; }
|
@@ -175,7 +176,8 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
175
176
|
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
|
176
177
|
fprintf(stderr, " -vp, --verbose-prompt [%-7s] print prompt at start\n", params.verbose_prompt ? "true" : "false");
|
177
178
|
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
|
178
|
-
fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n",
|
179
|
+
fprintf(stderr, " -fa, --flash-attn [%-7s] enable flash attention\n", params.flash_attn ? "true" : "false");
|
180
|
+
fprintf(stderr, " -nfa, --no-flash-attn [%-7s] disable flash attention\n", params.flash_attn ? "false" : "true");
|
179
181
|
fprintf(stderr, " -p NAME, --person NAME [%-7s] person name (for prompt selection)\n", params.person.c_str());
|
180
182
|
fprintf(stderr, " -bn NAME, --bot-name NAME [%-7s] bot name (to display)\n", params.bot_name.c_str());
|
181
183
|
fprintf(stderr, " -w TEXT, --wake-command T [%-7s] wake-up command to listen for\n", params.wake_cmd.c_str());
|
@@ -340,9 +342,10 @@ int main(int argc, char ** argv) {
|
|
340
342
|
llama_context_params lcparams = llama_context_default_params();
|
341
343
|
|
342
344
|
// tune these to your liking
|
343
|
-
lcparams.n_ctx
|
344
|
-
lcparams.n_threads
|
345
|
-
|
345
|
+
lcparams.n_ctx = 2048;
|
346
|
+
lcparams.n_threads = params.n_threads;
|
347
|
+
|
348
|
+
lcparams.flash_attn_type = params.flash_attn ? LLAMA_FLASH_ATTN_TYPE_AUTO : LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
346
349
|
|
347
350
|
struct llama_context * ctx_llama = llama_init_from_model(model_llama, lcparams);
|
348
351
|
|