whispercpp 1.3.2 → 1.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +6 -3
- data/README.md +71 -14
- data/Rakefile +20 -7
- data/ext/.gitignore +4 -6
- data/ext/dependencies.rb +36 -24
- data/ext/extconf.rb +1 -1
- data/ext/options.rb +48 -184
- data/ext/ruby_whisper.c +18 -0
- data/ext/ruby_whisper_context.c +43 -12
- data/ext/ruby_whisper_model.c +1 -1
- data/ext/ruby_whisper_params.c +59 -27
- data/ext/ruby_whisper_segment.c +81 -4
- data/ext/ruby_whisper_transcribe.cpp +13 -7
- data/ext/ruby_whisper_vad_params.c +1 -1
- data/ext/sources/CMakeLists.txt +5 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/build-xcframework.sh +24 -0
- data/ext/sources/examples/CMakeLists.txt +1 -0
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
- data/ext/sources/examples/addon.node/addon.cpp +154 -35
- data/ext/sources/examples/addon.node/index.js +10 -5
- data/ext/sources/examples/addon.node/vad-example.js +132 -0
- data/ext/sources/examples/bench/bench.cpp +29 -18
- data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
- data/ext/sources/examples/cli/cli.cpp +7 -4
- data/ext/sources/examples/command/command.cpp +58 -32
- data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/common-whisper.cpp +14 -7
- data/ext/sources/examples/lsp/lsp.cpp +21 -17
- data/ext/sources/examples/quantize/quantize.cpp +3 -0
- data/ext/sources/examples/server/CMakeLists.txt +3 -0
- data/ext/sources/examples/server/server.cpp +193 -35
- data/ext/sources/examples/server.py +6 -1
- data/ext/sources/examples/stream/stream.cpp +10 -2
- data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
- data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
- data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -0
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
- data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
- data/ext/sources/examples/talk-llama/llama-arch.cpp +756 -15
- data/ext/sources/examples/talk-llama/llama-arch.h +85 -1
- data/ext/sources/examples/talk-llama/llama-batch.cpp +773 -272
- data/ext/sources/examples/talk-llama/llama-batch.h +126 -55
- data/ext/sources/examples/talk-llama/llama-chat.cpp +150 -13
- data/ext/sources/examples/talk-llama/llama-chat.h +8 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +814 -542
- data/ext/sources/examples/talk-llama/llama-context.h +68 -32
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
- data/ext/sources/examples/talk-llama/llama-graph.cpp +787 -440
- data/ext/sources/examples/talk-llama/llama-graph.h +333 -153
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +128 -6
- data/ext/sources/examples/talk-llama/llama-hparams.h +80 -17
- data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +137 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +1248 -1967
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +218 -345
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +164 -52
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +266 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +139 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1154 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +182 -0
- data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
- data/ext/sources/examples/talk-llama/llama-memory.h +94 -4
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +44 -17
- data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +11377 -5248
- data/ext/sources/examples/talk-llama/llama-model.h +87 -9
- data/ext/sources/examples/talk-llama/llama-quant.cpp +137 -16
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +502 -38
- data/ext/sources/examples/talk-llama/llama-vocab.h +46 -0
- data/ext/sources/examples/talk-llama/llama.cpp +76 -17
- data/ext/sources/examples/talk-llama/llama.h +176 -151
- data/ext/sources/examples/talk-llama/talk-llama.cpp +11 -6
- data/ext/sources/examples/talk-llama/unicode.cpp +212 -0
- data/ext/sources/examples/talk-llama/unicode.h +45 -0
- data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +6 -2
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
- data/ext/sources/ggml/CMakeLists.txt +106 -33
- data/ext/sources/ggml/cmake/common.cmake +24 -0
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
- data/ext/sources/ggml/include/ggml-backend.h +18 -2
- data/ext/sources/ggml/include/ggml-cpu.h +2 -0
- data/ext/sources/ggml/include/ggml-metal.h +1 -6
- data/ext/sources/ggml/include/ggml-opt.h +25 -6
- data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
- data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
- data/ext/sources/ggml/include/ggml.h +365 -21
- data/ext/sources/ggml/src/CMakeLists.txt +98 -25
- data/ext/sources/ggml/src/ggml-alloc.c +265 -141
- data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +35 -13
- data/ext/sources/ggml/src/ggml-backend.cpp +266 -60
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +4 -4
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
- data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +15 -0
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
- data/ext/sources/ggml/src/ggml-cann/common.h +149 -2
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +521 -78
- data/ext/sources/ggml/src/ggml-common.h +21 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +165 -50
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -3
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +3650 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1891 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2160 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1897 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +214 -0
- data/ext/sources/ggml/src/ggml-cpu/common.h +18 -3
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +23 -7
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +179 -110
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +44 -33
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +228 -98
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +532 -1124
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +3374 -2081
- data/ext/sources/ggml/src/ggml-cpu/ops.h +13 -8
- data/ext/sources/ggml/src/ggml-cpu/quants.c +1193 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +34 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1982 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.h +120 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +367 -46
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +3 -3
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +272 -35
- data/ext/sources/ggml/src/ggml-cpu/vec.h +794 -142
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
- data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
- data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +291 -81
- data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +117 -22
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +20 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
- data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +499 -368
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +142 -93
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +90 -50
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +636 -222
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
- data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +73 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +506 -0
- data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +4 -5
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
- data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -98
- data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +23 -19
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
- data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +179 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +15 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +14 -2
- data/ext/sources/ggml/src/ggml-impl.h +229 -175
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +21 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +163 -63
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +3208 -1575
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
- data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +32 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4430 -792
- data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +138 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
- data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
- data/ext/sources/ggml/src/ggml-quants.c +117 -24
- data/ext/sources/ggml/src/ggml-quants.h +6 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +85 -62
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +13 -17
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +21 -2
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +116 -211
- data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +700 -1041
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +20 -9
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +17 -26
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +2 -96
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +393 -250
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +32 -8
- data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -11
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +125 -21
- data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +4 -3
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +105 -17
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4198 -1145
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +19 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +64 -11
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +338 -71
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
- data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
- data/ext/sources/ggml/src/ggml.c +802 -142
- data/ext/sources/ggml/src/ggml.cpp +26 -0
- data/ext/sources/ggml/src/gguf.cpp +32 -4
- data/ext/sources/include/whisper.h +2 -0
- data/ext/sources/src/CMakeLists.txt +2 -0
- data/ext/sources/src/coreml/whisper-compat.h +10 -0
- data/ext/sources/src/coreml/whisper-compat.m +35 -0
- data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
- data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
- data/ext/sources/src/whisper.cpp +241 -215
- data/ext/sources/tests/CMakeLists.txt +8 -1
- data/ext/sources/tests/test-vad-full.cpp +3 -3
- data/ext/sources/tests/test-vad.cpp +2 -2
- data/extsources.rb +15 -9
- data/lib/whisper/context.rb +15 -0
- data/lib/whisper/model/uri.rb +57 -2
- data/lib/whisper/segment.rb +58 -0
- data/sig/whisper.rbs +75 -38
- data/{tests → test}/helper.rb +1 -12
- data/{tests → test}/test_model.rb +9 -0
- data/test/test_package.rb +51 -0
- data/{tests → test}/test_params.rb +8 -0
- data/test/test_segment.rb +146 -0
- data/{tests → test}/test_whisper.rb +70 -0
- data/whispercpp.gemspec +2 -3
- metadata +246 -191
- data/ext/sources/.dockerignore +0 -3
- data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
- data/ext/sources/ci/run.sh +0 -336
- data/ext/sources/close-issue.yml +0 -28
- data/ext/sources/ggml/include/ggml-kompute.h +0 -50
- data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
- data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
- data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
- data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
- data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
- data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -6431
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
- data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -336
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -5998
- data/tests/test_package.rb +0 -46
- data/tests/test_segment.rb +0 -74
- /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /data/{tests → test}/jfk_reader/.gitignore +0 -0
- /data/{tests → test}/jfk_reader/extconf.rb +0 -0
- /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
- /data/{tests → test}/test_callback.rb +0 -0
- /data/{tests → test}/test_error.rb +0 -0
- /data/{tests → test}/test_vad.rb +0 -0
- /data/{tests → test}/test_vad_params.rb +0 -0
@@ -61,59 +61,21 @@ extern "C" {
|
|
61
61
|
struct llama_model;
|
62
62
|
struct llama_context;
|
63
63
|
struct llama_sampler;
|
64
|
-
|
64
|
+
|
65
|
+
typedef struct llama_memory_i * llama_memory_t;
|
65
66
|
|
66
67
|
typedef int32_t llama_pos;
|
67
68
|
typedef int32_t llama_token;
|
68
69
|
typedef int32_t llama_seq_id;
|
69
70
|
|
70
71
|
enum llama_vocab_type {
|
71
|
-
LLAMA_VOCAB_TYPE_NONE
|
72
|
-
LLAMA_VOCAB_TYPE_SPM
|
73
|
-
LLAMA_VOCAB_TYPE_BPE
|
74
|
-
LLAMA_VOCAB_TYPE_WPM
|
75
|
-
LLAMA_VOCAB_TYPE_UGM
|
76
|
-
LLAMA_VOCAB_TYPE_RWKV
|
77
|
-
|
78
|
-
|
79
|
-
// pre-tokenization types
|
80
|
-
enum llama_vocab_pre_type {
|
81
|
-
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
82
|
-
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
83
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
84
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
85
|
-
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
86
|
-
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
87
|
-
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
88
|
-
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
89
|
-
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
90
|
-
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
91
|
-
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
|
92
|
-
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
93
|
-
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
94
|
-
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
95
|
-
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
96
|
-
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
97
|
-
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
|
98
|
-
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
99
|
-
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
100
|
-
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
101
|
-
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
102
|
-
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
103
|
-
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
104
|
-
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
105
|
-
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
106
|
-
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
107
|
-
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
108
|
-
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
109
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
110
|
-
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
|
111
|
-
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
|
112
|
-
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
113
|
-
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
114
|
-
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
115
|
-
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
116
|
-
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
72
|
+
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
73
|
+
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
74
|
+
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
75
|
+
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
76
|
+
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
|
77
|
+
LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
|
78
|
+
LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
|
117
79
|
};
|
118
80
|
|
119
81
|
enum llama_rope_type {
|
@@ -188,6 +150,7 @@ extern "C" {
|
|
188
150
|
//LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
|
189
151
|
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
190
152
|
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
153
|
+
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
|
191
154
|
|
192
155
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
193
156
|
};
|
@@ -216,6 +179,14 @@ extern "C" {
|
|
216
179
|
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
|
217
180
|
};
|
218
181
|
|
182
|
+
enum llama_flash_attn_type {
|
183
|
+
LLAMA_FLASH_ATTN_TYPE_AUTO = -1,
|
184
|
+
LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
|
185
|
+
LLAMA_FLASH_ATTN_TYPE_ENABLED = 1,
|
186
|
+
};
|
187
|
+
|
188
|
+
LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
|
189
|
+
|
219
190
|
enum llama_split_mode {
|
220
191
|
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
221
192
|
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
@@ -235,23 +206,26 @@ extern "C" {
|
|
235
206
|
llama_token_data * data;
|
236
207
|
size_t size;
|
237
208
|
int64_t selected; // this is the index in the data array (i.e. not the token id)
|
238
|
-
bool sorted;
|
209
|
+
bool sorted; // note: do not assume the data is sorted - always check this flag
|
239
210
|
} llama_token_data_array;
|
240
211
|
|
241
212
|
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
242
213
|
|
243
|
-
// Input data for llama_decode
|
214
|
+
// Input data for llama_encode/llama_decode
|
244
215
|
// A llama_batch object can contain input about one or many sequences
|
245
216
|
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
|
246
217
|
//
|
247
218
|
// - token : the token ids of the input (used when embd is NULL)
|
248
219
|
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
249
220
|
// - pos : the positions of the respective token in the sequence
|
250
|
-
// (if set to NULL, the token position will be tracked automatically by llama_decode)
|
221
|
+
// (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
|
251
222
|
// - seq_id : the sequence to which the respective token belongs
|
252
223
|
// (if set to NULL, the sequence ID will be assumed to be 0)
|
253
224
|
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
254
|
-
// (if set to NULL
|
225
|
+
// (if set to NULL:
|
226
|
+
// - if embeddings: all tokens are output
|
227
|
+
// - if not: only the last token is output
|
228
|
+
// )
|
255
229
|
//
|
256
230
|
typedef struct llama_batch {
|
257
231
|
int32_t n_tokens;
|
@@ -261,7 +235,7 @@ extern "C" {
|
|
261
235
|
llama_pos * pos;
|
262
236
|
int32_t * n_seq_id;
|
263
237
|
llama_seq_id ** seq_id;
|
264
|
-
int8_t * logits;
|
238
|
+
int8_t * logits; // TODO: rename this to "output"
|
265
239
|
} llama_batch;
|
266
240
|
|
267
241
|
enum llama_model_kv_override_type {
|
@@ -317,10 +291,11 @@ extern "C" {
|
|
317
291
|
const struct llama_model_kv_override * kv_overrides;
|
318
292
|
|
319
293
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
320
|
-
bool vocab_only;
|
321
|
-
bool use_mmap;
|
322
|
-
bool use_mlock;
|
323
|
-
bool check_tensors;
|
294
|
+
bool vocab_only; // only load the vocabulary, no weights
|
295
|
+
bool use_mmap; // use mmap if possible
|
296
|
+
bool use_mlock; // force system to keep model in RAM
|
297
|
+
bool check_tensors; // validate model tensor data
|
298
|
+
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
324
299
|
};
|
325
300
|
|
326
301
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
@@ -336,6 +311,7 @@ extern "C" {
|
|
336
311
|
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
337
312
|
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
338
313
|
enum llama_attention_type attention_type; // attention type to use for embeddings
|
314
|
+
enum llama_flash_attn_type flash_attn_type; // when to enable Flash Attention
|
339
315
|
|
340
316
|
// ref: https://github.com/ggml-org/llama.cpp/pull/2054
|
341
317
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
@@ -345,7 +321,7 @@ extern "C" {
|
|
345
321
|
float yarn_beta_fast; // YaRN low correction dim
|
346
322
|
float yarn_beta_slow; // YaRN high correction dim
|
347
323
|
uint32_t yarn_orig_ctx; // YaRN original context size
|
348
|
-
float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
|
324
|
+
float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)
|
349
325
|
|
350
326
|
ggml_backend_sched_eval_callback cb_eval;
|
351
327
|
void * cb_eval_user_data;
|
@@ -362,10 +338,14 @@ extern "C" {
|
|
362
338
|
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
363
339
|
bool embeddings; // if true, extract embeddings (together with logits)
|
364
340
|
bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
|
365
|
-
bool flash_attn; // use flash attention [EXPERIMENTAL]
|
366
341
|
bool no_perf; // measure performance timings
|
367
342
|
bool op_offload; // offload host tensor operations to device
|
368
343
|
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
344
|
+
// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
|
345
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
|
346
|
+
bool kv_unified; // use a unified buffer across the input sequences when computing the attention
|
347
|
+
// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
|
348
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
|
369
349
|
};
|
370
350
|
|
371
351
|
// model quantization parameters
|
@@ -382,6 +362,7 @@ extern "C" {
|
|
382
362
|
void * imatrix; // pointer to importance matrix data
|
383
363
|
void * kv_overrides; // pointer to vector containing overrides
|
384
364
|
void * tensor_types; // pointer to vector containing tensor types
|
365
|
+
void * prune_layers; // pointer to vector containing layer indices to prune
|
385
366
|
} llama_model_quantize_params;
|
386
367
|
|
387
368
|
typedef struct llama_logit_bias {
|
@@ -491,7 +472,7 @@ extern "C" {
|
|
491
472
|
DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
|
492
473
|
|
493
474
|
LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
|
494
|
-
LLAMA_API
|
475
|
+
LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
|
495
476
|
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
|
496
477
|
|
497
478
|
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
|
@@ -502,10 +483,18 @@ extern "C" {
|
|
502
483
|
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
503
484
|
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
504
485
|
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
|
486
|
+
LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model);
|
505
487
|
|
506
488
|
// Get the model's RoPE frequency scaling factor
|
507
489
|
LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
|
508
490
|
|
491
|
+
// Returns the number of classifier outputs (only valid for classifier models)
|
492
|
+
// Undefined behavior for non-classifier models
|
493
|
+
LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
|
494
|
+
|
495
|
+
// Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
|
496
|
+
LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
|
497
|
+
|
509
498
|
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
|
510
499
|
|
511
500
|
LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
|
@@ -554,6 +543,9 @@ extern "C" {
|
|
554
543
|
// Returns true if the model is recurrent (like Mamba, RWKV, etc.)
|
555
544
|
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
|
556
545
|
|
546
|
+
// Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
|
547
|
+
LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
|
548
|
+
|
557
549
|
// Returns 0 on success
|
558
550
|
LLAMA_API uint32_t llama_model_quantize(
|
559
551
|
const char * fname_inp,
|
@@ -569,10 +561,32 @@ extern "C" {
|
|
569
561
|
struct llama_model * model,
|
570
562
|
const char * path_lora);
|
571
563
|
|
564
|
+
// Functions to access the adapter's GGUF metadata scalar values
|
565
|
+
// - The functions return the length of the string on success, or -1 on failure
|
566
|
+
// - The output string is always null-terminated and cleared on failure
|
567
|
+
// - When retrieving a string, an extra byte must be allocated to account for the null terminator
|
568
|
+
// - GGUF array values are not supported by these functions
|
569
|
+
|
570
|
+
// Get metadata value as a string by key name
|
571
|
+
LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size);
|
572
|
+
|
573
|
+
// Get the number of metadata key/value pairs
|
574
|
+
LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter);
|
575
|
+
|
576
|
+
// Get metadata key name by index
|
577
|
+
LLAMA_API int32_t llama_adapter_meta_key_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
|
578
|
+
|
579
|
+
// Get metadata value as a string by index
|
580
|
+
LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
|
581
|
+
|
572
582
|
// Manually free a LoRA adapter
|
573
583
|
// Note: loaded adapters will be free when the associated model is deleted
|
574
584
|
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
575
585
|
|
586
|
+
// Get the invocation tokens if the current lora is an alora
|
587
|
+
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
|
588
|
+
LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens (const struct llama_adapter_lora * adapter);
|
589
|
+
|
576
590
|
// The following functions operate on a llama_context, hence the naming: llama_verb_...
|
577
591
|
|
578
592
|
// Add a loaded LoRA adapter to given context
|
@@ -606,106 +620,85 @@ extern "C" {
|
|
606
620
|
int32_t il_end);
|
607
621
|
|
608
622
|
//
|
609
|
-
//
|
623
|
+
// Memory
|
610
624
|
//
|
611
625
|
|
612
|
-
//
|
613
|
-
// If
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
618
|
-
DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
|
619
|
-
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
620
|
-
|
621
|
-
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
622
|
-
LLAMA_API void llama_kv_self_clear(
|
623
|
-
struct llama_context * ctx);
|
626
|
+
// Clear the memory contents
|
627
|
+
// If data == true, the data buffers will also be cleared together with the metadata
|
628
|
+
LLAMA_API void llama_memory_clear(
|
629
|
+
llama_memory_t mem,
|
630
|
+
bool data);
|
624
631
|
|
625
632
|
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
626
633
|
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
627
634
|
// seq_id < 0 : match any sequence
|
628
635
|
// p0 < 0 : [0, p1]
|
629
636
|
// p1 < 0 : [p0, inf)
|
630
|
-
LLAMA_API bool
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
637
|
+
LLAMA_API bool llama_memory_seq_rm(
|
638
|
+
llama_memory_t mem,
|
639
|
+
llama_seq_id seq_id,
|
640
|
+
llama_pos p0,
|
641
|
+
llama_pos p1);
|
635
642
|
|
636
643
|
// Copy all tokens that belong to the specified sequence to another sequence
|
637
|
-
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
638
644
|
// p0 < 0 : [0, p1]
|
639
645
|
// p1 < 0 : [p0, inf)
|
640
|
-
LLAMA_API void
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
+
LLAMA_API void llama_memory_seq_cp(
|
647
|
+
llama_memory_t mem,
|
648
|
+
llama_seq_id seq_id_src,
|
649
|
+
llama_seq_id seq_id_dst,
|
650
|
+
llama_pos p0,
|
651
|
+
llama_pos p1);
|
646
652
|
|
647
653
|
// Removes all tokens that do not belong to the specified sequence
|
648
|
-
LLAMA_API void
|
649
|
-
|
650
|
-
|
654
|
+
LLAMA_API void llama_memory_seq_keep(
|
655
|
+
llama_memory_t mem,
|
656
|
+
llama_seq_id seq_id);
|
651
657
|
|
652
658
|
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
653
|
-
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
654
|
-
// - lazily on next llama_decode()
|
655
|
-
// - explicitly with llama_kv_self_update()
|
656
659
|
// p0 < 0 : [0, p1]
|
657
660
|
// p1 < 0 : [p0, inf)
|
658
|
-
LLAMA_API void
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
661
|
+
LLAMA_API void llama_memory_seq_add(
|
662
|
+
llama_memory_t mem,
|
663
|
+
llama_seq_id seq_id,
|
664
|
+
llama_pos p0,
|
665
|
+
llama_pos p1,
|
666
|
+
llama_pos delta);
|
664
667
|
|
665
668
|
// Integer division of the positions by factor of `d > 1`
|
666
|
-
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
667
|
-
// - lazily on next llama_decode()
|
668
|
-
// - explicitly with llama_kv_self_update()
|
669
669
|
// p0 < 0 : [0, p1]
|
670
670
|
// p1 < 0 : [p0, inf)
|
671
|
-
LLAMA_API void
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
// Returns the smallest position present in the
|
671
|
+
LLAMA_API void llama_memory_seq_div(
|
672
|
+
llama_memory_t mem,
|
673
|
+
llama_seq_id seq_id,
|
674
|
+
llama_pos p0,
|
675
|
+
llama_pos p1,
|
676
|
+
int d);
|
677
|
+
|
678
|
+
// Returns the smallest position present in the memory for the specified sequence
|
679
679
|
// This is typically non-zero only for SWA caches
|
680
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
|
680
681
|
// Return -1 if the sequence is empty
|
681
|
-
LLAMA_API llama_pos
|
682
|
-
|
683
|
-
|
682
|
+
LLAMA_API llama_pos llama_memory_seq_pos_min(
|
683
|
+
llama_memory_t mem,
|
684
|
+
llama_seq_id seq_id);
|
684
685
|
|
685
|
-
// Returns the largest position present in the
|
686
|
+
// Returns the largest position present in the memory for the specified sequence
|
687
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
|
686
688
|
// Return -1 if the sequence is empty
|
687
|
-
LLAMA_API llama_pos
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
// Defragment the KV cache
|
692
|
-
// This will be applied:
|
693
|
-
// - lazily on next llama_decode()
|
694
|
-
// - explicitly with llama_kv_self_update()
|
695
|
-
LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
|
689
|
+
LLAMA_API llama_pos llama_memory_seq_pos_max(
|
690
|
+
llama_memory_t mem,
|
691
|
+
llama_seq_id seq_id);
|
696
692
|
|
697
|
-
// Check if the
|
698
|
-
LLAMA_API bool
|
699
|
-
|
700
|
-
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
701
|
-
LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
|
693
|
+
// Check if the memory supports shifting
|
694
|
+
LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
|
702
695
|
|
703
696
|
//
|
704
697
|
// State / sessions
|
705
698
|
//
|
706
699
|
|
707
700
|
// Returns the *actual* size in bytes of the state
|
708
|
-
// (logits, embedding and
|
701
|
+
// (logits, embedding and memory)
|
709
702
|
// Only use when saving the state, not when restoring it, otherwise the size may be too small.
|
710
703
|
LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
|
711
704
|
LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
|
@@ -761,12 +754,12 @@ extern "C" {
|
|
761
754
|
size_t n_token_count),
|
762
755
|
"use llama_state_save_file instead");
|
763
756
|
|
764
|
-
// Get the exact size needed to copy the
|
757
|
+
// Get the exact size needed to copy the state of a single sequence
|
765
758
|
LLAMA_API size_t llama_state_seq_get_size(
|
766
759
|
struct llama_context * ctx,
|
767
760
|
llama_seq_id seq_id);
|
768
761
|
|
769
|
-
// Copy the
|
762
|
+
// Copy the state of a single sequence into the specified buffer
|
770
763
|
LLAMA_API size_t llama_state_seq_get_data(
|
771
764
|
struct llama_context * ctx,
|
772
765
|
uint8_t * dst,
|
@@ -798,6 +791,29 @@ extern "C" {
|
|
798
791
|
size_t n_token_capacity,
|
799
792
|
size_t * n_token_count_out);
|
800
793
|
|
794
|
+
#define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
|
795
|
+
|
796
|
+
typedef uint32_t llama_state_seq_flags;
|
797
|
+
|
798
|
+
LLAMA_API size_t llama_state_seq_get_size_ext(
|
799
|
+
struct llama_context * ctx,
|
800
|
+
llama_seq_id seq_id,
|
801
|
+
llama_state_seq_flags flags);
|
802
|
+
|
803
|
+
LLAMA_API size_t llama_state_seq_get_data_ext(
|
804
|
+
struct llama_context * ctx,
|
805
|
+
uint8_t * dst,
|
806
|
+
size_t size,
|
807
|
+
llama_seq_id seq_id,
|
808
|
+
llama_state_seq_flags flags);
|
809
|
+
|
810
|
+
LLAMA_API size_t llama_state_seq_set_data_ext(
|
811
|
+
struct llama_context * ctx,
|
812
|
+
const uint8_t * src,
|
813
|
+
size_t size,
|
814
|
+
llama_seq_id dest_seq_id,
|
815
|
+
llama_state_seq_flags flags);
|
816
|
+
|
801
817
|
//
|
802
818
|
// Decoding
|
803
819
|
//
|
@@ -832,21 +848,23 @@ extern "C" {
|
|
832
848
|
// For encode-decoder contexts, processes the batch using the encoder.
|
833
849
|
// Can store the encoder output internally for later use by the decoder's cross-attention layers.
|
834
850
|
// 0 - success
|
835
|
-
// < 0 - error. the
|
851
|
+
// < 0 - error. the memory state is restored to the state before this call
|
836
852
|
LLAMA_API int32_t llama_encode(
|
837
853
|
struct llama_context * ctx,
|
838
854
|
struct llama_batch batch);
|
839
855
|
|
840
856
|
// Process a batch of tokens.
|
841
|
-
// Requires
|
857
|
+
// Requires the context to have a memory.
|
842
858
|
// For encode-decoder contexts, processes the batch using the decoder.
|
843
859
|
// Positive return values does not mean a fatal error, but rather a warning.
|
844
|
-
// Upon
|
860
|
+
// Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
|
861
|
+
// To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
|
862
|
+
// Upon other return values, the memory state is restored to the state before this call
|
845
863
|
// 0 - success
|
846
864
|
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
847
|
-
// 2 - aborted
|
865
|
+
// 2 - aborted (processed ubatches will remain in the context's memory)
|
848
866
|
// -1 - invalid input batch
|
849
|
-
// < -1 - error
|
867
|
+
// < -1 - fatal error (processed ubatches will remain in the context's memory)
|
850
868
|
LLAMA_API int32_t llama_decode(
|
851
869
|
struct llama_context * ctx,
|
852
870
|
struct llama_batch batch);
|
@@ -862,8 +880,8 @@ extern "C" {
|
|
862
880
|
// Get the number of threads used for prompt and batch processing (multiple token).
|
863
881
|
LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
|
864
882
|
|
865
|
-
// Set whether the
|
866
|
-
//
|
883
|
+
// Set whether the context outputs embeddings or not
|
884
|
+
// TODO: rename to avoid confusion with llama_get_embeddings()
|
867
885
|
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
868
886
|
|
869
887
|
// Set whether to use causal attention or not
|
@@ -887,6 +905,7 @@ extern "C" {
|
|
887
905
|
// in the order they have appeared in the batch.
|
888
906
|
// Rows: number of tokens for which llama_batch.logits[i] != 0
|
889
907
|
// Cols: n_vocab
|
908
|
+
// TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
|
890
909
|
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
891
910
|
|
892
911
|
// Logits for the ith token. For positive indices, Equivalent to:
|
@@ -901,6 +920,7 @@ extern "C" {
|
|
901
920
|
// in the order they have appeared in the batch.
|
902
921
|
// shape: [n_outputs*n_embd]
|
903
922
|
// Otherwise, returns NULL.
|
923
|
+
// TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
|
904
924
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
905
925
|
|
906
926
|
// Get the embeddings for the ith token. For positive indices, Equivalent to:
|
@@ -912,7 +932,7 @@ extern "C" {
|
|
912
932
|
|
913
933
|
// Get the embeddings for a sequence id
|
914
934
|
// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
|
915
|
-
// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[
|
935
|
+
// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence
|
916
936
|
// otherwise: float[n_embd] (1-dimensional)
|
917
937
|
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
|
918
938
|
|
@@ -939,9 +959,11 @@ extern "C" {
|
|
939
959
|
LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
|
940
960
|
LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
|
941
961
|
LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
|
962
|
+
LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
|
942
963
|
|
943
964
|
LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
|
944
965
|
LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
|
966
|
+
LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
|
945
967
|
|
946
968
|
LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
|
947
969
|
LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
|
@@ -985,6 +1007,7 @@ extern "C" {
|
|
985
1007
|
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
986
1008
|
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
987
1009
|
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
1010
|
+
/// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
|
988
1011
|
/// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
|
989
1012
|
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
990
1013
|
/// as plaintext. Does not insert a leading space.
|
@@ -1137,11 +1160,6 @@ extern "C" {
|
|
1137
1160
|
LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
|
1138
1161
|
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
|
1139
1162
|
|
1140
|
-
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
1141
|
-
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
|
1142
|
-
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
|
1143
|
-
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
|
1144
|
-
|
1145
1163
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
1146
1164
|
/// Setting k <= 0 makes this a noop
|
1147
1165
|
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
@@ -1311,23 +1329,25 @@ extern "C" {
|
|
1311
1329
|
//
|
1312
1330
|
// Performance utils
|
1313
1331
|
//
|
1314
|
-
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
|
1332
|
+
// NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements.
|
1315
1333
|
//
|
1316
1334
|
|
1317
1335
|
struct llama_perf_context_data {
|
1318
|
-
|
1319
|
-
double
|
1320
|
-
double
|
1321
|
-
double
|
1322
|
-
|
1323
|
-
|
1324
|
-
int32_t
|
1336
|
+
// ms == milliseconds
|
1337
|
+
double t_start_ms; // absolute start time
|
1338
|
+
double t_load_ms; // time needed for loading the model
|
1339
|
+
double t_p_eval_ms; // time needed for processing the prompt
|
1340
|
+
double t_eval_ms; // time needed for generating tokens
|
1341
|
+
|
1342
|
+
int32_t n_p_eval; // number of prompt tokens
|
1343
|
+
int32_t n_eval; // number of generated tokens
|
1344
|
+
int32_t n_reused; // number of times a ggml compute graph had been reused
|
1325
1345
|
};
|
1326
1346
|
|
1327
1347
|
struct llama_perf_sampler_data {
|
1328
|
-
double t_sample_ms;
|
1348
|
+
double t_sample_ms; // time needed for sampling in ms
|
1329
1349
|
|
1330
|
-
int32_t n_sample;
|
1350
|
+
int32_t n_sample; // number of sampled tokens
|
1331
1351
|
};
|
1332
1352
|
|
1333
1353
|
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
|
@@ -1339,6 +1359,9 @@ extern "C" {
|
|
1339
1359
|
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
1340
1360
|
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
1341
1361
|
|
1362
|
+
// print a breakdown of per-device memory use via LLAMA_LOG:
|
1363
|
+
LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
|
1364
|
+
|
1342
1365
|
//
|
1343
1366
|
// training
|
1344
1367
|
//
|
@@ -1357,6 +1380,8 @@ extern "C" {
|
|
1357
1380
|
|
1358
1381
|
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
1359
1382
|
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
1383
|
+
|
1384
|
+
enum ggml_opt_optimizer_type optimizer_type;
|
1360
1385
|
};
|
1361
1386
|
|
1362
1387
|
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
|