whispercpp 1.3.0 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +6 -0
- data/LICENSE +1 -1
- data/README.md +216 -424
- data/Rakefile +79 -11
- data/ext/.gitignore +11 -0
- data/ext/dependencies.rb +61 -0
- data/ext/extconf.rb +18 -26
- data/ext/options.rb +221 -0
- data/ext/ruby_whisper.c +159 -0
- data/ext/ruby_whisper.h +27 -2
- data/ext/ruby_whisper_context.c +641 -0
- data/ext/ruby_whisper_error.c +52 -0
- data/ext/ruby_whisper_model.c +232 -0
- data/ext/ruby_whisper_params.c +1301 -0
- data/ext/ruby_whisper_segment.c +143 -0
- data/ext/ruby_whisper_transcribe.cpp +87 -0
- data/ext/ruby_whisper_vad_params.c +288 -0
- data/ext/sources/.dockerignore +3 -0
- data/ext/sources/.github/workflows/bindings-ruby.yml +21 -0
- data/ext/sources/CMakeGraphVizOptions.cmake +8 -0
- data/ext/sources/CMakeLists.txt +251 -0
- data/ext/sources/bindings/javascript/CMakeLists.txt +41 -0
- data/ext/sources/bindings/javascript/emscripten.cpp +93 -0
- data/ext/sources/bindings/javascript/libwhisper.worker.js +1 -0
- data/ext/sources/bindings/javascript/package-tmpl.json +26 -0
- data/ext/sources/bindings/javascript/package.json +26 -0
- data/ext/sources/bindings/javascript/whisper.js +19 -0
- data/ext/sources/build-xcframework.sh +547 -0
- data/ext/sources/ci/run.sh +336 -0
- data/ext/sources/close-issue.yml +28 -0
- data/ext/sources/cmake/DefaultTargetOptions.cmake +16 -0
- data/ext/sources/cmake/FindFFmpeg.cmake +163 -0
- data/ext/sources/cmake/build-info.cmake +60 -0
- data/ext/sources/cmake/git-vars.cmake +22 -0
- data/ext/sources/cmake/whisper-config.cmake.in +65 -0
- data/ext/sources/cmake/whisper.pc.in +10 -0
- data/ext/sources/examples/CMakeLists.txt +124 -0
- data/ext/sources/examples/addon.node/CMakeLists.txt +31 -0
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +37 -0
- data/ext/sources/examples/addon.node/addon.cpp +438 -0
- data/ext/sources/examples/addon.node/index.js +54 -0
- data/ext/sources/examples/addon.node/package.json +16 -0
- data/ext/sources/examples/bench/CMakeLists.txt +8 -0
- data/ext/sources/examples/bench/bench.cpp +175 -0
- data/ext/sources/examples/bench.wasm/CMakeLists.txt +49 -0
- data/ext/sources/examples/bench.wasm/emscripten.cpp +87 -0
- data/ext/sources/examples/bench.wasm/index-tmpl.html +284 -0
- data/ext/sources/examples/cli/CMakeLists.txt +8 -0
- data/ext/sources/examples/cli/cli.cpp +1294 -0
- data/ext/sources/examples/coi-serviceworker.js +146 -0
- data/ext/sources/examples/command/CMakeLists.txt +10 -0
- data/ext/sources/examples/command/command.cpp +776 -0
- data/ext/sources/examples/command/commands.txt +9 -0
- data/ext/sources/examples/command.wasm/CMakeLists.txt +50 -0
- data/ext/sources/examples/command.wasm/emscripten.cpp +327 -0
- data/ext/sources/examples/command.wasm/index-tmpl.html +414 -0
- data/ext/sources/examples/common-ggml.cpp +238 -0
- data/ext/sources/examples/common-ggml.h +18 -0
- data/ext/sources/examples/common-sdl.cpp +227 -0
- data/ext/sources/examples/common-sdl.h +49 -0
- data/ext/sources/examples/common-whisper.cpp +168 -0
- data/ext/sources/examples/common-whisper.h +24 -0
- data/ext/sources/examples/common.cpp +675 -0
- data/ext/sources/examples/common.h +322 -0
- data/ext/sources/examples/deprecation-warning/CMakeLists.txt +6 -0
- data/ext/sources/examples/deprecation-warning/deprecation-warning.cpp +38 -0
- data/ext/sources/examples/ffmpeg-transcode.cpp +368 -0
- data/ext/sources/examples/generate-karaoke.sh +57 -0
- data/ext/sources/examples/grammar-parser.cpp +423 -0
- data/ext/sources/examples/grammar-parser.h +29 -0
- data/ext/sources/examples/helpers.js +191 -0
- data/ext/sources/examples/json.hpp +24596 -0
- data/ext/sources/examples/livestream.sh +112 -0
- data/ext/sources/examples/lsp/CMakeLists.txt +9 -0
- data/ext/sources/examples/lsp/lsp.cpp +467 -0
- data/ext/sources/examples/lsp/whisper.vim +362 -0
- data/ext/sources/examples/miniaudio.h +93468 -0
- data/ext/sources/examples/python/test_whisper_processor.py +7 -0
- data/ext/sources/examples/python/whisper_processor.py +54 -0
- data/ext/sources/examples/quantize/CMakeLists.txt +6 -0
- data/ext/sources/examples/quantize/quantize.cpp +223 -0
- data/ext/sources/examples/server/CMakeLists.txt +12 -0
- data/ext/sources/examples/server/bench.js +29 -0
- data/ext/sources/examples/server/httplib.h +10497 -0
- data/ext/sources/examples/server/server.cpp +1091 -0
- data/ext/sources/examples/server.py +115 -0
- data/ext/sources/examples/stb_vorbis.c +5584 -0
- data/ext/sources/examples/stream/CMakeLists.txt +10 -0
- data/ext/sources/examples/stream/stream.cpp +429 -0
- data/ext/sources/examples/stream.wasm/CMakeLists.txt +49 -0
- data/ext/sources/examples/stream.wasm/emscripten.cpp +216 -0
- data/ext/sources/examples/stream.wasm/index-tmpl.html +414 -0
- data/ext/sources/examples/sycl/CMakeLists.txt +9 -0
- data/ext/sources/examples/sycl/build.sh +22 -0
- data/ext/sources/examples/sycl/ls-sycl-device.cpp +11 -0
- data/ext/sources/examples/sycl/run-whisper.sh +17 -0
- data/ext/sources/examples/talk-llama/CMakeLists.txt +40 -0
- data/ext/sources/examples/talk-llama/eleven-labs.py +80 -0
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +388 -0
- data/ext/sources/examples/talk-llama/llama-adapter.h +76 -0
- data/ext/sources/examples/talk-llama/llama-arch.cpp +1746 -0
- data/ext/sources/examples/talk-llama/llama-arch.h +437 -0
- data/ext/sources/examples/talk-llama/llama-batch.cpp +374 -0
- data/ext/sources/examples/talk-llama/llama-batch.h +89 -0
- data/ext/sources/examples/talk-llama/llama-chat.cpp +663 -0
- data/ext/sources/examples/talk-llama/llama-chat.h +58 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +2676 -0
- data/ext/sources/examples/talk-llama/llama-context.h +276 -0
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +5 -0
- data/ext/sources/examples/talk-llama/llama-cparams.h +41 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +1229 -0
- data/ext/sources/examples/talk-llama/llama-grammar.h +173 -0
- data/ext/sources/examples/talk-llama/llama-graph.cpp +1618 -0
- data/ext/sources/examples/talk-llama/llama-graph.h +640 -0
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +95 -0
- data/ext/sources/examples/talk-llama/llama-hparams.h +190 -0
- data/ext/sources/examples/talk-llama/llama-impl.cpp +167 -0
- data/ext/sources/examples/talk-llama/llama-impl.h +61 -0
- data/ext/sources/examples/talk-llama/llama-io.cpp +15 -0
- data/ext/sources/examples/talk-llama/llama-io.h +35 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2739 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +502 -0
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +379 -0
- data/ext/sources/examples/talk-llama/llama-memory.cpp +1 -0
- data/ext/sources/examples/talk-llama/llama-memory.h +32 -0
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +600 -0
- data/ext/sources/examples/talk-llama/llama-mmap.h +68 -0
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +1138 -0
- data/ext/sources/examples/talk-llama/llama-model-loader.h +169 -0
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +281 -0
- data/ext/sources/examples/talk-llama/llama-model-saver.h +37 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +13814 -0
- data/ext/sources/examples/talk-llama/llama-model.h +425 -0
- data/ext/sources/examples/talk-llama/llama-quant.cpp +966 -0
- data/ext/sources/examples/talk-llama/llama-quant.h +1 -0
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +2575 -0
- data/ext/sources/examples/talk-llama/llama-sampling.h +32 -0
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +3340 -0
- data/ext/sources/examples/talk-llama/llama-vocab.h +131 -0
- data/ext/sources/examples/talk-llama/llama.cpp +354 -0
- data/ext/sources/examples/talk-llama/llama.h +1377 -0
- data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +23 -0
- data/ext/sources/examples/talk-llama/speak +40 -0
- data/ext/sources/examples/talk-llama/speak.bat +1 -0
- data/ext/sources/examples/talk-llama/speak.ps1 +14 -0
- data/ext/sources/examples/talk-llama/talk-llama.cpp +808 -0
- data/ext/sources/examples/talk-llama/unicode-data.cpp +7034 -0
- data/ext/sources/examples/talk-llama/unicode-data.h +20 -0
- data/ext/sources/examples/talk-llama/unicode.cpp +849 -0
- data/ext/sources/examples/talk-llama/unicode.h +66 -0
- data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +8 -0
- data/ext/sources/examples/vad-speech-segments/speech.cpp +143 -0
- data/ext/sources/examples/wchess/CMakeLists.txt +10 -0
- data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +19 -0
- data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +803 -0
- data/ext/sources/examples/wchess/libwchess/Chessboard.h +33 -0
- data/ext/sources/examples/wchess/libwchess/WChess.cpp +193 -0
- data/ext/sources/examples/wchess/libwchess/WChess.h +63 -0
- data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +117 -0
- data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +8 -0
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +249 -0
- data/ext/sources/examples/whisper.wasm/CMakeLists.txt +50 -0
- data/ext/sources/examples/whisper.wasm/emscripten.cpp +118 -0
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +658 -0
- data/ext/sources/ggml/CMakeLists.txt +390 -0
- data/ext/sources/ggml/cmake/BuildTypes.cmake +54 -0
- data/ext/sources/ggml/cmake/GitVars.cmake +22 -0
- data/ext/sources/ggml/cmake/common.cmake +26 -0
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +152 -0
- data/ext/sources/ggml/include/ggml-alloc.h +76 -0
- data/ext/sources/ggml/include/ggml-backend.h +354 -0
- data/ext/sources/ggml/include/ggml-blas.h +25 -0
- data/ext/sources/ggml/include/ggml-cann.h +123 -0
- data/ext/sources/ggml/include/ggml-cpp.h +39 -0
- data/ext/sources/ggml/include/ggml-cpu.h +143 -0
- data/ext/sources/ggml/include/ggml-cuda.h +47 -0
- data/ext/sources/ggml/include/ggml-kompute.h +50 -0
- data/ext/sources/ggml/include/ggml-metal.h +66 -0
- data/ext/sources/ggml/include/ggml-opencl.h +26 -0
- data/ext/sources/ggml/include/ggml-opt.h +237 -0
- data/ext/sources/ggml/include/ggml-rpc.h +33 -0
- data/ext/sources/ggml/include/ggml-sycl.h +49 -0
- data/ext/sources/ggml/include/ggml-vulkan.h +29 -0
- data/ext/{ggml.h → sources/ggml/include/ggml.h} +621 -821
- data/ext/sources/ggml/include/gguf.h +202 -0
- data/ext/sources/ggml/src/CMakeLists.txt +346 -0
- data/ext/sources/ggml/src/ggml-alloc.c +1042 -0
- data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- data/ext/sources/ggml/src/ggml-amx/common.h +94 -0
- data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- data/ext/sources/ggml/src/ggml-amx/mmq.cpp +2510 -0
- data/ext/sources/ggml/src/ggml-amx/mmq.h +17 -0
- data/ext/sources/ggml/src/ggml-backend-impl.h +255 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +586 -0
- data/ext/sources/ggml/src/ggml-backend.cpp +2011 -0
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +87 -0
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
- data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +74 -0
- data/ext/sources/ggml/src/ggml-cann/Doxyfile +2579 -0
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +181 -0
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +258 -0
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +3193 -0
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +1125 -0
- data/ext/sources/ggml/src/ggml-cann/common.h +420 -0
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +2606 -0
- data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +30 -0
- data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
- data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +234 -0
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
- data/ext/sources/ggml/src/ggml-common.h +1857 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +504 -0
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +221 -0
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +91 -0
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.h +16 -0
- data/ext/sources/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
- data/ext/sources/ggml/src/ggml-cpu/common.h +72 -0
- data/ext/sources/ggml/src/ggml-cpu/cpu-feats-x86.cpp +327 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +6431 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +508 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +13747 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +3510 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +671 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +337 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +95 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +482 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3544 -0
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +8903 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.h +110 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +28 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +252 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.h +818 -0
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +184 -0
- data/ext/sources/ggml/src/ggml-cuda/acc.cu +61 -0
- data/ext/sources/ggml/src/ggml-cuda/acc.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/arange.cu +34 -0
- data/ext/sources/ggml/src/ggml-cuda/arange.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/argmax.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/argmax.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +104 -0
- data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +363 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +9 -0
- data/ext/sources/ggml/src/ggml-cuda/clamp.cu +45 -0
- data/ext/sources/ggml/src/ggml-cuda/clamp.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +828 -0
- data/ext/sources/ggml/src/ggml-cuda/concat.cu +221 -0
- data/ext/sources/ggml/src/ggml-cuda/concat.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +89 -0
- data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +730 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +26 -0
- data/ext/sources/ggml/src/ggml-cuda/count-equal.cu +64 -0
- data/ext/sources/ggml/src/ggml-cuda/count-equal.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/cp-async.cuh +57 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +705 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +11 -0
- data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +189 -0
- data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +103 -0
- data/ext/sources/ggml/src/ggml-cuda/diagmask.cu +40 -0
- data/ext/sources/ggml/src/ggml-cuda/diagmask.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +881 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1471 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +357 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +365 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +482 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +472 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +634 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +346 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +275 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cuh +15 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +3505 -0
- data/ext/sources/ggml/src/ggml-cuda/gla.cu +93 -0
- data/ext/sources/ggml/src/ggml-cuda/gla.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +103 -0
- data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +396 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +324 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +3217 -0
- data/ext/sources/ggml/src/ggml-cuda/mmv.cu +336 -0
- data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +12 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +595 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +12 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +458 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cuh +11 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +68 -0
- data/ext/sources/ggml/src/ggml-cuda/out-prod.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +49 -0
- data/ext/sources/ggml/src/ggml-cuda/pad.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/pool2d.cu +94 -0
- data/ext/sources/ggml/src/ggml-cuda/pool2d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +190 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +27 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +456 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +31 -0
- data/ext/sources/ggml/src/ggml-cuda/scale.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +283 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +148 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +153 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/sum.cu +45 -0
- data/ext/sources/ggml/src/ggml-cuda/sum.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +39 -0
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +78 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +47 -0
- data/ext/sources/ggml/src/ggml-cuda/tsembd.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +289 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +59 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +51 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +1135 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +15 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +243 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +140 -0
- data/ext/sources/ggml/src/ggml-cuda/wkv.cu +199 -0
- data/ext/sources/ggml/src/ggml-cuda/wkv.cuh +7 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +131 -0
- data/ext/sources/ggml/src/ggml-impl.h +601 -0
- data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
- data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +112 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +58 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +25 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +30 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +22 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +17 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +31 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +31 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +38 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +39 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +44 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +69 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +51 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +33 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +35 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +140 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +106 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +73 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +28 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +84 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +21 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +53 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +19 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +23 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +22 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +72 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +71 -0
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +120 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +622 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +5998 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +7089 -0
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +113 -0
- data/ext/sources/ggml/src/ggml-musa/mudnn.cu +112 -0
- data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +12 -0
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +96 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +5124 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +83 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +118 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +62 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +79 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl +190 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +81 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +96 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +721 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +16 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +87 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +87 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +86 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +86 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +84 -0
- data/ext/sources/ggml/src/ggml-opt.cpp +1037 -0
- data/ext/sources/ggml/src/ggml-quants.c +5232 -0
- data/ext/sources/ggml/src/ggml-quants.h +100 -0
- data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +1813 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +189 -0
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +37 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +345 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- data/ext/sources/ggml/src/ggml-sycl/common.cpp +83 -0
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +589 -0
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +195 -0
- data/ext/sources/ggml/src/ggml-sycl/concat.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/conv.cpp +101 -0
- data/ext/sources/ggml/src/ggml-sycl/conv.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +623 -0
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +34 -0
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +700 -0
- data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +11 -0
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +791 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +1162 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.hpp +27 -0
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +2957 -0
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1511 -0
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +75 -0
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +99 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +309 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +4493 -0
- data/ext/sources/ggml/src/ggml-sycl/gla.cpp +106 -0
- data/ext/sources/ggml/src/ggml-sycl/gla.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +136 -0
- data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +21 -0
- data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +3030 -0
- data/ext/sources/ggml/src/ggml-sycl/mmq.hpp +33 -0
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1110 -0
- data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +27 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +501 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.hpp +26 -0
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +47 -0
- data/ext/sources/ggml/src/ggml-sycl/outprod.hpp +10 -0
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +74 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +83 -0
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +361 -0
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +261 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +72 -0
- data/ext/sources/ggml/src/ggml-sycl/tsembd.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +1215 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +293 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.hpp +10 -0
- data/ext/sources/ggml/src/ggml-threading.cpp +12 -0
- data/ext/sources/ggml/src/ggml-threading.h +14 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +196 -0
- data/ext/sources/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +10700 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +39 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +51 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +69 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +242 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +462 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +699 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.comp +13 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +48 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +39 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +49 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +337 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +267 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +59 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +64 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +76 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +33 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +41 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +100 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +41 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +118 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +82 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +79 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +87 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +118 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +154 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +130 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +136 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +167 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +868 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +441 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +442 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +99 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +77 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +52 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +58 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +60 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +43 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +43 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +47 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +173 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +50 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +37 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_bfloat16_support.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat_support.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_integer_dot_support.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +41 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +1373 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +751 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
- data/ext/sources/ggml/src/ggml.c +6550 -0
- data/ext/sources/ggml/src/gguf.cpp +1330 -0
- data/ext/{whisper.h → sources/include/whisper.h} +91 -24
- data/ext/sources/src/CMakeLists.txt +143 -0
- data/ext/sources/src/coreml/whisper-decoder-impl.h +158 -0
- data/ext/sources/src/coreml/whisper-decoder-impl.m +226 -0
- data/ext/sources/src/coreml/whisper-encoder-impl.h +154 -0
- data/ext/sources/src/coreml/whisper-encoder-impl.m +222 -0
- data/ext/sources/src/coreml/whisper-encoder.h +26 -0
- data/ext/sources/src/coreml/whisper-encoder.mm +73 -0
- data/ext/sources/src/openvino/whisper-openvino-encoder.cpp +108 -0
- data/ext/sources/src/openvino/whisper-openvino-encoder.h +31 -0
- data/ext/sources/src/whisper-arch.h +197 -0
- data/ext/{whisper.cpp → sources/src/whisper.cpp} +2535 -835
- data/ext/sources/tests/CMakeLists.txt +105 -0
- data/ext/sources/tests/earnings21/eval.mk +58 -0
- data/ext/sources/tests/earnings21/eval.py +68 -0
- data/ext/sources/tests/earnings21/normalizers/__init__.py +2 -0
- data/ext/sources/tests/earnings21/normalizers/basic.py +80 -0
- data/ext/sources/tests/earnings21/normalizers/english.json +1741 -0
- data/ext/sources/tests/earnings21/normalizers/english.py +550 -0
- data/ext/sources/tests/earnings21/requirements.txt +6 -0
- data/ext/sources/tests/en-0-ref.txt +1 -0
- data/ext/sources/tests/en-1-ref.txt +1 -0
- data/ext/sources/tests/en-2-ref.txt +1 -0
- data/ext/sources/tests/es-0-ref.txt +1 -0
- data/ext/sources/tests/librispeech/eval.mk +39 -0
- data/ext/sources/tests/librispeech/eval.py +47 -0
- data/ext/sources/tests/librispeech/normalizers/__init__.py +2 -0
- data/ext/sources/tests/librispeech/normalizers/basic.py +80 -0
- data/ext/sources/tests/librispeech/normalizers/english.json +1741 -0
- data/ext/sources/tests/librispeech/normalizers/english.py +550 -0
- data/ext/sources/tests/librispeech/requirements.txt +6 -0
- data/ext/sources/tests/run-tests.sh +130 -0
- data/ext/sources/tests/test-c.c +3 -0
- data/ext/sources/tests/test-vad-full.cpp +54 -0
- data/ext/sources/tests/test-vad.cpp +83 -0
- data/ext/sources/tests/test-whisper.js +58 -0
- data/extsources.rb +34 -0
- data/lib/whisper/model/uri.rb +178 -0
- data/sig/whisper.rbs +480 -0
- data/tests/helper.rb +35 -0
- data/tests/jfk_reader/.gitignore +5 -0
- data/tests/jfk_reader/extconf.rb +3 -0
- data/tests/jfk_reader/jfk_reader.c +68 -0
- data/tests/test_callback.rb +202 -0
- data/tests/test_error.rb +20 -0
- data/tests/test_model.rb +109 -0
- data/tests/test_package.rb +46 -0
- data/tests/test_params.rb +297 -0
- data/tests/test_segment.rb +74 -0
- data/tests/test_vad.rb +19 -0
- data/tests/test_vad_params.rb +103 -0
- data/tests/test_whisper.rb +212 -124
- data/whispercpp.gemspec +37 -0
- metadata +794 -13
- data/ext/dr_wav.h +0 -6434
- data/ext/ggml.c +0 -21755
- data/ext/ruby_whisper.cpp +0 -426
@@ -0,0 +1,721 @@
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
2
|
+
|
3
|
+
//------------------------------------------------------------------------------
|
4
|
+
// kernel_rope
|
5
|
+
//------------------------------------------------------------------------------
|
6
|
+
float rope_yarn_ramp(float low, float high, int i0) {
|
7
|
+
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
8
|
+
return 1.0f - min(1.0f, max(0.0f, y));
|
9
|
+
}
|
10
|
+
|
11
|
+
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
12
|
+
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
13
|
+
float2 rope_yarn(
|
14
|
+
float theta_extrap, float freq_scale, float2 corr_dims, int i0, float ext_factor, float mscale
|
15
|
+
) {
|
16
|
+
// Get n-d rotational scaling corrected for extrapolation
|
17
|
+
float theta_interp = freq_scale * theta_extrap;
|
18
|
+
float theta = theta_interp;
|
19
|
+
if (ext_factor != 0.0f) {
|
20
|
+
float ramp_mix = rope_yarn_ramp(corr_dims.s0, corr_dims.s1, i0) * ext_factor;
|
21
|
+
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
22
|
+
|
23
|
+
// Get n-d magnitude scaling corrected for interpolation
|
24
|
+
mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
|
25
|
+
}
|
26
|
+
return (float2)(cos(theta) * mscale, sin(theta) * mscale);
|
27
|
+
}
|
28
|
+
|
29
|
+
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
30
|
+
// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
|
31
|
+
float rope_yarn_corr_factor(int n_dims, int n_ctx_orig, float n_rot, float base) {
|
32
|
+
return n_dims * log(n_ctx_orig / (n_rot * 2 * M_PI_F)) / (2 * log(base));
|
33
|
+
}
|
34
|
+
|
35
|
+
float2 rope_yarn_corr_dims(
|
36
|
+
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow
|
37
|
+
) {
|
38
|
+
// start and end correction dims
|
39
|
+
return (float2)(
|
40
|
+
max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_fast, freq_base))),
|
41
|
+
min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_slow, freq_base)))
|
42
|
+
);
|
43
|
+
}
|
44
|
+
|
45
|
+
kernel void kernel_rope_norm_f32(
|
46
|
+
global void * src0,
|
47
|
+
ulong offset0,
|
48
|
+
global int * src1,
|
49
|
+
ulong offset1,
|
50
|
+
global float * src2,
|
51
|
+
ulong offset2,
|
52
|
+
global float * dst,
|
53
|
+
ulong offsetd,
|
54
|
+
int ne00,
|
55
|
+
int ne01,
|
56
|
+
int ne02,
|
57
|
+
int ne03,
|
58
|
+
ulong nb00,
|
59
|
+
ulong nb01,
|
60
|
+
ulong nb02,
|
61
|
+
ulong nb03,
|
62
|
+
int ne0,
|
63
|
+
int ne1,
|
64
|
+
int ne2,
|
65
|
+
int ne3,
|
66
|
+
ulong nb0,
|
67
|
+
ulong nb1,
|
68
|
+
ulong nb2,
|
69
|
+
ulong nb3,
|
70
|
+
int n_past,
|
71
|
+
int n_dims,
|
72
|
+
int n_ctx_orig,
|
73
|
+
float freq_base,
|
74
|
+
float freq_scale,
|
75
|
+
float ext_factor,
|
76
|
+
float attn_factor,
|
77
|
+
float beta_fast,
|
78
|
+
float beta_slow
|
79
|
+
) {
|
80
|
+
src0 = (global void*)((global char*)src0 + offset0);
|
81
|
+
src1 = (global int*)((global char*)src1 + offset1);
|
82
|
+
src2 = (global float*)((global char*)src2 + offset2);
|
83
|
+
dst = (global float*)((global char*)dst + offsetd);
|
84
|
+
|
85
|
+
int i3 = get_group_id(2);
|
86
|
+
int i2 = get_group_id(1);
|
87
|
+
int i1 = get_group_id(0);
|
88
|
+
|
89
|
+
float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
|
90
|
+
|
91
|
+
global int * pos = src1;
|
92
|
+
|
93
|
+
float theta_base = (float) pos[i2];
|
94
|
+
float inv_ndims = -1.f/n_dims;
|
95
|
+
|
96
|
+
for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
|
97
|
+
if (i0 < n_dims) {
|
98
|
+
int ic = i0/2;
|
99
|
+
|
100
|
+
float theta = theta_base * pow(freq_base, inv_ndims*i0);
|
101
|
+
|
102
|
+
float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
|
103
|
+
|
104
|
+
float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
|
105
|
+
|
106
|
+
global float * src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
107
|
+
global float * dst_data = (global float *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
108
|
+
|
109
|
+
float x0 = src[0];
|
110
|
+
float x1 = src[1];
|
111
|
+
|
112
|
+
dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
|
113
|
+
dst_data[1] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
|
114
|
+
} else {
|
115
|
+
global float * src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
116
|
+
global float * dst_data = (global float *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
117
|
+
|
118
|
+
dst_data[0] = src[0];
|
119
|
+
dst_data[1] = src[1];
|
120
|
+
}
|
121
|
+
}
|
122
|
+
}
|
123
|
+
|
124
|
+
kernel void kernel_rope_norm_f16(
|
125
|
+
global void * src0,
|
126
|
+
ulong offset0,
|
127
|
+
global int * src1,
|
128
|
+
ulong offset1,
|
129
|
+
global float * src2,
|
130
|
+
ulong offset2,
|
131
|
+
global float * dst,
|
132
|
+
ulong offsetd,
|
133
|
+
int ne00,
|
134
|
+
int ne01,
|
135
|
+
int ne02,
|
136
|
+
int ne03,
|
137
|
+
ulong nb00,
|
138
|
+
ulong nb01,
|
139
|
+
ulong nb02,
|
140
|
+
ulong nb03,
|
141
|
+
int ne0,
|
142
|
+
int ne1,
|
143
|
+
int ne2,
|
144
|
+
int ne3,
|
145
|
+
ulong nb0,
|
146
|
+
ulong nb1,
|
147
|
+
ulong nb2,
|
148
|
+
ulong nb3,
|
149
|
+
int n_past,
|
150
|
+
int n_dims,
|
151
|
+
int n_ctx_orig,
|
152
|
+
float freq_base,
|
153
|
+
float freq_scale,
|
154
|
+
float ext_factor,
|
155
|
+
float attn_factor,
|
156
|
+
float beta_fast,
|
157
|
+
float beta_slow
|
158
|
+
) {
|
159
|
+
src0 = (global void*)((global char*)src0 + offset0);
|
160
|
+
src1 = (global int*)((global char*)src1 + offset1);
|
161
|
+
src2 = (global float*)((global char*)src2 + offset2);
|
162
|
+
dst = (global float*)((global char*)dst + offsetd);
|
163
|
+
|
164
|
+
int i3 = get_group_id(2);
|
165
|
+
int i2 = get_group_id(1);
|
166
|
+
int i1 = get_group_id(0);
|
167
|
+
|
168
|
+
float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
|
169
|
+
|
170
|
+
global int * pos = src1;
|
171
|
+
|
172
|
+
float theta_base = (float) pos[i2];
|
173
|
+
float inv_ndims = -1.f/n_dims;
|
174
|
+
|
175
|
+
for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
|
176
|
+
if (i0 < n_dims) {
|
177
|
+
int ic = i0/2;
|
178
|
+
|
179
|
+
float theta = theta_base * pow(freq_base, inv_ndims*i0);
|
180
|
+
|
181
|
+
float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
|
182
|
+
|
183
|
+
float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
|
184
|
+
|
185
|
+
global half * src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
186
|
+
global half * dst_data = (global half *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
187
|
+
|
188
|
+
float x0 = src[0];
|
189
|
+
float x1 = src[1];
|
190
|
+
|
191
|
+
dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
|
192
|
+
dst_data[1] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
|
193
|
+
} else {
|
194
|
+
global half * src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
195
|
+
global half * dst_data = (global half *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
196
|
+
|
197
|
+
dst_data[0] = src[0];
|
198
|
+
dst_data[1] = src[1];
|
199
|
+
}
|
200
|
+
}
|
201
|
+
}
|
202
|
+
|
203
|
+
kernel void kernel_rope_neox_f32(
|
204
|
+
global void * src0,
|
205
|
+
ulong offset0,
|
206
|
+
global int * src1,
|
207
|
+
ulong offset1,
|
208
|
+
global float * src2,
|
209
|
+
ulong offset2,
|
210
|
+
global float * dst,
|
211
|
+
ulong offsetd,
|
212
|
+
int ne00,
|
213
|
+
int ne01,
|
214
|
+
int ne02,
|
215
|
+
int ne03,
|
216
|
+
ulong nb00,
|
217
|
+
ulong nb01,
|
218
|
+
ulong nb02,
|
219
|
+
ulong nb03,
|
220
|
+
int ne0,
|
221
|
+
int ne1,
|
222
|
+
int ne2,
|
223
|
+
int ne3,
|
224
|
+
ulong nb0,
|
225
|
+
ulong nb1,
|
226
|
+
ulong nb2,
|
227
|
+
ulong nb3,
|
228
|
+
int n_past,
|
229
|
+
int n_dims,
|
230
|
+
int n_ctx_orig,
|
231
|
+
float freq_base,
|
232
|
+
float freq_scale,
|
233
|
+
float ext_factor,
|
234
|
+
float attn_factor,
|
235
|
+
float beta_fast,
|
236
|
+
float beta_slow
|
237
|
+
) {
|
238
|
+
src0 = (global void*)((global char*)src0 + offset0);
|
239
|
+
src1 = (global int*)((global char*)src1 + offset1);
|
240
|
+
src2 = (global float*)((global char*)src2 + offset2);
|
241
|
+
dst = (global float*)((global char*)dst + offsetd);
|
242
|
+
|
243
|
+
int i3 = get_group_id(2);
|
244
|
+
int i2 = get_group_id(1);
|
245
|
+
int i1 = get_group_id(0);
|
246
|
+
|
247
|
+
float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
|
248
|
+
|
249
|
+
global int * pos = src1;
|
250
|
+
|
251
|
+
float theta_base = (float) pos[i2];
|
252
|
+
float inv_ndims = -1.f/n_dims;
|
253
|
+
|
254
|
+
for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
|
255
|
+
if (i0 < n_dims) {
|
256
|
+
int ic = i0/2;
|
257
|
+
|
258
|
+
const float theta = theta_base * pow(freq_base, inv_ndims*i0);
|
259
|
+
|
260
|
+
const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
|
261
|
+
|
262
|
+
float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
|
263
|
+
|
264
|
+
global float * src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
265
|
+
global float * dst_data = (global float *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
266
|
+
|
267
|
+
const float x0 = src[0];
|
268
|
+
const float x1 = src[n_dims/2];
|
269
|
+
|
270
|
+
dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
|
271
|
+
dst_data[n_dims/2] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
|
272
|
+
} else {
|
273
|
+
global float * const src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
274
|
+
global float * dst_data = (global float *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
275
|
+
|
276
|
+
dst_data[0] = src[0];
|
277
|
+
dst_data[1] = src[1];
|
278
|
+
}
|
279
|
+
}
|
280
|
+
}
|
281
|
+
|
282
|
+
kernel void kernel_rope_neox_f16(
|
283
|
+
global void * src0,
|
284
|
+
ulong offset0,
|
285
|
+
global int * src1,
|
286
|
+
ulong offset1,
|
287
|
+
global float * src2,
|
288
|
+
ulong offset2,
|
289
|
+
global float * dst,
|
290
|
+
ulong offsetd,
|
291
|
+
int ne00,
|
292
|
+
int ne01,
|
293
|
+
int ne02,
|
294
|
+
int ne03,
|
295
|
+
ulong nb00,
|
296
|
+
ulong nb01,
|
297
|
+
ulong nb02,
|
298
|
+
ulong nb03,
|
299
|
+
int ne0,
|
300
|
+
int ne1,
|
301
|
+
int ne2,
|
302
|
+
int ne3,
|
303
|
+
ulong nb0,
|
304
|
+
ulong nb1,
|
305
|
+
ulong nb2,
|
306
|
+
ulong nb3,
|
307
|
+
int n_past,
|
308
|
+
int n_dims,
|
309
|
+
int n_ctx_orig,
|
310
|
+
float freq_base,
|
311
|
+
float freq_scale,
|
312
|
+
float ext_factor,
|
313
|
+
float attn_factor,
|
314
|
+
float beta_fast,
|
315
|
+
float beta_slow
|
316
|
+
) {
|
317
|
+
src0 = (global void*)((global char*)src0 + offset0);
|
318
|
+
src1 = (global int*)((global char*)src1 + offset1);
|
319
|
+
src2 = (global float*)((global char*)src2 + offset2);
|
320
|
+
dst = (global float*)((global char*)dst + offsetd);
|
321
|
+
|
322
|
+
int i3 = get_group_id(2);
|
323
|
+
int i2 = get_group_id(1);
|
324
|
+
int i1 = get_group_id(0);
|
325
|
+
|
326
|
+
float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
|
327
|
+
|
328
|
+
global int * pos = src1;
|
329
|
+
|
330
|
+
float theta_base = (float) pos[i2];
|
331
|
+
float inv_ndims = -1.f/n_dims;
|
332
|
+
|
333
|
+
for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
|
334
|
+
if (i0 < n_dims) {
|
335
|
+
int ic = i0/2;
|
336
|
+
|
337
|
+
const float theta = theta_base * pow(freq_base, inv_ndims*i0);
|
338
|
+
|
339
|
+
const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
|
340
|
+
|
341
|
+
float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
|
342
|
+
|
343
|
+
global half * src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
344
|
+
global half * dst_data = (global half *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
345
|
+
|
346
|
+
const float x0 = src[0];
|
347
|
+
const float x1 = src[n_dims/2];
|
348
|
+
|
349
|
+
dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
|
350
|
+
dst_data[n_dims/2] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
|
351
|
+
} else {
|
352
|
+
global half * const src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
353
|
+
global half * dst_data = (global half *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
354
|
+
|
355
|
+
dst_data[0] = src[0];
|
356
|
+
dst_data[1] = src[1];
|
357
|
+
}
|
358
|
+
}
|
359
|
+
}
|
360
|
+
|
361
|
+
kernel void kernel_rope_multi_f32(
|
362
|
+
global void * src0,
|
363
|
+
ulong offset0,
|
364
|
+
global int * src1,
|
365
|
+
ulong offset1,
|
366
|
+
global float * src2,
|
367
|
+
ulong offset2,
|
368
|
+
global float * dst,
|
369
|
+
ulong offsetd,
|
370
|
+
int ne00,
|
371
|
+
int ne01,
|
372
|
+
int ne02,
|
373
|
+
int ne03,
|
374
|
+
ulong nb00,
|
375
|
+
ulong nb01,
|
376
|
+
ulong nb02,
|
377
|
+
ulong nb03,
|
378
|
+
int ne0,
|
379
|
+
int ne1,
|
380
|
+
int ne2,
|
381
|
+
int ne3,
|
382
|
+
ulong nb0,
|
383
|
+
ulong nb1,
|
384
|
+
ulong nb2,
|
385
|
+
ulong nb3,
|
386
|
+
int n_past,
|
387
|
+
int n_dims,
|
388
|
+
int n_ctx_orig,
|
389
|
+
float freq_base,
|
390
|
+
float freq_scale,
|
391
|
+
float ext_factor,
|
392
|
+
float attn_factor,
|
393
|
+
float beta_fast,
|
394
|
+
float beta_slow,
|
395
|
+
int4 sections
|
396
|
+
) {
|
397
|
+
src0 = (global void*)((global char*)src0 + offset0);
|
398
|
+
src1 = (global int*)((global char*)src1 + offset1);
|
399
|
+
src2 = (global float*)((global char*)src2 + offset2);
|
400
|
+
dst = (global float*)((global char*)dst + offsetd);
|
401
|
+
|
402
|
+
int i3 = get_group_id(2);
|
403
|
+
int i2 = get_group_id(1);
|
404
|
+
int i1 = get_group_id(0);
|
405
|
+
|
406
|
+
float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
|
407
|
+
|
408
|
+
global int * pos = src1;
|
409
|
+
|
410
|
+
const int sect_dims = sections.s0 + sections.s1 + sections.s2 + sections.s3;
|
411
|
+
const int sec_w = sections.s1 + sections.s0;
|
412
|
+
|
413
|
+
float inv_ndims = -1.f/n_dims;
|
414
|
+
|
415
|
+
for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
|
416
|
+
if (i0 < n_dims) {
|
417
|
+
int ic = i0/2;
|
418
|
+
|
419
|
+
const int sector = (i0 / 2) % sect_dims;
|
420
|
+
float theta_base = 0.0f;
|
421
|
+
|
422
|
+
if (sector < sections.s0) {
|
423
|
+
theta_base = pos[i2];
|
424
|
+
}
|
425
|
+
else if (sector >= sections.s0 && sector < sec_w) {
|
426
|
+
theta_base = pos[i2 + ne2 * 1];
|
427
|
+
}
|
428
|
+
else if (sector >= sec_w && sector < sec_w + sections.s2) {
|
429
|
+
theta_base = pos[i2 + ne2 * 2];
|
430
|
+
}
|
431
|
+
else if (sector >= sec_w + sections.s2) {
|
432
|
+
theta_base = pos[i2 + ne2 * 3];
|
433
|
+
}
|
434
|
+
|
435
|
+
const float theta = theta_base * pow(freq_base, inv_ndims*i0);
|
436
|
+
|
437
|
+
const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
|
438
|
+
|
439
|
+
float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
|
440
|
+
|
441
|
+
global float * src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
442
|
+
global float * dst_data = (global float *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
443
|
+
|
444
|
+
const float x0 = src[0];
|
445
|
+
const float x1 = src[n_dims/2];
|
446
|
+
|
447
|
+
dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
|
448
|
+
dst_data[n_dims/2] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
|
449
|
+
} else {
|
450
|
+
global float * const src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
451
|
+
global float * dst_data = (global float *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
452
|
+
|
453
|
+
dst_data[0] = src[0];
|
454
|
+
dst_data[1] = src[1];
|
455
|
+
}
|
456
|
+
}
|
457
|
+
}
|
458
|
+
|
459
|
+
kernel void kernel_rope_multi_f16(
|
460
|
+
global void * src0,
|
461
|
+
ulong offset0,
|
462
|
+
global int * src1,
|
463
|
+
ulong offset1,
|
464
|
+
global float * src2,
|
465
|
+
ulong offset2,
|
466
|
+
global half * dst,
|
467
|
+
ulong offsetd,
|
468
|
+
int ne00,
|
469
|
+
int ne01,
|
470
|
+
int ne02,
|
471
|
+
int ne03,
|
472
|
+
ulong nb00,
|
473
|
+
ulong nb01,
|
474
|
+
ulong nb02,
|
475
|
+
ulong nb03,
|
476
|
+
int ne0,
|
477
|
+
int ne1,
|
478
|
+
int ne2,
|
479
|
+
int ne3,
|
480
|
+
ulong nb0,
|
481
|
+
ulong nb1,
|
482
|
+
ulong nb2,
|
483
|
+
ulong nb3,
|
484
|
+
int n_past,
|
485
|
+
int n_dims,
|
486
|
+
int n_ctx_orig,
|
487
|
+
float freq_base,
|
488
|
+
float freq_scale,
|
489
|
+
float ext_factor,
|
490
|
+
float attn_factor,
|
491
|
+
float beta_fast,
|
492
|
+
float beta_slow,
|
493
|
+
int4 sections
|
494
|
+
) {
|
495
|
+
src0 = (global void*)((global char*)src0 + offset0);
|
496
|
+
src1 = (global int*)((global char*)src1 + offset1);
|
497
|
+
src2 = (global float*)((global char*)src2 + offset2);
|
498
|
+
dst = (global float*)((global char*)dst + offsetd);
|
499
|
+
|
500
|
+
int i3 = get_group_id(2);
|
501
|
+
int i2 = get_group_id(1);
|
502
|
+
int i1 = get_group_id(0);
|
503
|
+
|
504
|
+
float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
|
505
|
+
|
506
|
+
global int * pos = src1;
|
507
|
+
|
508
|
+
const int sect_dims = sections.s0 + sections.s1 + sections.s2 + sections.s3;
|
509
|
+
const int sec_w = sections.s1 + sections.s0;
|
510
|
+
|
511
|
+
float inv_ndims = -1.f/n_dims;
|
512
|
+
|
513
|
+
for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
|
514
|
+
if (i0 < n_dims) {
|
515
|
+
int ic = i0/2;
|
516
|
+
|
517
|
+
const int sector = (i0 / 2) % sect_dims;
|
518
|
+
float theta_base = 0.0f;
|
519
|
+
|
520
|
+
if (sector < sections.s0) {
|
521
|
+
theta_base = pos[i2];
|
522
|
+
}
|
523
|
+
else if (sector >= sections.s0 && sector < sec_w) {
|
524
|
+
theta_base = pos[i2 + ne2 * 1];
|
525
|
+
}
|
526
|
+
else if (sector >= sec_w && sector < sec_w + sections.s2) {
|
527
|
+
theta_base = pos[i2 + ne2 * 2];
|
528
|
+
}
|
529
|
+
else if (sector >= sec_w + sections.s2) {
|
530
|
+
theta_base = pos[i2 + ne2 * 3];
|
531
|
+
}
|
532
|
+
|
533
|
+
const float theta = theta_base * pow(freq_base, inv_ndims*i0);
|
534
|
+
|
535
|
+
const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
|
536
|
+
|
537
|
+
float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
|
538
|
+
|
539
|
+
global half * src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
540
|
+
global half * dst_data = (global half *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
541
|
+
|
542
|
+
const float x0 = src[0];
|
543
|
+
const float x1 = src[n_dims/2];
|
544
|
+
|
545
|
+
dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
|
546
|
+
dst_data[n_dims/2] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
|
547
|
+
} else {
|
548
|
+
global half * const src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
549
|
+
global half * dst_data = (global half *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
550
|
+
|
551
|
+
dst_data[0] = src[0];
|
552
|
+
dst_data[1] = src[1];
|
553
|
+
}
|
554
|
+
}
|
555
|
+
}
|
556
|
+
|
557
|
+
kernel void kernel_rope_vision_f32(
|
558
|
+
global void * src0,
|
559
|
+
ulong offset0,
|
560
|
+
global int * src1,
|
561
|
+
ulong offset1,
|
562
|
+
global float * src2,
|
563
|
+
ulong offset2,
|
564
|
+
global float * dst,
|
565
|
+
ulong offsetd,
|
566
|
+
int ne00,
|
567
|
+
int ne01,
|
568
|
+
int ne02,
|
569
|
+
int ne03,
|
570
|
+
ulong nb00,
|
571
|
+
ulong nb01,
|
572
|
+
ulong nb02,
|
573
|
+
ulong nb03,
|
574
|
+
int ne0,
|
575
|
+
int ne1,
|
576
|
+
int ne2,
|
577
|
+
int ne3,
|
578
|
+
ulong nb0,
|
579
|
+
ulong nb1,
|
580
|
+
ulong nb2,
|
581
|
+
ulong nb3,
|
582
|
+
int n_past,
|
583
|
+
int n_dims,
|
584
|
+
int n_ctx_orig,
|
585
|
+
float freq_base,
|
586
|
+
float freq_scale,
|
587
|
+
float ext_factor,
|
588
|
+
float attn_factor,
|
589
|
+
float beta_fast,
|
590
|
+
float beta_slow,
|
591
|
+
int4 sections
|
592
|
+
) {
|
593
|
+
src0 = (global void*)((global char*)src0 + offset0);
|
594
|
+
src1 = (global int*)((global char*)src1 + offset1);
|
595
|
+
src2 = (global float*)((global char*)src2 + offset2);
|
596
|
+
dst = (global float*)((global char*)dst + offsetd);
|
597
|
+
|
598
|
+
int i3 = get_group_id(2);
|
599
|
+
int i2 = get_group_id(1);
|
600
|
+
int i1 = get_group_id(0);
|
601
|
+
|
602
|
+
float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
|
603
|
+
|
604
|
+
global int * pos = src1;
|
605
|
+
|
606
|
+
const int sect_dims = sections.s0 + sections.s1;
|
607
|
+
const int sec_w = sections.s1 + sections.s0;
|
608
|
+
|
609
|
+
float inv_ndims = -1.f/n_dims;
|
610
|
+
|
611
|
+
for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
|
612
|
+
int ic = i0/2;
|
613
|
+
|
614
|
+
const int sector = (i0/2) % sect_dims;
|
615
|
+
float theta_base = 0.0f;
|
616
|
+
|
617
|
+
if (sector < sections.s0) {
|
618
|
+
const int p = sector;
|
619
|
+
theta_base = pos[i2] * pow(freq_base, inv_ndims*2.0f*p);
|
620
|
+
} else if (sector >= sections.s0 && sector < sec_w) {
|
621
|
+
const int p = sector - sections.s0;
|
622
|
+
theta_base = pos[i2 + ne2] * pow(freq_base, inv_ndims*2.0f*p);
|
623
|
+
}
|
624
|
+
|
625
|
+
const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
|
626
|
+
|
627
|
+
float2 cos_sin_theta = rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
|
628
|
+
|
629
|
+
global float * src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
630
|
+
global float * dst_data = (global float *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
631
|
+
|
632
|
+
const float x0 = src[0];
|
633
|
+
const float x1 = src[n_dims];
|
634
|
+
|
635
|
+
dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
|
636
|
+
dst_data[n_dims] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
|
637
|
+
}
|
638
|
+
}
|
639
|
+
|
640
|
+
kernel void kernel_rope_vision_f16(
|
641
|
+
global void * src0,
|
642
|
+
ulong offset0,
|
643
|
+
global int * src1,
|
644
|
+
ulong offset1,
|
645
|
+
global float * src2,
|
646
|
+
ulong offset2,
|
647
|
+
global half * dst,
|
648
|
+
ulong offsetd,
|
649
|
+
int ne00,
|
650
|
+
int ne01,
|
651
|
+
int ne02,
|
652
|
+
int ne03,
|
653
|
+
ulong nb00,
|
654
|
+
ulong nb01,
|
655
|
+
ulong nb02,
|
656
|
+
ulong nb03,
|
657
|
+
int ne0,
|
658
|
+
int ne1,
|
659
|
+
int ne2,
|
660
|
+
int ne3,
|
661
|
+
ulong nb0,
|
662
|
+
ulong nb1,
|
663
|
+
ulong nb2,
|
664
|
+
ulong nb3,
|
665
|
+
int n_past,
|
666
|
+
int n_dims,
|
667
|
+
int n_ctx_orig,
|
668
|
+
float freq_base,
|
669
|
+
float freq_scale,
|
670
|
+
float ext_factor,
|
671
|
+
float attn_factor,
|
672
|
+
float beta_fast,
|
673
|
+
float beta_slow,
|
674
|
+
int4 sections
|
675
|
+
) {
|
676
|
+
src0 = (global void*)((global char*)src0 + offset0);
|
677
|
+
src1 = (global int*)((global char*)src1 + offset1);
|
678
|
+
src2 = (global float*)((global char*)src2 + offset2);
|
679
|
+
dst = (global float*)((global char*)dst + offsetd);
|
680
|
+
|
681
|
+
int i3 = get_group_id(2);
|
682
|
+
int i2 = get_group_id(1);
|
683
|
+
int i1 = get_group_id(0);
|
684
|
+
|
685
|
+
float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
|
686
|
+
|
687
|
+
global int * pos = src1;
|
688
|
+
|
689
|
+
const int sect_dims = sections.s0 + sections.s1;
|
690
|
+
const int sec_w = sections.s1 + sections.s0;
|
691
|
+
|
692
|
+
float inv_ndims = -1.f/n_dims;
|
693
|
+
|
694
|
+
for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
|
695
|
+
int ic = i0/2;
|
696
|
+
|
697
|
+
const int sector = (i0/2) % sect_dims;
|
698
|
+
float theta_base = 0.0f;
|
699
|
+
|
700
|
+
if (sector < sections.s0) {
|
701
|
+
const int p = sector;
|
702
|
+
theta_base = pos[i2] * pow(freq_base, inv_ndims*2.0f*p);
|
703
|
+
} else if (sector >= sections.s0 && sector < sec_w) {
|
704
|
+
const int p = sector - sections.s0;
|
705
|
+
theta_base = pos[i2 + ne2] * pow(freq_base, inv_ndims*2.0f*p);
|
706
|
+
}
|
707
|
+
|
708
|
+
const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
|
709
|
+
|
710
|
+
float2 cos_sin_theta = rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
|
711
|
+
|
712
|
+
global half * src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
713
|
+
global half * dst_data = (global half *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
714
|
+
|
715
|
+
const float x0 = src[0];
|
716
|
+
const float x1 = src[n_dims];
|
717
|
+
|
718
|
+
dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
|
719
|
+
dst_data[n_dims] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
|
720
|
+
}
|
721
|
+
}
|