whispercpp 1.3.0 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +6 -0
- data/LICENSE +1 -1
- data/README.md +216 -424
- data/Rakefile +79 -11
- data/ext/.gitignore +11 -0
- data/ext/dependencies.rb +61 -0
- data/ext/extconf.rb +18 -26
- data/ext/options.rb +221 -0
- data/ext/ruby_whisper.c +159 -0
- data/ext/ruby_whisper.h +27 -2
- data/ext/ruby_whisper_context.c +641 -0
- data/ext/ruby_whisper_error.c +52 -0
- data/ext/ruby_whisper_model.c +232 -0
- data/ext/ruby_whisper_params.c +1301 -0
- data/ext/ruby_whisper_segment.c +143 -0
- data/ext/ruby_whisper_transcribe.cpp +87 -0
- data/ext/ruby_whisper_vad_params.c +288 -0
- data/ext/sources/.dockerignore +3 -0
- data/ext/sources/.github/workflows/bindings-ruby.yml +21 -0
- data/ext/sources/CMakeGraphVizOptions.cmake +8 -0
- data/ext/sources/CMakeLists.txt +251 -0
- data/ext/sources/bindings/javascript/CMakeLists.txt +41 -0
- data/ext/sources/bindings/javascript/emscripten.cpp +93 -0
- data/ext/sources/bindings/javascript/libwhisper.worker.js +1 -0
- data/ext/sources/bindings/javascript/package-tmpl.json +26 -0
- data/ext/sources/bindings/javascript/package.json +26 -0
- data/ext/sources/bindings/javascript/whisper.js +19 -0
- data/ext/sources/build-xcframework.sh +547 -0
- data/ext/sources/ci/run.sh +336 -0
- data/ext/sources/close-issue.yml +28 -0
- data/ext/sources/cmake/DefaultTargetOptions.cmake +16 -0
- data/ext/sources/cmake/FindFFmpeg.cmake +163 -0
- data/ext/sources/cmake/build-info.cmake +60 -0
- data/ext/sources/cmake/git-vars.cmake +22 -0
- data/ext/sources/cmake/whisper-config.cmake.in +65 -0
- data/ext/sources/cmake/whisper.pc.in +10 -0
- data/ext/sources/examples/CMakeLists.txt +124 -0
- data/ext/sources/examples/addon.node/CMakeLists.txt +31 -0
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +37 -0
- data/ext/sources/examples/addon.node/addon.cpp +438 -0
- data/ext/sources/examples/addon.node/index.js +54 -0
- data/ext/sources/examples/addon.node/package.json +16 -0
- data/ext/sources/examples/bench/CMakeLists.txt +8 -0
- data/ext/sources/examples/bench/bench.cpp +175 -0
- data/ext/sources/examples/bench.wasm/CMakeLists.txt +49 -0
- data/ext/sources/examples/bench.wasm/emscripten.cpp +87 -0
- data/ext/sources/examples/bench.wasm/index-tmpl.html +284 -0
- data/ext/sources/examples/cli/CMakeLists.txt +8 -0
- data/ext/sources/examples/cli/cli.cpp +1294 -0
- data/ext/sources/examples/coi-serviceworker.js +146 -0
- data/ext/sources/examples/command/CMakeLists.txt +10 -0
- data/ext/sources/examples/command/command.cpp +776 -0
- data/ext/sources/examples/command/commands.txt +9 -0
- data/ext/sources/examples/command.wasm/CMakeLists.txt +50 -0
- data/ext/sources/examples/command.wasm/emscripten.cpp +327 -0
- data/ext/sources/examples/command.wasm/index-tmpl.html +414 -0
- data/ext/sources/examples/common-ggml.cpp +238 -0
- data/ext/sources/examples/common-ggml.h +18 -0
- data/ext/sources/examples/common-sdl.cpp +227 -0
- data/ext/sources/examples/common-sdl.h +49 -0
- data/ext/sources/examples/common-whisper.cpp +168 -0
- data/ext/sources/examples/common-whisper.h +24 -0
- data/ext/sources/examples/common.cpp +675 -0
- data/ext/sources/examples/common.h +322 -0
- data/ext/sources/examples/deprecation-warning/CMakeLists.txt +6 -0
- data/ext/sources/examples/deprecation-warning/deprecation-warning.cpp +38 -0
- data/ext/sources/examples/ffmpeg-transcode.cpp +368 -0
- data/ext/sources/examples/generate-karaoke.sh +57 -0
- data/ext/sources/examples/grammar-parser.cpp +423 -0
- data/ext/sources/examples/grammar-parser.h +29 -0
- data/ext/sources/examples/helpers.js +191 -0
- data/ext/sources/examples/json.hpp +24596 -0
- data/ext/sources/examples/livestream.sh +112 -0
- data/ext/sources/examples/lsp/CMakeLists.txt +9 -0
- data/ext/sources/examples/lsp/lsp.cpp +467 -0
- data/ext/sources/examples/lsp/whisper.vim +362 -0
- data/ext/sources/examples/miniaudio.h +93468 -0
- data/ext/sources/examples/python/test_whisper_processor.py +7 -0
- data/ext/sources/examples/python/whisper_processor.py +54 -0
- data/ext/sources/examples/quantize/CMakeLists.txt +6 -0
- data/ext/sources/examples/quantize/quantize.cpp +223 -0
- data/ext/sources/examples/server/CMakeLists.txt +12 -0
- data/ext/sources/examples/server/bench.js +29 -0
- data/ext/sources/examples/server/httplib.h +10497 -0
- data/ext/sources/examples/server/server.cpp +1091 -0
- data/ext/sources/examples/server.py +115 -0
- data/ext/sources/examples/stb_vorbis.c +5584 -0
- data/ext/sources/examples/stream/CMakeLists.txt +10 -0
- data/ext/sources/examples/stream/stream.cpp +429 -0
- data/ext/sources/examples/stream.wasm/CMakeLists.txt +49 -0
- data/ext/sources/examples/stream.wasm/emscripten.cpp +216 -0
- data/ext/sources/examples/stream.wasm/index-tmpl.html +414 -0
- data/ext/sources/examples/sycl/CMakeLists.txt +9 -0
- data/ext/sources/examples/sycl/build.sh +22 -0
- data/ext/sources/examples/sycl/ls-sycl-device.cpp +11 -0
- data/ext/sources/examples/sycl/run-whisper.sh +17 -0
- data/ext/sources/examples/talk-llama/CMakeLists.txt +40 -0
- data/ext/sources/examples/talk-llama/eleven-labs.py +80 -0
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +388 -0
- data/ext/sources/examples/talk-llama/llama-adapter.h +76 -0
- data/ext/sources/examples/talk-llama/llama-arch.cpp +1746 -0
- data/ext/sources/examples/talk-llama/llama-arch.h +437 -0
- data/ext/sources/examples/talk-llama/llama-batch.cpp +374 -0
- data/ext/sources/examples/talk-llama/llama-batch.h +89 -0
- data/ext/sources/examples/talk-llama/llama-chat.cpp +663 -0
- data/ext/sources/examples/talk-llama/llama-chat.h +58 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +2676 -0
- data/ext/sources/examples/talk-llama/llama-context.h +276 -0
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +5 -0
- data/ext/sources/examples/talk-llama/llama-cparams.h +41 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +1229 -0
- data/ext/sources/examples/talk-llama/llama-grammar.h +173 -0
- data/ext/sources/examples/talk-llama/llama-graph.cpp +1618 -0
- data/ext/sources/examples/talk-llama/llama-graph.h +640 -0
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +95 -0
- data/ext/sources/examples/talk-llama/llama-hparams.h +190 -0
- data/ext/sources/examples/talk-llama/llama-impl.cpp +167 -0
- data/ext/sources/examples/talk-llama/llama-impl.h +61 -0
- data/ext/sources/examples/talk-llama/llama-io.cpp +15 -0
- data/ext/sources/examples/talk-llama/llama-io.h +35 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2739 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +502 -0
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +379 -0
- data/ext/sources/examples/talk-llama/llama-memory.cpp +1 -0
- data/ext/sources/examples/talk-llama/llama-memory.h +32 -0
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +600 -0
- data/ext/sources/examples/talk-llama/llama-mmap.h +68 -0
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +1138 -0
- data/ext/sources/examples/talk-llama/llama-model-loader.h +169 -0
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +281 -0
- data/ext/sources/examples/talk-llama/llama-model-saver.h +37 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +13814 -0
- data/ext/sources/examples/talk-llama/llama-model.h +425 -0
- data/ext/sources/examples/talk-llama/llama-quant.cpp +966 -0
- data/ext/sources/examples/talk-llama/llama-quant.h +1 -0
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +2575 -0
- data/ext/sources/examples/talk-llama/llama-sampling.h +32 -0
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +3340 -0
- data/ext/sources/examples/talk-llama/llama-vocab.h +131 -0
- data/ext/sources/examples/talk-llama/llama.cpp +354 -0
- data/ext/sources/examples/talk-llama/llama.h +1377 -0
- data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +23 -0
- data/ext/sources/examples/talk-llama/speak +40 -0
- data/ext/sources/examples/talk-llama/speak.bat +1 -0
- data/ext/sources/examples/talk-llama/speak.ps1 +14 -0
- data/ext/sources/examples/talk-llama/talk-llama.cpp +808 -0
- data/ext/sources/examples/talk-llama/unicode-data.cpp +7034 -0
- data/ext/sources/examples/talk-llama/unicode-data.h +20 -0
- data/ext/sources/examples/talk-llama/unicode.cpp +849 -0
- data/ext/sources/examples/talk-llama/unicode.h +66 -0
- data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +8 -0
- data/ext/sources/examples/vad-speech-segments/speech.cpp +143 -0
- data/ext/sources/examples/wchess/CMakeLists.txt +10 -0
- data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +19 -0
- data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +803 -0
- data/ext/sources/examples/wchess/libwchess/Chessboard.h +33 -0
- data/ext/sources/examples/wchess/libwchess/WChess.cpp +193 -0
- data/ext/sources/examples/wchess/libwchess/WChess.h +63 -0
- data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +117 -0
- data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +8 -0
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +249 -0
- data/ext/sources/examples/whisper.wasm/CMakeLists.txt +50 -0
- data/ext/sources/examples/whisper.wasm/emscripten.cpp +118 -0
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +658 -0
- data/ext/sources/ggml/CMakeLists.txt +390 -0
- data/ext/sources/ggml/cmake/BuildTypes.cmake +54 -0
- data/ext/sources/ggml/cmake/GitVars.cmake +22 -0
- data/ext/sources/ggml/cmake/common.cmake +26 -0
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +152 -0
- data/ext/sources/ggml/include/ggml-alloc.h +76 -0
- data/ext/sources/ggml/include/ggml-backend.h +354 -0
- data/ext/sources/ggml/include/ggml-blas.h +25 -0
- data/ext/sources/ggml/include/ggml-cann.h +123 -0
- data/ext/sources/ggml/include/ggml-cpp.h +39 -0
- data/ext/sources/ggml/include/ggml-cpu.h +143 -0
- data/ext/sources/ggml/include/ggml-cuda.h +47 -0
- data/ext/sources/ggml/include/ggml-kompute.h +50 -0
- data/ext/sources/ggml/include/ggml-metal.h +66 -0
- data/ext/sources/ggml/include/ggml-opencl.h +26 -0
- data/ext/sources/ggml/include/ggml-opt.h +237 -0
- data/ext/sources/ggml/include/ggml-rpc.h +33 -0
- data/ext/sources/ggml/include/ggml-sycl.h +49 -0
- data/ext/sources/ggml/include/ggml-vulkan.h +29 -0
- data/ext/{ggml.h → sources/ggml/include/ggml.h} +621 -821
- data/ext/sources/ggml/include/gguf.h +202 -0
- data/ext/sources/ggml/src/CMakeLists.txt +346 -0
- data/ext/sources/ggml/src/ggml-alloc.c +1042 -0
- data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- data/ext/sources/ggml/src/ggml-amx/common.h +94 -0
- data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- data/ext/sources/ggml/src/ggml-amx/mmq.cpp +2510 -0
- data/ext/sources/ggml/src/ggml-amx/mmq.h +17 -0
- data/ext/sources/ggml/src/ggml-backend-impl.h +255 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +586 -0
- data/ext/sources/ggml/src/ggml-backend.cpp +2011 -0
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +87 -0
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
- data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +74 -0
- data/ext/sources/ggml/src/ggml-cann/Doxyfile +2579 -0
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +181 -0
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +258 -0
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +3193 -0
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +1125 -0
- data/ext/sources/ggml/src/ggml-cann/common.h +420 -0
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +2606 -0
- data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +30 -0
- data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
- data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +234 -0
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
- data/ext/sources/ggml/src/ggml-common.h +1857 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +504 -0
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +221 -0
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +91 -0
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.h +16 -0
- data/ext/sources/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
- data/ext/sources/ggml/src/ggml-cpu/common.h +72 -0
- data/ext/sources/ggml/src/ggml-cpu/cpu-feats-x86.cpp +327 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +6431 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +508 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +13747 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +3510 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +671 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +337 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +95 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +482 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3544 -0
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +8903 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.h +110 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +28 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +252 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.h +818 -0
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +184 -0
- data/ext/sources/ggml/src/ggml-cuda/acc.cu +61 -0
- data/ext/sources/ggml/src/ggml-cuda/acc.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/arange.cu +34 -0
- data/ext/sources/ggml/src/ggml-cuda/arange.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/argmax.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/argmax.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +104 -0
- data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +363 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +9 -0
- data/ext/sources/ggml/src/ggml-cuda/clamp.cu +45 -0
- data/ext/sources/ggml/src/ggml-cuda/clamp.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +828 -0
- data/ext/sources/ggml/src/ggml-cuda/concat.cu +221 -0
- data/ext/sources/ggml/src/ggml-cuda/concat.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +89 -0
- data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +730 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +26 -0
- data/ext/sources/ggml/src/ggml-cuda/count-equal.cu +64 -0
- data/ext/sources/ggml/src/ggml-cuda/count-equal.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/cp-async.cuh +57 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +705 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +11 -0
- data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +189 -0
- data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +103 -0
- data/ext/sources/ggml/src/ggml-cuda/diagmask.cu +40 -0
- data/ext/sources/ggml/src/ggml-cuda/diagmask.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +881 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1471 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +357 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +365 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +482 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +472 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +634 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +346 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +275 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cuh +15 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +3505 -0
- data/ext/sources/ggml/src/ggml-cuda/gla.cu +93 -0
- data/ext/sources/ggml/src/ggml-cuda/gla.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +103 -0
- data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +396 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +324 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +3217 -0
- data/ext/sources/ggml/src/ggml-cuda/mmv.cu +336 -0
- data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +12 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +595 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +12 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +458 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cuh +11 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +68 -0
- data/ext/sources/ggml/src/ggml-cuda/out-prod.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +49 -0
- data/ext/sources/ggml/src/ggml-cuda/pad.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/pool2d.cu +94 -0
- data/ext/sources/ggml/src/ggml-cuda/pool2d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +190 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +27 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +456 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +31 -0
- data/ext/sources/ggml/src/ggml-cuda/scale.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +283 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +148 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +153 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/sum.cu +45 -0
- data/ext/sources/ggml/src/ggml-cuda/sum.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +39 -0
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +78 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +47 -0
- data/ext/sources/ggml/src/ggml-cuda/tsembd.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +289 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +59 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +51 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +1135 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +15 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +243 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +140 -0
- data/ext/sources/ggml/src/ggml-cuda/wkv.cu +199 -0
- data/ext/sources/ggml/src/ggml-cuda/wkv.cuh +7 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +131 -0
- data/ext/sources/ggml/src/ggml-impl.h +601 -0
- data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
- data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +112 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +58 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +25 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +30 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +22 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +17 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +31 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +31 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +38 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +39 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +44 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +69 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +51 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +33 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +35 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +140 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +106 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +73 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +28 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +84 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +21 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +53 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +19 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +23 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +22 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +72 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +71 -0
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +120 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +622 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +5998 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +7089 -0
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +113 -0
- data/ext/sources/ggml/src/ggml-musa/mudnn.cu +112 -0
- data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +12 -0
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +96 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +5124 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +83 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +118 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +62 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +79 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl +190 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +81 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +96 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +721 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +16 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +87 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +87 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +86 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +86 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +84 -0
- data/ext/sources/ggml/src/ggml-opt.cpp +1037 -0
- data/ext/sources/ggml/src/ggml-quants.c +5232 -0
- data/ext/sources/ggml/src/ggml-quants.h +100 -0
- data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +1813 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +189 -0
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +37 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +345 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- data/ext/sources/ggml/src/ggml-sycl/common.cpp +83 -0
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +589 -0
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +195 -0
- data/ext/sources/ggml/src/ggml-sycl/concat.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/conv.cpp +101 -0
- data/ext/sources/ggml/src/ggml-sycl/conv.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +623 -0
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +34 -0
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +700 -0
- data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +11 -0
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +791 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +1162 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.hpp +27 -0
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +2957 -0
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1511 -0
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +75 -0
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +99 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +309 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +4493 -0
- data/ext/sources/ggml/src/ggml-sycl/gla.cpp +106 -0
- data/ext/sources/ggml/src/ggml-sycl/gla.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +136 -0
- data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +21 -0
- data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +3030 -0
- data/ext/sources/ggml/src/ggml-sycl/mmq.hpp +33 -0
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1110 -0
- data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +27 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +501 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.hpp +26 -0
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +47 -0
- data/ext/sources/ggml/src/ggml-sycl/outprod.hpp +10 -0
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +74 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +83 -0
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +361 -0
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +261 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +72 -0
- data/ext/sources/ggml/src/ggml-sycl/tsembd.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +1215 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +293 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.hpp +10 -0
- data/ext/sources/ggml/src/ggml-threading.cpp +12 -0
- data/ext/sources/ggml/src/ggml-threading.h +14 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +196 -0
- data/ext/sources/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +10700 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +39 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +51 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +69 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +242 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +462 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +699 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.comp +13 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +48 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +39 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +49 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +337 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +267 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +59 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +64 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +76 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +33 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +41 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +100 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +41 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +118 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +82 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +79 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +87 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +118 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +154 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +130 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +136 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +167 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +868 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +441 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +442 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +99 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +77 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +52 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +58 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +60 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +43 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +43 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +47 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +173 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +50 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +37 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_bfloat16_support.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat_support.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_integer_dot_support.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +41 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +1373 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +751 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
- data/ext/sources/ggml/src/ggml.c +6550 -0
- data/ext/sources/ggml/src/gguf.cpp +1330 -0
- data/ext/{whisper.h → sources/include/whisper.h} +91 -24
- data/ext/sources/src/CMakeLists.txt +143 -0
- data/ext/sources/src/coreml/whisper-decoder-impl.h +158 -0
- data/ext/sources/src/coreml/whisper-decoder-impl.m +226 -0
- data/ext/sources/src/coreml/whisper-encoder-impl.h +154 -0
- data/ext/sources/src/coreml/whisper-encoder-impl.m +222 -0
- data/ext/sources/src/coreml/whisper-encoder.h +26 -0
- data/ext/sources/src/coreml/whisper-encoder.mm +73 -0
- data/ext/sources/src/openvino/whisper-openvino-encoder.cpp +108 -0
- data/ext/sources/src/openvino/whisper-openvino-encoder.h +31 -0
- data/ext/sources/src/whisper-arch.h +197 -0
- data/ext/{whisper.cpp → sources/src/whisper.cpp} +2535 -835
- data/ext/sources/tests/CMakeLists.txt +105 -0
- data/ext/sources/tests/earnings21/eval.mk +58 -0
- data/ext/sources/tests/earnings21/eval.py +68 -0
- data/ext/sources/tests/earnings21/normalizers/__init__.py +2 -0
- data/ext/sources/tests/earnings21/normalizers/basic.py +80 -0
- data/ext/sources/tests/earnings21/normalizers/english.json +1741 -0
- data/ext/sources/tests/earnings21/normalizers/english.py +550 -0
- data/ext/sources/tests/earnings21/requirements.txt +6 -0
- data/ext/sources/tests/en-0-ref.txt +1 -0
- data/ext/sources/tests/en-1-ref.txt +1 -0
- data/ext/sources/tests/en-2-ref.txt +1 -0
- data/ext/sources/tests/es-0-ref.txt +1 -0
- data/ext/sources/tests/librispeech/eval.mk +39 -0
- data/ext/sources/tests/librispeech/eval.py +47 -0
- data/ext/sources/tests/librispeech/normalizers/__init__.py +2 -0
- data/ext/sources/tests/librispeech/normalizers/basic.py +80 -0
- data/ext/sources/tests/librispeech/normalizers/english.json +1741 -0
- data/ext/sources/tests/librispeech/normalizers/english.py +550 -0
- data/ext/sources/tests/librispeech/requirements.txt +6 -0
- data/ext/sources/tests/run-tests.sh +130 -0
- data/ext/sources/tests/test-c.c +3 -0
- data/ext/sources/tests/test-vad-full.cpp +54 -0
- data/ext/sources/tests/test-vad.cpp +83 -0
- data/ext/sources/tests/test-whisper.js +58 -0
- data/extsources.rb +34 -0
- data/lib/whisper/model/uri.rb +178 -0
- data/sig/whisper.rbs +480 -0
- data/tests/helper.rb +35 -0
- data/tests/jfk_reader/.gitignore +5 -0
- data/tests/jfk_reader/extconf.rb +3 -0
- data/tests/jfk_reader/jfk_reader.c +68 -0
- data/tests/test_callback.rb +202 -0
- data/tests/test_error.rb +20 -0
- data/tests/test_model.rb +109 -0
- data/tests/test_package.rb +46 -0
- data/tests/test_params.rb +297 -0
- data/tests/test_segment.rb +74 -0
- data/tests/test_vad.rb +19 -0
- data/tests/test_vad_params.rb +103 -0
- data/tests/test_whisper.rb +212 -124
- data/whispercpp.gemspec +37 -0
- metadata +794 -13
- data/ext/dr_wav.h +0 -6434
- data/ext/ggml.c +0 -21755
- data/ext/ruby_whisper.cpp +0 -426
@@ -0,0 +1,118 @@
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
2
|
+
|
3
|
+
#ifdef cl_intel_subgroups
|
4
|
+
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
5
|
+
#else
|
6
|
+
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
7
|
+
#endif
|
8
|
+
|
9
|
+
#ifdef cl_intel_required_subgroup_size
|
10
|
+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
11
|
+
#define INTEL_GPU 1
|
12
|
+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
13
|
+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
14
|
+
#elif defined(cl_qcom_reqd_sub_group_size)
|
15
|
+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
16
|
+
#define ADRENO_GPU 1
|
17
|
+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
18
|
+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
19
|
+
#endif
|
20
|
+
|
21
|
+
#define N_F16_F32 4
|
22
|
+
|
23
|
+
#ifdef ADRENO_GPU
|
24
|
+
REQD_SUBGROUP_SIZE_64
|
25
|
+
#endif
|
26
|
+
kernel void kernel_mul_mat_f16_f32(
|
27
|
+
global char * src0,
|
28
|
+
ulong offset0,
|
29
|
+
global char * src1,
|
30
|
+
ulong offset1,
|
31
|
+
global float * dst,
|
32
|
+
ulong offsetd,
|
33
|
+
int ne00,
|
34
|
+
int ne01,
|
35
|
+
int ne02,
|
36
|
+
ulong nb00,
|
37
|
+
ulong nb01,
|
38
|
+
ulong nb02,
|
39
|
+
ulong nb03,
|
40
|
+
int ne10,
|
41
|
+
int ne11,
|
42
|
+
int ne12,
|
43
|
+
ulong nb10,
|
44
|
+
ulong nb11,
|
45
|
+
ulong nb12,
|
46
|
+
ulong nb13,
|
47
|
+
int ne0,
|
48
|
+
int ne1,
|
49
|
+
int r2,
|
50
|
+
int r3
|
51
|
+
) {
|
52
|
+
src0 = (global char*)((global char*)src0 + offset0);
|
53
|
+
src1 = (global char*)((global char*)src1 + offset1);
|
54
|
+
dst = (global float*)((global char*)dst + offsetd);
|
55
|
+
|
56
|
+
int r0 = get_group_id(0);
|
57
|
+
int rb = get_group_id(1)*N_F16_F32;
|
58
|
+
int im = get_group_id(2);
|
59
|
+
|
60
|
+
int i12 = im%ne12;
|
61
|
+
int i13 = im/ne12;
|
62
|
+
|
63
|
+
ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
|
64
|
+
|
65
|
+
global half * x = (global half *) (src0 + offset_src0);
|
66
|
+
|
67
|
+
if (ne00 < 128) {
|
68
|
+
for (int row = 0; row < N_F16_F32; ++row) {
|
69
|
+
int r1 = rb + row;
|
70
|
+
if (r1 >= ne11) {
|
71
|
+
break;
|
72
|
+
}
|
73
|
+
|
74
|
+
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
|
75
|
+
|
76
|
+
global float * y = (global float *) (src1 + offset_src1);
|
77
|
+
|
78
|
+
float sumf = 0;
|
79
|
+
for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
|
80
|
+
sumf += convert_float(x[i]) * y[i];
|
81
|
+
}
|
82
|
+
|
83
|
+
float all_sum = sub_group_reduce_add(sumf);
|
84
|
+
if (get_sub_group_local_id() == 0) {
|
85
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
86
|
+
}
|
87
|
+
}
|
88
|
+
} else {
|
89
|
+
global half4 * x4 = (global half4 *)x;
|
90
|
+
for (int row = 0; row < N_F16_F32; ++row) {
|
91
|
+
int r1 = rb + row;
|
92
|
+
if (r1 >= ne11) {
|
93
|
+
break;
|
94
|
+
}
|
95
|
+
|
96
|
+
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
|
97
|
+
|
98
|
+
global float * y = (global float *) (src1 + offset_src1);
|
99
|
+
global float4 * y4 = (global float4 *) y;
|
100
|
+
|
101
|
+
float sumf = 0;
|
102
|
+
for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
|
103
|
+
sumf += convert_float(x4[i].s0) * y4[i].s0;
|
104
|
+
sumf += convert_float(x4[i].s1) * y4[i].s1;
|
105
|
+
sumf += convert_float(x4[i].s2) * y4[i].s2;
|
106
|
+
sumf += convert_float(x4[i].s3) * y4[i].s3;
|
107
|
+
}
|
108
|
+
|
109
|
+
float all_sum = sub_group_reduce_add(sumf);
|
110
|
+
if (get_sub_group_local_id() == 0) {
|
111
|
+
for (int i = 4*(ne00/4); i < ne00; ++i) {
|
112
|
+
all_sum += (float) x[i] * y[i];
|
113
|
+
}
|
114
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
115
|
+
}
|
116
|
+
}
|
117
|
+
}
|
118
|
+
}
|
@@ -0,0 +1,94 @@
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
2
|
+
|
3
|
+
#ifdef cl_intel_subgroups
|
4
|
+
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
5
|
+
#else
|
6
|
+
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
7
|
+
#endif
|
8
|
+
|
9
|
+
#ifdef cl_intel_required_subgroup_size
|
10
|
+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
11
|
+
#define INTEL_GPU 1
|
12
|
+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
13
|
+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
14
|
+
#elif defined(cl_qcom_reqd_sub_group_size)
|
15
|
+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
16
|
+
#define ADRENO_GPU 1
|
17
|
+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
18
|
+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
19
|
+
#endif
|
20
|
+
|
21
|
+
#ifdef ADRENO_GPU
|
22
|
+
REQD_SUBGROUP_SIZE_64
|
23
|
+
#endif
|
24
|
+
kernel void kernel_mul_mat_f16_f32_1row(
|
25
|
+
global char * src0,
|
26
|
+
ulong offset0,
|
27
|
+
global char * src1,
|
28
|
+
ulong offset1,
|
29
|
+
global float * dst,
|
30
|
+
ulong offsetd,
|
31
|
+
int ne00,
|
32
|
+
int ne01,
|
33
|
+
int ne02,
|
34
|
+
ulong nb00,
|
35
|
+
ulong nb01,
|
36
|
+
ulong nb02,
|
37
|
+
ulong nb03,
|
38
|
+
int ne10,
|
39
|
+
int ne11,
|
40
|
+
int ne12,
|
41
|
+
ulong nb10,
|
42
|
+
ulong nb11,
|
43
|
+
ulong nb12,
|
44
|
+
ulong nb13,
|
45
|
+
int ne0,
|
46
|
+
int ne1,
|
47
|
+
int r2,
|
48
|
+
int r3
|
49
|
+
) {
|
50
|
+
src0 = (global char*)((global char*)src0 + offset0);
|
51
|
+
src1 = (global char*)((global char*)src1 + offset1);
|
52
|
+
dst = (global float*)((global char*)dst + offsetd);
|
53
|
+
|
54
|
+
int r0 = get_group_id(0);
|
55
|
+
int r1 = get_group_id(1);
|
56
|
+
int im = get_group_id(2);
|
57
|
+
|
58
|
+
int i12 = im%ne12;
|
59
|
+
int i13 = im/ne12;
|
60
|
+
|
61
|
+
ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
|
62
|
+
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
|
63
|
+
|
64
|
+
global half * x = (global half *) (src0 + offset_src0);
|
65
|
+
global float * y = (global float *) (src1 + offset_src1);
|
66
|
+
|
67
|
+
float sumf = 0;
|
68
|
+
if (ne00 < 128) {
|
69
|
+
for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
|
70
|
+
sumf += (float) x[i] * (float) y[i];
|
71
|
+
}
|
72
|
+
float all_sum = sub_group_reduce_add(sumf);
|
73
|
+
if (get_sub_group_local_id() == 0) {
|
74
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
75
|
+
}
|
76
|
+
} else {
|
77
|
+
global half4 * x4 = (global half4 *) x;
|
78
|
+
global float4 * y4 = (global float4 *) y;
|
79
|
+
for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
|
80
|
+
sumf += (float) x4[i].s0 * y4[i].s0;
|
81
|
+
sumf += (float) x4[i].s1 * y4[i].s1;
|
82
|
+
sumf += (float) x4[i].s2 * y4[i].s2;
|
83
|
+
sumf += (float) x4[i].s3 * y4[i].s3;
|
84
|
+
}
|
85
|
+
float all_sum = sub_group_reduce_add(sumf);
|
86
|
+
if (get_sub_group_local_id() == 0) {
|
87
|
+
for (int i = 4*(ne00/4); i < ne00; ++i) {
|
88
|
+
all_sum += (float) x[i] * y[i];
|
89
|
+
}
|
90
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
91
|
+
}
|
92
|
+
}
|
93
|
+
|
94
|
+
}
|
@@ -0,0 +1,84 @@
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
2
|
+
|
3
|
+
#ifdef cl_intel_subgroups
|
4
|
+
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
5
|
+
#else
|
6
|
+
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
7
|
+
#endif
|
8
|
+
|
9
|
+
#ifdef cl_intel_required_subgroup_size
|
10
|
+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
11
|
+
#define INTEL_GPU 1
|
12
|
+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
13
|
+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
14
|
+
#elif defined(cl_qcom_reqd_sub_group_size)
|
15
|
+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
16
|
+
#define ADRENO_GPU 1
|
17
|
+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
18
|
+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
19
|
+
#endif
|
20
|
+
|
21
|
+
// Assumes row size (ne00) is a multiple of 4
|
22
|
+
#ifdef ADRENO_GPU
|
23
|
+
REQD_SUBGROUP_SIZE_64
|
24
|
+
#endif
|
25
|
+
kernel void kernel_mul_mat_f16_f32_l4(
|
26
|
+
global char * src0,
|
27
|
+
ulong offset0,
|
28
|
+
global char * src1,
|
29
|
+
ulong offset1,
|
30
|
+
global float * dst,
|
31
|
+
ulong offsetd,
|
32
|
+
int ne00,
|
33
|
+
int ne01,
|
34
|
+
int ne02,
|
35
|
+
ulong nb00,
|
36
|
+
ulong nb01,
|
37
|
+
ulong nb02,
|
38
|
+
ulong nb03,
|
39
|
+
int ne10,
|
40
|
+
int ne11,
|
41
|
+
int ne12,
|
42
|
+
ulong nb10,
|
43
|
+
ulong nb11,
|
44
|
+
ulong nb12,
|
45
|
+
ulong nb13,
|
46
|
+
int ne0,
|
47
|
+
int ne1,
|
48
|
+
int r2,
|
49
|
+
int r3
|
50
|
+
) {
|
51
|
+
src0 = (global char*)((global char*)src0 + offset0);
|
52
|
+
src1 = (global char*)((global char*)src1 + offset1);
|
53
|
+
dst = (global float*)((global char*)dst + offsetd);
|
54
|
+
|
55
|
+
int nrows = ne11;
|
56
|
+
int r0 = get_group_id(0);
|
57
|
+
int im = get_group_id(2);
|
58
|
+
|
59
|
+
int i12 = im%ne12;
|
60
|
+
int i13 = im/ne12;
|
61
|
+
|
62
|
+
ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
|
63
|
+
|
64
|
+
global half4 * x4 = (global half4 *) (src0 + offset_src0);
|
65
|
+
|
66
|
+
for (int r1 = 0; r1 < nrows; ++r1) {
|
67
|
+
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
|
68
|
+
|
69
|
+
global float4 * y4 = (global float4 *) (src1 + offset_src1);
|
70
|
+
|
71
|
+
float sumf = 0;
|
72
|
+
for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
|
73
|
+
sumf += convert_float(x4[i].s0) * y4[i].s0;
|
74
|
+
sumf += convert_float(x4[i].s1) * y4[i].s1;
|
75
|
+
sumf += convert_float(x4[i].s2) * y4[i].s2;
|
76
|
+
sumf += convert_float(x4[i].s3) * y4[i].s3;
|
77
|
+
}
|
78
|
+
|
79
|
+
float all_sum = sub_group_reduce_add(sumf);
|
80
|
+
if (get_sub_group_local_id() == 0) {
|
81
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
82
|
+
}
|
83
|
+
}
|
84
|
+
}
|
@@ -0,0 +1,118 @@
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
2
|
+
|
3
|
+
#ifdef cl_intel_subgroups
|
4
|
+
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
5
|
+
#else
|
6
|
+
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
7
|
+
#endif
|
8
|
+
|
9
|
+
#ifdef cl_intel_required_subgroup_size
|
10
|
+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
11
|
+
#define INTEL_GPU 1
|
12
|
+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
13
|
+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
14
|
+
#elif defined(cl_qcom_reqd_sub_group_size)
|
15
|
+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
16
|
+
#define ADRENO_GPU 1
|
17
|
+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
18
|
+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
19
|
+
#endif
|
20
|
+
|
21
|
+
#define N_F32_F32 4
|
22
|
+
|
23
|
+
#ifdef ADRENO_GPU
|
24
|
+
REQD_SUBGROUP_SIZE_64
|
25
|
+
#endif
|
26
|
+
kernel void kernel_mul_mat_f32_f32(
|
27
|
+
global char * src0,
|
28
|
+
ulong offset0,
|
29
|
+
global char * src1,
|
30
|
+
ulong offset1,
|
31
|
+
global float * dst,
|
32
|
+
ulong offsetd,
|
33
|
+
int ne00,
|
34
|
+
int ne01,
|
35
|
+
int ne02,
|
36
|
+
ulong nb00,
|
37
|
+
ulong nb01,
|
38
|
+
ulong nb02,
|
39
|
+
ulong nb03,
|
40
|
+
int ne10,
|
41
|
+
int ne11,
|
42
|
+
int ne12,
|
43
|
+
ulong nb10,
|
44
|
+
ulong nb11,
|
45
|
+
ulong nb12,
|
46
|
+
ulong nb13,
|
47
|
+
int ne0,
|
48
|
+
int ne1,
|
49
|
+
int r2,
|
50
|
+
int r3
|
51
|
+
) {
|
52
|
+
src0 = (global char*)((global char*)src0 + offset0);
|
53
|
+
src1 = (global char*)((global char*)src1 + offset1);
|
54
|
+
dst = (global float*)((global char*)dst + offsetd);
|
55
|
+
|
56
|
+
int r0 = get_group_id(0);
|
57
|
+
int rb = get_group_id(1)*N_F32_F32;
|
58
|
+
int im = get_group_id(2);
|
59
|
+
|
60
|
+
int i12 = im%ne12;
|
61
|
+
int i13 = im/ne12;
|
62
|
+
|
63
|
+
ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
|
64
|
+
|
65
|
+
global float * x = (global float *) (src0 + offset_src0);
|
66
|
+
|
67
|
+
if (ne00 < 128) {
|
68
|
+
for (int row = 0; row < N_F32_F32; ++row) {
|
69
|
+
int r1 = rb + row;
|
70
|
+
if (r1 >= ne11) {
|
71
|
+
break;
|
72
|
+
}
|
73
|
+
|
74
|
+
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
|
75
|
+
|
76
|
+
global float * y = (global float *) (src1 + offset_src1);
|
77
|
+
|
78
|
+
float sumf = 0;
|
79
|
+
for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
|
80
|
+
sumf += (float) x[i] * (float) y[i];
|
81
|
+
}
|
82
|
+
|
83
|
+
float all_sum = sub_group_reduce_add(sumf);
|
84
|
+
if (get_sub_group_local_id() == 0) {
|
85
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
86
|
+
}
|
87
|
+
}
|
88
|
+
} else {
|
89
|
+
global float4 * x4 = (global float4 *)x;
|
90
|
+
for (int row = 0; row < N_F32_F32; ++row) {
|
91
|
+
int r1 = rb + row;
|
92
|
+
if (r1 >= ne11) {
|
93
|
+
break;
|
94
|
+
}
|
95
|
+
|
96
|
+
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
|
97
|
+
|
98
|
+
global float * y = (global float *) (src1 + offset_src1);
|
99
|
+
global float4 * y4 = (global float4 *) y;
|
100
|
+
|
101
|
+
float sumf = 0;
|
102
|
+
for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
|
103
|
+
sumf += (float) x4[i].s0 * y4[i].s0;
|
104
|
+
sumf += (float) x4[i].s1 * y4[i].s1;
|
105
|
+
sumf += (float) x4[i].s2 * y4[i].s2;
|
106
|
+
sumf += (float) x4[i].s3 * y4[i].s3;
|
107
|
+
}
|
108
|
+
|
109
|
+
float all_sum = sub_group_reduce_add(sumf);
|
110
|
+
if (get_sub_group_local_id() == 0) {
|
111
|
+
for (int i = 4*(ne00/4); i < ne00; ++i) {
|
112
|
+
all_sum += (float) x[i] * y[i];
|
113
|
+
}
|
114
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
115
|
+
}
|
116
|
+
}
|
117
|
+
}
|
118
|
+
}
|
@@ -0,0 +1,192 @@
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
2
|
+
|
3
|
+
#ifdef cl_intel_subgroups
|
4
|
+
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
5
|
+
#else
|
6
|
+
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
7
|
+
#endif
|
8
|
+
|
9
|
+
#ifdef cl_intel_required_subgroup_size
|
10
|
+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
11
|
+
#define INTEL_GPU 1
|
12
|
+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
13
|
+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
14
|
+
#elif defined(cl_qcom_reqd_sub_group_size)
|
15
|
+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
16
|
+
#define ADRENO_GPU 1
|
17
|
+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
18
|
+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
19
|
+
#endif
|
20
|
+
|
21
|
+
#define QK4_0 32
|
22
|
+
#define QR4_0 2
|
23
|
+
#define QK4_1 32
|
24
|
+
#define QR4_1 2
|
25
|
+
#define QK5_0 32
|
26
|
+
#define QR5_0 2
|
27
|
+
#define QK5_1 32
|
28
|
+
#define QR5_1 2
|
29
|
+
#define QK8_0 32
|
30
|
+
#define QR8_0 1
|
31
|
+
#define QK_K 256
|
32
|
+
#define K_QUANTS_PER_ITERATION 2
|
33
|
+
|
34
|
+
typedef char int8_t;
|
35
|
+
typedef uchar uint8_t;
|
36
|
+
typedef short int16_t;
|
37
|
+
typedef ushort uint16_t;
|
38
|
+
typedef int int32_t;
|
39
|
+
typedef uint uint32_t;
|
40
|
+
|
41
|
+
//------------------------------------------------------------------------------
|
42
|
+
// block_q4_0
|
43
|
+
//------------------------------------------------------------------------------
|
44
|
+
struct block_q4_0
|
45
|
+
{
|
46
|
+
half d;
|
47
|
+
uint8_t qs[QK4_0 / 2];
|
48
|
+
};
|
49
|
+
|
50
|
+
//------------------------------------------------------------------------------
|
51
|
+
// mul_vec_q_n_f32
|
52
|
+
//------------------------------------------------------------------------------
|
53
|
+
// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
|
54
|
+
// il indicates where the q4 quants begin (0 or QK4_0/4)
|
55
|
+
// we assume that the yl's have been multiplied with the appropriate scale factor
|
56
|
+
// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
|
57
|
+
inline float block_q_4_0_dot_y(
|
58
|
+
global struct block_q4_0 * qb_curr,
|
59
|
+
float sumy,
|
60
|
+
private float * yl,
|
61
|
+
int il
|
62
|
+
) {
|
63
|
+
float d = qb_curr->d;
|
64
|
+
float2 acc = 0.f;
|
65
|
+
global ushort * qs = ((global ushort *)qb_curr + 1 + il/2);
|
66
|
+
for (int i = 0; i < 8; i+=2) {
|
67
|
+
acc.s0 += yl[i + 0] * (qs[i / 2] & 0x000F)
|
68
|
+
+ yl[i + 1] * (qs[i / 2] & 0x0F00);
|
69
|
+
acc.s1 += yl[i + 8] * (qs[i / 2] & 0x00F0)
|
70
|
+
+ yl[i + 9] * (qs[i / 2] & 0xF000);
|
71
|
+
}
|
72
|
+
return d * (sumy * -8.f + acc.s0 + acc.s1);
|
73
|
+
}
|
74
|
+
|
75
|
+
#ifdef INTEL_GPU
|
76
|
+
#define N_DST 4 // each SIMD group works on 4 rows
|
77
|
+
#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
|
78
|
+
#define N_SIMDWIDTH 16 // assuming SIMD group size is 16
|
79
|
+
#elif defined (ADRENO_GPU)
|
80
|
+
#define N_DST 4
|
81
|
+
#define N_SIMDGROUP 1
|
82
|
+
#define N_SIMDWIDTH 64
|
83
|
+
#endif
|
84
|
+
|
85
|
+
inline void mul_vec_q_n_f32(
|
86
|
+
global void * src0,
|
87
|
+
global float * src1,
|
88
|
+
global float * dst,
|
89
|
+
int ne00,
|
90
|
+
int ne01,
|
91
|
+
int ne02,
|
92
|
+
int ne10,
|
93
|
+
int ne12,
|
94
|
+
int ne0,
|
95
|
+
int ne1,
|
96
|
+
int r2,
|
97
|
+
int r3
|
98
|
+
) {
|
99
|
+
|
100
|
+
const ulong nb = ne00/QK4_0;
|
101
|
+
|
102
|
+
int r0 = get_group_id(0);
|
103
|
+
int r1 = get_group_id(1);
|
104
|
+
int im = get_group_id(2);
|
105
|
+
|
106
|
+
// (r0 * N_SIMDGROUP + get_sub_group_id()) is essenatially the linear global
|
107
|
+
// id of a SIMD group in the grid.
|
108
|
+
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
109
|
+
|
110
|
+
int i12 = im%ne12;
|
111
|
+
int i13 = im/ne12;
|
112
|
+
|
113
|
+
ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
114
|
+
|
115
|
+
global struct block_q4_0 * x = (global struct block_q4_0 *) src0 + offset0;
|
116
|
+
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
117
|
+
|
118
|
+
float yl[16]; // src1 vector cache
|
119
|
+
float sumf[N_DST]={0.f};
|
120
|
+
|
121
|
+
int ix = get_sub_group_local_id()/2;
|
122
|
+
int il = 8*(get_sub_group_local_id()%2);
|
123
|
+
|
124
|
+
global float * yb = y + ix * QK4_0 + il;
|
125
|
+
|
126
|
+
// each thread in a SIMD group deals with half a block.
|
127
|
+
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
|
128
|
+
float sumy = 0;
|
129
|
+
for (int i = 0; i < 8; i += 2) {
|
130
|
+
sumy += yb[i] + yb[i+1];
|
131
|
+
yl[i+0] = yb[i+ 0];
|
132
|
+
yl[i+1] = yb[i+ 1]/256.f;
|
133
|
+
sumy += yb[i+16] + yb[i+17];
|
134
|
+
yl[i+8] = yb[i+16]/16.f;
|
135
|
+
yl[i+9] = yb[i+17]/4096.f;
|
136
|
+
}
|
137
|
+
|
138
|
+
for (int row = 0; row < N_DST; row++) {
|
139
|
+
sumf[row] += block_q_4_0_dot_y(x+ib+row*nb, sumy, yl, il);
|
140
|
+
}
|
141
|
+
|
142
|
+
// One thread in a SIMD group (i.e., subgroup) handles a half block,
|
143
|
+
// hence then entire SIMD group handles SIMDWIDTH/2 blocks.
|
144
|
+
// y points to the activation matrix (of type float). Therefore for
|
145
|
+
// one thread, the # of blocks y should advance is SIMDWIDTH/2 (because
|
146
|
+
// SIMDWIDTH/2 blocks are processed by a SIMD group) - in terms of
|
147
|
+
// floats, it is QK4_0 * (SIMDWIDTH/2), where QK4_0 is the block size.
|
148
|
+
yb += QK4_0 * (N_SIMDWIDTH/2);
|
149
|
+
}
|
150
|
+
|
151
|
+
// The above does not work for Adreno - it produces incorrect results for
|
152
|
+
// row = 1, 2, 3 and only row = 0 gives the correct result.
|
153
|
+
// If N_DST is changed, the below array must be initialized accordingly.
|
154
|
+
// This also seems to perform better on Intel.
|
155
|
+
float tot[N_DST] = {
|
156
|
+
sub_group_reduce_add(sumf[0]), sub_group_reduce_add(sumf[1]),
|
157
|
+
sub_group_reduce_add(sumf[2]), sub_group_reduce_add(sumf[3])};
|
158
|
+
for (int row = 0; row < N_DST; ++row) {
|
159
|
+
if (get_sub_group_local_id() == 0 && first_row + row < ne01) {
|
160
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot[row];
|
161
|
+
}
|
162
|
+
}
|
163
|
+
}
|
164
|
+
|
165
|
+
#ifdef INTEL_GPU
|
166
|
+
REQD_SUBGROUP_SIZE_16
|
167
|
+
#elif defined (ADRENO_GPU)
|
168
|
+
REQD_SUBGROUP_SIZE_64
|
169
|
+
#endif
|
170
|
+
kernel void kernel_mul_mat_q4_0_f32(
|
171
|
+
global void * src0,
|
172
|
+
ulong offset0,
|
173
|
+
global float * src1,
|
174
|
+
ulong offset1,
|
175
|
+
global float * dst,
|
176
|
+
ulong offsetd,
|
177
|
+
int ne00,
|
178
|
+
int ne01,
|
179
|
+
int ne02,
|
180
|
+
int ne10,
|
181
|
+
int ne12,
|
182
|
+
int ne0,
|
183
|
+
int ne1,
|
184
|
+
int r2,
|
185
|
+
int r3
|
186
|
+
) {
|
187
|
+
src0 = (global void*)((global char*)src0 + offset0);
|
188
|
+
src1 = (global float*)((global char*)src1 + offset1);
|
189
|
+
dst = (global float*)((global char*)dst + offsetd);
|
190
|
+
|
191
|
+
mul_vec_q_n_f32(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
|
192
|
+
}
|