whispercpp 1.3.1 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +4 -3
- data/README.md +92 -31
- data/Rakefile +26 -7
- data/ext/.gitignore +5 -7
- data/ext/dependencies.rb +61 -0
- data/ext/extconf.rb +21 -198
- data/ext/options.rb +221 -0
- data/ext/ruby_whisper.c +159 -0
- data/ext/ruby_whisper.h +17 -2
- data/ext/ruby_whisper_context.c +641 -0
- data/ext/ruby_whisper_error.c +52 -0
- data/ext/ruby_whisper_model.c +232 -0
- data/ext/ruby_whisper_params.c +1301 -0
- data/ext/ruby_whisper_segment.c +143 -0
- data/ext/ruby_whisper_transcribe.cpp +87 -0
- data/ext/ruby_whisper_vad_params.c +288 -0
- data/ext/sources/.dockerignore +3 -0
- data/ext/sources/.github/workflows/bindings-ruby.yml +21 -0
- data/ext/sources/CMakeGraphVizOptions.cmake +8 -0
- data/ext/sources/CMakeLists.txt +251 -0
- data/ext/sources/bindings/javascript/CMakeLists.txt +41 -0
- data/ext/sources/bindings/javascript/emscripten.cpp +93 -0
- data/ext/sources/bindings/javascript/libwhisper.worker.js +1 -0
- data/ext/sources/bindings/javascript/package-tmpl.json +26 -0
- data/ext/sources/bindings/javascript/package.json +26 -0
- data/ext/sources/bindings/javascript/whisper.js +19 -0
- data/ext/sources/build-xcframework.sh +547 -0
- data/ext/sources/ci/run.sh +336 -0
- data/ext/sources/close-issue.yml +28 -0
- data/ext/sources/cmake/DefaultTargetOptions.cmake +16 -0
- data/ext/sources/cmake/FindFFmpeg.cmake +163 -0
- data/ext/sources/cmake/build-info.cmake +60 -0
- data/ext/sources/cmake/git-vars.cmake +22 -0
- data/ext/sources/cmake/whisper-config.cmake.in +65 -0
- data/ext/sources/cmake/whisper.pc.in +10 -0
- data/ext/sources/examples/CMakeLists.txt +124 -0
- data/ext/sources/examples/addon.node/CMakeLists.txt +31 -0
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +37 -0
- data/ext/sources/examples/addon.node/addon.cpp +438 -0
- data/ext/sources/examples/addon.node/index.js +54 -0
- data/ext/sources/examples/addon.node/package.json +16 -0
- data/ext/sources/examples/bench/CMakeLists.txt +8 -0
- data/ext/sources/examples/bench/bench.cpp +175 -0
- data/ext/sources/examples/bench.wasm/CMakeLists.txt +49 -0
- data/ext/sources/examples/bench.wasm/emscripten.cpp +87 -0
- data/ext/sources/examples/bench.wasm/index-tmpl.html +284 -0
- data/ext/sources/examples/cli/CMakeLists.txt +8 -0
- data/ext/sources/examples/cli/cli.cpp +1294 -0
- data/ext/sources/examples/coi-serviceworker.js +146 -0
- data/ext/sources/examples/command/CMakeLists.txt +10 -0
- data/ext/sources/examples/command/command.cpp +776 -0
- data/ext/sources/examples/command/commands.txt +9 -0
- data/ext/sources/examples/command.wasm/CMakeLists.txt +50 -0
- data/ext/sources/examples/command.wasm/emscripten.cpp +327 -0
- data/ext/sources/examples/command.wasm/index-tmpl.html +414 -0
- data/ext/sources/examples/common-ggml.cpp +238 -0
- data/ext/sources/examples/common-ggml.h +18 -0
- data/ext/sources/examples/common-sdl.cpp +227 -0
- data/ext/sources/examples/common-sdl.h +49 -0
- data/ext/sources/examples/common-whisper.cpp +168 -0
- data/ext/sources/examples/common-whisper.h +24 -0
- data/ext/sources/examples/common.cpp +675 -0
- data/ext/sources/examples/common.h +322 -0
- data/ext/sources/examples/deprecation-warning/CMakeLists.txt +6 -0
- data/ext/sources/examples/deprecation-warning/deprecation-warning.cpp +38 -0
- data/ext/sources/examples/ffmpeg-transcode.cpp +368 -0
- data/ext/sources/examples/generate-karaoke.sh +57 -0
- data/ext/sources/examples/grammar-parser.cpp +423 -0
- data/ext/sources/examples/grammar-parser.h +29 -0
- data/ext/sources/examples/helpers.js +191 -0
- data/ext/sources/examples/json.hpp +24596 -0
- data/ext/sources/examples/livestream.sh +112 -0
- data/ext/sources/examples/lsp/CMakeLists.txt +9 -0
- data/ext/sources/examples/lsp/lsp.cpp +467 -0
- data/ext/sources/examples/lsp/whisper.vim +362 -0
- data/ext/sources/examples/miniaudio.h +93468 -0
- data/ext/sources/examples/python/test_whisper_processor.py +7 -0
- data/ext/sources/examples/python/whisper_processor.py +54 -0
- data/ext/sources/examples/quantize/CMakeLists.txt +6 -0
- data/ext/sources/examples/quantize/quantize.cpp +223 -0
- data/ext/sources/examples/server/CMakeLists.txt +12 -0
- data/ext/sources/examples/server/bench.js +29 -0
- data/ext/sources/examples/server/httplib.h +10497 -0
- data/ext/sources/examples/server/server.cpp +1091 -0
- data/ext/sources/examples/server.py +115 -0
- data/ext/sources/examples/stb_vorbis.c +5584 -0
- data/ext/sources/examples/stream/CMakeLists.txt +10 -0
- data/ext/sources/examples/stream/stream.cpp +429 -0
- data/ext/sources/examples/stream.wasm/CMakeLists.txt +49 -0
- data/ext/sources/examples/stream.wasm/emscripten.cpp +216 -0
- data/ext/sources/examples/stream.wasm/index-tmpl.html +414 -0
- data/ext/sources/examples/sycl/CMakeLists.txt +9 -0
- data/ext/sources/examples/sycl/build.sh +22 -0
- data/ext/sources/examples/sycl/ls-sycl-device.cpp +11 -0
- data/ext/sources/examples/sycl/run-whisper.sh +17 -0
- data/ext/sources/examples/talk-llama/CMakeLists.txt +40 -0
- data/ext/sources/examples/talk-llama/eleven-labs.py +80 -0
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +388 -0
- data/ext/sources/examples/talk-llama/llama-adapter.h +76 -0
- data/ext/sources/examples/talk-llama/llama-arch.cpp +1746 -0
- data/ext/sources/examples/talk-llama/llama-arch.h +437 -0
- data/ext/sources/examples/talk-llama/llama-batch.cpp +374 -0
- data/ext/sources/examples/talk-llama/llama-batch.h +89 -0
- data/ext/sources/examples/talk-llama/llama-chat.cpp +663 -0
- data/ext/sources/examples/talk-llama/llama-chat.h +58 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +2676 -0
- data/ext/sources/examples/talk-llama/llama-context.h +276 -0
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +5 -0
- data/ext/sources/examples/talk-llama/llama-cparams.h +41 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +1229 -0
- data/ext/sources/examples/talk-llama/llama-grammar.h +173 -0
- data/ext/sources/examples/talk-llama/llama-graph.cpp +1618 -0
- data/ext/sources/examples/talk-llama/llama-graph.h +640 -0
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +95 -0
- data/ext/sources/examples/talk-llama/llama-hparams.h +190 -0
- data/ext/sources/examples/talk-llama/llama-impl.cpp +167 -0
- data/ext/sources/examples/talk-llama/llama-impl.h +61 -0
- data/ext/sources/examples/talk-llama/llama-io.cpp +15 -0
- data/ext/sources/examples/talk-llama/llama-io.h +35 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2739 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +502 -0
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +379 -0
- data/ext/sources/examples/talk-llama/llama-memory.cpp +1 -0
- data/ext/sources/examples/talk-llama/llama-memory.h +32 -0
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +600 -0
- data/ext/sources/examples/talk-llama/llama-mmap.h +68 -0
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +1138 -0
- data/ext/sources/examples/talk-llama/llama-model-loader.h +169 -0
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +281 -0
- data/ext/sources/examples/talk-llama/llama-model-saver.h +37 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +13814 -0
- data/ext/sources/examples/talk-llama/llama-model.h +425 -0
- data/ext/sources/examples/talk-llama/llama-quant.cpp +966 -0
- data/ext/sources/examples/talk-llama/llama-quant.h +1 -0
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +2575 -0
- data/ext/sources/examples/talk-llama/llama-sampling.h +32 -0
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +3340 -0
- data/ext/sources/examples/talk-llama/llama-vocab.h +131 -0
- data/ext/sources/examples/talk-llama/llama.cpp +354 -0
- data/ext/sources/examples/talk-llama/llama.h +1377 -0
- data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +23 -0
- data/ext/sources/examples/talk-llama/speak +40 -0
- data/ext/sources/examples/talk-llama/speak.bat +1 -0
- data/ext/sources/examples/talk-llama/speak.ps1 +14 -0
- data/ext/sources/examples/talk-llama/talk-llama.cpp +808 -0
- data/ext/sources/examples/talk-llama/unicode-data.cpp +7034 -0
- data/ext/sources/examples/talk-llama/unicode-data.h +20 -0
- data/ext/sources/examples/talk-llama/unicode.cpp +849 -0
- data/ext/sources/examples/talk-llama/unicode.h +66 -0
- data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +8 -0
- data/ext/sources/examples/vad-speech-segments/speech.cpp +143 -0
- data/ext/sources/examples/wchess/CMakeLists.txt +10 -0
- data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +19 -0
- data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +803 -0
- data/ext/sources/examples/wchess/libwchess/Chessboard.h +33 -0
- data/ext/sources/examples/wchess/libwchess/WChess.cpp +193 -0
- data/ext/sources/examples/wchess/libwchess/WChess.h +63 -0
- data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +117 -0
- data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +8 -0
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +249 -0
- data/ext/sources/examples/whisper.wasm/CMakeLists.txt +50 -0
- data/ext/sources/examples/whisper.wasm/emscripten.cpp +118 -0
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +658 -0
- data/ext/sources/ggml/CMakeLists.txt +390 -0
- data/ext/sources/ggml/cmake/BuildTypes.cmake +54 -0
- data/ext/sources/ggml/cmake/GitVars.cmake +22 -0
- data/ext/sources/ggml/cmake/common.cmake +26 -0
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +152 -0
- data/ext/{ggml → sources/ggml}/include/ggml-alloc.h +1 -1
- data/ext/{ggml → sources/ggml}/include/ggml-backend.h +9 -7
- data/ext/{ggml → sources/ggml}/include/ggml-cpp.h +2 -1
- data/ext/{ggml → sources/ggml}/include/ggml-cpu.h +9 -1
- data/ext/{ggml → sources/ggml}/include/ggml-metal.h +1 -1
- data/ext/{ggml → sources/ggml}/include/ggml-opt.h +49 -28
- data/ext/{ggml → sources/ggml}/include/ggml-rpc.h +6 -1
- data/ext/{ggml → sources/ggml}/include/ggml-vulkan.h +0 -2
- data/ext/{ggml → sources/ggml}/include/ggml.h +182 -265
- data/ext/sources/ggml/include/gguf.h +202 -0
- data/ext/sources/ggml/src/CMakeLists.txt +346 -0
- data/ext/{ggml → sources/ggml}/src/ggml-alloc.c +34 -29
- data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- data/ext/{ggml → sources/ggml}/src/ggml-backend-impl.h +1 -2
- data/ext/{ggml → sources/ggml}/src/ggml-backend-reg.cpp +87 -53
- data/ext/{ggml → sources/ggml}/src/ggml-backend.cpp +26 -14
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +87 -0
- data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +74 -0
- data/ext/sources/ggml/src/ggml-cann/Doxyfile +2579 -0
- data/ext/{ggml → sources/ggml}/src/ggml-cann/acl_tensor.cpp +10 -4
- data/ext/{ggml → sources/ggml}/src/ggml-cann/acl_tensor.h +5 -5
- data/ext/{ggml → sources/ggml}/src/ggml-cann/aclnn_ops.cpp +1272 -1506
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +1125 -0
- data/ext/{ggml → sources/ggml}/src/ggml-cann/common.h +135 -1
- data/ext/{ggml → sources/ggml}/src/ggml-cann/ggml-cann.cpp +564 -146
- data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +30 -0
- data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/dup.cpp +3 -5
- data/ext/{ggml → sources/ggml}/src/ggml-common.h +12 -8
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +504 -0
- data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/amx.cpp +2 -1
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.h +16 -0
- data/ext/sources/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
- data/ext/sources/ggml/src/ggml-cpu/common.h +72 -0
- data/ext/{ggml → sources/ggml}/src/ggml-cpu/cpu-feats-x86.cpp +5 -1
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +6431 -0
- data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-impl.h +163 -41
- data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-quants.c +4029 -1117
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +3510 -0
- data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu.cpp +67 -18
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +337 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +95 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +482 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3544 -0
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +8903 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.h +110 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +28 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +252 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.h +818 -0
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +184 -0
- data/ext/sources/ggml/src/ggml-cuda/acc.cu +61 -0
- data/ext/sources/ggml/src/ggml-cuda/acc.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/arange.cu +34 -0
- data/ext/sources/ggml/src/ggml-cuda/arange.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/argmax.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/argmax.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +104 -0
- data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +363 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +9 -0
- data/ext/sources/ggml/src/ggml-cuda/clamp.cu +45 -0
- data/ext/sources/ggml/src/ggml-cuda/clamp.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +828 -0
- data/ext/sources/ggml/src/ggml-cuda/concat.cu +221 -0
- data/ext/sources/ggml/src/ggml-cuda/concat.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +89 -0
- data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +730 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +26 -0
- data/ext/sources/ggml/src/ggml-cuda/count-equal.cu +64 -0
- data/ext/sources/ggml/src/ggml-cuda/count-equal.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/cp-async.cuh +57 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +705 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +11 -0
- data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +189 -0
- data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +103 -0
- data/ext/sources/ggml/src/ggml-cuda/diagmask.cu +40 -0
- data/ext/sources/ggml/src/ggml-cuda/diagmask.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +881 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1471 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +357 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +365 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +482 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +472 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +634 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +346 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +275 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cuh +15 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +3505 -0
- data/ext/sources/ggml/src/ggml-cuda/gla.cu +93 -0
- data/ext/sources/ggml/src/ggml-cuda/gla.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +103 -0
- data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +396 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +324 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +3217 -0
- data/ext/sources/ggml/src/ggml-cuda/mmv.cu +336 -0
- data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +12 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +595 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +12 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +458 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cuh +11 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +68 -0
- data/ext/sources/ggml/src/ggml-cuda/out-prod.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +49 -0
- data/ext/sources/ggml/src/ggml-cuda/pad.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/pool2d.cu +94 -0
- data/ext/sources/ggml/src/ggml-cuda/pool2d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +190 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +27 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +456 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +31 -0
- data/ext/sources/ggml/src/ggml-cuda/scale.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +283 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +148 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +153 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/sum.cu +45 -0
- data/ext/sources/ggml/src/ggml-cuda/sum.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +39 -0
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +10 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +78 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +47 -0
- data/ext/sources/ggml/src/ggml-cuda/tsembd.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +289 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +59 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +51 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +1135 -0
- data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/cuda.h +1 -0
- data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/hip.h +57 -0
- data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/musa.h +7 -1
- data/ext/sources/ggml/src/ggml-cuda/wkv.cu +199 -0
- data/ext/sources/ggml/src/ggml-cuda/wkv.cuh +7 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +131 -0
- data/ext/{ggml → sources/ggml}/src/ggml-impl.h +64 -19
- data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +112 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +58 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +25 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +30 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +22 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +17 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +31 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +31 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +38 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +39 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +44 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +69 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +51 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +33 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +35 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +140 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +106 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +73 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +28 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +84 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +21 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +53 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +52 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +19 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +23 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +22 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +72 -0
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +71 -0
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +120 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +622 -0
- data/ext/{ggml → sources/ggml}/src/ggml-metal/ggml-metal.m +2178 -1064
- data/ext/{ggml → sources/ggml}/src/ggml-metal/ggml-metal.metal +1575 -1218
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +113 -0
- data/ext/sources/ggml/src/ggml-musa/mudnn.cu +112 -0
- data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +12 -0
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +96 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +5124 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +83 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +118 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +62 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +79 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl +190 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +81 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +96 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +721 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +16 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +87 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +87 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +86 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +86 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +84 -0
- data/ext/{ggml → sources/ggml}/src/ggml-opt.cpp +373 -190
- data/ext/{ggml → sources/ggml}/src/ggml-quants.c +114 -120
- data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- data/ext/{ggml → sources/ggml}/src/ggml-rpc/ggml-rpc.cpp +480 -73
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +189 -0
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +37 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +345 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- data/ext/{ggml → sources/ggml}/src/ggml-sycl/common.cpp +20 -32
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +589 -0
- data/ext/{ggml → sources/ggml}/src/ggml-sycl/concat.cpp +32 -33
- data/ext/sources/ggml/src/ggml-sycl/concat.hpp +20 -0
- data/ext/{ggml → sources/ggml}/src/ggml-sycl/conv.cpp +4 -2
- data/ext/sources/ggml/src/ggml-sycl/conv.hpp +20 -0
- data/ext/{ggml → sources/ggml}/src/ggml-sycl/convert.cpp +104 -28
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +34 -0
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +700 -0
- data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +11 -0
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +791 -0
- data/ext/{ggml → sources/ggml}/src/ggml-sycl/dmmv.cpp +156 -17
- data/ext/sources/ggml/src/ggml-sycl/dmmv.hpp +27 -0
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +2957 -0
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1511 -0
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +75 -0
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +99 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +309 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.hpp +20 -0
- data/ext/{ggml → sources/ggml}/src/ggml-sycl/ggml-sycl.cpp +1004 -1240
- data/ext/sources/ggml/src/ggml-sycl/gla.cpp +106 -0
- data/ext/sources/ggml/src/ggml-sycl/gla.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +136 -0
- data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +21 -0
- data/ext/{ggml → sources/ggml}/src/ggml-sycl/mmq.cpp +0 -1
- data/ext/sources/ggml/src/ggml-sycl/mmq.hpp +33 -0
- data/ext/{ggml → sources/ggml}/src/ggml-sycl/mmvq.cpp +261 -166
- data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +27 -0
- data/ext/{ggml → sources/ggml}/src/ggml-sycl/norm.cpp +204 -81
- data/ext/sources/ggml/src/ggml-sycl/norm.hpp +26 -0
- data/ext/{ggml → sources/ggml}/src/ggml-sycl/outprod.cpp +8 -17
- data/ext/sources/ggml/src/ggml-sycl/outprod.hpp +10 -0
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +74 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +83 -0
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +361 -0
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +20 -0
- data/ext/{ggml → sources/ggml}/src/ggml-sycl/softmax.cpp +35 -25
- data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- data/ext/{ggml → sources/ggml}/src/ggml-sycl/tsembd.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/tsembd.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +1215 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +293 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.hpp +10 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +196 -0
- data/ext/sources/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
- data/ext/{ggml → sources/ggml}/src/ggml-vulkan/ggml-vulkan.cpp +3130 -1087
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +39 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +51 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +69 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +242 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +462 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +699 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.comp +13 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +48 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +39 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +49 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +337 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +267 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +59 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +64 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +76 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +33 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +41 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +100 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +41 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +118 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +82 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +79 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +87 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +118 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +154 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +130 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +136 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +167 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +868 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +441 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +442 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +99 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +77 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +52 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +58 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +60 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +43 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +43 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +47 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +173 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +50 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +37 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_bfloat16_support.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat_support.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_integer_dot_support.comp +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +41 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +1373 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -0
- data/ext/{ggml → sources/ggml}/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +193 -35
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
- data/ext/{ggml → sources/ggml}/src/ggml.c +676 -1820
- data/ext/sources/ggml/src/gguf.cpp +1330 -0
- data/ext/{include → sources/include}/whisper.h +68 -2
- data/ext/sources/src/CMakeLists.txt +143 -0
- data/ext/{src → sources/src}/coreml/whisper-decoder-impl.h +27 -15
- data/ext/{src → sources/src}/coreml/whisper-decoder-impl.m +35 -10
- data/ext/{src → sources/src}/coreml/whisper-encoder-impl.h +21 -9
- data/ext/{src → sources/src}/coreml/whisper-encoder-impl.m +28 -3
- data/ext/sources/src/coreml/whisper-encoder.mm +73 -0
- data/ext/sources/src/whisper-arch.h +197 -0
- data/ext/{src → sources/src}/whisper.cpp +1905 -374
- data/ext/sources/tests/CMakeLists.txt +105 -0
- data/ext/sources/tests/earnings21/eval.mk +58 -0
- data/ext/sources/tests/earnings21/eval.py +68 -0
- data/ext/sources/tests/earnings21/normalizers/__init__.py +2 -0
- data/ext/sources/tests/earnings21/normalizers/basic.py +80 -0
- data/ext/sources/tests/earnings21/normalizers/english.json +1741 -0
- data/ext/sources/tests/earnings21/normalizers/english.py +550 -0
- data/ext/sources/tests/earnings21/requirements.txt +6 -0
- data/ext/sources/tests/en-0-ref.txt +1 -0
- data/ext/sources/tests/en-1-ref.txt +1 -0
- data/ext/sources/tests/en-2-ref.txt +1 -0
- data/ext/sources/tests/es-0-ref.txt +1 -0
- data/ext/sources/tests/librispeech/eval.mk +39 -0
- data/ext/sources/tests/librispeech/eval.py +47 -0
- data/ext/sources/tests/librispeech/normalizers/__init__.py +2 -0
- data/ext/sources/tests/librispeech/normalizers/basic.py +80 -0
- data/ext/sources/tests/librispeech/normalizers/english.json +1741 -0
- data/ext/sources/tests/librispeech/normalizers/english.py +550 -0
- data/ext/sources/tests/librispeech/requirements.txt +6 -0
- data/ext/sources/tests/run-tests.sh +130 -0
- data/ext/sources/tests/test-c.c +3 -0
- data/ext/sources/tests/test-vad-full.cpp +54 -0
- data/ext/sources/tests/test-vad.cpp +83 -0
- data/ext/sources/tests/test-whisper.js +58 -0
- data/extsources.rb +33 -5
- data/lib/whisper/model/uri.rb +149 -128
- data/sig/whisper.rbs +480 -0
- data/tests/helper.rb +28 -0
- data/tests/test_callback.rb +45 -3
- data/tests/test_error.rb +2 -2
- data/tests/test_model.rb +38 -0
- data/tests/test_package.rb +18 -3
- data/tests/test_params.rb +145 -8
- data/tests/test_segment.rb +10 -19
- data/tests/test_vad.rb +19 -0
- data/tests/test_vad_params.rb +103 -0
- data/tests/test_whisper.rb +37 -37
- data/whispercpp.gemspec +5 -4
- metadata +766 -111
- data/ext/cpu.mk +0 -9
- data/ext/examples/dr_wav.h +0 -8815
- data/ext/ggml/src/ggml-cann/aclnn_ops.h +0 -592
- data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -4262
- data/ext/ggml/src/ggml-cpu/ggml-cpu.c +0 -14123
- data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +0 -1884
- data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +0 -14
- data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +0 -288
- data/ext/ggml/src/ggml-sycl/element_wise.cpp +0 -1030
- data/ext/ggml/src/ggml-sycl/im2col.cpp +0 -126
- data/ext/ggml/src/ggml-sycl/rope.cpp +0 -276
- data/ext/ggml/src/ggml-sycl/wkv6.cpp +0 -141
- data/ext/metal-embed.mk +0 -17
- data/ext/metal.mk +0 -6
- data/ext/ruby_whisper.cpp +0 -1909
- data/ext/scripts/get-flags.mk +0 -38
- data/lib/whisper.rb +0 -2
- /data/ext/{ggml → sources/ggml}/include/ggml-blas.h +0 -0
- /data/ext/{ggml → sources/ggml}/include/ggml-cann.h +0 -0
- /data/ext/{ggml → sources/ggml}/include/ggml-cuda.h +0 -0
- /data/ext/{ggml → sources/ggml}/include/ggml-kompute.h +0 -0
- /data/ext/{ggml → sources/ggml}/include/ggml-opencl.h +0 -0
- /data/ext/{ggml → sources/ggml}/include/ggml-sycl.h +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-amx/common.h +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-amx/ggml-amx.cpp +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-amx/mmq.cpp +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-amx/mmq.h +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-blas/ggml-blas.cpp +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/ascendc_kernels.h +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_f16.cpp +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_f32.cpp +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/amx.h +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/common.h +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/mmq.cpp +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/mmq.h +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-aarch64.h +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-hbm.cpp +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-hbm.h +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-quants.h +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-traits.cpp +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-traits.h +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-kompute/ggml-kompute.cpp +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-quants.h +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-threading.cpp +0 -0
- /data/ext/{ggml → sources/ggml}/src/ggml-threading.h +0 -0
- /data/ext/{src → sources/src}/coreml/whisper-encoder.h +0 -0
- /data/ext/{src → sources/src}/openvino/whisper-openvino-encoder.cpp +0 -0
- /data/ext/{src → sources/src}/openvino/whisper-openvino-encoder.h +0 -0
@@ -0,0 +1,892 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include "ggml-cpu-impl.h"
|
4
|
+
|
5
|
+
//
|
6
|
+
// simd mappings
|
7
|
+
//
|
8
|
+
|
9
|
+
// we define a common set of C macros which map to specific intrinsics based on the current architecture
|
10
|
+
// we then implement the fundamental computation operations below using only these macros
|
11
|
+
// adding support for new architectures requires to define the corresponding SIMD macros
|
12
|
+
//
|
13
|
+
// GGML_F32_STEP / GGML_F16_STEP
|
14
|
+
// number of elements to process in a single step
|
15
|
+
//
|
16
|
+
// GGML_F32_EPR / GGML_F16_EPR
|
17
|
+
// number of elements to fit in a single register
|
18
|
+
//
|
19
|
+
|
20
|
+
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
|
21
|
+
|
22
|
+
#define GGML_SIMD
|
23
|
+
|
24
|
+
// F32 NEON
|
25
|
+
|
26
|
+
#define GGML_F32_STEP 16
|
27
|
+
#define GGML_F32_EPR 4
|
28
|
+
|
29
|
+
#define GGML_F32x4 float32x4_t
|
30
|
+
#define GGML_F32x4_ZERO vdupq_n_f32(0.0f)
|
31
|
+
#define GGML_F32x4_SET1(x) vdupq_n_f32(x)
|
32
|
+
#define GGML_F32x4_LOAD vld1q_f32
|
33
|
+
#define GGML_F32x4_STORE vst1q_f32
|
34
|
+
#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
35
|
+
#define GGML_F32x4_ADD vaddq_f32
|
36
|
+
#define GGML_F32x4_MUL vmulq_f32
|
37
|
+
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
38
|
+
#define GGML_F32x4_REDUCE(res, x) \
|
39
|
+
{ \
|
40
|
+
int offset = GGML_F32_ARR >> 1; \
|
41
|
+
for (int i = 0; i < offset; ++i) { \
|
42
|
+
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
43
|
+
} \
|
44
|
+
offset >>= 1; \
|
45
|
+
for (int i = 0; i < offset; ++i) { \
|
46
|
+
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
47
|
+
} \
|
48
|
+
offset >>= 1; \
|
49
|
+
for (int i = 0; i < offset; ++i) { \
|
50
|
+
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
51
|
+
} \
|
52
|
+
(res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
|
53
|
+
}
|
54
|
+
|
55
|
+
#define GGML_F32_VEC GGML_F32x4
|
56
|
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
57
|
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
58
|
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
59
|
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
60
|
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
61
|
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
62
|
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
63
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
64
|
+
|
65
|
+
// F16 NEON
|
66
|
+
|
67
|
+
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
68
|
+
#define GGML_F16_STEP 32
|
69
|
+
#define GGML_F16_EPR 8
|
70
|
+
|
71
|
+
#define GGML_F16x8 float16x8_t
|
72
|
+
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
|
73
|
+
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
|
74
|
+
#define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
|
75
|
+
#define GGML_F16x8_STORE vst1q_f16
|
76
|
+
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
77
|
+
#define GGML_F16x8_ADD vaddq_f16
|
78
|
+
#define GGML_F16x8_MUL vmulq_f16
|
79
|
+
#define GGML_F16x8_REDUCE(res, x) \
|
80
|
+
do { \
|
81
|
+
int offset = GGML_F16_ARR >> 1; \
|
82
|
+
for (int i = 0; i < offset; ++i) { \
|
83
|
+
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
84
|
+
} \
|
85
|
+
offset >>= 1; \
|
86
|
+
for (int i = 0; i < offset; ++i) { \
|
87
|
+
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
88
|
+
} \
|
89
|
+
offset >>= 1; \
|
90
|
+
for (int i = 0; i < offset; ++i) { \
|
91
|
+
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
92
|
+
} \
|
93
|
+
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
|
94
|
+
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
|
95
|
+
(res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
96
|
+
} while (0)
|
97
|
+
|
98
|
+
#define GGML_F16_VEC GGML_F16x8
|
99
|
+
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
100
|
+
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
101
|
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
|
102
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
|
103
|
+
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
104
|
+
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
105
|
+
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
106
|
+
#define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
|
107
|
+
#else
|
108
|
+
// if FP16 vector arithmetic is not supported, we use FP32 instead
|
109
|
+
// and take advantage of the vcvt_ functions to convert to/from FP16
|
110
|
+
|
111
|
+
#define GGML_F16_STEP 16
|
112
|
+
#define GGML_F16_EPR 4
|
113
|
+
|
114
|
+
#define GGML_F32Cx4 float32x4_t
|
115
|
+
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
|
116
|
+
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
|
117
|
+
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
|
118
|
+
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
|
119
|
+
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
120
|
+
#define GGML_F32Cx4_ADD vaddq_f32
|
121
|
+
#define GGML_F32Cx4_MUL vmulq_f32
|
122
|
+
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
123
|
+
|
124
|
+
#define GGML_F16_VEC GGML_F32Cx4
|
125
|
+
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
126
|
+
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
127
|
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
128
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
|
129
|
+
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
130
|
+
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
131
|
+
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
132
|
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
133
|
+
#endif
|
134
|
+
|
135
|
+
#elif defined(__AVX512F__)
|
136
|
+
|
137
|
+
#define GGML_SIMD
|
138
|
+
|
139
|
+
// F32 AVX512
|
140
|
+
|
141
|
+
#define GGML_F32_STEP 64
|
142
|
+
#define GGML_F32_EPR 16
|
143
|
+
|
144
|
+
#define GGML_F32x16 __m512
|
145
|
+
#define GGML_F32x16_ZERO _mm512_setzero_ps()
|
146
|
+
#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
|
147
|
+
#define GGML_F32x16_LOAD _mm512_loadu_ps
|
148
|
+
#define GGML_F32x16_STORE _mm512_storeu_ps
|
149
|
+
// _mm512_fmadd_ps is defined in AVX512F so no guard is required
|
150
|
+
#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
151
|
+
#define GGML_F32x16_ADD _mm512_add_ps
|
152
|
+
#define GGML_F32x16_MUL _mm512_mul_ps
|
153
|
+
#define GGML_F32x16_REDUCE(res, x) \
|
154
|
+
do { \
|
155
|
+
int offset = GGML_F32_ARR >> 1; \
|
156
|
+
for (int i = 0; i < offset; ++i) { \
|
157
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
158
|
+
} \
|
159
|
+
offset >>= 1; \
|
160
|
+
for (int i = 0; i < offset; ++i) { \
|
161
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
162
|
+
} \
|
163
|
+
offset >>= 1; \
|
164
|
+
for (int i = 0; i < offset; ++i) { \
|
165
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
166
|
+
} \
|
167
|
+
res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
|
168
|
+
} while (0)
|
169
|
+
|
170
|
+
// TODO: is this optimal ?
|
171
|
+
|
172
|
+
#define GGML_F32_VEC GGML_F32x16
|
173
|
+
#define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
|
174
|
+
#define GGML_F32_VEC_SET1 GGML_F32x16_SET1
|
175
|
+
#define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
|
176
|
+
#define GGML_F32_VEC_STORE GGML_F32x16_STORE
|
177
|
+
#define GGML_F32_VEC_FMA GGML_F32x16_FMA
|
178
|
+
#define GGML_F32_VEC_ADD GGML_F32x16_ADD
|
179
|
+
#define GGML_F32_VEC_MUL GGML_F32x16_MUL
|
180
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
|
181
|
+
|
182
|
+
// F16 AVX512
|
183
|
+
|
184
|
+
// F16 AVX
|
185
|
+
|
186
|
+
#define GGML_F16_STEP 64
|
187
|
+
#define GGML_F16_EPR 16
|
188
|
+
|
189
|
+
// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
|
190
|
+
|
191
|
+
#define GGML_F32Cx16 __m512
|
192
|
+
#define GGML_F32Cx16_ZERO _mm512_setzero_ps()
|
193
|
+
#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
|
194
|
+
|
195
|
+
// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
|
196
|
+
// so F16C guard isn't required
|
197
|
+
#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
|
198
|
+
#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
|
199
|
+
|
200
|
+
#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
201
|
+
#define GGML_F32Cx16_ADD _mm512_add_ps
|
202
|
+
#define GGML_F32Cx16_MUL _mm512_mul_ps
|
203
|
+
#define GGML_F32Cx16_REDUCE(res, x) \
|
204
|
+
do { \
|
205
|
+
int offset = GGML_F32_ARR >> 1; \
|
206
|
+
for (int i = 0; i < offset; ++i) { \
|
207
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
208
|
+
} \
|
209
|
+
offset >>= 1; \
|
210
|
+
for (int i = 0; i < offset; ++i) { \
|
211
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
212
|
+
} \
|
213
|
+
offset >>= 1; \
|
214
|
+
for (int i = 0; i < offset; ++i) { \
|
215
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
216
|
+
} \
|
217
|
+
res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
|
218
|
+
} while (0)
|
219
|
+
|
220
|
+
#define GGML_F16_VEC GGML_F32Cx16
|
221
|
+
#define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
|
222
|
+
#define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
|
223
|
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
|
224
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
|
225
|
+
#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
|
226
|
+
#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
|
227
|
+
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
|
228
|
+
|
229
|
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
|
230
|
+
#elif defined(__AVX__)
|
231
|
+
|
232
|
+
#define GGML_SIMD
|
233
|
+
|
234
|
+
// F32 AVX
|
235
|
+
|
236
|
+
#define GGML_F32_STEP 32
|
237
|
+
#define GGML_F32_EPR 8
|
238
|
+
|
239
|
+
#define GGML_F32x8 __m256
|
240
|
+
#define GGML_F32x8_ZERO _mm256_setzero_ps()
|
241
|
+
#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
|
242
|
+
#define GGML_F32x8_LOAD _mm256_loadu_ps
|
243
|
+
#define GGML_F32x8_STORE _mm256_storeu_ps
|
244
|
+
#if defined(__FMA__)
|
245
|
+
#define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
|
246
|
+
#else
|
247
|
+
#define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
|
248
|
+
#endif
|
249
|
+
#define GGML_F32x8_ADD _mm256_add_ps
|
250
|
+
#define GGML_F32x8_MUL _mm256_mul_ps
|
251
|
+
#define GGML_F32x8_REDUCE(res, x) \
|
252
|
+
do { \
|
253
|
+
int offset = GGML_F32_ARR >> 1; \
|
254
|
+
for (int i = 0; i < offset; ++i) { \
|
255
|
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
256
|
+
} \
|
257
|
+
offset >>= 1; \
|
258
|
+
for (int i = 0; i < offset; ++i) { \
|
259
|
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
260
|
+
} \
|
261
|
+
offset >>= 1; \
|
262
|
+
for (int i = 0; i < offset; ++i) { \
|
263
|
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
264
|
+
} \
|
265
|
+
const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
|
266
|
+
_mm256_extractf128_ps(x[0], 1)); \
|
267
|
+
const __m128 t1 = _mm_hadd_ps(t0, t0); \
|
268
|
+
res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
|
269
|
+
} while (0)
|
270
|
+
// TODO: is this optimal ?
|
271
|
+
|
272
|
+
#define GGML_F32_VEC GGML_F32x8
|
273
|
+
#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
|
274
|
+
#define GGML_F32_VEC_SET1 GGML_F32x8_SET1
|
275
|
+
#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
|
276
|
+
#define GGML_F32_VEC_STORE GGML_F32x8_STORE
|
277
|
+
#define GGML_F32_VEC_FMA GGML_F32x8_FMA
|
278
|
+
#define GGML_F32_VEC_ADD GGML_F32x8_ADD
|
279
|
+
#define GGML_F32_VEC_MUL GGML_F32x8_MUL
|
280
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
|
281
|
+
|
282
|
+
// F16 AVX
|
283
|
+
|
284
|
+
#define GGML_F16_STEP 32
|
285
|
+
#define GGML_F16_EPR 8
|
286
|
+
|
287
|
+
// F16 arithmetic is not supported by AVX, so we use F32 instead
|
288
|
+
|
289
|
+
#define GGML_F32Cx8 __m256
|
290
|
+
#define GGML_F32Cx8_ZERO _mm256_setzero_ps()
|
291
|
+
#define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x)
|
292
|
+
|
293
|
+
#if defined(__F16C__)
|
294
|
+
// the _mm256_cvt intrinsics require F16C
|
295
|
+
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
|
296
|
+
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
|
297
|
+
#else
|
298
|
+
static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
|
299
|
+
float tmp[8];
|
300
|
+
|
301
|
+
for (int i = 0; i < 8; i++) {
|
302
|
+
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
303
|
+
}
|
304
|
+
|
305
|
+
return _mm256_loadu_ps(tmp);
|
306
|
+
}
|
307
|
+
static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
308
|
+
float arr[8];
|
309
|
+
|
310
|
+
_mm256_storeu_ps(arr, y);
|
311
|
+
|
312
|
+
for (int i = 0; i < 8; i++)
|
313
|
+
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
314
|
+
}
|
315
|
+
#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
|
316
|
+
#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
|
317
|
+
#endif
|
318
|
+
|
319
|
+
#define GGML_F32Cx8_FMA GGML_F32x8_FMA
|
320
|
+
#define GGML_F32Cx8_ADD _mm256_add_ps
|
321
|
+
#define GGML_F32Cx8_MUL _mm256_mul_ps
|
322
|
+
#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
|
323
|
+
|
324
|
+
#define GGML_F16_VEC GGML_F32Cx8
|
325
|
+
#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
|
326
|
+
#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
|
327
|
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
|
328
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
|
329
|
+
#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
|
330
|
+
#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
|
331
|
+
#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
|
332
|
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
|
333
|
+
|
334
|
+
#elif defined(__POWER9_VECTOR__)
|
335
|
+
|
336
|
+
#define GGML_SIMD
|
337
|
+
|
338
|
+
// F32 POWER9
|
339
|
+
|
340
|
+
#define GGML_F32_STEP 32
|
341
|
+
#define GGML_F32_EPR 4
|
342
|
+
|
343
|
+
#define GGML_F32x4 vector float
|
344
|
+
#define GGML_F32x4_ZERO {0.0f}
|
345
|
+
#define GGML_F32x4_SET1 vec_splats
|
346
|
+
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
|
347
|
+
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
|
348
|
+
#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
|
349
|
+
#define GGML_F32x4_ADD vec_add
|
350
|
+
#define GGML_F32x4_MUL vec_mul
|
351
|
+
#define GGML_F32x4_REDUCE(res, x) \
|
352
|
+
{ \
|
353
|
+
int offset = GGML_F32_ARR >> 1; \
|
354
|
+
for (int i = 0; i < offset; ++i) { \
|
355
|
+
x[i] = vec_add(x[i], x[offset+i]); \
|
356
|
+
} \
|
357
|
+
offset >>= 1; \
|
358
|
+
for (int i = 0; i < offset; ++i) { \
|
359
|
+
x[i] = vec_add(x[i], x[offset+i]); \
|
360
|
+
} \
|
361
|
+
offset >>= 1; \
|
362
|
+
for (int i = 0; i < offset; ++i) { \
|
363
|
+
x[i] = vec_add(x[i], x[offset+i]); \
|
364
|
+
} \
|
365
|
+
res = vec_extract(x[0], 0) + \
|
366
|
+
vec_extract(x[0], 1) + \
|
367
|
+
vec_extract(x[0], 2) + \
|
368
|
+
vec_extract(x[0], 3); \
|
369
|
+
}
|
370
|
+
|
371
|
+
#define GGML_F32_VEC GGML_F32x4
|
372
|
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
373
|
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
374
|
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
375
|
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
376
|
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
377
|
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
378
|
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
379
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
380
|
+
|
381
|
+
// F16 POWER9
|
382
|
+
#define GGML_F16_STEP GGML_F32_STEP
|
383
|
+
#define GGML_F16_EPR GGML_F32_EPR
|
384
|
+
#define GGML_F16_VEC GGML_F32x4
|
385
|
+
#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
|
386
|
+
#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
|
387
|
+
#define GGML_F16_VEC_FMA GGML_F32x4_FMA
|
388
|
+
#define GGML_F16_VEC_ADD GGML_F32x4_ADD
|
389
|
+
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
|
390
|
+
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
|
391
|
+
// Use vec_xl, not vec_ld, in case the load address is not aligned.
|
392
|
+
#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
|
393
|
+
vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
|
394
|
+
vec_extract_fp32_from_shortl(vec_xl(0, p))
|
395
|
+
static inline unsigned char ggml_endian_byte(int i) {
|
396
|
+
uint16_t tmp_val = 1;
|
397
|
+
return ((unsigned char *)&tmp_val)[i];
|
398
|
+
}
|
399
|
+
#define GGML_ENDIAN_BYTE(i) ggml_endian_byte(i)
|
400
|
+
#define GGML_F16_VEC_STORE(p, r, i) \
|
401
|
+
if (i & 0x1) \
|
402
|
+
vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \
|
403
|
+
r[i - GGML_ENDIAN_BYTE(0)]), \
|
404
|
+
0, p - GGML_F16_EPR)
|
405
|
+
|
406
|
+
#elif defined(__wasm_simd128__)
|
407
|
+
|
408
|
+
#define GGML_SIMD
|
409
|
+
|
410
|
+
// F32 WASM
|
411
|
+
|
412
|
+
#define GGML_F32_STEP 16
|
413
|
+
#define GGML_F32_EPR 4
|
414
|
+
|
415
|
+
#define GGML_F32x4 v128_t
|
416
|
+
#define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f)
|
417
|
+
#define GGML_F32x4_SET1(x) wasm_f32x4_splat(x)
|
418
|
+
#define GGML_F32x4_LOAD wasm_v128_load
|
419
|
+
#define GGML_F32x4_STORE wasm_v128_store
|
420
|
+
#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
|
421
|
+
#define GGML_F32x4_ADD wasm_f32x4_add
|
422
|
+
#define GGML_F32x4_MUL wasm_f32x4_mul
|
423
|
+
#define GGML_F32x4_REDUCE(res, x) \
|
424
|
+
{ \
|
425
|
+
int offset = GGML_F32_ARR >> 1; \
|
426
|
+
for (int i = 0; i < offset; ++i) { \
|
427
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
428
|
+
} \
|
429
|
+
offset >>= 1; \
|
430
|
+
for (int i = 0; i < offset; ++i) { \
|
431
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
432
|
+
} \
|
433
|
+
offset >>= 1; \
|
434
|
+
for (int i = 0; i < offset; ++i) { \
|
435
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
436
|
+
} \
|
437
|
+
res = wasm_f32x4_extract_lane(x[0], 0) + \
|
438
|
+
wasm_f32x4_extract_lane(x[0], 1) + \
|
439
|
+
wasm_f32x4_extract_lane(x[0], 2) + \
|
440
|
+
wasm_f32x4_extract_lane(x[0], 3); \
|
441
|
+
}
|
442
|
+
|
443
|
+
#define GGML_F32_VEC GGML_F32x4
|
444
|
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
445
|
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
446
|
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
447
|
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
448
|
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
449
|
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
450
|
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
451
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
452
|
+
|
453
|
+
// F16 WASM
|
454
|
+
|
455
|
+
#define GGML_F16_STEP 16
|
456
|
+
#define GGML_F16_EPR 4
|
457
|
+
|
458
|
+
inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
|
459
|
+
float tmp[4];
|
460
|
+
|
461
|
+
tmp[0] = GGML_FP16_TO_FP32(p[0]);
|
462
|
+
tmp[1] = GGML_FP16_TO_FP32(p[1]);
|
463
|
+
tmp[2] = GGML_FP16_TO_FP32(p[2]);
|
464
|
+
tmp[3] = GGML_FP16_TO_FP32(p[3]);
|
465
|
+
|
466
|
+
return wasm_v128_load(tmp);
|
467
|
+
}
|
468
|
+
|
469
|
+
inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
470
|
+
float tmp[4];
|
471
|
+
|
472
|
+
wasm_v128_store(tmp, x);
|
473
|
+
|
474
|
+
p[0] = GGML_FP32_TO_FP16(tmp[0]);
|
475
|
+
p[1] = GGML_FP32_TO_FP16(tmp[1]);
|
476
|
+
p[2] = GGML_FP32_TO_FP16(tmp[2]);
|
477
|
+
p[3] = GGML_FP32_TO_FP16(tmp[3]);
|
478
|
+
}
|
479
|
+
|
480
|
+
#define GGML_F16x4 v128_t
|
481
|
+
#define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f)
|
482
|
+
#define GGML_F16x4_SET1(x) wasm_f32x4_splat(x)
|
483
|
+
#define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x)
|
484
|
+
#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
|
485
|
+
#define GGML_F16x4_FMA GGML_F32x4_FMA
|
486
|
+
#define GGML_F16x4_ADD wasm_f32x4_add
|
487
|
+
#define GGML_F16x4_MUL wasm_f32x4_mul
|
488
|
+
#define GGML_F16x4_REDUCE(res, x) \
|
489
|
+
{ \
|
490
|
+
int offset = GGML_F16_ARR >> 1; \
|
491
|
+
for (int i = 0; i < offset; ++i) { \
|
492
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
493
|
+
} \
|
494
|
+
offset >>= 1; \
|
495
|
+
for (int i = 0; i < offset; ++i) { \
|
496
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
497
|
+
} \
|
498
|
+
offset >>= 1; \
|
499
|
+
for (int i = 0; i < offset; ++i) { \
|
500
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
501
|
+
} \
|
502
|
+
res = (ggml_float) (wasm_f32x4_extract_lane(x[0], 0) + \
|
503
|
+
wasm_f32x4_extract_lane(x[0], 1) + \
|
504
|
+
wasm_f32x4_extract_lane(x[0], 2) + \
|
505
|
+
wasm_f32x4_extract_lane(x[0], 3)); \
|
506
|
+
}
|
507
|
+
|
508
|
+
#define GGML_F16_VEC GGML_F16x4
|
509
|
+
#define GGML_F16_VEC_ZERO GGML_F16x4_ZERO
|
510
|
+
#define GGML_F16_VEC_SET1 GGML_F16x4_SET1
|
511
|
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p)
|
512
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
|
513
|
+
#define GGML_F16_VEC_FMA GGML_F16x4_FMA
|
514
|
+
#define GGML_F16_VEC_ADD GGML_F16x4_ADD
|
515
|
+
#define GGML_F16_VEC_MUL GGML_F16x4_MUL
|
516
|
+
#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
|
517
|
+
|
518
|
+
#elif defined(__SSE3__)
|
519
|
+
|
520
|
+
#define GGML_SIMD
|
521
|
+
|
522
|
+
// F32 SSE
|
523
|
+
|
524
|
+
#define GGML_F32_STEP 32
|
525
|
+
#define GGML_F32_EPR 4
|
526
|
+
|
527
|
+
#define GGML_F32x4 __m128
|
528
|
+
#define GGML_F32x4_ZERO _mm_setzero_ps()
|
529
|
+
#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
|
530
|
+
#define GGML_F32x4_LOAD _mm_loadu_ps
|
531
|
+
#define GGML_F32x4_STORE _mm_storeu_ps
|
532
|
+
#if defined(__FMA__)
|
533
|
+
// TODO: Does this work?
|
534
|
+
#define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
|
535
|
+
#else
|
536
|
+
#define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
|
537
|
+
#endif
|
538
|
+
#define GGML_F32x4_ADD _mm_add_ps
|
539
|
+
#define GGML_F32x4_MUL _mm_mul_ps
|
540
|
+
#define GGML_F32x4_REDUCE(res, x) \
|
541
|
+
{ \
|
542
|
+
int offset = GGML_F32_ARR >> 1; \
|
543
|
+
for (int i = 0; i < offset; ++i) { \
|
544
|
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
545
|
+
} \
|
546
|
+
offset >>= 1; \
|
547
|
+
for (int i = 0; i < offset; ++i) { \
|
548
|
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
549
|
+
} \
|
550
|
+
offset >>= 1; \
|
551
|
+
for (int i = 0; i < offset; ++i) { \
|
552
|
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
553
|
+
} \
|
554
|
+
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
|
555
|
+
res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
|
556
|
+
}
|
557
|
+
// TODO: is this optimal ?
|
558
|
+
|
559
|
+
#define GGML_F32_VEC GGML_F32x4
|
560
|
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
561
|
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
562
|
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
563
|
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
564
|
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
565
|
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
566
|
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
567
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
568
|
+
|
569
|
+
// F16 SSE
|
570
|
+
|
571
|
+
#define GGML_F16_STEP 32
|
572
|
+
#define GGML_F16_EPR 4
|
573
|
+
|
574
|
+
static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
|
575
|
+
float tmp[4];
|
576
|
+
|
577
|
+
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
578
|
+
tmp[1] = GGML_FP16_TO_FP32(x[1]);
|
579
|
+
tmp[2] = GGML_FP16_TO_FP32(x[2]);
|
580
|
+
tmp[3] = GGML_FP16_TO_FP32(x[3]);
|
581
|
+
|
582
|
+
return _mm_loadu_ps(tmp);
|
583
|
+
}
|
584
|
+
|
585
|
+
static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
586
|
+
float arr[4];
|
587
|
+
|
588
|
+
_mm_storeu_ps(arr, y);
|
589
|
+
|
590
|
+
x[0] = GGML_FP32_TO_FP16(arr[0]);
|
591
|
+
x[1] = GGML_FP32_TO_FP16(arr[1]);
|
592
|
+
x[2] = GGML_FP32_TO_FP16(arr[2]);
|
593
|
+
x[3] = GGML_FP32_TO_FP16(arr[3]);
|
594
|
+
}
|
595
|
+
|
596
|
+
#define GGML_F32Cx4 __m128
|
597
|
+
#define GGML_F32Cx4_ZERO _mm_setzero_ps()
|
598
|
+
#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
|
599
|
+
#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
|
600
|
+
#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
|
601
|
+
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
|
602
|
+
#define GGML_F32Cx4_ADD _mm_add_ps
|
603
|
+
#define GGML_F32Cx4_MUL _mm_mul_ps
|
604
|
+
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
605
|
+
|
606
|
+
#define GGML_F16_VEC GGML_F32Cx4
|
607
|
+
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
608
|
+
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
609
|
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
610
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
|
611
|
+
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
612
|
+
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
613
|
+
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
614
|
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
615
|
+
|
616
|
+
#elif defined(__loongarch_asx)
|
617
|
+
|
618
|
+
#define GGML_SIMD
|
619
|
+
|
620
|
+
// F32 LASX
|
621
|
+
#define GGML_F32_STEP 32
|
622
|
+
#define GGML_F32_EPR 8
|
623
|
+
|
624
|
+
#define GGML_F32x8 __m256
|
625
|
+
#define GGML_F32x8_ZERO (__m256)__lasx_xvldi(0)
|
626
|
+
#define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
|
627
|
+
#define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
|
628
|
+
#define GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0)
|
629
|
+
#define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
|
630
|
+
#define GGML_F32x8_ADD __lasx_xvfadd_s
|
631
|
+
#define GGML_F32x8_MUL __lasx_xvfmul_s
|
632
|
+
#define GGML_F32x8_REDUCE(res, x) \
|
633
|
+
do { \
|
634
|
+
int offset = GGML_F32_ARR >> 1; \
|
635
|
+
for (int i = 0; i < offset; ++i) { \
|
636
|
+
x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
|
637
|
+
} \
|
638
|
+
offset >>= 1; \
|
639
|
+
for (int i = 0; i < offset; ++i) { \
|
640
|
+
x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
|
641
|
+
} \
|
642
|
+
offset >>= 1; \
|
643
|
+
for (int i = 0; i < offset; ++i) { \
|
644
|
+
x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
|
645
|
+
} \
|
646
|
+
float *tmp_p = (float *)&x[0]; \
|
647
|
+
res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \
|
648
|
+
} while (0)
|
649
|
+
// TODO: is this optimal ?
|
650
|
+
|
651
|
+
#define GGML_F32_VEC GGML_F32x8
|
652
|
+
#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
|
653
|
+
#define GGML_F32_VEC_SET1 GGML_F32x8_SET1
|
654
|
+
#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
|
655
|
+
#define GGML_F32_VEC_STORE GGML_F32x8_STORE
|
656
|
+
#define GGML_F32_VEC_FMA GGML_F32x8_FMA
|
657
|
+
#define GGML_F32_VEC_ADD GGML_F32x8_ADD
|
658
|
+
#define GGML_F32_VEC_MUL GGML_F32x8_MUL
|
659
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
|
660
|
+
|
661
|
+
// F16 LASX
|
662
|
+
|
663
|
+
#define GGML_F16_STEP 32
|
664
|
+
#define GGML_F16_EPR 8
|
665
|
+
|
666
|
+
// F16 arithmetic is not supported by LASX, so we use F32 instead
|
667
|
+
|
668
|
+
#define GGML_F32Cx8 __m256
|
669
|
+
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
670
|
+
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
671
|
+
|
672
|
+
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
|
673
|
+
__m256i a;
|
674
|
+
memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
|
675
|
+
a = __lasx_xvpermi_d(a, 0 | (1 << 4));
|
676
|
+
return __lasx_xvfcvtl_s_h(a);
|
677
|
+
}
|
678
|
+
|
679
|
+
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
680
|
+
__m256i a = __lasx_xvfcvt_h_s(y, y);
|
681
|
+
a = __lasx_xvpermi_d(a, 0 | (2 << 2));
|
682
|
+
memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
|
683
|
+
}
|
684
|
+
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
|
685
|
+
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
|
686
|
+
|
687
|
+
#define GGML_F32Cx8_FMA GGML_F32x8_FMA
|
688
|
+
#define GGML_F32Cx8_ADD __lasx_xvfadd_s
|
689
|
+
#define GGML_F32Cx8_MUL __lasx_xvfmul_s
|
690
|
+
#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
|
691
|
+
|
692
|
+
#define GGML_F16_VEC GGML_F32Cx8
|
693
|
+
#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
|
694
|
+
#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
|
695
|
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
|
696
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
|
697
|
+
#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
|
698
|
+
#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
|
699
|
+
#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
|
700
|
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
|
701
|
+
|
702
|
+
#elif defined(__loongarch_sx)
|
703
|
+
|
704
|
+
#define GGML_SIMD
|
705
|
+
|
706
|
+
// F32 LSX
|
707
|
+
|
708
|
+
#define GGML_F32_STEP 32
|
709
|
+
#define GGML_F32_EPR 4
|
710
|
+
|
711
|
+
#define GGML_F32x4 __m128
|
712
|
+
#define GGML_F32x4_ZERO __lsx_vldi(0)
|
713
|
+
#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
714
|
+
#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
|
715
|
+
#define GGML_F32x4_STORE((x),(y)) __lsx_vst((y), (x), 0)
|
716
|
+
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
717
|
+
#define GGML_F32x4_ADD __lsx_vfadd_s
|
718
|
+
#define GGML_F32x4_MUL __lsx_vfmul_s
|
719
|
+
#define GGML_F32x4_REDUCE(res, x) \
|
720
|
+
{ \
|
721
|
+
int offset = GGML_F32_ARR >> 1; \
|
722
|
+
for (int i = 0; i < offset; ++i) { \
|
723
|
+
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
|
724
|
+
} \
|
725
|
+
offset >>= 1; \
|
726
|
+
for (int i = 0; i < offset; ++i) { \
|
727
|
+
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
|
728
|
+
} \
|
729
|
+
offset >>= 1; \
|
730
|
+
for (int i = 0; i < offset; ++i) { \
|
731
|
+
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
|
732
|
+
} \
|
733
|
+
__m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
|
734
|
+
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
|
735
|
+
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
736
|
+
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
|
737
|
+
tmp = __lsx_vsrli_d((__m128i) t0, 32); \
|
738
|
+
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
|
739
|
+
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
740
|
+
res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
|
741
|
+
}
|
742
|
+
|
743
|
+
#define GGML_F32_VEC GGML_F32x4
|
744
|
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
745
|
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
746
|
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
747
|
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
748
|
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
749
|
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
750
|
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
751
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
752
|
+
|
753
|
+
// F16 LSX
|
754
|
+
|
755
|
+
#define GGML_F16_STEP 32
|
756
|
+
#define GGML_F16_EPR 4
|
757
|
+
|
758
|
+
static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
|
759
|
+
float tmp[4];
|
760
|
+
|
761
|
+
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
762
|
+
tmp[1] = GGML_FP16_TO_FP32(x[1]);
|
763
|
+
tmp[2] = GGML_FP16_TO_FP32(x[2]);
|
764
|
+
tmp[3] = GGML_FP16_TO_FP32(x[3]);
|
765
|
+
|
766
|
+
return __lsx_vld(tmp, 0);
|
767
|
+
}
|
768
|
+
|
769
|
+
static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
770
|
+
float arr[4];
|
771
|
+
|
772
|
+
__lsx_vst(y, arr, 0);
|
773
|
+
|
774
|
+
x[0] = GGML_FP32_TO_FP16(arr[0]);
|
775
|
+
x[1] = GGML_FP32_TO_FP16(arr[1]);
|
776
|
+
x[2] = GGML_FP32_TO_FP16(arr[2]);
|
777
|
+
x[3] = GGML_FP32_TO_FP16(arr[3]);
|
778
|
+
}
|
779
|
+
|
780
|
+
#define GGML_F32Cx4 __m128
|
781
|
+
#define GGML_F32Cx4_ZERO __lsx_vldi(0)
|
782
|
+
#define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
783
|
+
#define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
|
784
|
+
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
|
785
|
+
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
|
786
|
+
#define GGML_F32Cx4_ADD __lsx_vfadd_s
|
787
|
+
#define GGML_F32Cx4_MUL __lsx_vfmul_s
|
788
|
+
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
789
|
+
|
790
|
+
#define GGML_F16_VEC GGML_F32Cx4
|
791
|
+
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
792
|
+
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
793
|
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
794
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
|
795
|
+
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
796
|
+
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
797
|
+
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
798
|
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
799
|
+
|
800
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
801
|
+
|
802
|
+
#define GGML_SIMD
|
803
|
+
|
804
|
+
// F32 s390x
|
805
|
+
|
806
|
+
#define GGML_F32_STEP 32
|
807
|
+
#define GGML_F32_EPR 4
|
808
|
+
|
809
|
+
#define GGML_F32x4 __vector float
|
810
|
+
#define GGML_F32x4_ZERO vec_splats(0.0f)
|
811
|
+
#define GGML_F32x4_SET1 vec_splats
|
812
|
+
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
|
813
|
+
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
|
814
|
+
#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
|
815
|
+
#define GGML_F32x4_ADD vec_add
|
816
|
+
#define GGML_F32x4_MUL vec_mul
|
817
|
+
#define GGML_F32x4_REDUCE(res, x) \
|
818
|
+
{ \
|
819
|
+
int offset = GGML_F32_ARR >> 1; \
|
820
|
+
for (int i = 0; i < offset; ++i) { \
|
821
|
+
x[i] = vec_add(x[i], x[offset + i]); \
|
822
|
+
} \
|
823
|
+
offset >>= 1; \
|
824
|
+
for (int i = 0; i < offset; ++i) { \
|
825
|
+
x[i] = vec_add(x[i], x[offset + i]); \
|
826
|
+
} \
|
827
|
+
offset >>= 1; \
|
828
|
+
for (int i = 0; i < offset; ++i) { \
|
829
|
+
x[i] = vec_add(x[i], x[offset + i]); \
|
830
|
+
} \
|
831
|
+
res = vec_extract(x[0], 0) + \
|
832
|
+
vec_extract(x[0], 1) + \
|
833
|
+
vec_extract(x[0], 2) + \
|
834
|
+
vec_extract(x[0], 3); \
|
835
|
+
}
|
836
|
+
|
837
|
+
#define GGML_F32_VEC GGML_F32x4
|
838
|
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
839
|
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
840
|
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
841
|
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
842
|
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
843
|
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
844
|
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
845
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
846
|
+
|
847
|
+
// F16 s390x
|
848
|
+
#define GGML_F16_STEP GGML_F32_STEP
|
849
|
+
#define GGML_F16_EPR GGML_F32_EPR
|
850
|
+
|
851
|
+
static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) {
|
852
|
+
float tmp[4];
|
853
|
+
|
854
|
+
for (int i = 0; i < 4; i++) {
|
855
|
+
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
856
|
+
}
|
857
|
+
|
858
|
+
// note: keep type-cast here to prevent compiler bugs
|
859
|
+
// see: https://github.com/ggml-org/llama.cpp/issues/12846
|
860
|
+
return vec_xl(0, (const float *)(tmp));
|
861
|
+
}
|
862
|
+
|
863
|
+
static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
|
864
|
+
float arr[4];
|
865
|
+
|
866
|
+
// note: keep type-cast here to prevent compiler bugs
|
867
|
+
// see: https://github.com/ggml-org/llama.cpp/issues/12846
|
868
|
+
vec_xst(y, 0, (float *)(arr));
|
869
|
+
|
870
|
+
for (int i = 0; i < 4; i++) {
|
871
|
+
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
872
|
+
}
|
873
|
+
}
|
874
|
+
|
875
|
+
#define GGML_F16_VEC GGML_F32x4
|
876
|
+
#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
|
877
|
+
#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
|
878
|
+
#define GGML_F16_VEC_LOAD(p, i) __lzs_f16cx4_load(p)
|
879
|
+
#define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
|
880
|
+
#define GGML_F16_VEC_FMA GGML_F32x4_FMA
|
881
|
+
#define GGML_F16_VEC_ADD GGML_F32x4_ADD
|
882
|
+
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
|
883
|
+
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
|
884
|
+
|
885
|
+
#endif
|
886
|
+
|
887
|
+
// GGML_F32_ARR / GGML_F16_ARR
|
888
|
+
// number of registers to use per step
|
889
|
+
#ifdef GGML_SIMD
|
890
|
+
#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
|
891
|
+
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
892
|
+
#endif
|