whispercpp 1.3.2 → 1.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +6 -3
- data/README.md +71 -14
- data/Rakefile +20 -7
- data/ext/.gitignore +4 -6
- data/ext/dependencies.rb +36 -24
- data/ext/extconf.rb +1 -1
- data/ext/options.rb +48 -184
- data/ext/ruby_whisper.c +18 -0
- data/ext/ruby_whisper_context.c +43 -12
- data/ext/ruby_whisper_model.c +1 -1
- data/ext/ruby_whisper_params.c +59 -27
- data/ext/ruby_whisper_segment.c +81 -4
- data/ext/ruby_whisper_transcribe.cpp +13 -7
- data/ext/ruby_whisper_vad_params.c +1 -1
- data/ext/sources/CMakeLists.txt +5 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/build-xcframework.sh +24 -0
- data/ext/sources/examples/CMakeLists.txt +1 -0
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
- data/ext/sources/examples/addon.node/addon.cpp +154 -35
- data/ext/sources/examples/addon.node/index.js +10 -5
- data/ext/sources/examples/addon.node/vad-example.js +132 -0
- data/ext/sources/examples/bench/bench.cpp +29 -18
- data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
- data/ext/sources/examples/cli/cli.cpp +7 -4
- data/ext/sources/examples/command/command.cpp +58 -32
- data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/common-whisper.cpp +14 -7
- data/ext/sources/examples/lsp/lsp.cpp +21 -17
- data/ext/sources/examples/quantize/quantize.cpp +3 -0
- data/ext/sources/examples/server/CMakeLists.txt +3 -0
- data/ext/sources/examples/server/server.cpp +193 -35
- data/ext/sources/examples/server.py +6 -1
- data/ext/sources/examples/stream/stream.cpp +10 -2
- data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
- data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
- data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -0
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
- data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
- data/ext/sources/examples/talk-llama/llama-arch.cpp +756 -15
- data/ext/sources/examples/talk-llama/llama-arch.h +85 -1
- data/ext/sources/examples/talk-llama/llama-batch.cpp +773 -272
- data/ext/sources/examples/talk-llama/llama-batch.h +126 -55
- data/ext/sources/examples/talk-llama/llama-chat.cpp +150 -13
- data/ext/sources/examples/talk-llama/llama-chat.h +8 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +814 -542
- data/ext/sources/examples/talk-llama/llama-context.h +68 -32
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
- data/ext/sources/examples/talk-llama/llama-graph.cpp +787 -440
- data/ext/sources/examples/talk-llama/llama-graph.h +333 -153
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +128 -6
- data/ext/sources/examples/talk-llama/llama-hparams.h +80 -17
- data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +137 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +1248 -1967
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +218 -345
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +164 -52
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +266 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +139 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1154 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +182 -0
- data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
- data/ext/sources/examples/talk-llama/llama-memory.h +94 -4
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +44 -17
- data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +11377 -5248
- data/ext/sources/examples/talk-llama/llama-model.h +87 -9
- data/ext/sources/examples/talk-llama/llama-quant.cpp +137 -16
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +502 -38
- data/ext/sources/examples/talk-llama/llama-vocab.h +46 -0
- data/ext/sources/examples/talk-llama/llama.cpp +76 -17
- data/ext/sources/examples/talk-llama/llama.h +176 -151
- data/ext/sources/examples/talk-llama/talk-llama.cpp +11 -6
- data/ext/sources/examples/talk-llama/unicode.cpp +212 -0
- data/ext/sources/examples/talk-llama/unicode.h +45 -0
- data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +6 -2
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
- data/ext/sources/ggml/CMakeLists.txt +106 -33
- data/ext/sources/ggml/cmake/common.cmake +24 -0
- data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
- data/ext/sources/ggml/include/ggml-backend.h +18 -2
- data/ext/sources/ggml/include/ggml-cpu.h +2 -0
- data/ext/sources/ggml/include/ggml-metal.h +1 -6
- data/ext/sources/ggml/include/ggml-opt.h +25 -6
- data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
- data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
- data/ext/sources/ggml/include/ggml.h +365 -21
- data/ext/sources/ggml/src/CMakeLists.txt +98 -25
- data/ext/sources/ggml/src/ggml-alloc.c +265 -141
- data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +35 -13
- data/ext/sources/ggml/src/ggml-backend.cpp +266 -60
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +4 -4
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
- data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +15 -0
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
- data/ext/sources/ggml/src/ggml-cann/common.h +149 -2
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +521 -78
- data/ext/sources/ggml/src/ggml-common.h +21 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +165 -50
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -3
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +3650 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1891 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2160 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1897 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +214 -0
- data/ext/sources/ggml/src/ggml-cpu/common.h +18 -3
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +23 -7
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +179 -110
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +44 -33
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +228 -98
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +532 -1124
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +3374 -2081
- data/ext/sources/ggml/src/ggml-cpu/ops.h +13 -8
- data/ext/sources/ggml/src/ggml-cpu/quants.c +1193 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +34 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1982 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.h +120 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +367 -46
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +3 -3
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +272 -35
- data/ext/sources/ggml/src/ggml-cpu/vec.h +794 -142
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
- data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
- data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +291 -81
- data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +117 -22
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +20 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
- data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +499 -368
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +142 -93
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +90 -50
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
- data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +636 -222
- data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
- data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +73 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +506 -0
- data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +4 -5
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
- data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
- data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
- data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
- data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
- data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -98
- data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +23 -19
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
- data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +179 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +15 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +14 -2
- data/ext/sources/ggml/src/ggml-impl.h +229 -175
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +21 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +163 -63
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +3208 -1575
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
- data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +32 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4430 -792
- data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +138 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
- data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
- data/ext/sources/ggml/src/ggml-quants.c +117 -24
- data/ext/sources/ggml/src/ggml-quants.h +6 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +85 -62
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +13 -17
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +21 -2
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +116 -211
- data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +700 -1041
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +20 -9
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +17 -26
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +2 -96
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +393 -250
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +32 -8
- data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -11
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +125 -21
- data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
- data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +4 -3
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +105 -17
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4198 -1145
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +19 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +64 -11
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +338 -71
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
- data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
- data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
- data/ext/sources/ggml/src/ggml.c +802 -142
- data/ext/sources/ggml/src/ggml.cpp +26 -0
- data/ext/sources/ggml/src/gguf.cpp +32 -4
- data/ext/sources/include/whisper.h +2 -0
- data/ext/sources/src/CMakeLists.txt +2 -0
- data/ext/sources/src/coreml/whisper-compat.h +10 -0
- data/ext/sources/src/coreml/whisper-compat.m +35 -0
- data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
- data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
- data/ext/sources/src/whisper.cpp +241 -215
- data/ext/sources/tests/CMakeLists.txt +8 -1
- data/ext/sources/tests/test-vad-full.cpp +3 -3
- data/ext/sources/tests/test-vad.cpp +2 -2
- data/extsources.rb +15 -9
- data/lib/whisper/context.rb +15 -0
- data/lib/whisper/model/uri.rb +57 -2
- data/lib/whisper/segment.rb +58 -0
- data/sig/whisper.rbs +75 -38
- data/{tests → test}/helper.rb +1 -12
- data/{tests → test}/test_model.rb +9 -0
- data/test/test_package.rb +51 -0
- data/{tests → test}/test_params.rb +8 -0
- data/test/test_segment.rb +146 -0
- data/{tests → test}/test_whisper.rb +70 -0
- data/whispercpp.gemspec +2 -3
- metadata +246 -191
- data/ext/sources/.dockerignore +0 -3
- data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
- data/ext/sources/ci/run.sh +0 -336
- data/ext/sources/close-issue.yml +0 -28
- data/ext/sources/ggml/include/ggml-kompute.h +0 -50
- data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
- data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
- data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
- data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
- data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
- data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -6431
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
- data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -336
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -5998
- data/tests/test_package.rb +0 -46
- data/tests/test_segment.rb +0 -74
- /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /data/{tests → test}/jfk_reader/.gitignore +0 -0
- /data/{tests → test}/jfk_reader/extconf.rb +0 -0
- /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
- /data/{tests → test}/test_callback.rb +0 -0
- /data/{tests → test}/test_error.rb +0 -0
- /data/{tests → test}/test_vad.rb +0 -0
- /data/{tests → test}/test_vad_params.rb +0 -0
@@ -5,6 +5,7 @@
|
|
5
5
|
#include "ggml-impl.h"
|
6
6
|
#include "simd-mappings.h"
|
7
7
|
#include "ggml.h"
|
8
|
+
#include "ggml-cpu.h"
|
8
9
|
|
9
10
|
#if defined(GGML_USE_ACCELERATE)
|
10
11
|
#include <Accelerate/Accelerate.h>
|
@@ -54,10 +55,25 @@ inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x)
|
|
54
55
|
|
55
56
|
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
56
57
|
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
57
|
-
|
58
|
+
|
59
|
+
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
|
60
|
+
int i = 0;
|
61
|
+
#if defined(__AVX2__)
|
62
|
+
for (; i + 7 < n; i += 8) {
|
63
|
+
__m256 vx = _mm256_loadu_ps(x + i);
|
64
|
+
__m256 vy = _mm256_loadu_ps(y + i);
|
65
|
+
__m256 vz = _mm256_add_ps(vx, vy);
|
66
|
+
_mm256_storeu_ps(z + i, vz);
|
67
|
+
}
|
68
|
+
#endif
|
69
|
+
for (; i < n; ++i) {
|
70
|
+
z[i] = x[i] + y[i];
|
71
|
+
}
|
72
|
+
}
|
73
|
+
|
58
74
|
inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
59
75
|
for (int i = 0; i < n; ++i) {
|
60
|
-
z[i] =
|
76
|
+
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
|
61
77
|
}
|
62
78
|
}
|
63
79
|
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
|
@@ -66,7 +82,7 @@ inline static void ggml_vec_acc1_f32(const int n, float * y, const float v)
|
|
66
82
|
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
|
67
83
|
inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
68
84
|
for (int i = 0; i < n; ++i) {
|
69
|
-
z[i] =
|
85
|
+
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i]));
|
70
86
|
}
|
71
87
|
}
|
72
88
|
inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
@@ -74,20 +90,20 @@ inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)
|
|
74
90
|
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
|
75
91
|
inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
76
92
|
for (int i = 0; i < n; ++i) {
|
77
|
-
y[i] =
|
93
|
+
y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i]));
|
78
94
|
}
|
79
95
|
}
|
80
96
|
|
81
97
|
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
82
98
|
inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
83
99
|
for (int i = 0; i < n; ++i) {
|
84
|
-
z[i] =
|
100
|
+
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i]));
|
85
101
|
}
|
86
102
|
}
|
87
103
|
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
88
104
|
inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
89
105
|
for (int i = 0; i < n; ++i) {
|
90
|
-
z[i] =
|
106
|
+
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i]));
|
91
107
|
}
|
92
108
|
}
|
93
109
|
|
@@ -103,40 +119,153 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
|
103
119
|
}
|
104
120
|
|
105
121
|
#if defined(GGML_SIMD)
|
106
|
-
|
122
|
+
#if defined(__ARM_FEATURE_SVE)
|
123
|
+
|
124
|
+
const int sve_register_length = svcntb() * 8;
|
125
|
+
const int ggml_f16_epr = sve_register_length / 16; // running when 16
|
126
|
+
const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
|
127
|
+
|
128
|
+
const int np = (n & ~(ggml_f16_step - 1));
|
129
|
+
|
130
|
+
svfloat16_t sum_00 = svdup_n_f16(0.0f);
|
131
|
+
svfloat16_t sum_01 = svdup_n_f16(0.0f);
|
132
|
+
svfloat16_t sum_02 = svdup_n_f16(0.0f);
|
133
|
+
svfloat16_t sum_03 = svdup_n_f16(0.0f);
|
134
|
+
|
135
|
+
svfloat16_t sum_10 = svdup_n_f16(0.0f);
|
136
|
+
svfloat16_t sum_11 = svdup_n_f16(0.0f);
|
137
|
+
svfloat16_t sum_12 = svdup_n_f16(0.0f);
|
138
|
+
svfloat16_t sum_13 = svdup_n_f16(0.0f);
|
139
|
+
|
140
|
+
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
141
|
+
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
142
|
+
|
143
|
+
for (int i = 0; i < np; i += ggml_f16_step) {
|
144
|
+
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
|
145
|
+
|
146
|
+
ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elemnst
|
147
|
+
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
|
148
|
+
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
|
149
|
+
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
|
150
|
+
|
151
|
+
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
|
152
|
+
|
153
|
+
ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 ekements
|
154
|
+
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
|
155
|
+
ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
|
156
|
+
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
|
107
157
|
|
108
|
-
|
158
|
+
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
109
159
|
|
110
|
-
|
111
|
-
|
160
|
+
ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
|
161
|
+
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
|
162
|
+
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
|
163
|
+
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
|
112
164
|
|
113
|
-
|
114
|
-
for (int j = 0; j < GGML_F16_ARR; j++) {
|
115
|
-
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
165
|
+
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
116
166
|
|
117
|
-
|
118
|
-
|
167
|
+
ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3);
|
168
|
+
sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
|
169
|
+
ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3);
|
170
|
+
sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
|
119
171
|
|
120
|
-
|
172
|
+
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
173
|
+
|
174
|
+
ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4);
|
175
|
+
|
176
|
+
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
|
177
|
+
ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4);
|
178
|
+
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
|
179
|
+
|
180
|
+
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
181
|
+
|
182
|
+
ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5);
|
183
|
+
|
184
|
+
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
|
185
|
+
ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5);
|
186
|
+
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
|
187
|
+
|
188
|
+
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
189
|
+
|
190
|
+
ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6);
|
191
|
+
|
192
|
+
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
|
193
|
+
ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6);
|
194
|
+
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
|
195
|
+
|
196
|
+
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
197
|
+
|
198
|
+
ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7);
|
199
|
+
|
200
|
+
sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
|
201
|
+
ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7);
|
202
|
+
sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
|
203
|
+
}
|
204
|
+
|
205
|
+
const int np2 = (n & ~(ggml_f16_epr - 1));
|
206
|
+
for (int k = np; k < np2; k += ggml_f16_epr) {
|
207
|
+
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
208
|
+
|
209
|
+
svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0);
|
210
|
+
sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry);
|
211
|
+
rx = GGML_F16x_VEC_LOAD(x[1] + k, 0);
|
212
|
+
sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry);
|
213
|
+
}
|
214
|
+
|
215
|
+
if (np2 < n) {
|
216
|
+
svbool_t pg = svwhilelt_b16(np2, n);
|
217
|
+
svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
|
218
|
+
svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
|
219
|
+
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
220
|
+
|
221
|
+
sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
|
222
|
+
sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
|
223
|
+
}
|
224
|
+
GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
|
225
|
+
GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
|
226
|
+
#elif defined(__riscv_v_intrinsic)
|
227
|
+
// todo: RVV impl
|
228
|
+
for (int i = 0; i < n; ++i) {
|
229
|
+
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
230
|
+
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
231
|
+
}
|
232
|
+
}
|
233
|
+
#else
|
234
|
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
235
|
+
|
236
|
+
GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
|
237
|
+
|
238
|
+
GGML_F16_VEC ax[GGML_F16_ARR];
|
239
|
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
240
|
+
|
241
|
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
242
|
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
243
|
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
244
|
+
|
245
|
+
for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
|
246
|
+
ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
|
247
|
+
|
248
|
+
sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
|
249
|
+
}
|
121
250
|
}
|
122
251
|
}
|
123
|
-
}
|
124
252
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
253
|
+
// reduce sum0..sum3 to sum0
|
254
|
+
for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
|
255
|
+
GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
|
256
|
+
}
|
129
257
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
258
|
+
// leftovers
|
259
|
+
for (int i = np; i < n; ++i) {
|
260
|
+
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
261
|
+
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
262
|
+
}
|
134
263
|
}
|
135
|
-
|
264
|
+
#endif
|
136
265
|
#else
|
137
266
|
for (int i = 0; i < n; ++i) {
|
138
267
|
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
139
|
-
sumf[j] += (ggml_float)(
|
268
|
+
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
140
269
|
}
|
141
270
|
}
|
142
271
|
#endif
|
@@ -148,27 +277,116 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
|
148
277
|
|
149
278
|
inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
|
150
279
|
#if defined(GGML_SIMD)
|
151
|
-
|
280
|
+
#if defined(__ARM_FEATURE_SVE)
|
281
|
+
|
282
|
+
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
|
283
|
+
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
|
284
|
+
const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
|
285
|
+
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
286
|
+
|
287
|
+
const int np = (n & ~(ggml_f32_step - 1));
|
288
|
+
svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
289
|
+
svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
290
|
+
for (int i = 0; i < np; i += ggml_f32_step) {
|
291
|
+
|
292
|
+
ax1 = GGML_F32_VEC_LOAD(x + i);
|
293
|
+
ay1 = GGML_F32_VEC_LOAD(y + i);
|
294
|
+
ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
|
295
|
+
|
296
|
+
GGML_F32_VEC_STORE(y + i, ay1);
|
297
|
+
|
298
|
+
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
|
299
|
+
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
300
|
+
ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
|
152
301
|
|
153
|
-
|
302
|
+
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
|
154
303
|
|
155
|
-
|
156
|
-
|
304
|
+
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
|
305
|
+
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
|
306
|
+
ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
|
157
307
|
|
158
|
-
|
159
|
-
for (int j = 0; j < GGML_F32_ARR; j++) {
|
160
|
-
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
161
|
-
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
162
|
-
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
|
308
|
+
GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
|
163
309
|
|
164
|
-
|
310
|
+
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
|
311
|
+
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
|
312
|
+
ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
|
313
|
+
|
314
|
+
GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
|
315
|
+
|
316
|
+
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
|
317
|
+
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
|
318
|
+
ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
|
319
|
+
|
320
|
+
GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
|
321
|
+
|
322
|
+
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
|
323
|
+
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
|
324
|
+
ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
|
325
|
+
|
326
|
+
GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
|
327
|
+
|
328
|
+
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
|
329
|
+
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
|
330
|
+
ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
|
331
|
+
|
332
|
+
GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
|
333
|
+
|
334
|
+
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
|
335
|
+
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
|
336
|
+
ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
|
337
|
+
|
338
|
+
GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
|
165
339
|
}
|
166
|
-
|
340
|
+
// leftovers
|
341
|
+
// Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
|
342
|
+
const int np2 = (n & ~(ggml_f32_epr - 1));
|
343
|
+
for (int i = np; i < np2; i += ggml_f32_epr) {
|
344
|
+
ax1 = GGML_F32_VEC_LOAD(x + i);
|
345
|
+
ay1 = GGML_F32_VEC_LOAD(y + i);
|
346
|
+
ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
|
347
|
+
|
348
|
+
GGML_F32_VEC_STORE(y + i, ay1);
|
349
|
+
}
|
350
|
+
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
351
|
+
if (np2 < n) {
|
352
|
+
svbool_t pg =svwhilelt_b32(np2, n);
|
353
|
+
ax1 = svld1_f32(pg, x + np2);
|
354
|
+
ay1 = svld1_f32(pg, y + np2);
|
355
|
+
ay1 = svmad_f32_m(pg, ax1, vx, ay1);
|
356
|
+
|
357
|
+
svst1_f32(pg, y + np2, ay1);
|
358
|
+
}
|
359
|
+
#elif defined(__riscv_v_intrinsic)
|
360
|
+
for (int i = 0, avl; i < n; i += avl) {
|
361
|
+
avl = __riscv_vsetvl_e32m8(n - i);
|
362
|
+
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
|
363
|
+
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
|
364
|
+
vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, v, ay, avl);
|
365
|
+
__riscv_vse32_v_f32m8(&y[i], ny, avl);
|
366
|
+
}
|
367
|
+
#else
|
368
|
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
167
369
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
370
|
+
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
371
|
+
|
372
|
+
GGML_F32_VEC ax[GGML_F32_ARR];
|
373
|
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
374
|
+
|
375
|
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
376
|
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
377
|
+
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
378
|
+
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
379
|
+
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
|
380
|
+
|
381
|
+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
382
|
+
}
|
383
|
+
}
|
384
|
+
|
385
|
+
// leftovers
|
386
|
+
for (int i = np; i < n; ++i) {
|
387
|
+
y[i] += x[i]*v;
|
388
|
+
}
|
389
|
+
#endif
|
172
390
|
#else
|
173
391
|
// scalar
|
174
392
|
for (int i = 0; i < n; ++i) {
|
@@ -179,31 +397,116 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
|
179
397
|
|
180
398
|
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
|
181
399
|
#if defined(GGML_SIMD)
|
182
|
-
|
400
|
+
#if defined(__ARM_FEATURE_SVE)
|
401
|
+
const int sve_register_length = svcntb() * 8;
|
402
|
+
const int ggml_f16_epr = sve_register_length / 16;
|
403
|
+
const int ggml_f16_step = 8 * ggml_f16_epr;
|
404
|
+
|
405
|
+
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
|
406
|
+
|
407
|
+
const int np= (n & ~(ggml_f16_step - 1));
|
408
|
+
|
409
|
+
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
410
|
+
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
411
|
+
for (int i = 0; i < np; i += ggml_f16_step) {
|
412
|
+
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
|
413
|
+
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
|
414
|
+
ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
|
415
|
+
|
416
|
+
GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
|
417
|
+
|
418
|
+
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
|
419
|
+
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
|
420
|
+
ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
|
421
|
+
|
422
|
+
GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
|
183
423
|
|
184
|
-
|
424
|
+
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
|
425
|
+
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
426
|
+
ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
|
185
427
|
|
186
|
-
|
187
|
-
GGML_F16_VEC ay[GGML_F16_ARR];
|
428
|
+
GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
|
188
429
|
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
193
|
-
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
430
|
+
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
|
431
|
+
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
432
|
+
ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
|
194
433
|
|
195
|
-
|
434
|
+
GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
|
435
|
+
|
436
|
+
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
|
437
|
+
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
438
|
+
ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
|
439
|
+
|
440
|
+
GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
|
441
|
+
|
442
|
+
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
|
443
|
+
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
444
|
+
ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
|
445
|
+
|
446
|
+
GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
|
447
|
+
|
448
|
+
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
|
449
|
+
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
450
|
+
ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
|
451
|
+
|
452
|
+
GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
|
453
|
+
|
454
|
+
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
|
455
|
+
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
456
|
+
ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
|
457
|
+
|
458
|
+
GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
|
196
459
|
}
|
197
|
-
|
460
|
+
const int np2 = (n & ~(ggml_f16_epr - 1));
|
461
|
+
for (int k = np; k < np2; k += ggml_f16_epr) {
|
462
|
+
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
|
463
|
+
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
464
|
+
ry = GGML_F16x_VEC_FMA(ry, rx, vx);
|
198
465
|
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
466
|
+
GGML_F16x_VEC_STORE(y + k, ry, 0);
|
467
|
+
}
|
468
|
+
|
469
|
+
if (np2 < n) {
|
470
|
+
svbool_t pg = svwhilelt_b16(np2, n);
|
471
|
+
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
472
|
+
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
473
|
+
hy = svmad_f16_x(pg, hx, vx, hy);
|
474
|
+
svst1_f16(pg, (__fp16 *)(y + np2), hy);
|
475
|
+
}
|
476
|
+
|
477
|
+
#elif defined(__riscv_v_intrinsic)
|
478
|
+
// todo: RVV impl
|
479
|
+
// scalar
|
480
|
+
for (int i = 0; i < n; ++i) {
|
481
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
482
|
+
}
|
483
|
+
#else
|
484
|
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
485
|
+
|
486
|
+
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
487
|
+
|
488
|
+
GGML_F16_VEC ax[GGML_F16_ARR];
|
489
|
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
490
|
+
|
491
|
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
492
|
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
493
|
+
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
494
|
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
495
|
+
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
496
|
+
|
497
|
+
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
498
|
+
}
|
499
|
+
}
|
500
|
+
|
501
|
+
// leftovers
|
502
|
+
for (int i = np; i < n; ++i) {
|
503
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
504
|
+
}
|
505
|
+
#endif
|
203
506
|
#else
|
204
507
|
// scalar
|
205
508
|
for (int i = 0; i < n; ++i) {
|
206
|
-
y[i] =
|
509
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
207
510
|
}
|
208
511
|
#endif
|
209
512
|
}
|
@@ -220,36 +523,55 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
|
|
220
523
|
}
|
221
524
|
|
222
525
|
#if defined(GGML_SIMD)
|
223
|
-
|
526
|
+
#if defined(__ARM_FEATURE_SVE)
|
527
|
+
// scalar Route to scalar implementation //TODO: Write SVE code
|
528
|
+
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
529
|
+
for (int i = 0; i < n; ++i) {
|
530
|
+
y[i] += x[k][i]*v[k][0];
|
531
|
+
}
|
532
|
+
}
|
533
|
+
#elif defined(__riscv_v_intrinsic)
|
534
|
+
for (int i = 0, avl; i < n; i += avl) {
|
535
|
+
avl = __riscv_vsetvl_e32m8(n - i);
|
536
|
+
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
|
537
|
+
for (int k = 0; k < GGML_VEC_MAD_UNROLL; k++) {
|
538
|
+
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[k][i], avl);
|
539
|
+
ay = __riscv_vfmadd_vf_f32m8(ax, v[k][0], ay, avl);
|
540
|
+
}
|
541
|
+
__riscv_vse32_v_f32m8(&y[i], ay, avl);
|
542
|
+
}
|
543
|
+
#else
|
544
|
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
224
545
|
|
225
|
-
|
546
|
+
GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
|
226
547
|
|
227
|
-
|
228
|
-
|
229
|
-
|
548
|
+
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
549
|
+
vx[k] = GGML_F32_VEC_SET1(v[k][0]);
|
550
|
+
}
|
230
551
|
|
231
|
-
|
232
|
-
|
552
|
+
GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
|
553
|
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
233
554
|
|
234
|
-
|
235
|
-
|
236
|
-
|
555
|
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
556
|
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
557
|
+
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
237
558
|
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
559
|
+
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
560
|
+
ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
|
561
|
+
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
|
562
|
+
}
|
242
563
|
|
243
|
-
|
564
|
+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
565
|
+
}
|
244
566
|
}
|
245
|
-
}
|
246
567
|
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
568
|
+
// leftovers
|
569
|
+
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
570
|
+
for (int i = np; i < n; ++i) {
|
571
|
+
y[i] += x[k][i]*v[k][0];
|
572
|
+
}
|
251
573
|
}
|
252
|
-
|
574
|
+
#endif
|
253
575
|
#else
|
254
576
|
// scalar
|
255
577
|
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
@@ -260,30 +582,112 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
|
|
260
582
|
#endif
|
261
583
|
}
|
262
584
|
|
585
|
+
inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
|
586
|
+
#if defined(GGML_USE_ACCELERATE)
|
587
|
+
vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
|
588
|
+
#elif defined(GGML_SIMD)
|
589
|
+
#if defined(__ARM_FEATURE_SVE)
|
590
|
+
// scalar ; TODO: Write SVE code
|
591
|
+
for (int i = 0; i < n; ++i) {
|
592
|
+
y[i] = x[i]*s + b;
|
593
|
+
}
|
594
|
+
#elif defined(__riscv_v_intrinsic)
|
595
|
+
for (int i = 0, avl; i < n; i += avl) {
|
596
|
+
avl = __riscv_vsetvl_e32m8(n - i);
|
597
|
+
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
|
598
|
+
vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(b, avl);
|
599
|
+
vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, s, vb, avl);
|
600
|
+
__riscv_vse32_v_f32m8(&y[i], ny, avl);
|
601
|
+
}
|
602
|
+
#else
|
603
|
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
604
|
+
|
605
|
+
GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
|
606
|
+
GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
|
607
|
+
|
608
|
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
609
|
+
|
610
|
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
611
|
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
612
|
+
ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
613
|
+
ay[j] = GGML_F32_VEC_FMA(vb, ay[j], vs);
|
614
|
+
|
615
|
+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
616
|
+
}
|
617
|
+
}
|
618
|
+
|
619
|
+
// leftovers
|
620
|
+
for (int i = np; i < n; ++i) {
|
621
|
+
y[i] = x[i]*s + b;
|
622
|
+
}
|
623
|
+
#endif
|
624
|
+
#else
|
625
|
+
// scalar
|
626
|
+
for (int i = 0; i < n; ++i) {
|
627
|
+
y[i] = x[i]*s + b;
|
628
|
+
}
|
629
|
+
#endif
|
630
|
+
}
|
631
|
+
|
263
632
|
//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
|
264
633
|
inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
265
634
|
#if defined(GGML_USE_ACCELERATE)
|
266
635
|
vDSP_vsmul(y, 1, &v, y, 1, n);
|
267
636
|
#elif defined(GGML_SIMD)
|
268
|
-
|
637
|
+
#if defined(__ARM_FEATURE_SVE)
|
638
|
+
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
|
639
|
+
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
|
640
|
+
const int ggml_f32_step = 2 * ggml_f32_epr;
|
641
|
+
|
642
|
+
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
643
|
+
const int np = (n & ~(ggml_f32_step - 1));
|
644
|
+
svfloat32_t ay1;
|
645
|
+
svfloat32_t ay2;
|
646
|
+
for (int i = 0; i < np; i += ggml_f32_step) {
|
647
|
+
ay1 = GGML_F32_VEC_LOAD(y + i);
|
648
|
+
ay1 = GGML_F32_VEC_MUL(ay1, vx);
|
649
|
+
GGML_F32_VEC_STORE(y + i, ay1);
|
650
|
+
|
651
|
+
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
652
|
+
ay2 = GGML_F32_VEC_MUL(ay2, vx);
|
653
|
+
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
|
654
|
+
}
|
655
|
+
// leftovers
|
656
|
+
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
657
|
+
if (np < n) {
|
658
|
+
svbool_t pg = svwhilelt_b32(np, n);
|
659
|
+
ay1 = svld1_f32(pg, y + np);
|
660
|
+
ay1 = svmul_f32_m(pg, ay1, vx);
|
661
|
+
svst1_f32(pg, y + np, ay1);
|
662
|
+
}
|
663
|
+
#elif defined(__riscv_v_intrinsic)
|
664
|
+
for (int i = 0, avl; i < n; i += avl) {
|
665
|
+
avl = __riscv_vsetvl_e32m8(n - i);
|
666
|
+
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
|
667
|
+
vfloat32m8_t ny = __riscv_vfmul_vf_f32m8(ay, v, avl);
|
668
|
+
__riscv_vse32_v_f32m8(&y[i], ny, avl);
|
669
|
+
}
|
670
|
+
#else
|
671
|
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
269
672
|
|
270
|
-
|
673
|
+
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
271
674
|
|
272
|
-
|
675
|
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
273
676
|
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
677
|
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
678
|
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
679
|
+
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
680
|
+
ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
|
278
681
|
|
279
|
-
|
682
|
+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
683
|
+
}
|
280
684
|
}
|
281
|
-
}
|
282
685
|
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
686
|
+
// leftovers
|
687
|
+
for (int i = np; i < n; ++i) {
|
688
|
+
y[i] *= v;
|
689
|
+
}
|
690
|
+
#endif
|
287
691
|
#else
|
288
692
|
// scalar
|
289
693
|
for (int i = 0; i < n; ++i) {
|
@@ -294,29 +698,63 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
|
294
698
|
|
295
699
|
inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
|
296
700
|
#if defined(GGML_SIMD)
|
297
|
-
|
701
|
+
#if defined(__ARM_FEATURE_SVE)
|
702
|
+
const int sve_register_length = svcntb() * 8;
|
703
|
+
const int ggml_f16_epr = sve_register_length / 16;
|
704
|
+
const int ggml_f16_step = 2 * ggml_f16_epr;
|
705
|
+
|
706
|
+
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
|
707
|
+
const int np = (n & ~(ggml_f16_step - 1));
|
708
|
+
svfloat16_t ay1, ay2;
|
709
|
+
|
710
|
+
for (int i = 0; i < np; i += ggml_f16_step) {
|
711
|
+
ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
|
712
|
+
ay1 = GGML_F16x_VEC_MUL(ay1, vx);
|
713
|
+
GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
|
714
|
+
|
715
|
+
ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
|
716
|
+
ay2 = GGML_F16x_VEC_MUL(ay2, vx);
|
717
|
+
GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
|
718
|
+
}
|
719
|
+
// leftovers
|
720
|
+
// maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
|
721
|
+
if (np < n) {
|
722
|
+
svbool_t pg = svwhilelt_b16(np, n);
|
723
|
+
svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
|
724
|
+
svfloat16_t out = svmul_f16_m(pg, hy, vx);
|
725
|
+
svst1_f16(pg, (__fp16 *)(y + np), out);
|
726
|
+
}
|
727
|
+
#elif defined(__riscv_v_intrinsic)
|
728
|
+
// todo: RVV impl
|
729
|
+
// scalar
|
730
|
+
for (int i = 0; i < n; ++i) {
|
731
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
732
|
+
}
|
733
|
+
#else
|
734
|
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
298
735
|
|
299
|
-
|
736
|
+
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
300
737
|
|
301
|
-
|
738
|
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
302
739
|
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
740
|
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
741
|
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
742
|
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
743
|
+
ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
|
307
744
|
|
308
|
-
|
745
|
+
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
746
|
+
}
|
309
747
|
}
|
310
|
-
}
|
311
748
|
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
749
|
+
// leftovers
|
750
|
+
for (int i = np; i < n; ++i) {
|
751
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
752
|
+
}
|
753
|
+
#endif
|
316
754
|
#else
|
317
755
|
// scalar
|
318
756
|
for (int i = 0; i < n; ++i) {
|
319
|
-
y[i] =
|
757
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
320
758
|
}
|
321
759
|
#endif
|
322
760
|
}
|
@@ -325,103 +763,103 @@ inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) {
|
|
325
763
|
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
326
764
|
inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
327
765
|
for (int i = 0; i < n; ++i) {
|
328
|
-
float v =
|
329
|
-
y[i] =
|
766
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
767
|
+
y[i] = GGML_CPU_FP32_TO_FP16(v*v);
|
330
768
|
}
|
331
769
|
}
|
332
770
|
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
333
771
|
inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
334
772
|
for (int i = 0; i < n; ++i) {
|
335
|
-
y[i] =
|
773
|
+
y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i])));
|
336
774
|
}
|
337
775
|
}
|
338
776
|
inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
|
339
777
|
inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
340
778
|
for (int i = 0; i < n; ++i) {
|
341
|
-
y[i] =
|
779
|
+
y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i])));
|
342
780
|
}
|
343
781
|
}
|
344
782
|
inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
|
345
783
|
inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
346
784
|
for (int i = 0; i < n; ++i) {
|
347
|
-
y[i] =
|
785
|
+
y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i])));
|
348
786
|
}
|
349
787
|
}
|
350
788
|
inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
|
351
789
|
inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
352
790
|
for (int i = 0; i < n; ++i) {
|
353
|
-
y[i] =
|
791
|
+
y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i])));
|
354
792
|
}
|
355
793
|
}
|
356
794
|
inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
|
357
795
|
inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
358
796
|
for (int i = 0; i < n; ++i) {
|
359
|
-
y[i] =
|
797
|
+
y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i])));
|
360
798
|
}
|
361
799
|
}
|
362
800
|
inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
|
363
801
|
inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
364
802
|
for (int i = 0; i < n; ++i) {
|
365
|
-
float v =
|
366
|
-
y[i] =
|
803
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
804
|
+
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
|
367
805
|
}
|
368
806
|
}
|
369
807
|
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
|
370
808
|
inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
371
809
|
for (int i = 0; i < n; ++i) {
|
372
|
-
y[i] =
|
810
|
+
y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
|
373
811
|
}
|
374
812
|
}
|
375
813
|
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
376
814
|
inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
377
815
|
for (int i = 0; i < n; ++i) {
|
378
|
-
y[i] =
|
816
|
+
y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i])));
|
379
817
|
}
|
380
818
|
}
|
381
819
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
|
382
820
|
inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
383
821
|
for (int i = 0; i < n; ++i) {
|
384
|
-
y[i] =
|
822
|
+
y[i] = GGML_CPU_FP32_TO_FP16(expm1f(GGML_CPU_FP16_TO_FP32(x[i])));
|
385
823
|
}
|
386
824
|
}
|
387
825
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
388
826
|
inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
389
827
|
for (int i = 0; i < n; ++i) {
|
390
|
-
float v =
|
391
|
-
y[i] =
|
828
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
829
|
+
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f);
|
392
830
|
}
|
393
831
|
}
|
394
832
|
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
395
833
|
inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
|
396
834
|
for (int i = 0; i < n; ++i) {
|
397
|
-
float v =
|
398
|
-
y[i] =
|
835
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
836
|
+
y[i] = GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
|
399
837
|
}
|
400
838
|
}
|
401
839
|
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
|
402
840
|
inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
403
841
|
for (int i = 0; i < n; ++i) {
|
404
|
-
y[i] =
|
842
|
+
y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i]))));
|
405
843
|
}
|
406
844
|
}
|
407
845
|
// TODO: optimize performance
|
408
846
|
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
409
847
|
inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
410
848
|
for (int i = 0; i < n; ++i) {
|
411
|
-
float v =
|
412
|
-
y[i] =
|
849
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
850
|
+
y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
|
413
851
|
}
|
414
852
|
}
|
415
853
|
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
416
854
|
inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
417
855
|
for (int i = 0; i < n; ++i) {
|
418
|
-
y[i] =
|
856
|
+
y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
|
419
857
|
}
|
420
858
|
}
|
421
859
|
inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
|
422
860
|
inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
423
861
|
for (int i = 0; i < n; ++i) {
|
424
|
-
y[i] =
|
862
|
+
y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i])));
|
425
863
|
}
|
426
864
|
}
|
427
865
|
|
@@ -443,9 +881,9 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
|
|
443
881
|
|
444
882
|
inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
445
883
|
for (int i = 0; i < n; ++i) {
|
446
|
-
float xi =
|
884
|
+
float xi = GGML_CPU_FP16_TO_FP32(x[i]);
|
447
885
|
float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
|
448
|
-
y[i] =
|
886
|
+
y[i] = GGML_CPU_FP32_TO_FP16(res);
|
449
887
|
}
|
450
888
|
}
|
451
889
|
|
@@ -458,9 +896,9 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
|
458
896
|
} else if (x[i] >= 10.0f) {
|
459
897
|
y[i] = x[i];
|
460
898
|
} else {
|
461
|
-
ggml_fp16_t fp16 =
|
899
|
+
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
462
900
|
memcpy(&t, &fp16, sizeof(uint16_t));
|
463
|
-
y[i] =
|
901
|
+
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]);
|
464
902
|
}
|
465
903
|
}
|
466
904
|
}
|
@@ -494,9 +932,9 @@ inline static float ggml_gelu_quick_f32(float x) {
|
|
494
932
|
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
|
495
933
|
uint16_t t;
|
496
934
|
for (int i = 0; i < n; ++i) {
|
497
|
-
ggml_fp16_t fp16 =
|
935
|
+
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
498
936
|
memcpy(&t, &fp16, sizeof(uint16_t));
|
499
|
-
y[i] =
|
937
|
+
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
|
500
938
|
}
|
501
939
|
}
|
502
940
|
#else
|
@@ -509,8 +947,8 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
|
|
509
947
|
|
510
948
|
inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
511
949
|
for (int i = 0; i < n; ++i) {
|
512
|
-
float v =
|
513
|
-
y[i] =
|
950
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
951
|
+
y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
|
514
952
|
}
|
515
953
|
}
|
516
954
|
|
@@ -519,8 +957,8 @@ inline static float ggml_silu_f32(float x) {
|
|
519
957
|
return x/(1.0f + expf(-x));
|
520
958
|
}
|
521
959
|
inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
|
522
|
-
float v =
|
523
|
-
return
|
960
|
+
float v = GGML_CPU_FP16_TO_FP32(x);
|
961
|
+
return GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v)));
|
524
962
|
}
|
525
963
|
|
526
964
|
#if __FINITE_MATH_ONLY__
|
@@ -528,7 +966,75 @@ inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
|
|
528
966
|
#error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
|
529
967
|
#endif
|
530
968
|
|
531
|
-
|
969
|
+
/* Below function was borrowed from the GitHub repository:
|
970
|
+
https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */
|
971
|
+
#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
972
|
+
inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
|
973
|
+
// Constants
|
974
|
+
const svfloat32_t log2_e = svdup_n_f32(1.4426950409f);
|
975
|
+
const svfloat32_t ln2 = svdup_n_f32(0.6931473921f);
|
976
|
+
const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f);
|
977
|
+
const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1));
|
978
|
+
const svfloat32_t one = svdup_n_f32(1.0f);
|
979
|
+
const svfloat32_t inactive1 = svdup_n_f32(0.0f);
|
980
|
+
const svint32_t inactive2 = svdup_n_s32(0);
|
981
|
+
|
982
|
+
// Algorithm starts here
|
983
|
+
svfloat32_t t0 = svmul_f32_m(pg, src, log2_e); // y = x * log2(e)
|
984
|
+
svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0); // rount to int (float)
|
985
|
+
svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1); // n
|
986
|
+
|
987
|
+
t1 = svsub_f32_m(pg, t0, t1); // a = y - floor(y)
|
988
|
+
t1 = svadd_f32_m(pg, t1, one); // b = a + 1
|
989
|
+
|
990
|
+
svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17); // v = b >> 17 (u32)
|
991
|
+
svfloat32_t t4 = svexpa_f32(t3); // c = fexpa(v)
|
992
|
+
t4 = svscale_f32_m(pg, t4, t2); // fexpa(v) * 2^(n)
|
993
|
+
|
994
|
+
// and_(t2.d, t1.d, not_mask17.d)
|
995
|
+
svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
|
996
|
+
t5 = svsub_f32_m(pg, t1, t5); // z
|
997
|
+
t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq); // ln2 + half_ln2_sq * z
|
998
|
+
t0 = svmla_f32_m(pg, one, t5, t0); // 1 + (ln2 * z) + (half_ln2_sq * z * z)
|
999
|
+
t0 = svmul_f32_m(pg, t0, t4); // Final result
|
1000
|
+
|
1001
|
+
return t0;
|
1002
|
+
}
|
1003
|
+
#endif
|
1004
|
+
|
1005
|
+
#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
1006
|
+
|
1007
|
+
inline static svfloat32_t ggml_v_expf(svbool_t pg, svfloat32_t x) {
|
1008
|
+
const svfloat32_t r = svdup_n_f32_x(pg, 0x1.8p23f);
|
1009
|
+
const svfloat32_t z = svmla_n_f32_x(pg, r, x, 0x1.715476p+0f);
|
1010
|
+
const svfloat32_t n = svsub_f32_x(pg, z, r);
|
1011
|
+
const svfloat32_t b = svmls_n_f32_x(pg, svmls_n_f32_x(pg, x, n, 0x1.62e4p-1f), n, 0x1.7f7d1cp-20f);
|
1012
|
+
const svuint32_t e = svlsl_n_u32_x(pg, svreinterpret_u32_f32(z), 23);
|
1013
|
+
const svfloat32_t k = svreinterpret_f32_u32(svadd_u32_x(pg, e, svreinterpret_u32_f32(svdup_n_f32_x(pg, 1))));
|
1014
|
+
const svbool_t c = svacgt_n_f32(pg, n, 126);
|
1015
|
+
const svfloat32_t u = svmul_f32_x(pg, b, b);
|
1016
|
+
const svfloat32_t j = svmla_f32_x(pg,
|
1017
|
+
svmul_n_f32_x(pg, b, 0x1.ffffecp-1f),
|
1018
|
+
svmla_f32_x(pg, svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.fffdb6p-2f), svdup_n_f32_x(pg, 0x1.555e66p-3f), b),
|
1019
|
+
svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.573e2ep-5f), svdup_n_f32_x(pg, 0x1.0e4020p-7f), b), u), u);
|
1020
|
+
const svuint32_t d = svdup_n_u32_z(svcmple_n_f32(pg, n, 0.0), 0x82000000);
|
1021
|
+
const svfloat32_t s1 = svreinterpret_f32_u32(svadd_n_u32_x(pg, d, 0x7f000000));
|
1022
|
+
const svfloat32_t s2 = svreinterpret_f32_u32(svsub_u32_x(pg, e, d));
|
1023
|
+
return svsel_f32(svacgt_f32(pg, n, svdup_n_f32_x(pg, 192)), svmul_f32_x(pg, s1, s1),
|
1024
|
+
svsel_f32(c, svmul_f32_x(pg, svmla_f32_x(pg, s2, s2, j), s1), svmla_f32_x(pg, k, k, j)));
|
1025
|
+
}
|
1026
|
+
|
1027
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
1028
|
+
inline static svfloat32_t ggml_v_silu(svbool_t pg, svfloat32_t x) {
|
1029
|
+
const svfloat32_t one = svdup_n_f32_x(pg, 1.0f);
|
1030
|
+
const svfloat32_t zero = svdup_n_f32_x(pg, 0.0f);
|
1031
|
+
const svfloat32_t neg_x = svsub_f32_x(pg, zero, x);
|
1032
|
+
const svfloat32_t exp_neg_x = ggml_v_expf(pg, neg_x);
|
1033
|
+
const svfloat32_t one_plus_exp_neg_x = svadd_f32_x(pg, one, exp_neg_x);
|
1034
|
+
return svdiv_f32_x(pg, x, one_plus_exp_neg_x);
|
1035
|
+
}
|
1036
|
+
|
1037
|
+
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
532
1038
|
|
533
1039
|
// adapted from arm limited optimized routine
|
534
1040
|
// the maximum error is 1.45358 plus 0.5 ulps
|
@@ -719,7 +1225,59 @@ inline static __m128 ggml_v_silu(__m128 x) {
|
|
719
1225
|
return _mm_div_ps(x, one_plus_exp_neg_x);
|
720
1226
|
}
|
721
1227
|
|
722
|
-
#
|
1228
|
+
#elif defined(__riscv_v_intrinsic)
|
1229
|
+
|
1230
|
+
// adapted from arm limited optimized routine
|
1231
|
+
// the maximum error is 1.45358 plus 0.5 ulps
|
1232
|
+
// numbers above 88.38 will flush to infinity
|
1233
|
+
// numbers beneath -103.97 will flush to zero
|
1234
|
+
inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) {
|
1235
|
+
const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2(0x1.8p23f, vl);
|
1236
|
+
#ifdef __riscv_xtheadvector
|
1237
|
+
// workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4')
|
1238
|
+
vfloat32m2_t z = __riscv_vfadd_vf_f32m2(r, 0.0f, vl);
|
1239
|
+
z = __riscv_vfmacc_vf_f32m2(z, 0x1.715476p+0f, x, vl);
|
1240
|
+
#else
|
1241
|
+
const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2(r, 0x1.715476p+0f, x, vl);
|
1242
|
+
#endif
|
1243
|
+
const vfloat32m2_t n = __riscv_vfsub_vv_f32m2(z, r, vl);
|
1244
|
+
const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2(__riscv_vfnmsac_vf_f32m2(x, 0x1.62e4p-1f, n, vl),
|
1245
|
+
0x1.7f7d1cp-20f, n, vl);
|
1246
|
+
const vuint32m2_t e = __riscv_vsll_vx_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(z), 23, vl);
|
1247
|
+
const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(e, 0x3f800000, vl)); // 1.0f
|
1248
|
+
const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 126.0f, vl);
|
1249
|
+
const vfloat32m2_t u = __riscv_vfmul_vv_f32m2(b, b, vl);
|
1250
|
+
const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2(
|
1251
|
+
__riscv_vfmul_vf_f32m2(b, 0x1.ffffecp-1f, vl),
|
1252
|
+
__riscv_vfmacc_vv_f32m2(
|
1253
|
+
__riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.fffdb6p-2f, vl), 0x1.555e66p-3f, b, vl),
|
1254
|
+
__riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.573e2ep-5f, vl), 0x1.0e4020p-7f, b, vl),
|
1255
|
+
u, vl), u, vl);
|
1256
|
+
if (!__riscv_vcpop_m_b16(c, vl))
|
1257
|
+
return __riscv_vfmacc_vv_f32m2(k, j, k, vl);
|
1258
|
+
const vbool16_t dm = __riscv_vmfle_vf_f32m2_b16(n, 0.0f, vl);
|
1259
|
+
const vuint32m2_t d = __riscv_vmerge_vxm_u32m2(__riscv_vmv_v_x_u32m2(0, vl), 0x82000000, dm, vl);
|
1260
|
+
const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(d, 0x7f000000, vl));
|
1261
|
+
const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vsub_vv_u32m2(e, d, vl));
|
1262
|
+
const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2(
|
1263
|
+
__riscv_vfmacc_vv_f32m2(k, k, j, vl),
|
1264
|
+
__riscv_vfmul_vv_f32m2(__riscv_vfmacc_vv_f32m2(s2, s2, j, vl), s1, vl),
|
1265
|
+
c, vl);
|
1266
|
+
return __riscv_vmerge_vvm_f32m2(
|
1267
|
+
r1, __riscv_vfmul_vv_f32m2(s1, s1, vl),
|
1268
|
+
__riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 192.0f, vl),
|
1269
|
+
vl);
|
1270
|
+
}
|
1271
|
+
|
1272
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
1273
|
+
inline static vfloat32m2_t ggml_v_silu_m2(vfloat32m2_t x, int vl) {
|
1274
|
+
const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl);
|
1275
|
+
const vfloat32m2_t exp_neg_x = ggml_v_expf_m2(neg_x, vl);
|
1276
|
+
const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, 1.0f, vl);
|
1277
|
+
return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl);
|
1278
|
+
}
|
1279
|
+
|
1280
|
+
#endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
|
723
1281
|
|
724
1282
|
inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
725
1283
|
for (int i = 0; i < n; ++i) {
|
@@ -733,9 +1291,9 @@ inline static float ggml_silu_backward_f32(float x, float dy) {
|
|
733
1291
|
}
|
734
1292
|
|
735
1293
|
inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
|
736
|
-
const float v =
|
1294
|
+
const float v = GGML_CPU_FP16_TO_FP32(x);
|
737
1295
|
const float s = 1.0f/(1.0f + expf(-v));
|
738
|
-
return
|
1296
|
+
return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
|
739
1297
|
}
|
740
1298
|
|
741
1299
|
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
|
@@ -750,6 +1308,100 @@ inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, con
|
|
750
1308
|
}
|
751
1309
|
}
|
752
1310
|
|
1311
|
+
inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {
|
1312
|
+
for (int i = 0; i < n; ++i) {
|
1313
|
+
y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f;
|
1314
|
+
}
|
1315
|
+
}
|
1316
|
+
|
1317
|
+
inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
1318
|
+
for (int i = 0; i < n; ++i) {
|
1319
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
1320
|
+
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : 0.f);
|
1321
|
+
}
|
1322
|
+
}
|
1323
|
+
|
1324
|
+
#ifdef GGML_GELU_FP16
|
1325
|
+
inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
|
1326
|
+
uint16_t t;
|
1327
|
+
for (int i = 0; i < n; ++i) {
|
1328
|
+
if (x[i] <= -10.0f) {
|
1329
|
+
y[i] = 0.0f;
|
1330
|
+
} else if (x[i] >= 10.0f) {
|
1331
|
+
y[i] = x[i] * g[i];
|
1332
|
+
} else {
|
1333
|
+
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
1334
|
+
memcpy(&t, &fp16, sizeof(uint16_t));
|
1335
|
+
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i];
|
1336
|
+
}
|
1337
|
+
}
|
1338
|
+
}
|
1339
|
+
#else
|
1340
|
+
inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
|
1341
|
+
for (int i = 0; i < n; ++i) {
|
1342
|
+
y[i] = ggml_gelu_f32(x[i]) * g[i];
|
1343
|
+
}
|
1344
|
+
}
|
1345
|
+
#endif
|
1346
|
+
|
1347
|
+
inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
1348
|
+
const uint16_t * i16 = (const uint16_t *) x;
|
1349
|
+
for (int i = 0; i < n; ++i) {
|
1350
|
+
float v = GGML_CPU_FP16_TO_FP32(g[i]);
|
1351
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v);
|
1352
|
+
}
|
1353
|
+
}
|
1354
|
+
|
1355
|
+
void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g);
|
1356
|
+
|
1357
|
+
inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
1358
|
+
for (int i = 0; i < n; ++i) {
|
1359
|
+
float xi = GGML_CPU_FP16_TO_FP32(x[i]);
|
1360
|
+
float gi = GGML_CPU_FP16_TO_FP32(g[i]);
|
1361
|
+
y[i] = GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
|
1362
|
+
}
|
1363
|
+
}
|
1364
|
+
|
1365
|
+
inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
|
1366
|
+
for (int i = 0; i < n; ++i) {
|
1367
|
+
float xi = x[i];
|
1368
|
+
y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
|
1369
|
+
}
|
1370
|
+
}
|
1371
|
+
|
1372
|
+
inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
1373
|
+
for (int i = 0; i < n; ++i) {
|
1374
|
+
float xi = GGML_CPU_FP16_TO_FP32(x[i]);
|
1375
|
+
float gi = GGML_CPU_FP16_TO_FP32(g[i]);
|
1376
|
+
y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
|
1377
|
+
}
|
1378
|
+
}
|
1379
|
+
|
1380
|
+
#ifdef GGML_GELU_QUICK_FP16
|
1381
|
+
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
|
1382
|
+
uint16_t t;
|
1383
|
+
for (int i = 0; i < n; ++i) {
|
1384
|
+
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
1385
|
+
memcpy(&t, &fp16, sizeof(uint16_t));
|
1386
|
+
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
|
1387
|
+
}
|
1388
|
+
}
|
1389
|
+
#else
|
1390
|
+
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
|
1391
|
+
for (int i = 0; i < n; ++i) {
|
1392
|
+
y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
|
1393
|
+
}
|
1394
|
+
}
|
1395
|
+
#endif
|
1396
|
+
|
1397
|
+
inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
1398
|
+
const uint16_t * i16 = (const uint16_t *) x;
|
1399
|
+
for (int i = 0; i < n; ++i) {
|
1400
|
+
float v = GGML_CPU_FP16_TO_FP32(g[i]);
|
1401
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
|
1402
|
+
}
|
1403
|
+
}
|
1404
|
+
|
753
1405
|
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
|
754
1406
|
#ifndef GGML_USE_ACCELERATE
|
755
1407
|
ggml_float sum = 0.0;
|
@@ -773,7 +1425,7 @@ inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float
|
|
773
1425
|
inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
|
774
1426
|
float sum = 0.0f;
|
775
1427
|
for (int i = 0; i < n; ++i) {
|
776
|
-
sum +=
|
1428
|
+
sum += GGML_CPU_FP16_TO_FP32(x[i]);
|
777
1429
|
}
|
778
1430
|
*s = sum;
|
779
1431
|
}
|