whispercpp 1.3.3 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (963) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -43
  3. data/ext/extconf.rb +2 -2
  4. data/ext/ruby_whisper.c +14 -2
  5. data/ext/ruby_whisper.h +39 -0
  6. data/ext/ruby_whisper_context.c +22 -22
  7. data/ext/ruby_whisper_model.c +12 -12
  8. data/ext/ruby_whisper_params.c +79 -25
  9. data/ext/ruby_whisper_segment.c +84 -19
  10. data/ext/ruby_whisper_token.c +351 -0
  11. data/ext/ruby_whisper_transcribe.cpp +1 -1
  12. data/ext/ruby_whisper_vad_context.c +75 -0
  13. data/ext/ruby_whisper_vad_context_detect.cpp +50 -0
  14. data/ext/ruby_whisper_vad_segment.c +139 -0
  15. data/ext/ruby_whisper_vad_segments.c +106 -0
  16. data/ext/sources/CMakeLists.txt +4 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/cmake/arm64-apple-clang.cmake +16 -0
  19. data/ext/sources/cmake/arm64-windows-llvm.cmake +16 -0
  20. data/ext/sources/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
  21. data/ext/sources/cmake/x64-windows-llvm.cmake +5 -0
  22. data/ext/sources/examples/CMakeLists.txt +1 -0
  23. data/ext/sources/examples/addon.node/addon.cpp +19 -19
  24. data/ext/sources/examples/addon.node/index.js +7 -5
  25. data/ext/sources/examples/addon.node/vad-example.js +2 -2
  26. data/ext/sources/examples/bench/bench.cpp +26 -16
  27. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  28. data/ext/sources/examples/cli/cli.cpp +122 -111
  29. data/ext/sources/examples/command/command.cpp +26 -24
  30. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  31. data/ext/sources/examples/common-ggml.cpp +2 -0
  32. data/ext/sources/examples/lsp/CMakeLists.txt +2 -1
  33. data/ext/sources/examples/lsp/lsp.cpp +19 -17
  34. data/ext/sources/examples/quantize/CMakeLists.txt +2 -1
  35. data/ext/sources/examples/server/server.cpp +34 -24
  36. data/ext/sources/examples/server.py +6 -1
  37. data/ext/sources/examples/stream/stream.cpp +4 -2
  38. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  39. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  40. data/ext/sources/examples/talk-llama/CMakeLists.txt +7 -3
  41. data/ext/sources/examples/talk-llama/llama-adapter.cpp +113 -7
  42. data/ext/sources/examples/talk-llama/llama-adapter.h +13 -1
  43. data/ext/sources/examples/talk-llama/llama-arch.cpp +2136 -1491
  44. data/ext/sources/examples/talk-llama/llama-arch.h +125 -3
  45. data/ext/sources/examples/talk-llama/llama-batch.cpp +174 -100
  46. data/ext/sources/examples/talk-llama/llama-batch.h +46 -20
  47. data/ext/sources/examples/talk-llama/llama-chat.cpp +199 -8
  48. data/ext/sources/examples/talk-llama/llama-chat.h +11 -0
  49. data/ext/sources/examples/talk-llama/llama-context.cpp +1213 -413
  50. data/ext/sources/examples/talk-llama/llama-context.h +99 -36
  51. data/ext/sources/examples/talk-llama/llama-cparams.h +5 -4
  52. data/ext/sources/examples/talk-llama/llama-grammar.cpp +288 -53
  53. data/ext/sources/examples/talk-llama/llama-grammar.h +22 -1
  54. data/ext/sources/examples/talk-llama/llama-graph.cpp +883 -294
  55. data/ext/sources/examples/talk-llama/llama-graph.h +361 -161
  56. data/ext/sources/examples/talk-llama/llama-hparams.cpp +144 -6
  57. data/ext/sources/examples/talk-llama/llama-hparams.h +100 -23
  58. data/ext/sources/examples/talk-llama/llama-impl.cpp +7 -3
  59. data/ext/sources/examples/talk-llama/llama-impl.h +3 -1
  60. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +328 -0
  61. data/ext/sources/examples/talk-llama/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +38 -29
  62. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2100 -0
  63. data/ext/sources/examples/talk-llama/llama-kv-cache.h +373 -27
  64. data/ext/sources/examples/talk-llama/llama-kv-cells.h +124 -30
  65. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +63 -41
  66. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +30 -29
  67. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +77 -35
  68. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +15 -16
  69. data/ext/sources/examples/talk-llama/llama-memory.h +16 -10
  70. data/ext/sources/examples/talk-llama/llama-mmap.cpp +172 -37
  71. data/ext/sources/examples/talk-llama/llama-mmap.h +8 -3
  72. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +93 -9
  73. data/ext/sources/examples/talk-llama/llama-model-loader.h +9 -2
  74. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +3 -0
  75. data/ext/sources/examples/talk-llama/llama-model.cpp +3369 -10145
  76. data/ext/sources/examples/talk-llama/llama-model.h +104 -12
  77. data/ext/sources/examples/talk-llama/llama-quant.cpp +53 -30
  78. data/ext/sources/examples/talk-llama/llama-sampling.cpp +1520 -324
  79. data/ext/sources/examples/talk-llama/llama-sampling.h +19 -7
  80. data/ext/sources/examples/talk-llama/llama-vocab.cpp +562 -39
  81. data/ext/sources/examples/talk-llama/llama-vocab.h +50 -0
  82. data/ext/sources/examples/talk-llama/llama.cpp +794 -12
  83. data/ext/sources/examples/talk-llama/llama.h +246 -190
  84. data/ext/sources/examples/talk-llama/models/afmoe.cpp +191 -0
  85. data/ext/sources/examples/talk-llama/models/apertus.cpp +125 -0
  86. data/ext/sources/examples/talk-llama/models/arcee.cpp +135 -0
  87. data/ext/sources/examples/talk-llama/models/arctic.cpp +138 -0
  88. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +86 -0
  89. data/ext/sources/examples/talk-llama/models/baichuan.cpp +122 -0
  90. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +144 -0
  91. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +135 -0
  92. data/ext/sources/examples/talk-llama/models/bert.cpp +178 -0
  93. data/ext/sources/examples/talk-llama/models/bitnet.cpp +160 -0
  94. data/ext/sources/examples/talk-llama/models/bloom.cpp +101 -0
  95. data/ext/sources/examples/talk-llama/models/chameleon.cpp +178 -0
  96. data/ext/sources/examples/talk-llama/models/chatglm.cpp +132 -0
  97. data/ext/sources/examples/talk-llama/models/codeshell.cpp +111 -0
  98. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +102 -0
  99. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +134 -0
  100. data/ext/sources/examples/talk-llama/models/command-r.cpp +122 -0
  101. data/ext/sources/examples/talk-llama/models/dbrx.cpp +123 -0
  102. data/ext/sources/examples/talk-llama/models/deci.cpp +135 -0
  103. data/ext/sources/examples/talk-llama/models/deepseek.cpp +144 -0
  104. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +259 -0
  105. data/ext/sources/examples/talk-llama/models/dots1.cpp +134 -0
  106. data/ext/sources/examples/talk-llama/models/dream.cpp +105 -0
  107. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +150 -0
  108. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +110 -0
  109. data/ext/sources/examples/talk-llama/models/exaone.cpp +114 -0
  110. data/ext/sources/examples/talk-llama/models/exaone4.cpp +123 -0
  111. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +113 -0
  112. data/ext/sources/examples/talk-llama/models/falcon.cpp +120 -0
  113. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +116 -0
  114. data/ext/sources/examples/talk-llama/models/gemma.cpp +112 -0
  115. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +128 -0
  116. data/ext/sources/examples/talk-llama/models/gemma3.cpp +155 -0
  117. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +384 -0
  118. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +170 -0
  119. data/ext/sources/examples/talk-llama/models/glm4.cpp +150 -0
  120. data/ext/sources/examples/talk-llama/models/gpt2.cpp +105 -0
  121. data/ext/sources/examples/talk-llama/models/gptneox.cpp +144 -0
  122. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +196 -0
  123. data/ext/sources/examples/talk-llama/models/granite.cpp +211 -0
  124. data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +283 -0
  125. data/ext/sources/examples/talk-llama/models/grok.cpp +159 -0
  126. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +141 -0
  127. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +132 -0
  128. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +154 -0
  129. data/ext/sources/examples/talk-llama/models/internlm2.cpp +120 -0
  130. data/ext/sources/examples/talk-llama/models/jais.cpp +86 -0
  131. data/ext/sources/examples/talk-llama/models/jamba.cpp +106 -0
  132. data/ext/sources/examples/talk-llama/models/lfm2.cpp +175 -0
  133. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +122 -0
  134. data/ext/sources/examples/talk-llama/models/llada.cpp +99 -0
  135. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +178 -0
  136. data/ext/sources/examples/talk-llama/models/llama.cpp +168 -0
  137. data/ext/sources/examples/talk-llama/models/maincoder.cpp +117 -0
  138. data/ext/sources/examples/talk-llama/models/mamba.cpp +55 -0
  139. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +123 -0
  140. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +199 -0
  141. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +124 -0
  142. data/ext/sources/examples/talk-llama/models/mistral3.cpp +160 -0
  143. data/ext/sources/examples/talk-llama/models/models.h +569 -0
  144. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +116 -0
  145. data/ext/sources/examples/talk-llama/models/mpt.cpp +126 -0
  146. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +150 -0
  147. data/ext/sources/examples/talk-llama/models/nemotron.cpp +122 -0
  148. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +104 -0
  149. data/ext/sources/examples/talk-llama/models/olmo.cpp +121 -0
  150. data/ext/sources/examples/talk-llama/models/olmo2.cpp +150 -0
  151. data/ext/sources/examples/talk-llama/models/olmoe.cpp +124 -0
  152. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +127 -0
  153. data/ext/sources/examples/talk-llama/models/openelm.cpp +124 -0
  154. data/ext/sources/examples/talk-llama/models/orion.cpp +123 -0
  155. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +121 -0
  156. data/ext/sources/examples/talk-llama/models/phi2.cpp +121 -0
  157. data/ext/sources/examples/talk-llama/models/phi3.cpp +152 -0
  158. data/ext/sources/examples/talk-llama/models/plamo.cpp +110 -0
  159. data/ext/sources/examples/talk-llama/models/plamo2.cpp +316 -0
  160. data/ext/sources/examples/talk-llama/models/plamo3.cpp +128 -0
  161. data/ext/sources/examples/talk-llama/models/plm.cpp +168 -0
  162. data/ext/sources/examples/talk-llama/models/qwen.cpp +108 -0
  163. data/ext/sources/examples/talk-llama/models/qwen2.cpp +126 -0
  164. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +151 -0
  165. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +117 -0
  166. data/ext/sources/examples/talk-llama/models/qwen3.cpp +117 -0
  167. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +124 -0
  168. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +873 -0
  169. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +149 -0
  170. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +141 -0
  171. data/ext/sources/examples/talk-llama/models/refact.cpp +94 -0
  172. data/ext/sources/examples/talk-llama/models/rnd1.cpp +126 -0
  173. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +162 -0
  174. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +94 -0
  175. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +86 -0
  176. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +135 -0
  177. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +90 -0
  178. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +124 -0
  179. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +126 -0
  180. data/ext/sources/examples/talk-llama/models/smollm3.cpp +128 -0
  181. data/ext/sources/examples/talk-llama/models/stablelm.cpp +146 -0
  182. data/ext/sources/examples/talk-llama/models/starcoder.cpp +100 -0
  183. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +121 -0
  184. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +166 -0
  185. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +96 -0
  186. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +149 -0
  187. data/ext/sources/examples/talk-llama/models/xverse.cpp +108 -0
  188. data/ext/sources/examples/talk-llama/talk-llama.cpp +9 -6
  189. data/ext/sources/examples/talk-llama/unicode.cpp +309 -16
  190. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  191. data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +1 -1
  192. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +4 -2
  193. data/ext/sources/examples/whisper.wasm/index-tmpl.html +18 -17
  194. data/ext/sources/ggml/CMakeLists.txt +135 -79
  195. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  196. data/ext/sources/ggml/include/ggml-alloc.h +9 -0
  197. data/ext/sources/ggml/include/ggml-backend.h +21 -2
  198. data/ext/sources/ggml/include/ggml-cpu.h +2 -1
  199. data/ext/sources/ggml/include/ggml-hexagon.h +19 -0
  200. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  201. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  202. data/ext/sources/ggml/include/ggml-rpc.h +8 -11
  203. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  204. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  205. data/ext/sources/ggml/include/ggml-zendnn.h +22 -0
  206. data/ext/sources/ggml/include/ggml.h +406 -23
  207. data/ext/sources/ggml/src/CMakeLists.txt +99 -13
  208. data/ext/sources/ggml/src/ggml-alloc.c +368 -161
  209. data/ext/sources/ggml/src/ggml-backend-impl.h +5 -5
  210. data/ext/sources/ggml/src/ggml-backend-reg.cpp +55 -14
  211. data/ext/sources/ggml/src/ggml-backend.cpp +290 -57
  212. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +17 -3
  213. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +10 -13
  214. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +14 -0
  215. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +59 -45
  216. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +138 -47
  217. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +2586 -1917
  218. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +348 -309
  219. data/ext/sources/ggml/src/ggml-cann/common.h +350 -133
  220. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +894 -625
  221. data/ext/sources/ggml/src/ggml-common.h +17 -0
  222. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +167 -75
  223. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
  224. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
  225. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +560 -622
  226. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1002 -270
  227. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +107 -587
  228. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
  229. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  230. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +373 -486
  231. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
  232. data/ext/sources/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  233. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +521 -353
  234. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
  235. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
  236. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +4682 -1660
  237. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +82 -4
  238. data/ext/sources/ggml/src/ggml-cpu/common.h +14 -0
  239. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +18 -9
  240. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +263 -111
  241. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +39 -28
  242. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +683 -82
  243. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +38 -43
  244. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +435 -119
  245. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
  246. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1234 -1182
  247. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
  248. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +2167 -1480
  249. data/ext/sources/ggml/src/ggml-cpu/ops.h +10 -12
  250. data/ext/sources/ggml/src/ggml-cpu/quants.c +35 -0
  251. data/ext/sources/ggml/src/ggml-cpu/quants.h +8 -0
  252. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1132 -81
  253. data/ext/sources/ggml/src/ggml-cpu/repack.h +36 -0
  254. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +120 -93
  255. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  256. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  257. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  258. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  259. data/ext/sources/ggml/src/ggml-cpu/traits.cpp +2 -2
  260. data/ext/sources/ggml/src/ggml-cpu/traits.h +1 -1
  261. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
  262. data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +7 -0
  263. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +294 -27
  264. data/ext/sources/ggml/src/ggml-cpu/vec.h +606 -48
  265. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +92 -17
  266. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  267. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  268. data/ext/sources/ggml/src/ggml-cuda/argmax.cu +2 -2
  269. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +123 -6
  270. data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +16 -0
  271. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  272. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  273. data/ext/sources/ggml/src/ggml-cuda/common.cuh +588 -128
  274. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  275. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  276. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  277. data/ext/sources/ggml/src/ggml-cuda/convert.cu +95 -22
  278. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +25 -0
  279. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  280. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +335 -485
  281. data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +1 -5
  282. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  283. data/ext/sources/ggml/src/ggml-cuda/cumsum.cu +307 -0
  284. data/ext/sources/ggml/src/ggml-cuda/cumsum.cuh +5 -0
  285. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  286. data/ext/sources/ggml/src/ggml-cuda/diag.cu +77 -0
  287. data/ext/sources/ggml/src/ggml-cuda/diag.cuh +5 -0
  288. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +519 -378
  289. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +750 -637
  290. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
  291. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +1244 -0
  292. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
  293. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +98 -61
  294. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
  295. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +230 -197
  296. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  297. data/ext/sources/ggml/src/ggml-cuda/fill.cu +37 -0
  298. data/ext/sources/ggml/src/ggml-cuda/fill.cuh +3 -0
  299. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  300. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1557 -294
  301. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  302. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  303. data/ext/sources/ggml/src/ggml-cuda/mean.cu +57 -2
  304. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +915 -69
  305. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +171 -0
  306. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +835 -0
  307. data/ext/sources/ggml/src/ggml-cuda/mmid.cu +164 -0
  308. data/ext/sources/ggml/src/ggml-cuda/mmid.cuh +5 -0
  309. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +109 -67
  310. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1601 -733
  311. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +802 -0
  312. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +12 -0
  313. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +286 -149
  314. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -1
  315. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  316. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  317. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  318. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  319. data/ext/sources/ggml/src/ggml-cuda/pad.cu +86 -32
  320. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  321. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  322. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +163 -10
  323. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +14 -0
  324. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  325. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  326. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  327. data/ext/sources/ggml/src/ggml-cuda/rope.cu +207 -98
  328. data/ext/sources/ggml/src/ggml-cuda/rope.cuh +2 -0
  329. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  330. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +330 -0
  331. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  332. data/ext/sources/ggml/src/ggml-cuda/set.cu +39 -0
  333. data/ext/sources/ggml/src/ggml-cuda/set.cuh +7 -0
  334. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  335. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  336. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +325 -61
  337. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +275 -0
  338. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
  339. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +14 -12
  340. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +291 -104
  341. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  342. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +21 -4
  343. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  344. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  345. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  346. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  347. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  348. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  349. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  350. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  351. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  352. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  353. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  354. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  355. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  356. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  357. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  358. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  359. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  360. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  361. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  362. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  363. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  364. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  365. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  366. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  367. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  368. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  369. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  370. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  371. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  372. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  373. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  374. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  375. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  376. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  377. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  378. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  379. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  380. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  381. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  382. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  383. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  384. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  385. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  386. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  387. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  388. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
  389. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  390. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  391. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  392. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  393. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  394. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  395. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  396. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  397. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  398. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  399. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  400. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  401. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  402. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  403. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  404. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  405. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  406. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +96 -0
  407. data/ext/sources/ggml/src/ggml-cuda/top-k.cuh +3 -0
  408. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +351 -0
  409. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +21 -0
  410. data/ext/sources/ggml/src/ggml-cuda/tri.cu +136 -0
  411. data/ext/sources/ggml/src/ggml-cuda/tri.cuh +5 -0
  412. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  413. data/ext/sources/ggml/src/ggml-cuda/unary.cu +189 -5
  414. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +44 -0
  415. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +248 -6
  416. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  417. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +8 -0
  418. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +70 -37
  419. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +10 -3
  420. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +80 -0
  421. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3151 -0
  422. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +44 -0
  423. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +682 -0
  424. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
  425. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  426. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +566 -0
  427. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +112 -0
  428. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  429. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.c +63 -0
  430. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +157 -0
  431. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +165 -0
  432. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +92 -0
  433. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  434. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +94 -0
  435. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +72 -0
  436. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
  437. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +1020 -0
  438. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +1353 -0
  439. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +1001 -0
  440. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2503 -0
  441. data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
  442. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +487 -0
  443. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +168 -0
  444. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
  445. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +287 -0
  446. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
  447. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  448. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +454 -0
  449. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +221 -0
  450. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +153 -0
  451. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +16 -13
  452. data/ext/sources/ggml/src/ggml-impl.h +186 -15
  453. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -7
  454. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  455. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  456. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  457. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +609 -0
  458. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1743 -0
  459. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +273 -0
  460. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1686 -0
  461. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +356 -61
  462. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +4161 -0
  463. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
  464. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +724 -0
  465. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +4495 -1876
  466. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +21 -9
  467. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +29 -0
  468. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4005 -427
  469. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  470. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  471. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  472. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  473. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +147 -0
  474. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +66 -0
  475. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +82 -0
  476. data/ext/sources/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  477. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  478. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  479. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  480. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  481. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  482. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  483. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
  484. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +177 -0
  485. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
  486. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  487. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  488. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +39 -0
  489. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  490. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  491. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  492. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  493. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  494. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  495. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  496. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  497. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  498. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  499. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  500. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  501. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  502. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  503. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  504. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
  505. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +94 -0
  506. data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
  507. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  508. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  509. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  510. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  511. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  512. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  513. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -0
  514. data/ext/sources/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  515. data/ext/sources/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  516. data/ext/sources/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  517. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
  518. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +33 -0
  519. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
  520. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  521. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  522. data/ext/sources/ggml/src/ggml-quants.c +111 -16
  523. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  524. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +497 -195
  525. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +48 -3
  526. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +77 -0
  527. data/ext/sources/ggml/src/ggml-sycl/add-id.hpp +8 -0
  528. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +8 -0
  529. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +6 -5
  530. data/ext/sources/ggml/src/ggml-sycl/common.hpp +117 -15
  531. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +50 -30
  532. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +10 -4
  533. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +200 -99
  534. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  535. data/ext/sources/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  536. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +72 -309
  537. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  538. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +18 -0
  539. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +67 -49
  540. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +77 -34
  541. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +397 -314
  542. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +12 -2
  543. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +14 -26
  544. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +9 -6
  545. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +643 -413
  546. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  547. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +2 -2
  548. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +80 -60
  549. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +223 -132
  550. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +230 -55
  551. data/ext/sources/ggml/src/ggml-sycl/norm.hpp +2 -0
  552. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +97 -0
  553. data/ext/sources/ggml/src/ggml-sycl/pad.hpp +24 -0
  554. data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
  555. data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
  556. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +2 -0
  557. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  558. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +8 -9
  559. data/ext/sources/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  560. data/ext/sources/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  561. data/ext/sources/ggml/src/ggml-sycl/roll.cpp +122 -0
  562. data/ext/sources/ggml/src/ggml-sycl/roll.hpp +20 -0
  563. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +65 -59
  564. data/ext/sources/ggml/src/ggml-sycl/set.cpp +73 -0
  565. data/ext/sources/ggml/src/ggml-sycl/set.hpp +5 -0
  566. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  567. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  568. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +330 -165
  569. data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +4 -0
  570. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
  571. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  572. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +12 -6
  573. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +60 -6
  574. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +16 -12
  575. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
  576. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +7398 -2635
  577. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  578. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
  579. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +43 -3
  580. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  581. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  582. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  583. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +15 -6
  584. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +56 -39
  585. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  586. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  587. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
  588. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
  589. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
  590. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
  591. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +347 -0
  592. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
  593. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
  594. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +5 -5
  595. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +67 -13
  596. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  597. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
  598. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
  599. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  600. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
  601. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  602. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  603. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
  604. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +158 -16
  605. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +38 -3
  606. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
  607. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  608. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
  609. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
  610. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
  611. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
  612. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
  613. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
  614. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
  615. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
  616. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  617. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +4 -4
  618. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +2 -2
  619. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
  620. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
  621. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +4 -4
  622. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
  623. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
  624. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +4 -4
  625. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +2 -2
  626. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
  627. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
  628. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
  629. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
  630. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  631. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  632. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +103 -36
  633. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +220 -0
  634. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +139 -45
  635. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +113 -38
  636. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +75 -14
  637. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  638. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
  639. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  640. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  641. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
  642. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  643. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
  644. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +19 -17
  645. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +2 -0
  646. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +7 -0
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +4 -0
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +33 -17
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +2 -2
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +227 -0
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +71 -21
  663. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +41 -25
  664. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +2 -2
  665. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +44 -26
  666. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +2 -2
  667. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +2 -2
  668. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +2 -2
  669. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +20 -14
  670. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
  671. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +4 -6
  672. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +2 -2
  673. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +4 -6
  674. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +4 -6
  675. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +2 -2
  676. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +143 -0
  677. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +494 -0
  678. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -556
  679. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +230 -51
  680. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +566 -0
  681. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +72 -0
  682. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +90 -223
  683. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
  684. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
  685. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +195 -0
  686. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  687. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
  688. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
  689. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  690. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +41 -5
  691. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
  692. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +59 -9
  693. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
  694. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
  695. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
  696. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
  697. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +104 -14
  698. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
  699. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  700. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  701. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +234 -0
  702. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
  703. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +6 -52
  704. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +6 -35
  705. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +6 -35
  706. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +28 -0
  707. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +6 -39
  708. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  709. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +5 -0
  710. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +3 -3
  711. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
  712. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
  713. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
  714. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
  715. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +30 -8
  716. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
  717. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
  718. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
  719. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
  720. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
  721. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  722. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
  723. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  724. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
  725. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
  726. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
  727. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  728. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
  729. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +16 -6
  730. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
  731. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
  732. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  733. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
  734. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
  735. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
  736. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
  737. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
  738. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +43 -0
  739. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  740. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +435 -24
  741. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +148 -6
  742. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.glsl +25 -0
  743. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +619 -177
  744. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  745. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  746. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +169 -0
  747. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3087 -0
  748. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  749. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
  750. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  751. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  752. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
  753. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +147 -0
  754. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +591 -0
  755. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  756. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  757. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  758. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  759. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
  760. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
  761. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
  762. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
  763. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  764. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  765. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
  766. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +112 -0
  767. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  768. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  769. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +483 -0
  770. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  771. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  772. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  773. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  774. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  775. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  776. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  777. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
  778. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +466 -0
  779. data/ext/sources/ggml/src/ggml.c +901 -129
  780. data/ext/sources/ggml/src/gguf.cpp +8 -1
  781. data/ext/sources/include/whisper.h +1 -0
  782. data/ext/sources/src/CMakeLists.txt +3 -1
  783. data/ext/sources/src/whisper.cpp +124 -81
  784. data/ext/sources/tests/CMakeLists.txt +8 -1
  785. data/ext/sources/tests/test-vad-full.cpp +7 -5
  786. data/ext/sources/tests/test-vad.cpp +3 -3
  787. data/extsources.rb +1 -0
  788. data/lib/whisper/model/uri.rb +17 -18
  789. data/sig/whisper.rbs +126 -2
  790. data/test/test_params.rb +24 -8
  791. data/test/test_segment.rb +0 -1
  792. data/test/test_token.rb +70 -0
  793. data/test/test_vad.rb +1 -1
  794. data/test/test_vad_context.rb +50 -0
  795. data/test/test_vad_segment.rb +19 -0
  796. data/test/test_vad_segments.rb +16 -0
  797. data/test/test_whisper.rb +8 -1
  798. data/whispercpp.gemspec +1 -1
  799. metadata +439 -179
  800. data/ext/sources/build-xcframework.sh +0 -547
  801. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +0 -279
  802. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +0 -1841
  803. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +0 -303
  804. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  805. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  806. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  807. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  808. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  809. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  810. data/ext/sources/ggml/src/ggml-cann/Doxyfile +0 -2579
  811. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  812. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  813. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  814. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  815. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  816. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  817. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  818. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  819. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  820. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  821. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  822. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  823. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  824. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  825. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  826. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  827. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -506
  828. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +0 -11
  829. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  830. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  831. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  832. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  833. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  834. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  835. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  836. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  837. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  838. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  839. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  840. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  841. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  842. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  843. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  844. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  845. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  846. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  847. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  848. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  849. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  850. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  851. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  852. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  853. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  854. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  855. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  856. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  857. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  858. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  859. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  860. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  861. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  862. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  863. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  864. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  865. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  866. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  867. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  868. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  869. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  870. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  871. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  872. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  873. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  874. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  875. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  876. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  877. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  878. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  879. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  880. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  881. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  882. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  883. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  884. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  885. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  886. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  887. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  888. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  889. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  890. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  891. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  892. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  893. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  894. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  895. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  896. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  897. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  898. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  899. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  900. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  901. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  902. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  903. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  904. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  905. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  906. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  907. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  908. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  909. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  910. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  911. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  912. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  913. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  914. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  915. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  916. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  917. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  918. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  919. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  920. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  921. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  922. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  923. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  924. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  925. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  926. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  927. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  928. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  929. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  930. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  931. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  932. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  933. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  934. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  935. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  936. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  937. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  938. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  939. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  940. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  941. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  942. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  943. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  944. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  945. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  946. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  947. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  948. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  949. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  950. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  951. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  952. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  953. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  954. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -6280
  955. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +0 -162
  956. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -118
  957. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -99
  958. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -58
  959. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
  960. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
  961. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
  962. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
  963. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
@@ -0,0 +1,1353 @@
1
+ #ifndef HVX_UTILS_H
2
+ #define HVX_UTILS_H
3
+
4
+ #include "ops-utils.h"
5
+
6
+ #include <stdbool.h>
7
+ #include <stdint.h>
8
+
9
+ #define SIZEOF_FP32 (4)
10
+ #define SIZEOF_FP16 (2)
11
+ #define VLEN (128)
12
+ #define VLEN_FP32 (VLEN / SIZEOF_FP32)
13
+ #define VLEN_FP16 (VLEN / SIZEOF_FP16)
14
+
15
+ typedef union {
16
+ HVX_Vector v;
17
+ uint8_t b[VLEN];
18
+ uint16_t h[VLEN_FP16];
19
+ uint32_t w[VLEN_FP32];
20
+ __fp16 fp16[VLEN_FP16];
21
+ float fp32[VLEN_FP32];
22
+ } __attribute__((aligned(VLEN), packed)) HVX_VectorAlias;
23
+
24
+ /* Q6_Vsf_equals_Vw is only available on v73+.*/
25
+ #if __HVX_ARCH__ < 73
26
+ static inline HVX_Vector int32_to_qfloat(HVX_Vector const in)
27
+ {
28
+ HVX_Vector const vzero = Q6_V_vzero();
29
+ HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero);
30
+ HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in);
31
+ HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift);
32
+ HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
33
+ HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
34
+ HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp));
35
+ return ret;
36
+ }
37
+
38
+ static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in)
39
+ {
40
+ return Q6_Vsf_equals_Vqf32(int32_to_qfloat(in));
41
+ }
42
+ #endif
43
+
44
+ static inline HVX_Vector hvx_vec_splat_fp32(float v) {
45
+ union {
46
+ float f;
47
+ uint32_t i;
48
+ } fp32 = { .f = v };
49
+
50
+ return Q6_V_vsplat_R(fp32.i);
51
+ }
52
+
53
+ static inline HVX_Vector hvx_vec_splat_fp16(float v) {
54
+ union {
55
+ __fp16 f;
56
+ uint16_t i;
57
+ } fp16 = { .f = v };
58
+
59
+ return Q6_Vh_vsplat_R(fp16.i);
60
+ }
61
+
62
+ static inline void hvx_vec_store_u(void * addr, uint32_t n, HVX_Vector v) {
63
+ // Rotate as needed.
64
+ v = Q6_V_vlalign_VVR(v, v, (size_t) addr);
65
+
66
+ uint32_t left_off = (size_t) addr & 127;
67
+ uint32_t right_off = left_off + n;
68
+
69
+ HVX_VectorPred ql_not = Q6_Q_vsetq_R((size_t) addr);
70
+ HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off);
71
+
72
+ if (right_off > 128) {
73
+ Q6_vmem_QRIV(qr, (HVX_Vector *) addr + 1, v);
74
+ // all 1's
75
+ qr = Q6_Q_vcmp_eq_VbVb(v, v);
76
+ }
77
+
78
+ ql_not = Q6_Q_or_QQn(ql_not, qr);
79
+ Q6_vmem_QnRIV(ql_not, (HVX_Vector *) addr, v);
80
+ }
81
+
82
+ static inline void hvx_vec_store_a(void * ptr, size_t n, HVX_Vector v) {
83
+ assert((unsigned long) ptr % 128 == 0);
84
+
85
+ HVX_VectorPred ql_not = Q6_Q_vsetq_R((size_t) ptr);
86
+ HVX_VectorPred qr = Q6_Q_vsetq2_R(n);
87
+ ql_not = Q6_Q_or_QQn(ql_not, qr);
88
+ Q6_vmem_QnRIV(ql_not, (HVX_Vector *) ptr, v);
89
+ }
90
+
91
+ static inline HVX_Vector hvx_vec_repl4(HVX_Vector v) {
92
+ // vdelta control to replicate first 4 bytes across all elements
93
+ static const uint8_t __attribute__((aligned(128))) repl[128] = {
94
+ 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
95
+ 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
96
+ 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
97
+ 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
98
+ 0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
99
+ 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
100
+ 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
101
+ 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
102
+ };
103
+
104
+ HVX_Vector ctrl = *(HVX_Vector *) repl;
105
+ return Q6_V_vdelta_VV(v, ctrl);
106
+ }
107
+
108
+ // copy n fp16 elements : source and destination are aligned to HVX Vector (128)
109
+ static inline void hvx_copy_fp16_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
110
+ HVX_Vector * restrict vdst = (HVX_Vector *) dst;
111
+ HVX_Vector * restrict vsrc = (HVX_Vector *) src;
112
+
113
+ assert((unsigned long) dst % 128 == 0);
114
+ assert((unsigned long) src % 128 == 0);
115
+
116
+ uint32_t nvec = n / 64;
117
+ uint32_t nloe = n % 64;
118
+
119
+ uint32_t i = 0;
120
+
121
+ #pragma unroll(4)
122
+ for (; i < nvec; i++) {
123
+ HVX_Vector v = vsrc[i];
124
+ vdst[i] = v;
125
+ }
126
+
127
+ if (nloe) {
128
+ HVX_Vector v = vsrc[i];
129
+ hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), v);
130
+ }
131
+ }
132
+
133
+ // copy n fp16 elements : source is aligned, destination is potentially unaligned
134
+ static inline void hvx_copy_fp16_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
135
+ HVX_UVector * restrict vdst = (HVX_UVector *) dst;
136
+ HVX_Vector * restrict vsrc = (HVX_Vector *) src;
137
+
138
+ assert((unsigned long) src % 128 == 0);
139
+
140
+ uint32_t nvec = n / 64;
141
+ uint32_t nloe = n % 64;
142
+
143
+ uint32_t i = 0;
144
+
145
+ #pragma unroll(4)
146
+ for (; i < nvec; i++) {
147
+ HVX_Vector v = vsrc[i];
148
+ vdst[i] = v;
149
+ }
150
+
151
+ if (nloe) {
152
+ HVX_Vector v = vsrc[i];
153
+ hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), v);
154
+ }
155
+ }
156
+
157
+ // copy n fp16 elements : source is aligned, destination is potentially unaligned
158
+ static inline void hvx_copy_fp16_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
159
+ HVX_Vector * restrict vdst = (HVX_Vector *) dst;
160
+ HVX_UVector * restrict vsrc = (HVX_UVector *) src;
161
+
162
+ assert((unsigned long) dst % 128 == 0);
163
+
164
+ uint32_t nvec = n / 64;
165
+ uint32_t nloe = n % 64;
166
+
167
+ uint32_t i = 0;
168
+
169
+ #pragma unroll(4)
170
+ for (; i < nvec; i++) {
171
+ HVX_Vector v = vsrc[i];
172
+ vdst[i] = v;
173
+ }
174
+
175
+ if (nloe) {
176
+ HVX_Vector v = vsrc[i];
177
+ hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), v);
178
+ }
179
+ }
180
+
181
+ // copy n fp32 elements : source and destination are aligned to HVX Vector (128)
182
+ static inline void hvx_copy_fp32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
183
+ HVX_Vector * restrict vdst = (HVX_Vector *) dst;
184
+ HVX_Vector * restrict vsrc = (HVX_Vector *) src;
185
+
186
+ assert((unsigned long) dst % 128 == 0);
187
+ assert((unsigned long) src % 128 == 0);
188
+
189
+ uint32_t nvec = n / 32;
190
+ uint32_t nloe = n % 32;
191
+
192
+ uint32_t i = 0;
193
+
194
+ #pragma unroll(4)
195
+ for (; i < nvec; i++) {
196
+ HVX_Vector v = vsrc[i];
197
+ vdst[i] = v;
198
+ }
199
+
200
+ if (nloe) {
201
+ HVX_Vector v = vsrc[i];
202
+ hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v);
203
+ }
204
+ }
205
+
206
+ // copy n fp32 elements : source is aligned, destination is unaligned
207
+ static inline void hvx_copy_fp32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
208
+ HVX_UVector * restrict vdst = (HVX_UVector *) dst;
209
+ HVX_Vector * restrict vsrc = (HVX_Vector *) src;
210
+
211
+ assert((unsigned long) src % 128 == 0);
212
+
213
+ uint32_t nvec = n / 32;
214
+ uint32_t nloe = n % 32;
215
+
216
+ uint32_t i = 0;
217
+
218
+ #pragma unroll(4)
219
+ for (; i < nvec; i++) {
220
+ HVX_Vector v = vsrc[i];
221
+ vdst[i] = v;
222
+ }
223
+
224
+ if (nloe) {
225
+ HVX_Vector v = vsrc[i];
226
+ hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v);
227
+ }
228
+ }
229
+
230
+ // copy n fp32 elements : source is unaligned, destination is aligned
231
+ static inline void hvx_copy_fp32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
232
+ HVX_Vector * restrict vdst = (HVX_Vector *) dst;
233
+ HVX_UVector * restrict vsrc = (HVX_UVector *) src;
234
+
235
+ assert((unsigned long) dst % 128 == 0);
236
+
237
+ uint32_t nvec = n / 32;
238
+ uint32_t nloe = n % 32;
239
+
240
+ uint32_t i = 0;
241
+
242
+ #pragma unroll(4)
243
+ for (; i < nvec; i++) {
244
+ HVX_Vector v = vsrc[i];
245
+ vdst[i] = v;
246
+ }
247
+
248
+ if (nloe) {
249
+ HVX_Vector v = vsrc[i];
250
+ hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v);
251
+ }
252
+ }
253
+
254
+ // copy n fp32 elements : source is unaligned, destination unaligned
255
+ static inline void hvx_copy_fp32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
256
+ HVX_UVector * restrict vdst = (HVX_UVector *) dst;
257
+ HVX_UVector * restrict vsrc = (HVX_UVector *) src;
258
+
259
+ assert((unsigned long) dst % 128 == 0);
260
+
261
+ uint32_t nvec = n / 32;
262
+ uint32_t nloe = n % 32;
263
+
264
+ uint32_t i = 0;
265
+
266
+ #pragma unroll(4)
267
+ for (; i < nvec; i++) {
268
+ HVX_Vector v = vsrc[i];
269
+ vdst[i] = v;
270
+ }
271
+
272
+ if (nloe) {
273
+ HVX_Vector v = vsrc[i];
274
+ hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v);
275
+ }
276
+ }
277
+
278
+ // copy/convert n fp32 elements into n fp16 elements : source is unaligned, destination is unaligned
279
+ static inline void hvx_copy_fp16_fp32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
280
+ HVX_UVector * restrict vdst = (HVX_UVector *) dst; // fp16
281
+ HVX_UVector * restrict vsrc = (HVX_UVector *) src; // fp32
282
+
283
+ const HVX_Vector zero = Q6_V_vsplat_R(0);
284
+
285
+ uint32_t nvec = n / 64;
286
+ uint32_t nloe = n % 64;
287
+
288
+ uint32_t i = 0;
289
+
290
+ #pragma unroll(4)
291
+ for (; i < nvec; i++) {
292
+ // Load y (fp32) and convert into fp16
293
+ HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
294
+ HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
295
+ HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
296
+ vdst[i] = Q6_Vh_vdeal_Vh(s_hf);
297
+ }
298
+
299
+ if (nloe) {
300
+ // Load y (fp32) and convert into fp16
301
+ HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
302
+ HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
303
+ HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
304
+ hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), Q6_Vh_vdeal_Vh(s_hf));
305
+ }
306
+ }
307
+
308
+ // copy/convert n fp32 elements into n fp16 elements : source is aligned, destination is unaligned
309
+ static inline void hvx_copy_fp16_fp32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
310
+ HVX_UVector * restrict vdst = (HVX_UVector *) dst; // fp16
311
+ HVX_Vector * restrict vsrc = (HVX_Vector *) src; // fp32
312
+
313
+ const HVX_Vector zero = Q6_V_vsplat_R(0);
314
+
315
+ uint32_t nvec = n / 64;
316
+ uint32_t nloe = n % 64;
317
+
318
+ uint32_t i = 0;
319
+
320
+ #pragma unroll(4)
321
+ for (; i < nvec; i++) {
322
+ // Load y (fp32) and convert into fp16
323
+ HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
324
+ HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
325
+ HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
326
+ vdst[i] = Q6_Vh_vdeal_Vh(s_hf);
327
+ }
328
+
329
+ if (nloe) {
330
+ // Load y (fp32) and convert into fp16
331
+ HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
332
+ HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
333
+ HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
334
+ hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), Q6_Vh_vdeal_Vh(s_hf));
335
+ }
336
+ }
337
+
338
+ // copy/convert n fp32 elements into n fp16 elements : source is unaligned, destination is aligned
339
+ static inline void hvx_copy_fp16_fp32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
340
+ HVX_Vector * restrict vdst = (HVX_Vector *) dst; // fp16
341
+ HVX_UVector * restrict vsrc = (HVX_UVector *) src; // fp32
342
+
343
+ const HVX_Vector zero = Q6_V_vsplat_R(0);
344
+
345
+ uint32_t nvec = n / 64;
346
+ uint32_t nloe = n % 64;
347
+
348
+ uint32_t i = 0;
349
+
350
+ #pragma unroll(4)
351
+ for (; i < nvec; i++) {
352
+ // Load y (fp32) and convert into fp16
353
+ HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
354
+ HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
355
+ HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
356
+ vdst[i] = Q6_Vh_vdeal_Vh(s_hf);
357
+ }
358
+
359
+ if (nloe) {
360
+ // Load y (fp32) and convert into fp16
361
+ HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
362
+ HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
363
+ HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
364
+ hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), Q6_Vh_vdeal_Vh(s_hf));
365
+ }
366
+ }
367
+
368
+ // bcast 1 fp32 element from source to n fp32 elements in destination : destination is aligned
369
+ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t n) {
370
+ HVX_Vector * restrict vdst = (HVX_Vector *) dst;
371
+
372
+ HVX_Vector velem = hvx_vec_splat_fp32(elem);
373
+
374
+ assert((unsigned long) dst % 128 == 0);
375
+
376
+ uint32_t nvec = n / 32;
377
+ uint32_t nloe = n % 32;
378
+
379
+ uint32_t i = 0;
380
+
381
+ #pragma unroll(4)
382
+ for (; i < nvec; i++) {
383
+ vdst[i] = velem;
384
+ }
385
+
386
+ if (nloe) {
387
+ hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), velem);
388
+ }
389
+ }
390
+
391
+
392
+ /* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */
393
+ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
394
+ uint32_t left_off = (size_t) addr & (chunk_size - 1);
395
+ uint32_t right_off = left_off + n;
396
+ return right_off <= chunk_size;
397
+ }
398
+
399
+ static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
400
+ HVX_VectorAlias u = { .v = v };
401
+
402
+ const uint32_t n0 = n / 16;
403
+ const uint32_t n1 = n % 16;
404
+ int i = 0;
405
+ for (; i < n0; i++) {
406
+ htp_dump_fp16_line(pref, u.fp16 + (16 * i), 16);
407
+ }
408
+ if (n1) {
409
+ htp_dump_fp16_line(pref, u.fp16 + (16 * i), n1);
410
+ }
411
+ }
412
+
413
+ static void hvx_vec_dump_fp16(char * pref, HVX_Vector v) {
414
+ hvx_vec_dump_fp16_n(pref, v, 64);
415
+ }
416
+
417
+ static void hvx_vec_dump_fp32_n(char * pref, HVX_Vector v, uint32_t n) {
418
+ union {
419
+ HVX_Vector v;
420
+ float d[32];
421
+ } u = { .v = v };
422
+
423
+ const uint32_t n0 = n / 16;
424
+ const uint32_t n1 = n % 16;
425
+ int i = 0;
426
+ for (; i < n0; i++) {
427
+ htp_dump_fp32_line(pref, u.d + (16 * i), 16);
428
+ }
429
+ if (n1) {
430
+ htp_dump_fp32_line(pref, u.d + (16 * i), n1);
431
+ }
432
+ }
433
+
434
+ static void hvx_vec_dump_fp32_hmt(char * pref, HVX_Vector v) {
435
+ union {
436
+ HVX_Vector v;
437
+ float d[32];
438
+ } u = { .v = v };
439
+
440
+ FARF(HIGH, "%s: %.6f %.6f %.6f %.6f ... %.6f %.6f %.6f %.6f ... %.6f %.6f %.6f %.6f\n", pref, u.d[0], u.d[1],
441
+ u.d[2], u.d[3], u.d[12], u.d[13], u.d[14], u.d[15], u.d[28], u.d[29], u.d[30], u.d[31]);
442
+ }
443
+
444
+ static void hvx_vec_dump_fp32(char * pref, HVX_Vector v) {
445
+ hvx_vec_dump_fp32_n(pref, v, 32);
446
+ }
447
+
448
+ static void hvx_vec_dump_int32(char * pref, HVX_Vector v) {
449
+ union {
450
+ HVX_Vector v;
451
+ int32_t d[32];
452
+ } u = { .v = v };
453
+
454
+ for (int i = 0; i < 32 / 16; i++) {
455
+ htp_dump_int32_line(pref, u.d + (16 * i), 16);
456
+ }
457
+ }
458
+
459
+ static void hvx_vec_dump_int32_hmt(char * pref, HVX_Vector v) {
460
+ union {
461
+ HVX_Vector v;
462
+ int32_t d[32];
463
+ } u = { .v = v };
464
+
465
+ FARF(HIGH, "%s: %d %d %d %d ... %d %d %d %d ... %d %d %d %d\n", pref, u.d[0], u.d[1], u.d[2], u.d[3], u.d[12],
466
+ u.d[13], u.d[14], u.d[15], u.d[28], u.d[29], u.d[30], u.d[31]);
467
+ }
468
+
469
+ static void hvx_vec_dump_int8_hmt(char * pref, HVX_Vector v) {
470
+ union {
471
+ HVX_Vector v;
472
+ int8_t d[128];
473
+ } u = { .v = v };
474
+
475
+ FARF(HIGH, "%s: %d %d %d %d ... %d %d %d %d ... %d %d %d %d\n", pref, u.d[0], u.d[1], u.d[2], u.d[3], u.d[60],
476
+ u.d[61], u.d[62], u.d[63], u.d[124], u.d[125], u.d[126], u.d[127]);
477
+ }
478
+
479
+ static void hvx_vec_dump_int8(char * pref, HVX_Vector v) {
480
+ union {
481
+ HVX_Vector v;
482
+ int8_t d[128];
483
+ } u = { .v = v };
484
+
485
+ for (int i = 0; i < 128 / 16; i++) {
486
+ htp_dump_int8_line(pref, u.d + (16 * i), 16);
487
+ }
488
+ }
489
+
490
+ static void hvx_vec_dump_uint8(char * pref, HVX_Vector v) {
491
+ union {
492
+ HVX_Vector v;
493
+ uint8_t d[128];
494
+ } u = { .v = v };
495
+
496
+ for (int i = 0; i < 128 / 16; i++) {
497
+ htp_dump_uint8_line(pref, u.d + (16 * i), 16);
498
+ }
499
+ }
500
+
501
+ static bool hvx_vec_eq(HVX_Vector v0, HVX_Vector v1, size_t n) {
502
+ typedef union {
503
+ HVX_Vector v;
504
+ int8_t d[128];
505
+ } U;
506
+
507
+ U u0 = { .v = v0 };
508
+ U u1 = { .v = v1 };
509
+
510
+ for (int i = 0; i < n; i++) {
511
+ if (u0.d[i] != u1.d[i]) {
512
+ return false;
513
+ }
514
+ }
515
+
516
+ return true;
517
+ }
518
+
519
+ static inline float hvx_vec_get_fp32(HVX_Vector v) {
520
+ float __attribute__((aligned(128))) x;
521
+ hvx_vec_store_a(&x, 4, v);
522
+ return x;
523
+ }
524
+
525
+ static inline HVX_Vector hvx_vec_int32_reduce_sum_n(HVX_Vector in, unsigned int n) {
526
+ unsigned int total = n * 4; // total vec nbytes
527
+ unsigned int width = 4; // int32
528
+
529
+ HVX_Vector sum = in, sum_t;
530
+ while (width < total) {
531
+ sum_t = Q6_V_vror_VR(sum, width); // rotate right
532
+ sum = Q6_Vw_vadd_VwVw(sum_t, sum); // elementwise sum
533
+ width = width << 1;
534
+ }
535
+ return sum;
536
+ }
537
+
538
+ static inline HVX_Vector hvx_vec_int32_reduce_sum(HVX_Vector in) {
539
+ return hvx_vec_int32_reduce_sum_n(in, 32);
540
+ }
541
+
542
+ static inline HVX_Vector hvx_vec_qf32_reduce_sum_n(HVX_Vector in, unsigned int n) {
543
+ unsigned int total = n * 4; // total vec nbytes
544
+ unsigned int width = 4; // fp32 nbytes
545
+
546
+ HVX_Vector sum = in, sum_t;
547
+ while (width < total) {
548
+ sum_t = Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum), width); // rotate right
549
+ sum = Q6_Vqf32_vadd_Vqf32Vsf(sum, sum_t); // elementwise sum
550
+ width = width << 1;
551
+ }
552
+ return sum;
553
+ }
554
+
555
+ static inline HVX_Vector hvx_vec_qf32_reduce_sum(HVX_Vector in) {
556
+ return hvx_vec_qf32_reduce_sum_n(in, 32);
557
+ }
558
+
559
+ static inline HVX_Vector hvx_vec_fp32_reduce_sum_n(HVX_Vector in, unsigned int n) {
560
+ unsigned int total = n * 4; // total vec nbytes
561
+ unsigned int width = 4; // fp32 nbytes
562
+
563
+ HVX_Vector sum = in, sum_t;
564
+ while (width < total) {
565
+ sum_t = Q6_V_vror_VR(sum, width); // rotate right
566
+ sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t)); // elementwise sum
567
+ width = width << 1;
568
+ }
569
+ return sum;
570
+ }
571
+
572
+ static inline HVX_Vector hvx_vec_fp32_reduce_sum(HVX_Vector in) {
573
+ return hvx_vec_fp32_reduce_sum_n(in, 32);
574
+ }
575
+
576
+ static inline HVX_Vector hvx_vec_reduce_max_fp16(HVX_Vector in) {
577
+ unsigned total = 128; // total vec nbytes
578
+ unsigned width = 2; // fp16 nbytes
579
+
580
+ HVX_Vector _max = in, _max_t;
581
+ while (width < total) {
582
+ _max_t = Q6_V_vror_VR(_max, width); // rotate right
583
+ _max = Q6_Vhf_vmax_VhfVhf(_max_t, _max); // elementwise max
584
+ width = width << 1;
585
+ }
586
+
587
+ return _max;
588
+ }
589
+
590
+ static inline HVX_Vector hvx_vec_reduce_max2_fp16(HVX_Vector in, HVX_Vector _max) {
591
+ unsigned total = 128; // total vec nbytes
592
+ unsigned width = 2; // fp32 nbytes
593
+
594
+ HVX_Vector _max_t;
595
+
596
+ _max = Q6_Vhf_vmax_VhfVhf(in, _max);
597
+ while (width < total) {
598
+ _max_t = Q6_V_vror_VR(_max, width); // rotate right
599
+ _max = Q6_Vhf_vmax_VhfVhf(_max_t, _max); // elementwise max
600
+ width = width << 1;
601
+ }
602
+
603
+ return _max;
604
+ }
605
+
606
+ static inline HVX_Vector hvx_vec_reduce_max_fp32(HVX_Vector in) {
607
+ unsigned total = 128; // total vec nbytes
608
+ unsigned width = 4; // fp32 nbytes
609
+
610
+ HVX_Vector _max = in, _max_t;
611
+ while (width < total) {
612
+ _max_t = Q6_V_vror_VR(_max, width); // rotate right
613
+ _max = Q6_Vsf_vmax_VsfVsf(_max_t, _max); // elementwise max
614
+ width = width << 1;
615
+ }
616
+
617
+ return _max;
618
+ }
619
+
620
+ static inline HVX_Vector hvx_vec_reduce_max2_fp32(HVX_Vector in, HVX_Vector _max) {
621
+ unsigned total = 128; // total vec nbytes
622
+ unsigned width = 4; // fp32 nbytes
623
+
624
+ HVX_Vector _max_t;
625
+
626
+ _max = Q6_Vsf_vmax_VsfVsf(in, _max);
627
+ while (width < total) {
628
+ _max_t = Q6_V_vror_VR(_max, width); // rotate right
629
+ _max = Q6_Vsf_vmax_VsfVsf(_max_t, _max); // elementwise max
630
+ width = width << 1;
631
+ }
632
+
633
+ return _max;
634
+ }
635
+
636
+ static inline HVX_Vector hvx_vec_abs_fp16(HVX_Vector v) {
637
+ // abs by clearing the fp16 sign bit
638
+ HVX_Vector mask = Q6_Vh_vsplat_R(0x7fff);
639
+ return Q6_V_vand_VV(v, mask);
640
+ }
641
+
642
+ static inline HVX_Vector hvx_vec_neg_fp16(HVX_Vector v) {
643
+ // neg by setting the fp16 sign bit
644
+ HVX_Vector mask = Q6_Vh_vsplat_R(0x8000);
645
+ return Q6_V_vxor_VV(v, mask);
646
+ }
647
+
648
+ static inline HVX_Vector hvx_vec_abs_fp32(HVX_Vector v) {
649
+ // abs by clearing the fp32 sign bit
650
+ HVX_Vector mask = Q6_V_vsplat_R(0x7fffffff);
651
+ return Q6_V_vand_VV(v, mask);
652
+ }
653
+
654
+ static inline HVX_Vector hvx_vec_neg_fp32(HVX_Vector v) {
655
+ #if __HVX_ARCH__ > 75
656
+ return Q6_Vsf_vfneg_Vsf(v);
657
+ #else
658
+ // neg by setting the fp32 sign bit
659
+ HVX_Vector mask = Q6_V_vsplat_R(0x80000000);
660
+ return Q6_V_vxor_VV(v, mask);
661
+ #endif // __HVX_ARCH__ > 75
662
+ }
663
+
664
+ // ====================================================
665
+ // FUNCTION: 1/(x+1) y(0) = 1, y(0.5) = 0.6667, y(1) = 0.5
666
+ // Order:3; continuity: True; Ends forced: True
667
+ // Mode: unsigned; Result fractional bits: 14
668
+ // Peak Error: 1.1295e-04 Rms Error: 2.8410e-05 Mean Error: 1.1370e-05
669
+ // 32769 -32706 31252 -10589
670
+ // 32590 -30635 22793 -4493
671
+ // 32066 -27505 16481 -2348
672
+ // 31205 -24054 11849 -1306
673
+
674
+ static inline HVX_Vector hvx_vec_recip_xp1_O3_unsigned(HVX_Vector vx) {
675
+ // input is 0..0xffff representing 0.0 .. 1.0
676
+ HVX_Vector p;
677
+ p = Q6_Vh_vlut4_VuhPh(vx, 0xFAE6F6D4EE73D6A3ull);
678
+ p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x2E49406159097A14ull);
679
+ p = Q6_Vh_vmps_VhVhVuhPuh_sat(p, vx, 0x5DF66B7177AB7FC2ull);
680
+ p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x79E57D427F4E8001ull);
681
+ return p; // signed result, 14 fractional bits
682
+ }
683
+
684
+ // Find reciprocal of fp16.
685
+ // (1) first, convert to fp32, multiplying by 1.0; this is done to
686
+ // handle denormals. Ignoring sign and zero, result should be at
687
+ // least 5.9604645e-08 (32-bit code 0x33800000) and at most 131008 (0x47ffe000)
688
+ // (exponent in range [103,143])
689
+ // (2) extract the mantissa into 16-bit unsigned; find reciprocal using a fitted poly
690
+ // (3) put this, along with '253-exp' (exp from (1)) together to make an qf32
691
+ // (4) convert that to fp16
692
+ // (5) put sign back in. Also, if the original value (w/o sign) was <0x81, replace
693
+ // the result with the max value.
694
+ static inline HVX_Vector hvx_vec_inverse_fp16(HVX_Vector vals) {
695
+ HVX_Vector em_mask = Q6_Vh_vsplat_R(0x7FFF);
696
+ HVX_Vector avals = Q6_V_vand_VV(vals, em_mask);
697
+ HVX_VectorPred is_neg = Q6_Q_vcmp_gt_VhVh(avals, vals);
698
+ // is too small to 1/x ? for 'standard' fp16, this would be 0x101
699
+ HVX_VectorPred is_small = Q6_Q_vcmp_gt_VhVh(Q6_Vh_vsplat_R(0x101), avals);
700
+
701
+ HVX_VectorPair to_qf32 = Q6_Wqf32_vmpy_VhfVhf(avals, Q6_Vh_vsplat_R(0x3C00)); // *1.0
702
+ HVX_Vector to_f32_0 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(to_qf32));
703
+ HVX_Vector to_f32_1 = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(to_qf32));
704
+
705
+ // bits 22..13 contain the mantissa now (w/o hidden bit); move to bit 14..5 of a 16-bit vector
706
+ HVX_Vector mant_u16 = Q6_Vh_vshuffo_VhVh(Q6_Vw_vasl_VwR(to_f32_1, 9), Q6_Vw_vasl_VwR(to_f32_0, 9));
707
+ // likewise extract the upper 16 from each, containing the exponents in range 103..142
708
+ HVX_Vector exp_u16 = Q6_Vh_vshuffo_VhVh(to_f32_1, to_f32_0);
709
+ //Get exponent in IEEE 32-bit representation
710
+ exp_u16 = Q6_Vuh_vlsr_VuhR(exp_u16, 7);
711
+
712
+ // so, mant_u16 contains an unbiased mantissa in upper 10 bits of each u16 lane
713
+ // We can consider it to be x-1.0, with 16 fractional bits, where 'x' is in range [1.0,2.0)
714
+ // Use poly to transform to 1/x, with 14 fractional bits
715
+ //
716
+ HVX_Vector rm = hvx_vec_recip_xp1_O3_unsigned(mant_u16);
717
+
718
+ HVX_Vector vcl0 = Q6_Vuh_vcl0_Vuh(rm); //count leading zeros
719
+
720
+ // Get mantissa for 16-bit represenation
721
+ HVX_Vector mant_recip = Q6_V_vand_VV(Q6_Vh_vasr_VhR(Q6_Vh_vasl_VhVh(rm, vcl0), 5), Q6_Vh_vsplat_R(0x03FF));
722
+
723
+ //Compute Reciprocal Exponent
724
+ HVX_Vector exp_recip =
725
+ Q6_Vh_vsub_VhVh(Q6_Vh_vsub_VhVh(Q6_Vh_vsplat_R(254), exp_u16), Q6_Vh_vsub_VhVh(vcl0, Q6_Vh_vsplat_R(1)));
726
+ //Convert it for 16-bit representation
727
+ exp_recip = Q6_Vh_vadd_VhVh_sat(Q6_Vh_vsub_VhVh(exp_recip, Q6_Vh_vsplat_R(127)), Q6_Vh_vsplat_R(15));
728
+ exp_recip = Q6_Vh_vasl_VhR(exp_recip, 10);
729
+
730
+ //Merge exponent and mantissa for reciprocal
731
+ HVX_Vector recip = Q6_V_vor_VV(exp_recip, mant_recip);
732
+ // map 'small' inputs to standard largest value 0x7bff
733
+ recip = Q6_V_vmux_QVV(is_small, Q6_Vh_vsplat_R(0x7bff), recip);
734
+ // add sign back
735
+ recip = Q6_V_vandor_VQR(recip, is_neg, 0x80008000);
736
+ return recip;
737
+ }
738
+
739
+ #define IEEE_VSF_EXPLEN (8)
740
+ #define IEEE_VSF_EXPBIAS (127)
741
+ #define IEEE_VSF_EXPMASK (0xFF)
742
+ #define IEEE_VSF_MANTLEN (23)
743
+ #define IEEE_VSF_MANTMASK (0x7FFFFF)
744
+ #define IEEE_VSF_MIMPMASK (0x800000)
745
+
746
+ static inline HVX_Vector hvx_vec_truncate_fp32(HVX_Vector in_vec) {
747
+ HVX_Vector mask_mant_v = Q6_V_vsplat_R(IEEE_VSF_MANTMASK);
748
+ HVX_Vector mask_impl_v = Q6_V_vsplat_R(IEEE_VSF_MIMPMASK);
749
+ HVX_Vector const_zero_v = Q6_V_vzero();
750
+
751
+ HVX_VectorPred q_negative = Q6_Q_vcmp_gt_VwVw(const_zero_v, in_vec);
752
+
753
+ HVX_Vector expval_v = in_vec >> IEEE_VSF_MANTLEN;
754
+ expval_v &= IEEE_VSF_EXPMASK;
755
+ expval_v -= IEEE_VSF_EXPBIAS;
756
+
757
+ // negative exp == fractional value
758
+ HVX_VectorPred q_negexp = Q6_Q_vcmp_gt_VwVw(const_zero_v, expval_v);
759
+
760
+ HVX_Vector rshift_v = IEEE_VSF_MANTLEN - expval_v; // fractional bits - exp shift
761
+
762
+ HVX_Vector mant_v = in_vec & mask_mant_v; // obtain mantissa
763
+ HVX_Vector vout = Q6_Vw_vadd_VwVw(mant_v, mask_impl_v); // add implicit 1.0
764
+
765
+ vout = Q6_Vw_vasr_VwVw(vout, rshift_v); // shift to obtain truncated integer
766
+ vout = Q6_V_vmux_QVV(q_negexp, const_zero_v, vout); // expval<0 -> 0
767
+
768
+ HVX_Vector neg_vout = -vout;
769
+
770
+ vout = Q6_V_vmux_QVV(q_negative, neg_vout, vout); // handle negatives
771
+
772
+ return (vout);
773
+ }
774
+
775
+ static inline HVX_Vector hvx_vec_floor_fp32(HVX_Vector in_vec) {
776
+ HVX_Vector mask_mant_v = Q6_V_vsplat_R(IEEE_VSF_MANTMASK);
777
+ HVX_Vector mask_impl_v = Q6_V_vsplat_R(IEEE_VSF_MIMPMASK);
778
+ HVX_Vector const_mnlen_v = Q6_V_vsplat_R(IEEE_VSF_MANTLEN);
779
+ HVX_Vector const_zero_v = Q6_V_vzero();
780
+ HVX_Vector const_negone_v = Q6_V_vsplat_R(0xbf800000); // -1 IEEE vsf
781
+
782
+ HVX_VectorPred q_negative = Q6_Q_vcmp_gt_VwVw(const_zero_v, in_vec);
783
+
784
+ HVX_Vector expval_v = in_vec >> IEEE_VSF_MANTLEN;
785
+ expval_v &= IEEE_VSF_EXPMASK;
786
+ expval_v -= IEEE_VSF_EXPBIAS;
787
+
788
+ HVX_VectorPred q_negexp = Q6_Q_vcmp_gt_VwVw(const_zero_v, expval_v);
789
+ HVX_VectorPred q_expltmn = Q6_Q_vcmp_gt_VwVw(const_mnlen_v, expval_v);
790
+ HVX_VectorPred q_negexp_pos = Q6_Q_vcmp_gtand_QVwVw(q_negexp, in_vec, const_zero_v);
791
+ HVX_VectorPred q_negexp_neg = Q6_Q_vcmp_gtand_QVwVw(q_negexp, const_zero_v, in_vec);
792
+
793
+ // if expval < 0 (q_negexp) // <0, floor is 0
794
+ // if vin > 0
795
+ // floor = 0
796
+ // if vin < 0
797
+ // floor = -1
798
+ // if expval < mant_len (q_expltmn) // >0, but fraction may exist
799
+ // get sign (q_negative)
800
+ // mask >> expval // fraction bits to mask off
801
+ // vout = ~(mask) // apply mask to remove fraction
802
+ // if (qneg) // negative floor is one less (more, sign bit for neg)
803
+ // vout += ((impl_mask) >> expval)
804
+ // if (mask && vin)
805
+ // vout = vin
806
+ // else // already an integer
807
+ // ; // no change
808
+
809
+ // compute floor
810
+ mask_mant_v >>= expval_v;
811
+ HVX_Vector neg_addin_v = mask_impl_v >> expval_v;
812
+ HVX_Vector vout_neg_addin = Q6_Vw_vadd_VwVw(in_vec, neg_addin_v);
813
+ HVX_Vector vout = Q6_V_vmux_QVV(q_negative, vout_neg_addin, in_vec);
814
+
815
+ HVX_Vector mask_chk_v = Q6_V_vand_VV(in_vec, mask_mant_v); // chk if bits set
816
+ HVX_VectorPred q_integral = Q6_Q_vcmp_eq_VwVw(const_zero_v, mask_chk_v);
817
+
818
+ HVX_Vector not_mask_v = Q6_V_vnot_V(mask_mant_v); // frac bits to clear
819
+ HVX_Vector vfrfloor_v = Q6_V_vand_VV(vout, not_mask_v); // clear frac bits
820
+
821
+ vout = in_vec;
822
+ vout = Q6_V_vmux_QVV(q_expltmn, vfrfloor_v, vout); // expval<mant
823
+ vout = Q6_V_vmux_QVV(q_integral, in_vec, vout); // integral values
824
+ vout = Q6_V_vmux_QVV(q_negexp_pos, const_zero_v, vout); // expval<0 x>0 -> 0
825
+ vout = Q6_V_vmux_QVV(q_negexp_neg, const_negone_v, vout); // expval<0 x<0 -> -1
826
+
827
+ return vout;
828
+ }
829
+
830
+ static inline HVX_Vector hvx_vec_i16_from_hf_rnd_sat(HVX_Vector vin) {
831
+ // This looks complicated.
832
+ // Ideally should just be Q6_Vh_equals_Vhf(vin)
833
+ // but that instruction does not do proper rounding.
834
+
835
+ // convert to qf32, multiplying by 1.0 in the process.
836
+ HVX_VectorPair v32 = Q6_Wqf32_vmpy_VhfVhf(vin, Q6_Vh_vsplat_R(0x3C00));
837
+
838
+ // 'in-range' values are +/32752.
839
+ // add 192K to it, convert to sf
840
+ HVX_Vector v192K = Q6_V_vsplat_R(0x48400000);
841
+ HVX_Vector vsf_0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_lo_W(v32), v192K));
842
+ HVX_Vector vsf_1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_hi_W(v32), v192K));
843
+
844
+ // for in-range cases, result is {163858... 229360} so the exponent is always 144.
845
+ // if we extract bits 21..0 as a signed quantity, and round 6 bits off, that will be the answer.
846
+ // Start by <<10 to get the final 'sign' bit in bit 15...
847
+ vsf_0 = Q6_Vw_vasl_VwR(vsf_0, 10);
848
+ vsf_1 = Q6_Vw_vasl_VwR(vsf_1, 10);
849
+
850
+ // now round down to 16
851
+ return Q6_Vh_vround_VwVw_sat(vsf_1, vsf_0);
852
+ }
853
+
854
+ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
855
+ HVX_Vector inv_aprox_sf = Q6_V_vsplat_R(0x7EEEEBB3);
856
+ HVX_Vector two_sf = hvx_vec_splat_fp32(2.0);
857
+
858
+ // First approximation
859
+ HVX_Vector i_sf = Q6_Vw_vsub_VwVw(inv_aprox_sf, v_sf);
860
+
861
+ HVX_Vector r_qf;
862
+
863
+ // Refine
864
+ r_qf = Q6_Vqf32_vmpy_VsfVsf(
865
+ i_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(i_sf, v_sf)))));
866
+ r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
867
+ r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
868
+ r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
869
+ r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
870
+
871
+ return Q6_Vsf_equals_Vqf32(r_qf);
872
+ }
873
+
874
+ #define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022
875
+ #define FAST_SIGMOID_C1 (0x3d009076) // 0.03138777
876
+ #define FAST_SIGMOID_C2 (0x3e8d74bd) // 0.276281267
877
+ #define FAST_SIGMOID_C3 (0x3f000000) // 0.5
878
+
879
+ static inline HVX_Vector hvx_vec_fast_sigmoid_fp32(HVX_Vector v) {
880
+ v = Q6_Vqf32_vmpy_VsfVsf(v, Q6_V_vsplat_R(FAST_SIGMOID_LOG2F));
881
+ v = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v), Q6_V_vsplat_R(FAST_SIGMOID_C3));
882
+
883
+ HVX_Vector in_int = hvx_vec_truncate_fp32(Q6_Vsf_equals_Vqf32(v));
884
+ HVX_Vector x = Q6_Vqf32_vsub_Vqf32Vsf(v, Q6_Vsf_equals_Vw(in_int));
885
+ HVX_Vector xx = Q6_Vqf32_vmpy_Vqf32Vqf32(x, x);
886
+
887
+ HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(xx), Q6_V_vsplat_R(FAST_SIGMOID_C2));
888
+ v1 = Q6_Vqf32_vadd_Vqf32Vsf(v1, Q6_V_vsplat_R(FAST_SIGMOID_LOG2F));
889
+
890
+ HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(x), Q6_V_vsplat_R(FAST_SIGMOID_C1));
891
+ v2 = Q6_Vqf32_vmpy_Vqf32Vqf32(v2, xx);
892
+ v2 = Q6_Vqf32_vadd_Vqf32Vqf32(v2, x);
893
+
894
+ HVX_Vector v3 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vqf32(v2, v1));
895
+ HVX_Vector v3_exponent = Q6_Vw_vasl_VwR(v3, 1);
896
+ v3_exponent = Q6_Vuw_vlsr_VuwR(v3_exponent, 24);
897
+ v3_exponent = Q6_Vw_vadd_VwVw(in_int, v3_exponent);
898
+ v3 = Q6_Vw_vaslacc_VwVwR(v3, in_int, 24);
899
+
900
+ HVX_Vector v4 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_Vqf32Vqf32(v2, v1));
901
+ HVX_Vector v5 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(v3, v4));
902
+
903
+ HVX_Vector res = hvx_vec_inverse_fp32(v5);
904
+ res = Q6_Vqf32_vmpy_VsfVsf(v3, res);
905
+
906
+ return Q6_Vsf_equals_Vqf32(res);
907
+ }
908
+
909
+ #define EXP_COEFF_5 (0x39506967) // 0.000198757 = 1/(7!)
910
+ #define EXP_COEFF_4 (0x3AB743CE) // 0.0013982 = 1/(6!)
911
+ #define EXP_COEFF_3 (0x3C088908) // 0.00833345 = 1/(5!)
912
+ #define EXP_COEFF_2 (0x3D2AA9C1) // 0.416658 = 1/(4!)
913
+ #define EXP_COEFF_1 (0x3E2AAAAA) // 0.16666667 = 1/(3!)
914
+ #define EXP_COEFF_0 (0x3F000000) // 0.5 = 1/(2!)
915
+ #define EXP_LOGN2 (0x3F317218) // ln(2) = 0.6931471805
916
+ #define EXP_LOG2E (0x3FB8AA3B) // log2(e) = 1/ln(2) = 1.4426950408
917
+ #define EXP_ONE (0x3f800000) // 1.0
918
+ #define EXP_RANGE_R (0x41a00000) // 20.0
919
+ #define EXP_RANGE_L (0xc1a00000) // -20.0
920
+
921
+ static inline HVX_Vector hvx_vec_exp_fp32(HVX_Vector in_vec) {
922
+ HVX_Vector z_qf32_v;
923
+ HVX_Vector x_v;
924
+ HVX_Vector x_qf32_v;
925
+ HVX_Vector y_v;
926
+ HVX_Vector k_v;
927
+ HVX_Vector f_v;
928
+ HVX_Vector epsilon_v;
929
+ HVX_Vector log2e = Q6_V_vsplat_R(EXP_LOG2E);
930
+ HVX_Vector logn2 = Q6_V_vsplat_R(EXP_LOGN2);
931
+ HVX_Vector E_const;
932
+ HVX_Vector zero_v = Q6_V_vzero();
933
+
934
+ // exp(x) is approximated as follows:
935
+ // f = floor(x/ln(2)) = floor(x*log2(e))
936
+ // epsilon = x - f*ln(2)
937
+ // exp(x) = exp(epsilon+f*ln(2))
938
+ // = exp(epsilon)*exp(f*ln(2))
939
+ // = exp(epsilon)*2^f
940
+ //
941
+ // Since epsilon is close to zero, it can be approximated with its Taylor series:
942
+ // exp(x) ~= 1+x+x^2/2!+x^3/3!+...+x^n/n!+...
943
+ // Preserving the first eight elements, we get:
944
+ // exp(x) ~= 1+x+e0*x^2+e1*x^3+e2*x^4+e3*x^5+e4*x^6+e5*x^7
945
+ // = 1+x+(E0+(E1+(E2+(E3+(E4+E5*x)*x)*x)*x)*x)*x^2
946
+
947
+ HVX_Vector temp_v = in_vec;
948
+
949
+ // Clamp inputs to (-20.0, 20.0)
950
+ HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, Q6_V_vsplat_R(EXP_RANGE_R));
951
+ HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(Q6_V_vsplat_R(EXP_RANGE_L), in_vec);
952
+
953
+ in_vec = Q6_V_vmux_QVV(pred_cap_right, Q6_V_vsplat_R(EXP_RANGE_R), temp_v);
954
+ in_vec = Q6_V_vmux_QVV(pred_cap_left, Q6_V_vsplat_R(EXP_RANGE_L), temp_v);
955
+
956
+ epsilon_v = Q6_Vqf32_vmpy_VsfVsf(log2e, in_vec);
957
+ epsilon_v = Q6_Vsf_equals_Vqf32(epsilon_v);
958
+
959
+ // f_v is the floating point result and k_v is the integer result
960
+ f_v = hvx_vec_floor_fp32(epsilon_v);
961
+ k_v = hvx_vec_truncate_fp32(f_v);
962
+
963
+ x_qf32_v = Q6_Vqf32_vadd_VsfVsf(in_vec, zero_v);
964
+
965
+ // x = x - f_v * logn2;
966
+ epsilon_v = Q6_Vqf32_vmpy_VsfVsf(f_v, logn2);
967
+ x_qf32_v = Q6_Vqf32_vsub_Vqf32Vqf32(x_qf32_v, epsilon_v);
968
+ // normalize before every QFloat's vmpy
969
+ x_qf32_v = Q6_Vqf32_vadd_Vqf32Vsf(x_qf32_v, zero_v);
970
+
971
+ // z = x * x;
972
+ z_qf32_v = Q6_Vqf32_vmpy_Vqf32Vqf32(x_qf32_v, x_qf32_v);
973
+ z_qf32_v = Q6_Vqf32_vadd_Vqf32Vsf(z_qf32_v, zero_v);
974
+
975
+ x_v = Q6_Vsf_equals_Vqf32(x_qf32_v);
976
+
977
+ // y = E4 + E5 * x;
978
+ E_const = Q6_V_vsplat_R(EXP_COEFF_5);
979
+ y_v = Q6_Vqf32_vmpy_VsfVsf(E_const, x_v);
980
+ E_const = Q6_V_vsplat_R(EXP_COEFF_4);
981
+ y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
982
+ y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
983
+
984
+ // y = E3 + y * x;
985
+ E_const = Q6_V_vsplat_R(EXP_COEFF_3);
986
+ y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
987
+ y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
988
+ y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
989
+
990
+ // y = E2 + y * x;
991
+ E_const = Q6_V_vsplat_R(EXP_COEFF_2);
992
+ y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
993
+ y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
994
+ y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
995
+
996
+ // y = E1 + y * x;
997
+ E_const = Q6_V_vsplat_R(EXP_COEFF_1);
998
+ y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
999
+ y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
1000
+ y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
1001
+
1002
+ // y = E0 + y * x;
1003
+ E_const = Q6_V_vsplat_R(EXP_COEFF_0);
1004
+ y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
1005
+ y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
1006
+ y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
1007
+
1008
+ // y = x + y * z;
1009
+ y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, z_qf32_v);
1010
+ y_v = Q6_Vqf32_vadd_Vqf32Vqf32(y_v, x_qf32_v);
1011
+ y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
1012
+
1013
+ // y = y + 1.0;
1014
+ y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, Q6_V_vsplat_R(EXP_ONE));
1015
+
1016
+ // insert exponents
1017
+ // y = ldexpf(y, k);
1018
+ // y_v += k_v; // qf32
1019
+ // modify exponent
1020
+
1021
+ y_v = Q6_Vsf_equals_Vqf32(y_v);
1022
+
1023
+ // add k_v to the exponent of y_v
1024
+ HVX_Vector y_v_exponent = Q6_Vw_vasl_VwR(y_v, 1);
1025
+
1026
+ y_v_exponent = Q6_Vuw_vlsr_VuwR(y_v_exponent, IEEE_VSF_MANTLEN + 1);
1027
+ y_v_exponent = Q6_Vw_vadd_VwVw(k_v, y_v_exponent);
1028
+
1029
+ // exponent cannot be negative; if overflow is detected, result is set to zero
1030
+ HVX_VectorPred qy_v_negative_exponent = Q6_Q_vcmp_gt_VwVw(zero_v, y_v_exponent);
1031
+
1032
+ y_v = Q6_Vw_vaslacc_VwVwR(y_v, k_v, IEEE_VSF_MANTLEN);
1033
+
1034
+ y_v = Q6_V_vmux_QVV(qy_v_negative_exponent, zero_v, y_v);
1035
+
1036
+ return y_v;
1037
+ }
1038
+
1039
+ #define RSQRT_CONST 0x5f3759df // Constant for fast inverse square root calculation
1040
+ #define RSQRT_ONE_HALF 0x3f000000 // 0.5
1041
+ #define RSQRT_THREE_HALVES 0x3fc00000 // 1.5
1042
+
1043
+ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) {
1044
+ //Algorithm :
1045
+ // x2 = input*0.5
1046
+ // y = * (long *) &input
1047
+ // y = 0x5f3759df - (y>>2)
1048
+ // y = y*(threehalfs - x2*y*y)
1049
+
1050
+ HVX_Vector rsqrtconst = Q6_V_vsplat_R(RSQRT_CONST);
1051
+ HVX_Vector onehalf = Q6_V_vsplat_R(RSQRT_ONE_HALF);
1052
+ HVX_Vector threehalfs = Q6_V_vsplat_R(RSQRT_THREE_HALVES);
1053
+
1054
+ HVX_Vector x2, y, ypower2, temp;
1055
+
1056
+ x2 = Q6_Vqf32_vmpy_VsfVsf(in_vec, onehalf);
1057
+ x2 = Q6_Vqf32_vadd_Vqf32Vsf(x2, Q6_V_vzero());
1058
+
1059
+ y = Q6_Vw_vasr_VwR(in_vec, 1);
1060
+ y = Q6_Vw_vsub_VwVw(rsqrtconst, y);
1061
+
1062
+ // 1st iteration
1063
+ ypower2 = Q6_Vqf32_vmpy_VsfVsf(y, y);
1064
+ ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero());
1065
+ temp = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2);
1066
+ temp = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp));
1067
+ temp = Q6_Vqf32_vmpy_VsfVsf(y, Q6_Vsf_equals_Vqf32(temp));
1068
+
1069
+ // 2nd iteration
1070
+ y = Q6_Vqf32_vadd_Vqf32Vsf(temp, Q6_V_vzero());
1071
+ ypower2 = Q6_Vqf32_vmpy_Vqf32Vqf32(y, y);
1072
+ ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero());
1073
+ temp = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2);
1074
+ temp = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp));
1075
+ temp = Q6_Vqf32_vmpy_Vqf32Vqf32(y, temp);
1076
+
1077
+ // 3rd iteration
1078
+ y = Q6_Vqf32_vadd_Vqf32Vsf(temp, Q6_V_vzero());
1079
+ ypower2 = Q6_Vqf32_vmpy_Vqf32Vqf32(y, y);
1080
+ ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero());
1081
+ temp = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2);
1082
+ temp = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp));
1083
+ temp = Q6_Vqf32_vmpy_Vqf32Vqf32(y, temp);
1084
+
1085
+ return Q6_Vsf_equals_Vqf32(temp);
1086
+ }
1087
+
1088
+ static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v,
1089
+ HVX_Vector one,
1090
+ HVX_Vector max_exp,
1091
+ HVX_Vector min_exp) {
1092
+ const HVX_VectorPred pred_max = Q6_Q_vcmp_gt_VsfVsf(max_exp, v);
1093
+ const HVX_VectorPred pred_min = Q6_Q_vcmp_gt_VsfVsf(v, min_exp);
1094
+
1095
+ HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v);
1096
+ out = Q6_V_vmux_QVV(pred_max, out, one);
1097
+ return Q6_V_vmux_QVV(pred_min, out, Q6_V_vzero());
1098
+ }
1099
+
1100
+ static inline HVX_Vector hvx_vec_tanh_fp32(HVX_Vector x) {
1101
+ // tanh(x) = 2 * sigmoid(2x) - 1
1102
+ HVX_Vector two = hvx_vec_splat_fp32(2.0f);
1103
+ HVX_Vector one = hvx_vec_splat_fp32(1.0f);
1104
+ HVX_Vector x2 = Q6_Vqf32_vmpy_VsfVsf(x, two);
1105
+
1106
+ static const float kMinExp = -87.f; // 0
1107
+ static const float kMaxExp = 87.f; // 1
1108
+ HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
1109
+ HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
1110
+
1111
+ HVX_Vector sig2x = hvx_vec_fast_sigmoid_fp32_guard(Q6_Vsf_equals_Vqf32(x2), one, max_exp, min_exp);
1112
+
1113
+ HVX_Vector res = Q6_Vqf32_vmpy_VsfVsf(sig2x, two);
1114
+ res = Q6_Vqf32_vsub_Vqf32Vsf(res, one);
1115
+ return Q6_Vsf_equals_Vqf32(res);
1116
+ }
1117
+
1118
+ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
1119
+ int step_of_1 = num_elems >> 5;
1120
+ int remaining = num_elems - step_of_1 * VLEN_FP32;
1121
+
1122
+ const HVX_Vector * restrict v_src = (HVX_Vector *) src;
1123
+ HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
1124
+
1125
+ static const float kMinExp = -87.f; // 0
1126
+ static const float kMaxExp = 87.f; // 1
1127
+
1128
+ const HVX_Vector one = hvx_vec_splat_fp32(1.f);
1129
+ const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
1130
+ const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
1131
+
1132
+ #pragma unroll(4)
1133
+ for (int i = 0; i < step_of_1; i++) {
1134
+ v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
1135
+ }
1136
+
1137
+ if (remaining > 0) {
1138
+ const float * srcf = ((const float *) src) + step_of_1* VLEN_FP32;
1139
+ float * dstf = (float *) dst + step_of_1*VLEN_FP32;
1140
+
1141
+ HVX_Vector in = *(HVX_UVector *) srcf;
1142
+ HVX_Vector out = hvx_vec_fast_sigmoid_fp32_guard(in, one, max_exp, min_exp);
1143
+ hvx_vec_store_u((void *) dstf, remaining * SIZEOF_FP32, out);
1144
+ }
1145
+ }
1146
+
1147
+ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){
1148
+ int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector
1149
+ int leftover = num_elems - (step_of_1 * VLEN_FP32);
1150
+
1151
+ int32_t leftover_size = leftover * sizeof(float);
1152
+
1153
+ static const float kMinExp = -87.f; // 0
1154
+ static const float kMaxExp = 87.f; // 1
1155
+
1156
+ const HVX_Vector one = hvx_vec_splat_fp32(1.f);
1157
+ const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
1158
+ const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
1159
+
1160
+ const float *input = (float *)src;
1161
+ float *output = (float *)dst;
1162
+
1163
+ HVX_Vector * input_v_ptr = (HVX_Vector *) input;
1164
+ HVX_UVector * output_v_ptr = (HVX_UVector *) output;
1165
+
1166
+ HVX_Vector slinep;
1167
+ HVX_Vector slinec;
1168
+ HVX_Vector sline;
1169
+
1170
+ slinep = *input_v_ptr++;
1171
+ #pragma unroll(4)
1172
+ for (int i = step_of_1 - 1; i > 0; i--) {
1173
+ slinec = *input_v_ptr++;
1174
+ sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
1175
+ *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
1176
+ /* Prepare slinep for next iteration */
1177
+ slinep = slinec;
1178
+ }
1179
+
1180
+ if (step_of_1 > 0) {
1181
+ slinec = htp_is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++;
1182
+ sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
1183
+ *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
1184
+ ;
1185
+
1186
+ slinep = slinec;
1187
+ }
1188
+ if (leftover > 0) {
1189
+ slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128) ? slinep : *input_v_ptr++);
1190
+
1191
+ sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
1192
+
1193
+ HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
1194
+ hvx_vec_store_u(output_v_ptr, leftover_size, sout);
1195
+ }
1196
+ }
1197
+
1198
+ static inline void hvx_scale_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
1199
+ int nvec = n / VLEN_FP32;
1200
+ int nloe = n % VLEN_FP32;
1201
+
1202
+ HVX_Vector vs = hvx_vec_splat_fp32(scale);
1203
+
1204
+ HVX_Vector * vsrc = (HVX_Vector *) src;
1205
+ HVX_Vector * vdst = (HVX_Vector *) dst;
1206
+
1207
+ uint32_t i = 0;
1208
+
1209
+ #pragma unroll(4)
1210
+ for (i = 0; i < nvec; ++i) {
1211
+ HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);
1212
+ vdst[i] = Q6_Vsf_equals_Vqf32(v);
1213
+ }
1214
+
1215
+ if (nloe) {
1216
+ HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);
1217
+ hvx_vec_store_u((void *) &vdst[i], nloe * 4, Q6_Vsf_equals_Vqf32(v));
1218
+ }
1219
+ }
1220
+
1221
+ static inline void hvx_scale_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
1222
+ int nvec = n / VLEN_FP32;
1223
+ int nloe = n % VLEN_FP32;
1224
+
1225
+ HVX_Vector vs = hvx_vec_splat_fp32(scale);
1226
+
1227
+ HVX_UVector * vsrc = (HVX_UVector *) src;
1228
+ HVX_UVector * vdst = (HVX_UVector *) dst;
1229
+
1230
+ uint32_t i = 0;
1231
+
1232
+ #pragma unroll(4)
1233
+ for (i = 0; i < nvec; ++i) {
1234
+ HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);
1235
+ vdst[i] = Q6_Vsf_equals_Vqf32(v);
1236
+ }
1237
+
1238
+ if (nloe) {
1239
+ HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);
1240
+ hvx_vec_store_u((void *) &vdst[i], nloe * 4, Q6_Vsf_equals_Vqf32(v));
1241
+ }
1242
+ }
1243
+
1244
+ static inline void hvx_scale_f32(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
1245
+ if (htp_is_aligned((void *) src, VLEN) && htp_is_aligned((void *) dst, VLEN)) {
1246
+ hvx_scale_f32_aa(dst, src, n, scale);
1247
+ } else {
1248
+ hvx_scale_f32_uu(dst, src, n, scale);
1249
+ }
1250
+ }
1251
+
1252
+ static inline void hvx_scale_offset_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
1253
+ int nvec = n / VLEN_FP32;
1254
+ int nloe = n % VLEN_FP32;
1255
+
1256
+ HVX_Vector vs = hvx_vec_splat_fp32(scale);
1257
+ HVX_Vector vo = hvx_vec_splat_fp32(offset);
1258
+
1259
+ HVX_Vector * vsrc = (HVX_Vector *) src;
1260
+ HVX_Vector * vdst = (HVX_Vector *) dst;
1261
+
1262
+ uint32_t i = 0;
1263
+
1264
+ #pragma unroll(4)
1265
+ for (i = 0; i < nvec; ++i) {
1266
+ HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo);
1267
+ vdst[i] = Q6_Vsf_equals_Vqf32(v);
1268
+ }
1269
+
1270
+ if (nloe) {
1271
+ HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo);
1272
+ hvx_vec_store_u((void *) &vdst[i], nloe * 4, Q6_Vsf_equals_Vqf32(v));
1273
+ }
1274
+ }
1275
+
1276
+ static inline void hvx_scale_offset_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
1277
+ int nvec = n / VLEN_FP32;
1278
+ int nloe = n % VLEN_FP32;
1279
+
1280
+ HVX_Vector vs = hvx_vec_splat_fp32(scale);
1281
+ HVX_Vector vo = hvx_vec_splat_fp32(offset);
1282
+
1283
+ HVX_UVector * vsrc = (HVX_UVector *) src;
1284
+ HVX_UVector * vdst = (HVX_UVector *) dst;
1285
+
1286
+ uint32_t i = 0;
1287
+
1288
+ #pragma unroll(4)
1289
+ for (i = 0; i < nvec; ++i) {
1290
+ HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo);
1291
+ vdst[i] = Q6_Vsf_equals_Vqf32(v);
1292
+ }
1293
+
1294
+ if (nloe) {
1295
+ HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo);
1296
+ hvx_vec_store_u((void *) &vdst[i], nloe * 4, Q6_Vsf_equals_Vqf32(v));
1297
+ }
1298
+ }
1299
+
1300
+ static inline void hvx_scale_offset_f32(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
1301
+ if (htp_is_aligned((void *) src, VLEN) && htp_is_aligned((void *) dst, VLEN)) {
1302
+ hvx_scale_offset_f32_aa(dst, src, n, scale, offset);
1303
+ } else {
1304
+ hvx_scale_offset_f32_uu(dst, src, n, scale, offset);
1305
+ }
1306
+ }
1307
+
1308
+ float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems);
1309
+ void hvx_mul_f32(const uint8_t * restrict src0,
1310
+ const uint8_t * restrict src1,
1311
+ uint8_t * restrict dst,
1312
+ const int num_elems);
1313
+ void hvx_mul_f32_opt(const uint8_t * restrict src0,
1314
+ const uint8_t * restrict src1,
1315
+ uint8_t * restrict dst,
1316
+ const int num_elems);
1317
+ void hvx_mul_mul_f32_opt(const uint8_t * restrict src0,
1318
+ const uint8_t * restrict src1,
1319
+ const uint8_t * restrict src2,
1320
+ uint8_t * restrict dst,
1321
+ const int num_elems);
1322
+ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
1323
+ void hvx_add_f32(const uint8_t * restrict src0,
1324
+ const uint8_t * restrict src1,
1325
+ uint8_t * restrict dst,
1326
+ const int num_elems);
1327
+ void hvx_add_f32_opt(const uint8_t * restrict src0,
1328
+ const uint8_t * restrict src1,
1329
+ uint8_t * restrict dst,
1330
+ const int num_elems);
1331
+ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
1332
+ void hvx_sub_f32(const uint8_t * restrict src0,
1333
+ const uint8_t * restrict src1,
1334
+ uint8_t * restrict dst,
1335
+ const int num_elems);
1336
+ void hvx_sub_f32_opt(const uint8_t * restrict src0,
1337
+ const uint8_t * restrict src1,
1338
+ uint8_t * restrict dst,
1339
+ const int num_elems);
1340
+ void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
1341
+ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems);
1342
+ void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems);
1343
+ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate);
1344
+ float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems);
1345
+ float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems);
1346
+ void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
1347
+ void hvx_clamp_scalar_f32(const uint8_t * restrict src,
1348
+ const float limit_left,
1349
+ const float limit_right,
1350
+ uint8_t * restrict dst,
1351
+ const int num_elems);
1352
+
1353
+ #endif /* HVX_UTILS_H */