whispercpp 1.3.4 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (891) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.md +158 -44
  4. data/ext/extconf.rb +3 -2
  5. data/ext/ruby_whisper.c +34 -6
  6. data/ext/ruby_whisper.h +67 -0
  7. data/ext/ruby_whisper_context.c +236 -144
  8. data/ext/ruby_whisper_context_params.c +163 -0
  9. data/ext/ruby_whisper_model.c +12 -13
  10. data/ext/ruby_whisper_params.c +47 -24
  11. data/ext/ruby_whisper_segment.c +84 -20
  12. data/ext/ruby_whisper_token.c +371 -0
  13. data/ext/ruby_whisper_transcribe.cpp +5 -2
  14. data/ext/ruby_whisper_vad_context.c +122 -0
  15. data/ext/ruby_whisper_vad_context_detect.cpp +51 -0
  16. data/ext/ruby_whisper_vad_params.c +0 -1
  17. data/ext/ruby_whisper_vad_segment.c +138 -0
  18. data/ext/ruby_whisper_vad_segments.c +105 -0
  19. data/ext/sources/CMakeLists.txt +4 -1
  20. data/ext/sources/bindings/javascript/package.json +1 -1
  21. data/ext/sources/cmake/arm64-apple-clang.cmake +16 -0
  22. data/ext/sources/cmake/arm64-windows-llvm.cmake +16 -0
  23. data/ext/sources/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
  24. data/ext/sources/cmake/whisper-config.cmake.in +5 -40
  25. data/ext/sources/cmake/x64-windows-llvm.cmake +5 -0
  26. data/ext/sources/examples/addon.node/vad-example.js +2 -2
  27. data/ext/sources/examples/bench/bench.cpp +23 -18
  28. data/ext/sources/examples/cli/cli.cpp +129 -112
  29. data/ext/sources/examples/common-ggml.cpp +2 -0
  30. data/ext/sources/examples/lsp/CMakeLists.txt +2 -1
  31. data/ext/sources/examples/miniaudio.h +4507 -2131
  32. data/ext/sources/examples/quantize/CMakeLists.txt +2 -1
  33. data/ext/sources/examples/server/server.cpp +28 -15
  34. data/ext/sources/examples/talk-llama/CMakeLists.txt +8 -3
  35. data/ext/sources/examples/talk-llama/llama-adapter.cpp +5 -2
  36. data/ext/sources/examples/talk-llama/llama-adapter.h +7 -0
  37. data/ext/sources/examples/talk-llama/llama-arch.cpp +2378 -1988
  38. data/ext/sources/examples/talk-llama/llama-arch.h +109 -2
  39. data/ext/sources/examples/talk-llama/llama-batch.cpp +78 -34
  40. data/ext/sources/examples/talk-llama/llama-batch.h +17 -4
  41. data/ext/sources/examples/talk-llama/llama-chat.cpp +100 -4
  42. data/ext/sources/examples/talk-llama/llama-chat.h +5 -0
  43. data/ext/sources/examples/talk-llama/llama-context.cpp +1088 -403
  44. data/ext/sources/examples/talk-llama/llama-context.h +70 -23
  45. data/ext/sources/examples/talk-llama/llama-cparams.h +6 -0
  46. data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
  47. data/ext/sources/examples/talk-llama/llama-grammar.cpp +295 -60
  48. data/ext/sources/examples/talk-llama/llama-grammar.h +22 -1
  49. data/ext/sources/examples/talk-llama/llama-graph.cpp +925 -155
  50. data/ext/sources/examples/talk-llama/llama-graph.h +234 -23
  51. data/ext/sources/examples/talk-llama/llama-hparams.cpp +79 -38
  52. data/ext/sources/examples/talk-llama/llama-hparams.h +118 -18
  53. data/ext/sources/examples/talk-llama/llama-impl.cpp +11 -7
  54. data/ext/sources/examples/talk-llama/llama-impl.h +14 -2
  55. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +8 -4
  56. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +405 -140
  57. data/ext/sources/examples/talk-llama/llama-kv-cache.h +24 -10
  58. data/ext/sources/examples/talk-llama/llama-kv-cells.h +44 -2
  59. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
  60. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
  61. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +12 -10
  62. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +42 -31
  63. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +2 -2
  64. data/ext/sources/examples/talk-llama/llama-mmap.cpp +197 -45
  65. data/ext/sources/examples/talk-llama/llama-mmap.h +8 -3
  66. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +606 -116
  67. data/ext/sources/examples/talk-llama/llama-model-loader.h +41 -5
  68. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +61 -44
  69. data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
  70. data/ext/sources/examples/talk-llama/llama-model.cpp +2756 -13643
  71. data/ext/sources/examples/talk-llama/llama-model.h +112 -18
  72. data/ext/sources/examples/talk-llama/llama-quant.cpp +582 -365
  73. data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +1409 -199
  74. data/ext/sources/examples/talk-llama/llama-sampler.h +42 -0
  75. data/ext/sources/examples/talk-llama/llama-vocab.cpp +248 -82
  76. data/ext/sources/examples/talk-llama/llama-vocab.h +50 -40
  77. data/ext/sources/examples/talk-llama/llama.cpp +802 -21
  78. data/ext/sources/examples/talk-llama/llama.h +210 -39
  79. data/ext/sources/examples/talk-llama/models/afmoe.cpp +190 -0
  80. data/ext/sources/examples/talk-llama/models/apertus.cpp +125 -0
  81. data/ext/sources/examples/talk-llama/models/arcee.cpp +135 -0
  82. data/ext/sources/examples/talk-llama/models/arctic.cpp +137 -0
  83. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +86 -0
  84. data/ext/sources/examples/talk-llama/models/baichuan.cpp +123 -0
  85. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +143 -0
  86. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +133 -0
  87. data/ext/sources/examples/talk-llama/models/bert.cpp +184 -0
  88. data/ext/sources/examples/talk-llama/models/bitnet.cpp +145 -0
  89. data/ext/sources/examples/talk-llama/models/bloom.cpp +101 -0
  90. data/ext/sources/examples/talk-llama/models/chameleon.cpp +178 -0
  91. data/ext/sources/examples/talk-llama/models/chatglm.cpp +132 -0
  92. data/ext/sources/examples/talk-llama/models/codeshell.cpp +111 -0
  93. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +102 -0
  94. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +134 -0
  95. data/ext/sources/examples/talk-llama/models/command-r.cpp +122 -0
  96. data/ext/sources/examples/talk-llama/models/dbrx.cpp +122 -0
  97. data/ext/sources/examples/talk-llama/models/deci.cpp +135 -0
  98. data/ext/sources/examples/talk-llama/models/deepseek.cpp +142 -0
  99. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +262 -0
  100. data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
  101. data/ext/sources/examples/talk-llama/models/dots1.cpp +132 -0
  102. data/ext/sources/examples/talk-llama/models/dream.cpp +105 -0
  103. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +148 -0
  104. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +110 -0
  105. data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
  106. data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
  107. data/ext/sources/examples/talk-llama/models/exaone.cpp +114 -0
  108. data/ext/sources/examples/talk-llama/models/exaone4.cpp +123 -0
  109. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +111 -0
  110. data/ext/sources/examples/talk-llama/models/falcon.cpp +120 -0
  111. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +116 -0
  112. data/ext/sources/examples/talk-llama/models/gemma.cpp +112 -0
  113. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +128 -0
  114. data/ext/sources/examples/talk-llama/models/gemma3.cpp +155 -0
  115. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +384 -0
  116. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +170 -0
  117. data/ext/sources/examples/talk-llama/models/glm4.cpp +157 -0
  118. data/ext/sources/examples/talk-llama/models/gpt2.cpp +105 -0
  119. data/ext/sources/examples/talk-llama/models/gptneox.cpp +144 -0
  120. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +195 -0
  121. data/ext/sources/examples/talk-llama/models/granite.cpp +210 -0
  122. data/ext/sources/examples/talk-llama/models/grok.cpp +159 -0
  123. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +139 -0
  124. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +132 -0
  125. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +153 -0
  126. data/ext/sources/examples/talk-llama/models/internlm2.cpp +120 -0
  127. data/ext/sources/examples/talk-llama/models/jais.cpp +86 -0
  128. data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
  129. data/ext/sources/examples/talk-llama/models/jamba.cpp +106 -0
  130. data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
  131. data/ext/sources/examples/talk-llama/models/lfm2.cpp +196 -0
  132. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +122 -0
  133. data/ext/sources/examples/talk-llama/models/llada.cpp +99 -0
  134. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +178 -0
  135. data/ext/sources/examples/talk-llama/models/llama.cpp +175 -0
  136. data/ext/sources/examples/talk-llama/models/maincoder.cpp +117 -0
  137. data/ext/sources/examples/talk-llama/models/mamba-base.cpp +289 -0
  138. data/ext/sources/examples/talk-llama/models/mamba.cpp +54 -0
  139. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +129 -0
  140. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +200 -0
  141. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +123 -0
  142. data/ext/sources/examples/talk-llama/models/mistral3.cpp +160 -0
  143. data/ext/sources/examples/talk-llama/models/models.h +704 -0
  144. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +109 -0
  145. data/ext/sources/examples/talk-llama/models/mpt.cpp +126 -0
  146. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +162 -0
  147. data/ext/sources/examples/talk-llama/models/nemotron.cpp +122 -0
  148. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +104 -0
  149. data/ext/sources/examples/talk-llama/models/olmo.cpp +121 -0
  150. data/ext/sources/examples/talk-llama/models/olmo2.cpp +150 -0
  151. data/ext/sources/examples/talk-llama/models/olmoe.cpp +124 -0
  152. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +127 -0
  153. data/ext/sources/examples/talk-llama/models/openelm.cpp +124 -0
  154. data/ext/sources/examples/talk-llama/models/orion.cpp +123 -0
  155. data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
  156. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +121 -0
  157. data/ext/sources/examples/talk-llama/models/phi2.cpp +121 -0
  158. data/ext/sources/examples/talk-llama/models/phi3.cpp +152 -0
  159. data/ext/sources/examples/talk-llama/models/plamo.cpp +110 -0
  160. data/ext/sources/examples/talk-llama/models/plamo2.cpp +320 -0
  161. data/ext/sources/examples/talk-llama/models/plamo3.cpp +128 -0
  162. data/ext/sources/examples/talk-llama/models/plm.cpp +169 -0
  163. data/ext/sources/examples/talk-llama/models/qwen.cpp +108 -0
  164. data/ext/sources/examples/talk-llama/models/qwen2.cpp +126 -0
  165. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +151 -0
  166. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +117 -0
  167. data/ext/sources/examples/talk-llama/models/qwen3.cpp +120 -0
  168. data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
  169. data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
  170. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +131 -0
  171. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +525 -0
  172. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +140 -0
  173. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +132 -0
  174. data/ext/sources/examples/talk-llama/models/refact.cpp +94 -0
  175. data/ext/sources/examples/talk-llama/models/rnd1.cpp +126 -0
  176. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +164 -0
  177. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +94 -0
  178. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +86 -0
  179. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +137 -0
  180. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +90 -0
  181. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +124 -0
  182. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +126 -0
  183. data/ext/sources/examples/talk-llama/models/smollm3.cpp +128 -0
  184. data/ext/sources/examples/talk-llama/models/stablelm.cpp +146 -0
  185. data/ext/sources/examples/talk-llama/models/starcoder.cpp +100 -0
  186. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +121 -0
  187. data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
  188. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +166 -0
  189. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +96 -0
  190. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +149 -0
  191. data/ext/sources/examples/talk-llama/models/xverse.cpp +108 -0
  192. data/ext/sources/examples/talk-llama/unicode.cpp +121 -79
  193. data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +1 -1
  194. data/ext/sources/examples/whisper.wasm/index-tmpl.html +1 -1
  195. data/ext/sources/ggml/CMakeLists.txt +90 -56
  196. data/ext/sources/ggml/include/ggml-alloc.h +9 -0
  197. data/ext/sources/ggml/include/ggml-backend.h +5 -2
  198. data/ext/sources/ggml/include/ggml-cann.h +1 -1
  199. data/ext/sources/ggml/include/ggml-cpu.h +6 -0
  200. data/ext/sources/ggml/include/ggml-hexagon.h +19 -0
  201. data/ext/sources/ggml/include/ggml-openvino.h +37 -0
  202. data/ext/sources/ggml/include/ggml-opt.h +1 -1
  203. data/ext/sources/ggml/include/ggml-rpc.h +14 -12
  204. data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
  205. data/ext/sources/ggml/include/ggml-zendnn.h +22 -0
  206. data/ext/sources/ggml/include/ggml.h +246 -21
  207. data/ext/sources/ggml/src/CMakeLists.txt +85 -11
  208. data/ext/sources/ggml/src/ggml-alloc.c +128 -50
  209. data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
  210. data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
  211. data/ext/sources/ggml/src/ggml-backend-impl.h +1 -4
  212. data/ext/sources/ggml/src/ggml-backend-reg.cpp +54 -88
  213. data/ext/sources/ggml/src/ggml-backend.cpp +76 -23
  214. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +18 -4
  215. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +11 -11
  216. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +58 -46
  217. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +139 -48
  218. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +2427 -1785
  219. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +238 -362
  220. data/ext/sources/ggml/src/ggml-cann/common.h +285 -211
  221. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +663 -831
  222. data/ext/sources/ggml/src/ggml-common.h +11 -0
  223. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +170 -95
  224. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -18
  225. data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
  226. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
  227. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
  228. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +513 -27
  229. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +4192 -992
  230. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
  231. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  232. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1761 -49
  233. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
  234. data/ext/sources/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  235. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
  236. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
  237. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +124 -24
  238. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +157 -28
  239. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
  240. data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
  241. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -3
  242. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +251 -80
  243. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +19 -0
  244. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +587 -119
  245. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +33 -44
  246. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1093 -194
  247. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1284 -203
  248. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
  249. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1519 -527
  250. data/ext/sources/ggml/src/ggml-cpu/ops.h +6 -4
  251. data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
  252. data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
  253. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +3632 -781
  254. data/ext/sources/ggml/src/ggml-cpu/repack.h +129 -4
  255. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  256. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +152 -46
  257. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +3 -2
  258. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +152 -1
  259. data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +7 -0
  260. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +140 -0
  261. data/ext/sources/ggml/src/ggml-cpu/vec.h +261 -146
  262. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +72 -1
  263. data/ext/sources/ggml/src/ggml-cuda/argmax.cu +2 -2
  264. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +132 -6
  265. data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +16 -0
  266. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +33 -31
  267. data/ext/sources/ggml/src/ggml-cuda/common.cuh +474 -85
  268. data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
  269. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
  270. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
  271. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +342 -246
  272. data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +1 -5
  273. data/ext/sources/ggml/src/ggml-cuda/cumsum.cu +307 -0
  274. data/ext/sources/ggml/src/ggml-cuda/cumsum.cuh +5 -0
  275. data/ext/sources/ggml/src/ggml-cuda/diag.cu +77 -0
  276. data/ext/sources/ggml/src/ggml-cuda/diag.cuh +5 -0
  277. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +98 -74
  278. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +973 -665
  279. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +35 -741
  280. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +1255 -0
  281. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +33 -40
  282. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +40 -18
  283. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
  284. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +206 -45
  285. data/ext/sources/ggml/src/ggml-cuda/fill.cu +37 -0
  286. data/ext/sources/ggml/src/ggml-cuda/fill.cuh +3 -0
  287. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
  288. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  289. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1688 -302
  290. data/ext/sources/ggml/src/ggml-cuda/mean.cu +12 -10
  291. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +908 -48
  292. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +88 -20
  293. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +502 -90
  294. data/ext/sources/ggml/src/ggml-cuda/mmid.cu +164 -0
  295. data/ext/sources/ggml/src/ggml-cuda/mmid.cuh +5 -0
  296. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +69 -176
  297. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +532 -193
  298. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +460 -104
  299. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +5 -2
  300. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +360 -122
  301. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +2 -1
  302. data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
  303. data/ext/sources/ggml/src/ggml-cuda/pad.cu +73 -39
  304. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +152 -1
  305. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +14 -0
  306. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
  307. data/ext/sources/ggml/src/ggml-cuda/rope.cu +364 -149
  308. data/ext/sources/ggml/src/ggml-cuda/rope.cuh +2 -0
  309. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +101 -47
  310. data/ext/sources/ggml/src/ggml-cuda/set.cu +39 -0
  311. data/ext/sources/ggml/src/ggml-cuda/set.cuh +7 -0
  312. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +163 -41
  313. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +275 -0
  314. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
  315. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +68 -50
  316. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  317. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +49 -84
  318. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
  319. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
  320. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
  321. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
  322. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
  323. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
  324. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  325. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  326. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  327. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  328. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  329. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  330. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  331. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  332. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  333. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +22 -4
  334. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +95 -0
  335. data/ext/sources/ggml/src/ggml-cuda/top-k.cuh +3 -0
  336. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +275 -119
  337. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -7
  338. data/ext/sources/ggml/src/ggml-cuda/tri.cu +136 -0
  339. data/ext/sources/ggml/src/ggml-cuda/tri.cuh +5 -0
  340. data/ext/sources/ggml/src/ggml-cuda/unary.cu +160 -11
  341. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +38 -0
  342. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +163 -7
  343. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
  344. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  345. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +22 -1
  346. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +6 -0
  347. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
  348. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3325 -0
  349. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +46 -0
  350. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +813 -0
  351. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  352. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +891 -0
  353. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  354. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
  355. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +713 -0
  356. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +112 -0
  357. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  358. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
  359. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  360. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  361. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  362. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  363. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +155 -0
  364. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +63 -0
  365. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  366. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  367. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
  368. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  369. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
  370. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  371. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  372. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  373. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  374. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  375. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  376. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  377. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  378. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  379. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +26 -0
  380. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +1199 -0
  381. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2670 -0
  382. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +497 -0
  383. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +168 -0
  384. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +419 -0
  385. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
  386. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  387. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +382 -0
  388. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  389. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  390. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  391. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  392. data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
  393. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  394. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +153 -0
  395. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +14 -13
  396. data/ext/sources/ggml/src/ggml-impl.h +129 -6
  397. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
  398. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +15 -4
  399. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
  400. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +173 -34
  401. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +912 -344
  402. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +124 -59
  403. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +588 -144
  404. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +396 -23
  405. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +1724 -421
  406. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +16 -3
  407. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +333 -114
  408. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +3050 -1539
  409. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +3 -1
  410. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +30 -1
  411. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4279 -497
  412. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
  413. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
  414. data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  415. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +267 -0
  416. data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  417. data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  418. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  419. data/ext/sources/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  420. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +4 -3
  421. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  422. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  423. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  424. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  425. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  426. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
  427. data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  428. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  429. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  430. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
  431. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
  432. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  433. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  434. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  435. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  436. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  437. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  438. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  439. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  440. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
  441. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  442. data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  443. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
  444. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
  445. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
  446. data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
  447. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
  448. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +35 -16
  449. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  450. data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  451. data/ext/sources/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  452. data/ext/sources/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  453. data/ext/sources/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  454. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
  455. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
  456. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +39 -0
  457. data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  458. data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
  459. data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  460. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
  461. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  462. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
  463. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  464. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
  465. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
  466. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  467. data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  468. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  469. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  470. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  471. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  472. data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  473. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  474. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  475. data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  476. data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  477. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  478. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  479. data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  480. data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  481. data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  482. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  483. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
  484. data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  485. data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  486. data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  487. data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  488. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  489. data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  490. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
  491. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
  492. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
  493. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
  494. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  495. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  496. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  497. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  498. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  499. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
  500. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  501. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
  502. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
  503. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
  504. data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
  505. data/ext/sources/ggml/src/ggml-quants.c +96 -5
  506. data/ext/sources/ggml/src/ggml-quants.h +3 -0
  507. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +438 -156
  508. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +59 -87
  509. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +81 -0
  510. data/ext/sources/ggml/src/ggml-sycl/add-id.hpp +8 -0
  511. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +7 -0
  512. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -29
  513. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +0 -6
  514. data/ext/sources/ggml/src/ggml-sycl/common.hpp +427 -20
  515. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +55 -44
  516. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +103 -1
  517. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
  518. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  519. data/ext/sources/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  520. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +0 -3
  521. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +18 -0
  522. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +867 -50
  523. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +401 -358
  524. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +12 -2
  525. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
  526. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
  527. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
  528. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
  529. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
  530. data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
  531. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
  532. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
  533. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +645 -155
  534. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +22 -0
  535. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +221 -66
  536. data/ext/sources/ggml/src/ggml-sycl/norm.hpp +2 -0
  537. data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
  538. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +97 -0
  539. data/ext/sources/ggml/src/ggml-sycl/pad.hpp +24 -0
  540. data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
  541. data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
  542. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +5 -0
  543. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
  544. data/ext/sources/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  545. data/ext/sources/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  546. data/ext/sources/ggml/src/ggml-sycl/roll.cpp +122 -0
  547. data/ext/sources/ggml/src/ggml-sycl/roll.hpp +20 -0
  548. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +457 -281
  549. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
  550. data/ext/sources/ggml/src/ggml-sycl/set.cpp +73 -0
  551. data/ext/sources/ggml/src/ggml-sycl/set.hpp +5 -0
  552. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +327 -162
  553. data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +4 -0
  554. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
  555. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  556. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  557. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  558. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  559. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  560. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  561. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  562. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  563. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  564. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  565. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
  566. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
  567. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
  568. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
  569. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
  570. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
  571. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
  572. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
  573. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
  574. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
  575. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
  576. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
  577. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
  578. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
  579. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
  580. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
  581. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
  582. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
  583. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
  584. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
  585. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
  586. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
  587. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
  588. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
  589. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
  590. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
  591. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
  592. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
  593. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
  594. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
  595. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
  596. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
  597. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
  598. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
  599. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
  600. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
  601. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +71 -0
  602. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
  603. data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  604. data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  605. data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  606. data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  607. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  608. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  609. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  610. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  611. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  612. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  613. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  614. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  615. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  616. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  617. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  618. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  619. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  620. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  621. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  622. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  623. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  624. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  625. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  626. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  627. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  628. data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  629. data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  630. data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  631. data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  632. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  633. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  634. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  635. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  636. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  637. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  638. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  639. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  640. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  641. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  642. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  643. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  644. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  645. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +39 -19
  646. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +5994 -3055
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +18 -10
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +47 -49
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
  663. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
  664. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
  665. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +4 -4
  666. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  667. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
  668. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
  669. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  670. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
  671. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  672. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  673. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
  674. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +9 -21
  675. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
  676. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
  677. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  678. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
  679. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  680. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
  681. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +1 -1
  682. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +1 -1
  683. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +1 -1
  684. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
  685. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
  686. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
  687. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
  688. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  689. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
  690. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
  691. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
  692. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
  693. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
  694. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
  695. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  696. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
  697. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
  698. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
  699. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
  700. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  701. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -3
  702. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  703. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +386 -160
  704. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +82 -20
  705. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +400 -174
  706. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +123 -37
  707. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  708. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +10 -9
  709. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  710. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
  711. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
  712. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
  713. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
  714. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
  715. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
  716. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
  717. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +17 -2
  718. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +2 -0
  719. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +7 -0
  720. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +4 -4
  721. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +3 -3
  722. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
  723. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
  724. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +2 -2
  725. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +2 -2
  726. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +19 -7
  727. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +2 -3
  728. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +13 -10
  729. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
  730. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
  731. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
  732. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +2 -2
  733. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mat_vec_base.comp → mul_mat_vec_base.glsl} +77 -29
  734. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
  735. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +71 -21
  736. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +41 -25
  737. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +2 -2
  738. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +44 -26
  739. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +2 -2
  740. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +2 -2
  741. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +2 -2
  742. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
  743. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
  744. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +4 -6
  745. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +2 -2
  746. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +4 -6
  747. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +4 -6
  748. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +2 -2
  749. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +39 -36
  750. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +494 -0
  751. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +88 -105
  752. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +41 -26
  753. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mm_funcs.comp → mul_mm_funcs.glsl} +69 -59
  754. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +74 -0
  755. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +92 -230
  756. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
  757. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
  758. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
  759. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  760. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
  761. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
  762. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
  763. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +21 -6
  764. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
  765. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +10 -10
  766. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
  767. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
  768. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
  769. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
  770. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +49 -4
  771. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
  772. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
  773. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
  774. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +207 -0
  775. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
  776. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +8 -49
  777. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +8 -32
  778. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +8 -32
  779. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +33 -0
  780. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +8 -38
  781. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  782. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
  783. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  784. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
  785. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
  786. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
  787. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
  788. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
  789. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +2 -2
  790. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
  791. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
  792. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
  793. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
  794. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  795. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
  796. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
  797. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
  798. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +50 -0
  799. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
  800. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  801. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
  802. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +2 -25
  803. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
  804. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
  805. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
  806. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
  807. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +1 -1
  808. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
  809. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
  810. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
  811. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +43 -0
  812. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  813. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +345 -26
  814. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +90 -12
  815. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +384 -180
  816. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  817. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +28 -2
  818. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1374 -0
  819. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +2544 -726
  820. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  821. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  822. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  823. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  824. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
  825. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
  826. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
  827. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
  828. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  829. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +73 -15
  830. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
  831. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
  832. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  833. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +72 -261
  834. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +766 -0
  835. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +147 -0
  836. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +196 -0
  837. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
  838. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  839. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  840. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +83 -17
  841. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  842. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  843. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
  844. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  845. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  846. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  847. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
  848. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +91 -0
  849. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
  850. data/ext/sources/ggml/src/ggml.c +590 -64
  851. data/ext/sources/ggml/src/gguf.cpp +229 -44
  852. data/ext/sources/include/whisper.h +1 -0
  853. data/ext/sources/src/CMakeLists.txt +3 -1
  854. data/ext/sources/src/whisper.cpp +106 -62
  855. data/ext/sources/tests/CMakeLists.txt +2 -2
  856. data/ext/sources/tests/test-vad-full.cpp +4 -2
  857. data/ext/sources/tests/test-vad.cpp +1 -1
  858. data/extsources.rb +1 -0
  859. data/lib/whisper/model/uri.rb +17 -18
  860. data/sig/whisper.rbs +162 -4
  861. data/test/test_context_params.rb +82 -0
  862. data/test/test_params.rb +16 -8
  863. data/test/test_segment.rb +0 -1
  864. data/test/test_token.rb +81 -0
  865. data/test/test_vad.rb +1 -1
  866. data/test/test_vad_context.rb +100 -0
  867. data/test/test_vad_segment.rb +19 -0
  868. data/test/test_vad_segments.rb +16 -0
  869. data/test/test_whisper.rb +27 -0
  870. data/whispercpp.gemspec +1 -1
  871. metadata +502 -37
  872. data/ext/sources/build-xcframework.sh +0 -571
  873. data/ext/sources/examples/talk-llama/llama-sampling.h +0 -32
  874. data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
  875. data/ext/sources/ggml/src/ggml-cann/Doxyfile +0 -2579
  876. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
  877. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
  878. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +0 -44
  879. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +0 -41
  880. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
  881. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
  882. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +0 -44
  883. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +0 -41
  884. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +0 -48
  885. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
  886. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
  887. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
  888. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
  889. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
  890. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
  891. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
@@ -0,0 +1,1199 @@
1
+ #pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
2
+ #pragma clang diagnostic ignored "-Wunused-function"
3
+
4
+ #include <HAP_farf.h>
5
+ #include <HAP_perf.h>
6
+ #include <AEEStdErr.h>
7
+ #include <dspqueue.h>
8
+ #include <HAP_compute_res.h>
9
+ #include <HAP_etm_config.h>
10
+ #include <HAP_mem.h>
11
+ #include <HAP_power.h>
12
+ #include <HAP_ps.h>
13
+ #include <qurt.h>
14
+ #include <qurt_thread.h>
15
+ #include <remote.h>
16
+ #include <string.h>
17
+
18
+ #include "hex-dma.h"
19
+ #include "hex-utils.h"
20
+
21
+ #define GGML_COMMON_DECL_C
22
+ #include "ggml-common.h"
23
+ #include "htp-ctx.h"
24
+ #include "htp-msg.h"
25
+ #include "htp-ops.h"
26
+ #include "worker-pool.h"
27
+
28
+ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
29
+ struct htp_context * ctx;
30
+ int err = 0;
31
+
32
+ ctx = calloc(1, sizeof(*ctx));
33
+ if (ctx == NULL) {
34
+ return AEE_ENOMEMORY;
35
+ }
36
+
37
+ // Use the context structure as a handle
38
+ *handle = (remote_handle64) ctx;
39
+
40
+ // Enable FARF logs
41
+ HAP_setFARFRuntimeLoggingParams(0xffff, NULL, 0);
42
+
43
+ // Set client class
44
+ {
45
+ HAP_power_request_t request;
46
+ memset(&request, 0, sizeof(HAP_power_request_t));
47
+ request.type = HAP_power_set_apptype;
48
+ request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
49
+
50
+ if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
51
+ return err;
52
+ }
53
+ }
54
+
55
+ {
56
+ HAP_power_request_t request;
57
+ memset(&request, 0, sizeof(request));
58
+
59
+ request.type = HAP_power_set_DCVS_v3;
60
+ request.dcvs_v3.set_dcvs_enable = TRUE;
61
+ request.dcvs_v3.dcvs_enable = TRUE;
62
+ request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
63
+ request.dcvs_v3.set_bus_params = TRUE;
64
+ request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_MAX;
65
+ request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_MAX;
66
+ request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_MAX;
67
+ request.dcvs_v3.set_core_params = TRUE;
68
+ request.dcvs_v3.core_params.min_corner = HAP_DCVS_VCORNER_MAX;
69
+ request.dcvs_v3.core_params.max_corner = HAP_DCVS_VCORNER_MAX;
70
+ request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_MAX;
71
+ request.dcvs_v3.set_sleep_disable = TRUE;
72
+ request.dcvs_v3.sleep_disable = TRUE;
73
+ if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
74
+ return err;
75
+ }
76
+
77
+ memset(&request, 0, sizeof(request));
78
+ request.type = HAP_power_set_HVX;
79
+ request.hvx.power_up = TRUE;
80
+ if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
81
+ return err;
82
+ }
83
+ }
84
+
85
+ {
86
+ // Power on HMX
87
+ HAP_power_request_t request;
88
+ memset(&request, 0, sizeof(HAP_power_request_t));
89
+ request.type = HAP_power_set_HMX;
90
+ request.hmx.power_up = TRUE;
91
+ FARF(ALWAYS, "Powering HMX on\n");
92
+ err = HAP_power_set((void *) &ctx, &request);
93
+ if (err != AEE_SUCCESS) {
94
+ FARF(ERROR, "Error powering on HMX.");
95
+ return err;
96
+ }
97
+ }
98
+
99
+ return AEE_SUCCESS;
100
+ }
101
+
102
+ AEEResult htp_iface_close(remote_handle64 handle) {
103
+ struct htp_context * ctx = (struct htp_context *) handle;
104
+
105
+ if (!ctx) {
106
+ return AEE_EBADPARM;
107
+ }
108
+
109
+ if (ctx->queue) {
110
+ FARF(ERROR, "Closing handle with queue still open");
111
+ return AEE_EITEMBUSY;
112
+ }
113
+
114
+ free(ctx);
115
+ return AEE_SUCCESS;
116
+ }
117
+
118
+ AEEResult htp_iface_enable_etm(remote_handle64 handle) {
119
+ int err = HAP_user_etm_enable();
120
+ if (err) {
121
+ if (err == AEE_EVERSIONNOTSUPPORT) {
122
+ FARF(ERROR, "API HAP_user_etm_enable is not supported\n");
123
+ } else {
124
+ FARF(ERROR, "Error executing HAP_user_etm_enable with error code : 0x%x\n", err);
125
+ }
126
+ }
127
+ return err;
128
+ }
129
+
130
+ AEEResult htp_iface_disable_etm(remote_handle64 handle) {
131
+ int err = HAP_user_etm_disable();
132
+ if (err) {
133
+ if (err == AEE_EVERSIONNOTSUPPORT) {
134
+ FARF(ERROR, "API HAP_user_etm_disable is not supported\n");
135
+ } else {
136
+ FARF(ERROR, "Error executing HAP_user_etm_disable with error code : 0x%x\n", err);
137
+ }
138
+ }
139
+ return err;
140
+ }
141
+
142
+ static int vtcm_acquire(struct htp_context * ctx) {
143
+ int err;
144
+ if (!ctx->vtcm_valid) {
145
+ // Temporarily bump thread priority to make sure it's higher than other sessions.
146
+ // This way the resource manager will notify the other thread to release VTCM.
147
+ // Note that we need to reaquire VTCM at normal priority for this to work next time.
148
+ qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10);
149
+ err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
150
+ if (err != 0) {
151
+ FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
152
+ abort();
153
+ }
154
+ HAP_compute_res_release_cached(ctx->vtcm_rctx);
155
+ qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio);
156
+
157
+ err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
158
+ if (err != 0) {
159
+ FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
160
+ abort();
161
+ }
162
+ ctx->vtcm_valid = true;
163
+ }
164
+
165
+ ctx->vtcm_inuse = true;
166
+ return 0;
167
+ }
168
+
169
+ static int vtcm_release(struct htp_context * ctx) {
170
+ ctx->vtcm_inuse = false;
171
+
172
+ if (ctx->vtcm_valid && ctx->vtcm_needs_release) {
173
+ ctx->vtcm_valid = false;
174
+ ctx->vtcm_needs_release = false;
175
+ HAP_compute_res_release_cached(ctx->vtcm_rctx);
176
+ }
177
+
178
+ return 0;
179
+ }
180
+
181
+ static int vtcm_release_callback(unsigned int rctx, void * state) {
182
+ struct htp_context * ctx = (struct htp_context *) state;
183
+
184
+ if (!ctx || ctx->vtcm_rctx != rctx) {
185
+ return AEE_EBADPARM;
186
+ }
187
+
188
+ // If VTCM is not inuse (not processing Ops) release it right here
189
+ // otherwise we'll release it once we're done with the current Op.
190
+
191
+ if (ctx->vtcm_inuse) {
192
+ ctx->vtcm_needs_release = true;
193
+ return 0;
194
+ }
195
+
196
+ ctx->vtcm_valid = false;
197
+ HAP_compute_res_release_cached(ctx->vtcm_rctx);
198
+
199
+ return 0;
200
+ }
201
+
202
+ static int vtcm_alloc(struct htp_context * ctx) {
203
+ unsigned int vtcm_size = 8 * 1024 * 1024; // 8MB default
204
+ HAP_compute_res_query_VTCM(0, &vtcm_size, NULL, NULL, NULL);
205
+
206
+ compute_res_attr_t attr;
207
+ HAP_compute_res_attr_init(&attr);
208
+ HAP_compute_res_attr_set_serialize(&attr, 0);
209
+ HAP_compute_res_attr_set_cache_mode(&attr, 1);
210
+ HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, 0, vtcm_size);
211
+ HAP_compute_res_attr_set_release_callback(&attr, vtcm_release_callback, (void *) ctx);
212
+ HAP_compute_res_attr_set_hmx_param(&attr, 1);
213
+
214
+ // Allocate VTCM for scratch pads
215
+ uint32_t rctx = HAP_compute_res_acquire(&attr, 1000000 /* timeout */);
216
+ if (!rctx) {
217
+ FARF(ERROR, "failed to allocate %zu bytes VTCM\n", ctx->vtcm_size);
218
+ return AEE_ENOMEMORY;
219
+ }
220
+
221
+ void * vtcm_ptr;
222
+ if (HAP_compute_res_attr_get_vtcm_ptr_v2(&attr, &vtcm_ptr, &vtcm_size) != 0) {
223
+ HAP_compute_res_release(rctx);
224
+ FARF(ERROR, "failed to allocate %zu bytes VTCM (new)\n", ctx->vtcm_size);
225
+ return AEE_ENOMEMORY;
226
+ }
227
+
228
+ ctx->vtcm_base = (uint8_t *) vtcm_ptr;
229
+ ctx->vtcm_size = vtcm_size;
230
+ ctx->vtcm_rctx = rctx;
231
+ ctx->vtcm_valid = false;
232
+ ctx->vtcm_inuse = false;
233
+ ctx->vtcm_needs_release = false;
234
+
235
+ return 0;
236
+ }
237
+
238
+ static void vtcm_free(struct htp_context * ctx) {
239
+ if (ctx->vtcm_rctx) {
240
+ HAP_compute_res_release(ctx->vtcm_rctx);
241
+ ctx->vtcm_base = 0;
242
+ ctx->vtcm_rctx = 0;
243
+ }
244
+ }
245
+
246
+ static void htp_packet_callback(dspqueue_t queue, int error, void * context);
247
+ static void htp_error_callback(dspqueue_t queue, int error, void * context);
248
+
249
+ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx) {
250
+ struct htp_context * ctx = (struct htp_context *) handle;
251
+
252
+ if (!ctx) {
253
+ return AEE_EBADPARM;
254
+ }
255
+
256
+ if (ctx->queue) {
257
+ FARF(ERROR, "Queue already open");
258
+ return AEE_EITEMBUSY;
259
+ }
260
+
261
+ // Import queue created on the CPU
262
+ int err = dspqueue_import(dsp_queue_id, // Queue ID from dspqueue_export
263
+ htp_packet_callback, // Packet callback
264
+ htp_error_callback, // Error callback; no errors expected on the DSP
265
+ (void *) ctx, // Callback context
266
+ &ctx->queue);
267
+
268
+ if (err) {
269
+ FARF(ERROR, "Queue import failed with 0x%08x", (unsigned) err);
270
+ return err;
271
+ }
272
+
273
+ ctx->thread_id = qurt_thread_get_id();
274
+ ctx->thread_prio = qurt_thread_get_priority(ctx->thread_id);
275
+
276
+ // allocate VTCM
277
+ err = vtcm_alloc(ctx);
278
+ if (err != AEE_SUCCESS) {
279
+ FARF(ERROR, "Unable to allocate VTCM");
280
+ return AEE_ENOMEMORY;
281
+ }
282
+
283
+ qurt_sysenv_max_hthreads_t hw_threads;
284
+ qurt_sysenv_get_max_hw_threads(&hw_threads);
285
+ uint32_t hw_nhvx = (qurt_hvx_get_units() >> 8) & 0xFF;
286
+
287
+ if (n_hvx == 0) {
288
+ n_hvx = hw_nhvx;
289
+ }
290
+ if (n_hvx > hw_threads.max_hthreads) {
291
+ n_hvx = hw_threads.max_hthreads;
292
+ }
293
+ if (n_hvx > HTP_MAX_NTHREADS) {
294
+ n_hvx = HTP_MAX_NTHREADS;
295
+ }
296
+
297
+ ctx->n_threads = n_hvx;
298
+ for (int i = 0; i < ctx->n_threads; i++) {
299
+ // see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541
300
+ ctx->dma[i] = dma_queue_create(64);
301
+ }
302
+
303
+ // init worker pool
304
+ err = worker_pool_init(&ctx->worker_pool, n_hvx);
305
+ if (err != AEE_SUCCESS) {
306
+ FARF(ERROR, "Unable to create worker pool");
307
+ return err;
308
+ }
309
+
310
+ FARF(HIGH, "session %u started: n-hvx %u vtcm-size %zu vtcm-rctx %u n-threads %u thread-id %d thread-prio %d \n",
311
+ sess_id, hw_nhvx, ctx->vtcm_size, ctx->vtcm_rctx, ctx->n_threads, ctx->thread_id, ctx->thread_prio);
312
+
313
+ return AEE_SUCCESS;
314
+ }
315
+
316
+ AEEResult htp_iface_stop(remote_handle64 handle) {
317
+ struct htp_context * ctx = (struct htp_context *) handle;
318
+ if (!ctx) {
319
+ return AEE_EBADPARM;
320
+ }
321
+
322
+ if (!ctx->queue) {
323
+ FARF(ERROR, "Queue not open");
324
+ return AEE_EBADSTATE;
325
+ }
326
+
327
+ // Close queue. dspqueue_close() will also wait for callbacks to finish.
328
+ int err = dspqueue_close(ctx->queue);
329
+ ctx->queue = NULL;
330
+ if (err != 0) {
331
+ FARF(ERROR, "Queue close failed with 0x%08x", (unsigned) err);
332
+ return err;
333
+ }
334
+
335
+ if (ctx->worker_pool) {
336
+ // Release worker pool
337
+ worker_pool_release(&ctx->worker_pool);
338
+ }
339
+
340
+ for (int i = 0; i < ctx->n_threads; i++) {
341
+ dma_queue_delete(ctx->dma[i]);
342
+ }
343
+
344
+ vtcm_free(ctx);
345
+
346
+ return AEE_SUCCESS;
347
+ }
348
+
349
+ static void htp_error_callback(dspqueue_t queue, int error, void * context) {
350
+ // No errors expected on the DSP.
351
+ FARF(ERROR, "Error callback: 0x%08x", (unsigned) error);
352
+ }
353
+
354
+ struct profile_data {
355
+ uint64_t usecs;
356
+ uint64_t cycles;
357
+ uint64_t pkts;
358
+ };
359
+
360
+ static inline void profile_start(struct profile_data * d) {
361
+ d->usecs = HAP_perf_get_qtimer_count();
362
+ d->cycles = hex_get_cycles();
363
+ d->pkts = hex_get_pktcnt();
364
+ }
365
+
366
+ static inline void profile_stop(struct profile_data * d) {
367
+ d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
368
+ d->cycles = hex_get_cycles() - d->cycles;
369
+ d->pkts = hex_get_pktcnt() - d->pkts;
370
+ }
371
+
372
+ static int send_htp_rsp(struct htp_context * c,
373
+ uint32_t op,
374
+ uint32_t status,
375
+ struct dspqueue_buffer * bufs,
376
+ size_t n_bufs,
377
+ struct profile_data * prof) {
378
+ // Prep response struct
379
+ struct htp_general_rsp rsp;
380
+ rsp.op = op;
381
+ rsp.status = status;
382
+ rsp.prof_usecs = prof->usecs;
383
+ rsp.prof_cycles = prof->cycles;
384
+ rsp.prof_pkts = prof->pkts;
385
+
386
+ int err = dspqueue_write(c->queue,
387
+ 0, // Flags
388
+ n_bufs,
389
+ bufs, // Buffer references
390
+ sizeof(rsp),
391
+ (const uint8_t *) &rsp, // Message
392
+ DSPQUEUE_TIMEOUT_NONE);
393
+
394
+ if (err != 0) {
395
+ FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err);
396
+ }
397
+
398
+ return err;
399
+ }
400
+
401
+ static void proc_matmul_req(struct htp_context * ctx,
402
+ struct htp_general_req * req,
403
+ struct dspqueue_buffer * bufs,
404
+ size_t n_bufs) {
405
+ struct dspqueue_buffer rsp_bufs[1];
406
+
407
+ // We had written to the output buffer, we'd also need to flush it
408
+ rsp_bufs[0].fd = bufs[2].fd;
409
+ rsp_bufs[0].ptr = bufs[2].ptr;
410
+ rsp_bufs[0].size = bufs[2].size;
411
+ rsp_bufs[0].offset = bufs[2].offset;
412
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
413
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
414
+
415
+ // Setup Op context
416
+ struct htp_ops_context octx = { 0 };
417
+ octx.ctx = ctx;
418
+ octx.src0 = req->src0;
419
+ octx.src1 = req->src1;
420
+ octx.dst = req->dst;
421
+ octx.flags = req->flags;
422
+ octx.op = req->op;
423
+
424
+ // Update data pointers
425
+ octx.src0.data = (uint32_t) bufs[0].ptr;
426
+ octx.src1.data = (uint32_t) bufs[1].ptr;
427
+ octx.dst.data = (uint32_t) bufs[2].ptr;
428
+ octx.n_threads = ctx->n_threads;
429
+
430
+ struct profile_data prof;
431
+ profile_start(&prof);
432
+
433
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
434
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
435
+ rsp_status = op_matmul(&octx);
436
+ vtcm_release(ctx);
437
+ }
438
+
439
+ profile_stop(&prof);
440
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
441
+ }
442
+
443
+ static void proc_argsort_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
444
+ struct dspqueue_buffer rsp_bufs[1];
445
+
446
+ // We had written to the output buffer, we'd also need to flush it
447
+ rsp_bufs[0].fd = bufs[1].fd;
448
+ rsp_bufs[0].ptr = bufs[1].ptr;
449
+ rsp_bufs[0].offset = bufs[1].offset;
450
+ rsp_bufs[0].size = bufs[1].size;
451
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
452
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
453
+
454
+ // Setup Op context
455
+ struct htp_ops_context octx = { 0 };
456
+ octx.ctx = ctx;
457
+ octx.src0 = req->src0;
458
+ octx.dst = req->dst;
459
+ octx.flags = req->flags;
460
+ octx.op = req->op;
461
+
462
+ memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
463
+
464
+ // Update data pointers
465
+ octx.src0.data = (uint32_t) bufs[0].ptr;
466
+ octx.dst.data = (uint32_t) bufs[1].ptr;
467
+ octx.n_threads = ctx->n_threads;
468
+
469
+ struct profile_data prof;
470
+ profile_start(&prof);
471
+
472
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
473
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
474
+ rsp_status = op_argsort(&octx);
475
+ vtcm_release(ctx);
476
+ }
477
+
478
+ profile_stop(&prof);
479
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
480
+ }
481
+
482
+ static void proc_cpy_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
483
+ struct dspqueue_buffer rsp_bufs[1];
484
+
485
+ // We had written to the output buffer, we'd also need to flush it
486
+ rsp_bufs[0].fd = bufs[1].fd;
487
+ rsp_bufs[0].ptr = bufs[1].ptr;
488
+ rsp_bufs[0].offset = bufs[1].offset;
489
+ rsp_bufs[0].size = bufs[1].size;
490
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
491
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
492
+
493
+ // Setup Op context
494
+ struct htp_ops_context octx = { 0 };
495
+ octx.ctx = ctx;
496
+ octx.src0 = req->src0;
497
+ octx.dst = req->dst;
498
+ octx.flags = req->flags;
499
+ octx.op = req->op;
500
+
501
+ // Update data pointers
502
+ octx.src0.data = (uint32_t) bufs[0].ptr;
503
+ octx.dst.data = (uint32_t) bufs[1].ptr;
504
+ octx.n_threads = ctx->n_threads;
505
+
506
+ struct profile_data prof;
507
+ profile_start(&prof);
508
+
509
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
510
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
511
+ rsp_status = op_cpy(&octx);
512
+ vtcm_release(ctx);
513
+ }
514
+
515
+ profile_stop(&prof);
516
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
517
+ }
518
+
519
+ static void proc_get_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
520
+ struct dspqueue_buffer rsp_bufs[1];
521
+
522
+ // We had written to the output buffer, we'd also need to flush it
523
+ rsp_bufs[0].fd = bufs[2].fd;
524
+ rsp_bufs[0].ptr = bufs[2].ptr;
525
+ rsp_bufs[0].offset = bufs[2].offset;
526
+ rsp_bufs[0].size = bufs[2].size;
527
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
528
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
529
+
530
+ // Setup Op context
531
+ struct htp_ops_context octx = { 0 };
532
+ octx.ctx = ctx;
533
+ octx.src0 = req->src0;
534
+ octx.src1 = req->src1;
535
+ octx.dst = req->dst;
536
+ octx.flags = req->flags;
537
+ octx.op = req->op;
538
+
539
+ // Update data pointers
540
+ octx.src0.data = (uint32_t) bufs[0].ptr;
541
+ octx.src1.data = (uint32_t) bufs[1].ptr;
542
+ octx.dst.data = (uint32_t) bufs[2].ptr;
543
+ octx.n_threads = ctx->n_threads;
544
+
545
+ struct profile_data prof;
546
+ profile_start(&prof);
547
+
548
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
549
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
550
+ rsp_status = op_get_rows(&octx);
551
+ vtcm_release(ctx);
552
+ }
553
+
554
+ profile_stop(&prof);
555
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
556
+ }
557
+
558
+ static void proc_matmul_id_req(struct htp_context * ctx,
559
+ struct htp_general_req * req,
560
+ struct dspqueue_buffer * bufs,
561
+ size_t n_bufs) {
562
+ struct dspqueue_buffer rsp_bufs[1];
563
+
564
+ // We had written to the output buffer, we'd also need to flush it
565
+ rsp_bufs[0].fd = bufs[3].fd;
566
+ rsp_bufs[0].ptr = bufs[3].ptr;
567
+ rsp_bufs[0].size = bufs[3].size;
568
+ rsp_bufs[0].offset = bufs[3].offset;
569
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
570
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
571
+
572
+ // Setup Op context
573
+ struct htp_ops_context octx = { 0 };
574
+ octx.ctx = ctx;
575
+ octx.src0 = req->src0;
576
+ octx.src1 = req->src1;
577
+ octx.src2 = req->src2;
578
+ octx.dst = req->dst;
579
+ octx.flags = req->flags;
580
+ octx.op = req->op;
581
+
582
+ // Update data pointers
583
+ octx.src0.data = (uint32_t) bufs[0].ptr;
584
+ octx.src1.data = (uint32_t) bufs[1].ptr;
585
+ octx.src2.data = (uint32_t) bufs[2].ptr;
586
+ octx.dst.data = (uint32_t) bufs[3].ptr;
587
+ octx.n_threads = ctx->n_threads;
588
+
589
+ struct profile_data prof;
590
+ profile_start(&prof);
591
+
592
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
593
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
594
+ rsp_status = op_matmul_id(&octx);
595
+ vtcm_release(ctx);
596
+ }
597
+
598
+ profile_stop(&prof);
599
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
600
+ }
601
+
602
+ static void proc_binary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
603
+ struct dspqueue_buffer rsp_bufs[1];
604
+
605
+ // We had written to the output buffer, we'd also need to flush it
606
+ rsp_bufs[0].fd = bufs[2].fd;
607
+ rsp_bufs[0].ptr = bufs[2].ptr;
608
+ rsp_bufs[0].offset = bufs[2].offset;
609
+ rsp_bufs[0].size = bufs[2].size;
610
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
611
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
612
+
613
+ // Setup Op context
614
+ struct htp_ops_context octx = { 0 };
615
+ octx.ctx = ctx;
616
+ octx.src0 = req->src0;
617
+ octx.src1 = req->src1;
618
+ octx.dst = req->dst;
619
+ octx.flags = req->flags;
620
+ octx.op = req->op;
621
+
622
+ // Update data pointers
623
+ octx.src0.data = (uint32_t) bufs[0].ptr;
624
+ octx.src1.data = (uint32_t) bufs[1].ptr;
625
+ octx.dst.data = (uint32_t) bufs[2].ptr;
626
+ octx.n_threads = ctx->n_threads;
627
+
628
+ struct profile_data prof;
629
+ profile_start(&prof);
630
+
631
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
632
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
633
+ rsp_status = op_binary(&octx);
634
+ vtcm_release(ctx);
635
+ }
636
+
637
+ profile_stop(&prof);
638
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
639
+ }
640
+
641
+ static void proc_add_id_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
642
+ struct dspqueue_buffer rsp_bufs[1];
643
+
644
+ // We had written to the output buffer, we'd also need to flush it
645
+ rsp_bufs[0].fd = bufs[3].fd;
646
+ rsp_bufs[0].ptr = bufs[3].ptr;
647
+ rsp_bufs[0].offset = bufs[3].offset;
648
+ rsp_bufs[0].size = bufs[3].size;
649
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
650
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
651
+
652
+ // Setup Op context
653
+ struct htp_ops_context octx = { 0 };
654
+ octx.ctx = ctx;
655
+ octx.src0 = req->src0;
656
+ octx.src1 = req->src1;
657
+ octx.src2 = req->src2;
658
+ octx.dst = req->dst;
659
+ octx.flags = req->flags;
660
+ octx.op = req->op;
661
+
662
+ // Update data pointers
663
+ octx.src0.data = (uint32_t) bufs[0].ptr;
664
+ octx.src1.data = (uint32_t) bufs[1].ptr;
665
+ octx.src2.data = (uint32_t) bufs[2].ptr;
666
+ octx.dst.data = (uint32_t) bufs[3].ptr;
667
+ octx.n_threads = ctx->n_threads;
668
+
669
+ struct profile_data prof;
670
+ profile_start(&prof);
671
+
672
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
673
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
674
+ rsp_status = op_binary(&octx);
675
+ vtcm_release(ctx);
676
+ }
677
+
678
+ profile_stop(&prof);
679
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
680
+ }
681
+
682
+ static void proc_unary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
683
+ struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
684
+
685
+ // We had written to the output buffer, we'd also need to flush it
686
+ rsp_bufs[0].fd = bufs[1].fd;
687
+ rsp_bufs[0].ptr = bufs[1].ptr;
688
+ rsp_bufs[0].offset = bufs[1].offset;
689
+ rsp_bufs[0].size = bufs[1].size;
690
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
691
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
692
+
693
+ // Setup Op context
694
+ struct htp_ops_context octx = { 0 };
695
+ octx.ctx = ctx;
696
+ octx.src0 = req->src0;
697
+ octx.dst = req->dst;
698
+ octx.flags = req->flags;
699
+ octx.op = req->op;
700
+
701
+ memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
702
+
703
+ // Update data pointers
704
+ octx.src0.data = (uint32_t) bufs[0].ptr;
705
+ octx.dst.data = (uint32_t) bufs[1].ptr;
706
+ octx.n_threads = ctx->n_threads;
707
+
708
+ struct profile_data prof;
709
+ profile_start(&prof);
710
+
711
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
712
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
713
+ rsp_status = op_unary(&octx);
714
+ vtcm_release(ctx);
715
+ }
716
+
717
+ profile_stop(&prof);
718
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
719
+ }
720
+
721
+ static void proc_sum_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
722
+ struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
723
+
724
+ // We had written to the output buffer, we'd also need to flush it
725
+ rsp_bufs[0].fd = bufs[1].fd;
726
+ rsp_bufs[0].ptr = bufs[1].ptr;
727
+ rsp_bufs[0].offset = bufs[1].offset;
728
+ rsp_bufs[0].size = bufs[1].size;
729
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
730
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
731
+
732
+ // Setup Op context
733
+ struct htp_ops_context octx = { 0 };
734
+ octx.ctx = ctx;
735
+ octx.src0 = req->src0;
736
+ octx.dst = req->dst;
737
+ octx.flags = req->flags;
738
+ octx.op = req->op;
739
+
740
+ memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
741
+
742
+ // Update data pointers
743
+ octx.src0.data = (uint32_t) bufs[0].ptr;
744
+ octx.dst.data = (uint32_t) bufs[1].ptr;
745
+ octx.n_threads = ctx->n_threads;
746
+
747
+ struct profile_data prof;
748
+ profile_start(&prof);
749
+
750
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
751
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
752
+ rsp_status = op_sum_rows(&octx);
753
+ vtcm_release(ctx);
754
+ }
755
+
756
+ profile_stop(&prof);
757
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
758
+ }
759
+
760
+ static void proc_ssm_conv_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
761
+ struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
762
+
763
+ // We've written to the output buffer, we'd also need to flush it
764
+ rsp_bufs[0].fd = bufs[2].fd;
765
+ rsp_bufs[0].ptr = bufs[2].ptr;
766
+ rsp_bufs[0].offset = bufs[2].offset;
767
+ rsp_bufs[0].size = bufs[2].size;
768
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
769
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
770
+
771
+ // Setup OP context
772
+ struct htp_ops_context octx = { 0 };
773
+ octx.ctx = ctx;
774
+ octx.src0 = req->src0;
775
+ octx.src1 = req->src1;
776
+ octx.dst = req->dst;
777
+ octx.flags = req->flags;
778
+ octx.op = req->op;
779
+
780
+ memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
781
+
782
+ // Update data pointers
783
+ octx.src0.data = (uint32_t) bufs[0].ptr;
784
+ octx.src1.data = (uint32_t) bufs[1].ptr;
785
+ octx.dst.data = (uint32_t) bufs[2].ptr;
786
+ octx.n_threads = ctx->n_threads;
787
+
788
+ struct profile_data prof;
789
+ profile_start(&prof);
790
+
791
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
792
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
793
+ rsp_status = op_ssm_conv(&octx);
794
+ vtcm_release(ctx);
795
+ }
796
+
797
+ profile_stop(&prof);
798
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
799
+ }
800
+
801
+ static void proc_activations_req(struct htp_context * ctx,
802
+ struct htp_general_req * req,
803
+ struct dspqueue_buffer * bufs,
804
+ uint32_t n_bufs) {
805
+ struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
806
+
807
+ int write_idx = (n_bufs == 3) ? 2 : 1;
808
+
809
+ // We had written to the output buffer, we'd also need to flush it
810
+ rsp_bufs[0].fd = bufs[write_idx].fd;
811
+ rsp_bufs[0].ptr = bufs[write_idx].ptr;
812
+ rsp_bufs[0].offset = bufs[write_idx].offset;
813
+ rsp_bufs[0].size = bufs[write_idx].size;
814
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
815
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
816
+
817
+ // Setup Op context
818
+ struct htp_ops_context octx = { 0 };
819
+ octx.ctx = ctx;
820
+ octx.src0 = req->src0;
821
+ if (3 == n_bufs) {
822
+ octx.src1 = req->src1;
823
+ }
824
+ octx.dst = req->dst;
825
+ octx.flags = req->flags;
826
+ octx.op = req->op;
827
+
828
+ memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
829
+
830
+ // Update data pointers
831
+ octx.src0.data = (uint32_t) bufs[0].ptr;
832
+ if (3 == n_bufs) {
833
+ octx.src1.data = (uint32_t) bufs[1].ptr;
834
+ octx.dst.data = (uint32_t) bufs[2].ptr;
835
+ } else {
836
+ octx.dst.data = (uint32_t) bufs[1].ptr;
837
+ }
838
+ octx.n_threads = ctx->n_threads;
839
+
840
+ struct profile_data prof;
841
+ profile_start(&prof);
842
+
843
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
844
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
845
+ if (octx.op == HTP_OP_SOFTMAX) {
846
+ rsp_status = op_softmax(&octx);
847
+ } else {
848
+ rsp_status = op_activations(&octx);
849
+ }
850
+ vtcm_release(ctx);
851
+ }
852
+
853
+ profile_stop(&prof);
854
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
855
+ }
856
+
857
+ static void proc_rope_req(struct htp_context * ctx,
858
+ struct htp_general_req * req,
859
+ struct dspqueue_buffer * bufs,
860
+ uint32_t n_bufs) {
861
+ struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
862
+
863
+ int write_idx = n_bufs - 1;
864
+
865
+ // We had written to the output buffer, we'd also need to flush it
866
+ rsp_bufs[0].fd = bufs[write_idx].fd;
867
+ rsp_bufs[0].ptr = bufs[write_idx].ptr;
868
+ rsp_bufs[0].offset = bufs[write_idx].offset;
869
+ rsp_bufs[0].size = bufs[write_idx].size;
870
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
871
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
872
+
873
+ // Setup Op context
874
+ struct htp_ops_context octx = { 0 };
875
+ octx.ctx = ctx;
876
+ octx.src0 = req->src0;
877
+ octx.src1 = req->src1;
878
+ if (4 == n_bufs) {
879
+ octx.src2 = req->src2;
880
+ }
881
+ octx.dst = req->dst;
882
+ octx.flags = req->flags;
883
+ octx.op = req->op;
884
+
885
+ memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
886
+
887
+ // Update data pointers
888
+ octx.src0.data = (uint32_t) bufs[0].ptr;
889
+ octx.src1.data = (uint32_t) bufs[1].ptr;
890
+ if (4 == n_bufs) {
891
+ octx.src2.data = (uint32_t) bufs[2].ptr;
892
+ octx.dst.data = (uint32_t) bufs[3].ptr;
893
+ } else {
894
+ octx.dst.data = (uint32_t) bufs[2].ptr;
895
+ }
896
+ octx.n_threads = ctx->n_threads;
897
+
898
+ struct profile_data prof;
899
+ profile_start(&prof);
900
+
901
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
902
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
903
+ rsp_status = op_rope(&octx);
904
+ vtcm_release(ctx);
905
+ }
906
+
907
+ profile_stop(&prof);
908
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
909
+ }
910
+
911
+ static void proc_set_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
912
+ struct dspqueue_buffer rsp_bufs[1];
913
+
914
+ // We had written to the output buffer, we'd also need to flush it
915
+ rsp_bufs[0].fd = bufs[2].fd;
916
+ rsp_bufs[0].ptr = bufs[2].ptr;
917
+ rsp_bufs[0].offset = bufs[2].offset;
918
+ rsp_bufs[0].size = bufs[2].size;
919
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
920
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
921
+
922
+ // Setup Op context
923
+ struct htp_ops_context octx = { 0 };
924
+ octx.ctx = ctx;
925
+ octx.src0 = req->src0;
926
+ octx.src1 = req->src1;
927
+ octx.dst = req->dst;
928
+ octx.flags = req->flags;
929
+ octx.op = req->op;
930
+
931
+ // Update data pointers
932
+ octx.src0.data = (uint32_t) bufs[0].ptr;
933
+ octx.src1.data = (uint32_t) bufs[1].ptr;
934
+ octx.dst.data = (uint32_t) bufs[2].ptr;
935
+ octx.n_threads = ctx->n_threads;
936
+
937
+ struct profile_data prof;
938
+ profile_start(&prof);
939
+
940
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
941
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
942
+ rsp_status = op_set_rows(&octx);
943
+ vtcm_release(ctx);
944
+ }
945
+
946
+ profile_stop(&prof);
947
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
948
+ }
949
+
950
+ static void proc_flash_attn_ext_req(struct htp_context * ctx,
951
+ struct htp_general_req * req,
952
+ struct dspqueue_buffer * bufs,
953
+ uint32_t n_bufs) {
954
+ // Setup Op context
955
+ struct htp_ops_context octx;
956
+ memset(&octx, 0, sizeof(octx));
957
+
958
+ octx.ctx = ctx;
959
+ octx.n_threads = ctx->n_threads;
960
+
961
+ octx.src0 = req->src0;
962
+ octx.src1 = req->src1;
963
+ octx.src2 = req->src2;
964
+ octx.src3 = req->src3;
965
+ octx.src4 = req->src4;
966
+ octx.dst = req->dst;
967
+ octx.flags = req->flags;
968
+ octx.op = req->op;
969
+
970
+ memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
971
+
972
+ // Update data pointers
973
+ octx.src0.data = (uint32_t) bufs[0].ptr;
974
+ octx.src1.data = (uint32_t) bufs[1].ptr;
975
+ octx.src2.data = (uint32_t) bufs[2].ptr;
976
+
977
+ int last_buf = 3;
978
+
979
+ if (octx.src3.ne[0]) {
980
+ octx.src3.data = (uint32_t) bufs[last_buf++].ptr; // mask is valid
981
+ }
982
+
983
+ if (octx.src4.ne[0]) {
984
+ octx.src4.data = (uint32_t) bufs[last_buf++].ptr; // sinks is valid
985
+ }
986
+
987
+ octx.dst.data = (uint32_t) bufs[last_buf].ptr;
988
+
989
+ struct profile_data prof;
990
+ profile_start(&prof);
991
+
992
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
993
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
994
+ rsp_status = op_flash_attn_ext(&octx);
995
+ vtcm_release(ctx);
996
+ }
997
+
998
+ profile_stop(&prof);
999
+
1000
+ struct dspqueue_buffer rsp_buf = bufs[last_buf];
1001
+ rsp_buf.flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
1002
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
1003
+
1004
+ send_htp_rsp(ctx, req->op, rsp_status, &bufs[last_buf], 1, &prof);
1005
+ }
1006
+
1007
+ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
1008
+ struct htp_context * ctx = (struct htp_context *) context;
1009
+
1010
+ // Repeatedly read packets from the queue until it's empty. We don't
1011
+ // necessarily get a separate callback for each packet, and new packets
1012
+ // may arrive while we're processing the previous one. This ensures we
1013
+ // keep the DSP busy as much as possible and avoid waiting for the CPU.
1014
+
1015
+ while (1) {
1016
+ struct htp_general_req req;
1017
+ uint32_t req_size;
1018
+
1019
+ struct dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
1020
+ uint32_t n_bufs;
1021
+ uint32_t flags;
1022
+
1023
+ // Read packet from queue
1024
+ int err = dspqueue_read_noblock(queue, &flags,
1025
+ HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
1026
+ &n_bufs, // Number of buffer references
1027
+ bufs, // Buffer references
1028
+ sizeof(req), // Max message length
1029
+ &req_size, // Message length
1030
+ (uint8_t *) &req); // Message
1031
+
1032
+ if (err == AEE_EWOULDBLOCK) {
1033
+ // Consumed all packets available for now
1034
+ return;
1035
+ }
1036
+
1037
+ if (err != 0) {
1038
+ FARF(ERROR, "dspqueue_read_noblock failed: 0x%08x", (unsigned) err);
1039
+ return;
1040
+ }
1041
+
1042
+ if (req_size != sizeof(req)) {
1043
+ FARF(ERROR, "Invalid request size");
1044
+ continue;
1045
+ }
1046
+
1047
+ if (req.flags & HTP_OPFLAGS_EARLY_WAKEUP) {
1048
+ // Host wants early notification
1049
+ dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);
1050
+ }
1051
+
1052
+ // Process packet based on its message type
1053
+ switch (req.op) {
1054
+ case HTP_OP_MUL_MAT:
1055
+ if (n_bufs != 3) {
1056
+ FARF(ERROR, "Bad matmul-req buffer list");
1057
+ continue;
1058
+ }
1059
+ proc_matmul_req(ctx, &req, bufs, n_bufs);
1060
+ break;
1061
+
1062
+ case HTP_OP_MUL_MAT_ID:
1063
+ if (n_bufs != 4) {
1064
+ FARF(ERROR, "Bad matmul-id-req buffer list");
1065
+ continue;
1066
+ }
1067
+ proc_matmul_id_req(ctx, &req, bufs, n_bufs);
1068
+ break;
1069
+
1070
+ case HTP_OP_MUL:
1071
+ case HTP_OP_ADD:
1072
+ case HTP_OP_SUB:
1073
+ case HTP_OP_DIV:
1074
+ if (n_bufs != 3) {
1075
+ FARF(ERROR, "Bad binary-req buffer list");
1076
+ continue;
1077
+ }
1078
+ proc_binary_req(ctx, &req, bufs);
1079
+ break;
1080
+
1081
+ case HTP_OP_RMS_NORM:
1082
+ case HTP_OP_SCALE:
1083
+ if (n_bufs != 2) {
1084
+ FARF(ERROR, "Bad unary-req buffer list");
1085
+ continue;
1086
+ }
1087
+
1088
+ proc_unary_req(ctx, &req, bufs);
1089
+ break;
1090
+
1091
+ case HTP_OP_SQR:
1092
+ case HTP_OP_SQRT:
1093
+ if (n_bufs != 2) {
1094
+ FARF(ERROR, "Bad unary-req buffer list");
1095
+ continue;
1096
+ }
1097
+
1098
+ proc_unary_req(ctx, &req, bufs);
1099
+ break;
1100
+
1101
+ case HTP_OP_SUM_ROWS:
1102
+ if (n_bufs != 2) {
1103
+ FARF(ERROR, "Bad unary-req buffer list");
1104
+ continue;
1105
+ }
1106
+
1107
+ proc_sum_rows_req(ctx, &req, bufs);
1108
+ break;
1109
+
1110
+ case HTP_OP_UNARY_SILU:
1111
+ case HTP_OP_UNARY_GELU:
1112
+ if (n_bufs != 2) {
1113
+ FARF(ERROR, "Bad act-req buffer list");
1114
+ continue;
1115
+ }
1116
+ proc_activations_req(ctx, &req, bufs, n_bufs);
1117
+ break;
1118
+
1119
+ case HTP_OP_GLU_SWIGLU:
1120
+ case HTP_OP_GLU_SWIGLU_OAI:
1121
+ case HTP_OP_SOFTMAX:
1122
+ case HTP_OP_GLU_GEGLU:
1123
+ if ((n_bufs != 2) && (n_bufs != 3)) {
1124
+ FARF(ERROR, "Bad act-req buffer list");
1125
+ continue;
1126
+ }
1127
+ proc_activations_req(ctx, &req, bufs, n_bufs);
1128
+ break;
1129
+
1130
+ case HTP_OP_ADD_ID:
1131
+ if (n_bufs != 4) {
1132
+ FARF(ERROR, "Bad add-id-req buffer list");
1133
+ continue;
1134
+ }
1135
+ proc_add_id_req(ctx, &req, bufs);
1136
+ break;
1137
+
1138
+ case HTP_OP_ROPE:
1139
+ if ((n_bufs != 3) && (n_bufs != 4)) {
1140
+ FARF(ERROR, "Bad rope-req buffer list");
1141
+ continue;
1142
+ }
1143
+ proc_rope_req(ctx, &req, bufs, n_bufs);
1144
+ break;
1145
+
1146
+ case HTP_OP_FLASH_ATTN_EXT:
1147
+ if (!(n_bufs >= 4 && n_bufs <= 6)) {
1148
+ FARF(ERROR, "Bad flash-attn-ext-req buffer list");
1149
+ continue;
1150
+ }
1151
+ proc_flash_attn_ext_req(ctx, &req, bufs, n_bufs);
1152
+ break;
1153
+
1154
+ case HTP_OP_SET_ROWS:
1155
+ if (n_bufs != 3) {
1156
+ FARF(ERROR, "Bad set-rows-req buffer list");
1157
+ continue;
1158
+ }
1159
+ proc_set_rows_req(ctx, &req, bufs);
1160
+ break;
1161
+
1162
+ case HTP_OP_GET_ROWS:
1163
+ if (n_bufs != 3) {
1164
+ FARF(ERROR, "Bad get-rows-req buffer list");
1165
+ continue;
1166
+ }
1167
+ proc_get_rows_req(ctx, &req, bufs);
1168
+ break;
1169
+
1170
+ case HTP_OP_CPY:
1171
+ if (n_bufs != 2) {
1172
+ FARF(ERROR, "Bad cpy-req buffer list");
1173
+ continue;
1174
+ }
1175
+ proc_cpy_req(ctx, &req, bufs);
1176
+ break;
1177
+
1178
+ case HTP_OP_ARGSORT:
1179
+ if (n_bufs != 2) {
1180
+ FARF(ERROR, "Bad argsort-req buffer list");
1181
+ continue;
1182
+ }
1183
+ proc_argsort_req(ctx, &req, bufs);
1184
+ break;
1185
+
1186
+ case HTP_OP_SSM_CONV:
1187
+ if (n_bufs != 3) {
1188
+ FARF(ERROR, "Bad ssm-conv-req buffer list");
1189
+ continue;
1190
+ }
1191
+ proc_ssm_conv_req(ctx, &req, bufs);
1192
+ break;
1193
+
1194
+ default:
1195
+ FARF(ERROR, "Unknown Op %u", req.op);
1196
+ break;
1197
+ }
1198
+ }
1199
+ }