whispercpp 1.3.4 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (630) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -43
  3. data/ext/extconf.rb +2 -2
  4. data/ext/ruby_whisper.c +14 -2
  5. data/ext/ruby_whisper.h +39 -0
  6. data/ext/ruby_whisper_context.c +22 -22
  7. data/ext/ruby_whisper_model.c +12 -12
  8. data/ext/ruby_whisper_params.c +47 -23
  9. data/ext/ruby_whisper_segment.c +84 -19
  10. data/ext/ruby_whisper_token.c +351 -0
  11. data/ext/ruby_whisper_transcribe.cpp +1 -1
  12. data/ext/ruby_whisper_vad_context.c +75 -0
  13. data/ext/ruby_whisper_vad_context_detect.cpp +50 -0
  14. data/ext/ruby_whisper_vad_segment.c +139 -0
  15. data/ext/ruby_whisper_vad_segments.c +106 -0
  16. data/ext/sources/CMakeLists.txt +4 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/cmake/arm64-apple-clang.cmake +16 -0
  19. data/ext/sources/cmake/arm64-windows-llvm.cmake +16 -0
  20. data/ext/sources/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
  21. data/ext/sources/cmake/x64-windows-llvm.cmake +5 -0
  22. data/ext/sources/examples/addon.node/vad-example.js +2 -2
  23. data/ext/sources/examples/cli/cli.cpp +121 -112
  24. data/ext/sources/examples/lsp/CMakeLists.txt +2 -1
  25. data/ext/sources/examples/quantize/CMakeLists.txt +2 -1
  26. data/ext/sources/examples/server/server.cpp +10 -11
  27. data/ext/sources/examples/talk-llama/CMakeLists.txt +5 -1
  28. data/ext/sources/examples/talk-llama/llama-adapter.cpp +12 -3
  29. data/ext/sources/examples/talk-llama/llama-adapter.h +7 -1
  30. data/ext/sources/examples/talk-llama/llama-arch.cpp +2046 -1974
  31. data/ext/sources/examples/talk-llama/llama-arch.h +67 -2
  32. data/ext/sources/examples/talk-llama/llama-batch.cpp +75 -33
  33. data/ext/sources/examples/talk-llama/llama-batch.h +17 -4
  34. data/ext/sources/examples/talk-llama/llama-chat.cpp +79 -3
  35. data/ext/sources/examples/talk-llama/llama-chat.h +4 -0
  36. data/ext/sources/examples/talk-llama/llama-context.cpp +775 -78
  37. data/ext/sources/examples/talk-llama/llama-context.h +57 -9
  38. data/ext/sources/examples/talk-llama/llama-cparams.h +1 -0
  39. data/ext/sources/examples/talk-llama/llama-grammar.cpp +288 -53
  40. data/ext/sources/examples/talk-llama/llama-grammar.h +22 -1
  41. data/ext/sources/examples/talk-llama/llama-graph.cpp +381 -64
  42. data/ext/sources/examples/talk-llama/llama-graph.h +103 -13
  43. data/ext/sources/examples/talk-llama/llama-hparams.cpp +26 -2
  44. data/ext/sources/examples/talk-llama/llama-hparams.h +41 -10
  45. data/ext/sources/examples/talk-llama/llama-impl.cpp +7 -3
  46. data/ext/sources/examples/talk-llama/llama-impl.h +1 -1
  47. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +5 -3
  48. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +145 -65
  49. data/ext/sources/examples/talk-llama/llama-kv-cache.h +22 -7
  50. data/ext/sources/examples/talk-llama/llama-kv-cells.h +44 -2
  51. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +12 -10
  52. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +32 -19
  53. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +2 -2
  54. data/ext/sources/examples/talk-llama/llama-mmap.cpp +172 -37
  55. data/ext/sources/examples/talk-llama/llama-mmap.h +8 -3
  56. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +91 -9
  57. data/ext/sources/examples/talk-llama/llama-model-loader.h +6 -0
  58. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +3 -0
  59. data/ext/sources/examples/talk-llama/llama-model.cpp +1529 -13134
  60. data/ext/sources/examples/talk-llama/llama-model.h +44 -3
  61. data/ext/sources/examples/talk-llama/llama-quant.cpp +8 -23
  62. data/ext/sources/examples/talk-llama/llama-sampling.cpp +1294 -198
  63. data/ext/sources/examples/talk-llama/llama-sampling.h +19 -7
  64. data/ext/sources/examples/talk-llama/llama-vocab.cpp +133 -37
  65. data/ext/sources/examples/talk-llama/llama-vocab.h +45 -40
  66. data/ext/sources/examples/talk-llama/llama.cpp +729 -2
  67. data/ext/sources/examples/talk-llama/llama.h +152 -14
  68. data/ext/sources/examples/talk-llama/models/afmoe.cpp +191 -0
  69. data/ext/sources/examples/talk-llama/models/apertus.cpp +125 -0
  70. data/ext/sources/examples/talk-llama/models/arcee.cpp +135 -0
  71. data/ext/sources/examples/talk-llama/models/arctic.cpp +138 -0
  72. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +86 -0
  73. data/ext/sources/examples/talk-llama/models/baichuan.cpp +122 -0
  74. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +144 -0
  75. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +135 -0
  76. data/ext/sources/examples/talk-llama/models/bert.cpp +178 -0
  77. data/ext/sources/examples/talk-llama/models/bitnet.cpp +160 -0
  78. data/ext/sources/examples/talk-llama/models/bloom.cpp +101 -0
  79. data/ext/sources/examples/talk-llama/models/chameleon.cpp +178 -0
  80. data/ext/sources/examples/talk-llama/models/chatglm.cpp +132 -0
  81. data/ext/sources/examples/talk-llama/models/codeshell.cpp +111 -0
  82. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +102 -0
  83. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +134 -0
  84. data/ext/sources/examples/talk-llama/models/command-r.cpp +122 -0
  85. data/ext/sources/examples/talk-llama/models/dbrx.cpp +123 -0
  86. data/ext/sources/examples/talk-llama/models/deci.cpp +135 -0
  87. data/ext/sources/examples/talk-llama/models/deepseek.cpp +144 -0
  88. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +259 -0
  89. data/ext/sources/examples/talk-llama/models/dots1.cpp +134 -0
  90. data/ext/sources/examples/talk-llama/models/dream.cpp +105 -0
  91. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +150 -0
  92. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +110 -0
  93. data/ext/sources/examples/talk-llama/models/exaone.cpp +114 -0
  94. data/ext/sources/examples/talk-llama/models/exaone4.cpp +123 -0
  95. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +113 -0
  96. data/ext/sources/examples/talk-llama/models/falcon.cpp +120 -0
  97. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +116 -0
  98. data/ext/sources/examples/talk-llama/models/gemma.cpp +112 -0
  99. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +128 -0
  100. data/ext/sources/examples/talk-llama/models/gemma3.cpp +155 -0
  101. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +384 -0
  102. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +170 -0
  103. data/ext/sources/examples/talk-llama/models/glm4.cpp +150 -0
  104. data/ext/sources/examples/talk-llama/models/gpt2.cpp +105 -0
  105. data/ext/sources/examples/talk-llama/models/gptneox.cpp +144 -0
  106. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +196 -0
  107. data/ext/sources/examples/talk-llama/models/granite.cpp +211 -0
  108. data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +283 -0
  109. data/ext/sources/examples/talk-llama/models/grok.cpp +159 -0
  110. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +141 -0
  111. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +132 -0
  112. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +154 -0
  113. data/ext/sources/examples/talk-llama/models/internlm2.cpp +120 -0
  114. data/ext/sources/examples/talk-llama/models/jais.cpp +86 -0
  115. data/ext/sources/examples/talk-llama/models/jamba.cpp +106 -0
  116. data/ext/sources/examples/talk-llama/models/lfm2.cpp +175 -0
  117. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +122 -0
  118. data/ext/sources/examples/talk-llama/models/llada.cpp +99 -0
  119. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +178 -0
  120. data/ext/sources/examples/talk-llama/models/llama.cpp +168 -0
  121. data/ext/sources/examples/talk-llama/models/maincoder.cpp +117 -0
  122. data/ext/sources/examples/talk-llama/models/mamba.cpp +55 -0
  123. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +123 -0
  124. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +199 -0
  125. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +124 -0
  126. data/ext/sources/examples/talk-llama/models/mistral3.cpp +160 -0
  127. data/ext/sources/examples/talk-llama/models/models.h +569 -0
  128. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +116 -0
  129. data/ext/sources/examples/talk-llama/models/mpt.cpp +126 -0
  130. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +150 -0
  131. data/ext/sources/examples/talk-llama/models/nemotron.cpp +122 -0
  132. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +104 -0
  133. data/ext/sources/examples/talk-llama/models/olmo.cpp +121 -0
  134. data/ext/sources/examples/talk-llama/models/olmo2.cpp +150 -0
  135. data/ext/sources/examples/talk-llama/models/olmoe.cpp +124 -0
  136. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +127 -0
  137. data/ext/sources/examples/talk-llama/models/openelm.cpp +124 -0
  138. data/ext/sources/examples/talk-llama/models/orion.cpp +123 -0
  139. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +121 -0
  140. data/ext/sources/examples/talk-llama/models/phi2.cpp +121 -0
  141. data/ext/sources/examples/talk-llama/models/phi3.cpp +152 -0
  142. data/ext/sources/examples/talk-llama/models/plamo.cpp +110 -0
  143. data/ext/sources/examples/talk-llama/models/plamo2.cpp +316 -0
  144. data/ext/sources/examples/talk-llama/models/plamo3.cpp +128 -0
  145. data/ext/sources/examples/talk-llama/models/plm.cpp +168 -0
  146. data/ext/sources/examples/talk-llama/models/qwen.cpp +108 -0
  147. data/ext/sources/examples/talk-llama/models/qwen2.cpp +126 -0
  148. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +151 -0
  149. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +117 -0
  150. data/ext/sources/examples/talk-llama/models/qwen3.cpp +117 -0
  151. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +124 -0
  152. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +873 -0
  153. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +149 -0
  154. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +141 -0
  155. data/ext/sources/examples/talk-llama/models/refact.cpp +94 -0
  156. data/ext/sources/examples/talk-llama/models/rnd1.cpp +126 -0
  157. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +162 -0
  158. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +94 -0
  159. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +86 -0
  160. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +135 -0
  161. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +90 -0
  162. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +124 -0
  163. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +126 -0
  164. data/ext/sources/examples/talk-llama/models/smollm3.cpp +128 -0
  165. data/ext/sources/examples/talk-llama/models/stablelm.cpp +146 -0
  166. data/ext/sources/examples/talk-llama/models/starcoder.cpp +100 -0
  167. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +121 -0
  168. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +166 -0
  169. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +96 -0
  170. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +149 -0
  171. data/ext/sources/examples/talk-llama/models/xverse.cpp +108 -0
  172. data/ext/sources/examples/talk-llama/unicode.cpp +102 -16
  173. data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +1 -1
  174. data/ext/sources/examples/whisper.wasm/index-tmpl.html +1 -1
  175. data/ext/sources/ggml/CMakeLists.txt +82 -54
  176. data/ext/sources/ggml/include/ggml-alloc.h +9 -0
  177. data/ext/sources/ggml/include/ggml-backend.h +4 -1
  178. data/ext/sources/ggml/include/ggml-cpu.h +1 -0
  179. data/ext/sources/ggml/include/ggml-hexagon.h +19 -0
  180. data/ext/sources/ggml/include/ggml-rpc.h +8 -11
  181. data/ext/sources/ggml/include/ggml-zendnn.h +22 -0
  182. data/ext/sources/ggml/include/ggml.h +190 -12
  183. data/ext/sources/ggml/src/CMakeLists.txt +82 -11
  184. data/ext/sources/ggml/src/ggml-alloc.c +124 -41
  185. data/ext/sources/ggml/src/ggml-backend-impl.h +1 -4
  186. data/ext/sources/ggml/src/ggml-backend-reg.cpp +27 -3
  187. data/ext/sources/ggml/src/ggml-backend.cpp +71 -21
  188. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +17 -3
  189. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -9
  190. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
  191. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +138 -47
  192. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +2179 -1696
  193. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +238 -317
  194. data/ext/sources/ggml/src/ggml-cann/common.h +283 -208
  195. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +626 -776
  196. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +156 -86
  197. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
  198. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
  199. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
  200. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1004 -0
  201. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
  202. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  203. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
  204. data/ext/sources/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  205. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6 -6
  206. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +50 -2
  207. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -3
  208. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +195 -71
  209. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  210. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +573 -106
  211. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +33 -44
  212. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +298 -112
  213. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
  214. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +819 -125
  215. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
  216. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +708 -431
  217. data/ext/sources/ggml/src/ggml-cpu/ops.h +5 -4
  218. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +671 -31
  219. data/ext/sources/ggml/src/ggml-cpu/repack.h +14 -0
  220. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +41 -43
  221. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +3 -2
  222. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
  223. data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +7 -0
  224. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +124 -1
  225. data/ext/sources/ggml/src/ggml-cpu/vec.h +261 -146
  226. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +72 -1
  227. data/ext/sources/ggml/src/ggml-cuda/argmax.cu +2 -2
  228. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +123 -6
  229. data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +16 -0
  230. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +1 -1
  231. data/ext/sources/ggml/src/ggml-cuda/common.cuh +353 -80
  232. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
  233. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
  234. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +339 -246
  235. data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +1 -5
  236. data/ext/sources/ggml/src/ggml-cuda/cumsum.cu +307 -0
  237. data/ext/sources/ggml/src/ggml-cuda/cumsum.cuh +5 -0
  238. data/ext/sources/ggml/src/ggml-cuda/diag.cu +77 -0
  239. data/ext/sources/ggml/src/ggml-cuda/diag.cuh +5 -0
  240. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +31 -21
  241. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +663 -596
  242. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +35 -741
  243. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +1241 -0
  244. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +30 -37
  245. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +14 -13
  246. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
  247. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +83 -37
  248. data/ext/sources/ggml/src/ggml-cuda/fill.cu +37 -0
  249. data/ext/sources/ggml/src/ggml-cuda/fill.cuh +3 -0
  250. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1155 -164
  251. data/ext/sources/ggml/src/ggml-cuda/mean.cu +5 -4
  252. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +741 -48
  253. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +60 -12
  254. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +381 -42
  255. data/ext/sources/ggml/src/ggml-cuda/mmid.cu +164 -0
  256. data/ext/sources/ggml/src/ggml-cuda/mmid.cuh +5 -0
  257. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +69 -176
  258. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +498 -171
  259. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +375 -79
  260. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +3 -2
  261. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +241 -95
  262. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -1
  263. data/ext/sources/ggml/src/ggml-cuda/pad.cu +64 -33
  264. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +151 -0
  265. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +14 -0
  266. data/ext/sources/ggml/src/ggml-cuda/rope.cu +192 -77
  267. data/ext/sources/ggml/src/ggml-cuda/rope.cuh +2 -0
  268. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +101 -47
  269. data/ext/sources/ggml/src/ggml-cuda/set.cu +39 -0
  270. data/ext/sources/ggml/src/ggml-cuda/set.cuh +7 -0
  271. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +203 -6
  272. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +275 -0
  273. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
  274. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +14 -20
  275. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +49 -84
  276. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  277. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  278. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  279. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  280. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  281. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  282. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  283. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  284. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  285. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +19 -1
  286. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +96 -0
  287. data/ext/sources/ggml/src/ggml-cuda/top-k.cuh +3 -0
  288. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +168 -76
  289. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +11 -4
  290. data/ext/sources/ggml/src/ggml-cuda/tri.cu +136 -0
  291. data/ext/sources/ggml/src/ggml-cuda/tri.cuh +5 -0
  292. data/ext/sources/ggml/src/ggml-cuda/unary.cu +105 -11
  293. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +36 -0
  294. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +163 -7
  295. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  296. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +12 -1
  297. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +6 -0
  298. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +80 -0
  299. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3151 -0
  300. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +44 -0
  301. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +682 -0
  302. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
  303. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  304. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +566 -0
  305. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +112 -0
  306. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  307. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.c +63 -0
  308. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +157 -0
  309. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +165 -0
  310. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +92 -0
  311. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  312. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +94 -0
  313. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +72 -0
  314. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
  315. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +1020 -0
  316. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +1353 -0
  317. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +1001 -0
  318. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2503 -0
  319. data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
  320. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +487 -0
  321. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +168 -0
  322. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
  323. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +287 -0
  324. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
  325. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  326. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +454 -0
  327. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +221 -0
  328. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +153 -0
  329. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +8 -13
  330. data/ext/sources/ggml/src/ggml-impl.h +67 -6
  331. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +2 -2
  332. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +29 -20
  333. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +652 -285
  334. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +103 -56
  335. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +496 -118
  336. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +231 -9
  337. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +1227 -224
  338. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +12 -0
  339. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +14 -8
  340. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1972 -704
  341. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +3 -1
  342. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +11 -0
  343. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1430 -120
  344. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +63 -0
  345. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +82 -0
  346. data/ext/sources/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  347. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +4 -3
  348. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  349. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  350. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
  351. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +39 -0
  352. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  353. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
  354. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
  355. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  356. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
  357. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
  358. data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
  359. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +35 -16
  360. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -0
  361. data/ext/sources/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  362. data/ext/sources/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  363. data/ext/sources/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  364. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
  365. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +438 -156
  366. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +48 -3
  367. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +77 -0
  368. data/ext/sources/ggml/src/ggml-sycl/add-id.hpp +8 -0
  369. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +6 -0
  370. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +0 -9
  371. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +0 -6
  372. data/ext/sources/ggml/src/ggml-sycl/common.hpp +117 -15
  373. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +55 -44
  374. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +34 -0
  375. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  376. data/ext/sources/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  377. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +0 -3
  378. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +18 -0
  379. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +76 -3
  380. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +333 -300
  381. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +10 -2
  382. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +335 -110
  383. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +22 -0
  384. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +156 -0
  385. data/ext/sources/ggml/src/ggml-sycl/norm.hpp +2 -0
  386. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +97 -0
  387. data/ext/sources/ggml/src/ggml-sycl/pad.hpp +24 -0
  388. data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
  389. data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
  390. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +2 -0
  391. data/ext/sources/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  392. data/ext/sources/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  393. data/ext/sources/ggml/src/ggml-sycl/roll.cpp +122 -0
  394. data/ext/sources/ggml/src/ggml-sycl/roll.hpp +20 -0
  395. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +30 -17
  396. data/ext/sources/ggml/src/ggml-sycl/set.cpp +73 -0
  397. data/ext/sources/ggml/src/ggml-sycl/set.hpp +5 -0
  398. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +327 -162
  399. data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +4 -0
  400. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
  401. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  402. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +58 -0
  403. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
  404. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +5013 -2859
  405. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  406. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
  407. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
  408. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  409. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
  410. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  411. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
  412. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
  413. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  414. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  415. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
  416. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
  417. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
  418. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
  419. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +47 -49
  420. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
  421. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
  422. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
  423. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +4 -4
  424. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  425. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
  426. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
  427. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  428. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
  429. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  430. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  431. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
  432. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +9 -21
  433. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
  434. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +1 -1
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +1 -1
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +1 -1
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
  449. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
  450. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
  451. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
  452. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
  453. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  454. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
  455. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
  456. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
  457. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
  458. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -3
  459. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  460. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +39 -17
  461. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +19 -1
  462. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +45 -7
  463. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +50 -12
  464. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +1 -1
  465. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  466. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
  467. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
  468. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
  469. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
  470. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
  471. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
  472. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +17 -2
  473. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +2 -0
  474. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +7 -0
  475. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +4 -4
  476. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +3 -3
  477. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
  478. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
  479. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +2 -2
  480. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +2 -2
  481. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +19 -7
  482. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +2 -3
  483. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
  484. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
  485. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
  486. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
  487. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +2 -2
  488. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mat_vec_base.comp → mul_mat_vec_base.glsl} +70 -25
  489. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
  490. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +71 -21
  491. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +41 -25
  492. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +2 -2
  493. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +44 -26
  494. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +2 -2
  495. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +2 -2
  496. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +2 -2
  497. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
  498. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
  499. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +4 -6
  500. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +2 -2
  501. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +4 -6
  502. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +4 -6
  503. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +2 -2
  504. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +39 -36
  505. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +494 -0
  506. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +78 -103
  507. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +34 -23
  508. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mm_funcs.comp → mul_mm_funcs.glsl} +69 -59
  509. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +72 -0
  510. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +88 -228
  511. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
  512. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
  513. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
  514. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  515. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
  516. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
  517. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
  518. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +21 -6
  519. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
  520. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +10 -10
  521. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
  522. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
  523. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
  524. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
  525. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +50 -4
  526. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
  527. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
  528. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
  529. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +234 -0
  530. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
  531. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +6 -50
  532. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +6 -33
  533. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +6 -33
  534. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +28 -0
  535. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +6 -39
  536. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  537. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
  538. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
  539. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
  540. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
  541. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
  542. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
  543. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +2 -2
  544. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
  545. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
  546. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
  547. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
  548. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  549. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
  550. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
  551. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
  552. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
  553. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
  554. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  555. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
  556. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +2 -25
  557. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
  558. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
  559. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
  560. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
  561. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +1 -1
  562. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
  563. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
  564. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
  565. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +43 -0
  566. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  567. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +345 -26
  568. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +90 -12
  569. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +335 -151
  570. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  571. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +28 -2
  572. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +169 -0
  573. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1964 -435
  574. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  575. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
  576. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
  577. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +33 -10
  578. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +591 -0
  579. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +1 -1
  580. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  581. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +6 -6
  582. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
  583. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
  584. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
  585. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
  586. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +83 -17
  587. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  588. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
  589. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +112 -0
  590. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  591. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +483 -0
  592. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
  593. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +466 -0
  594. data/ext/sources/ggml/src/ggml.c +425 -33
  595. data/ext/sources/include/whisper.h +1 -0
  596. data/ext/sources/src/CMakeLists.txt +3 -1
  597. data/ext/sources/src/whisper.cpp +101 -35
  598. data/ext/sources/tests/CMakeLists.txt +2 -2
  599. data/ext/sources/tests/test-vad-full.cpp +4 -2
  600. data/ext/sources/tests/test-vad.cpp +1 -1
  601. data/extsources.rb +1 -0
  602. data/lib/whisper/model/uri.rb +17 -18
  603. data/sig/whisper.rbs +119 -2
  604. data/test/test_params.rb +16 -8
  605. data/test/test_segment.rb +0 -1
  606. data/test/test_token.rb +70 -0
  607. data/test/test_vad.rb +1 -1
  608. data/test/test_vad_context.rb +50 -0
  609. data/test/test_vad_segment.rb +19 -0
  610. data/test/test_vad_segments.rb +16 -0
  611. data/test/test_whisper.rb +7 -0
  612. data/whispercpp.gemspec +1 -1
  613. metadata +287 -34
  614. data/ext/sources/build-xcframework.sh +0 -571
  615. data/ext/sources/ggml/src/ggml-cann/Doxyfile +0 -2579
  616. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
  617. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
  618. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +0 -44
  619. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +0 -41
  620. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
  621. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +0 -44
  622. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +0 -41
  623. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +0 -48
  624. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
  625. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
  626. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
  627. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
  628. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
  629. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
  630. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
@@ -0,0 +1,1001 @@
1
+ #pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
2
+ #pragma clang diagnostic ignored "-Wunused-function"
3
+
4
+ #define FARF_ERROR 1
5
+ #define FARF_HIGH 1
6
+ #define FARF_MEDIUM 0
7
+ #define FARF_LOW 0
8
+ #include <AEEStdErr.h>
9
+ #include <dspqueue.h>
10
+ #include <HAP_compute_res.h>
11
+ #include <HAP_etm_config.h>
12
+ #include <HAP_farf.h>
13
+ #include <HAP_mem.h>
14
+ #include <HAP_perf.h>
15
+ #include <HAP_power.h>
16
+ #include <HAP_ps.h>
17
+ #include <qurt.h>
18
+ #include <qurt_thread.h>
19
+ #include <remote.h>
20
+ #include <string.h>
21
+
22
+ #define GGML_COMMON_DECL_C
23
+ #include "ggml-common.h"
24
+ #include "htp-ctx.h"
25
+ #include "htp-dma.h"
26
+ #include "htp-msg.h"
27
+ #include "htp-ops.h"
28
+ #include "ops-utils.h"
29
+ #include "worker-pool.h"
30
+
31
+ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
32
+ struct htp_context * ctx;
33
+ int err = 0;
34
+
35
+ ctx = calloc(1, sizeof(*ctx));
36
+ if (ctx == NULL) {
37
+ return AEE_ENOMEMORY;
38
+ }
39
+
40
+ // Use the context structure as a handle
41
+ *handle = (remote_handle64) ctx;
42
+
43
+ // Enable FARF logs
44
+ HAP_setFARFRuntimeLoggingParams(0xffff, NULL, 0);
45
+
46
+ // Set client class
47
+ {
48
+ HAP_power_request_t request;
49
+ memset(&request, 0, sizeof(HAP_power_request_t));
50
+ request.type = HAP_power_set_apptype;
51
+ request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
52
+
53
+ if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
54
+ return err;
55
+ }
56
+ }
57
+
58
+ {
59
+ HAP_power_request_t request;
60
+ memset(&request, 0, sizeof(request));
61
+
62
+ request.type = HAP_power_set_DCVS_v3;
63
+ request.dcvs_v3.set_dcvs_enable = TRUE;
64
+ request.dcvs_v3.dcvs_enable = TRUE;
65
+ request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
66
+ request.dcvs_v3.set_bus_params = TRUE;
67
+ request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_MAX;
68
+ request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_MAX;
69
+ request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_MAX;
70
+ request.dcvs_v3.set_core_params = TRUE;
71
+ request.dcvs_v3.core_params.min_corner = HAP_DCVS_VCORNER_MAX;
72
+ request.dcvs_v3.core_params.max_corner = HAP_DCVS_VCORNER_MAX;
73
+ request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_MAX;
74
+ request.dcvs_v3.set_sleep_disable = TRUE;
75
+ request.dcvs_v3.sleep_disable = TRUE;
76
+ if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
77
+ return err;
78
+ }
79
+
80
+ memset(&request, 0, sizeof(request));
81
+ request.type = HAP_power_set_HVX;
82
+ request.hvx.power_up = TRUE;
83
+ if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
84
+ return err;
85
+ }
86
+ }
87
+
88
+ {
89
+ // Power on HMX
90
+ HAP_power_request_t request;
91
+ memset(&request, 0, sizeof(HAP_power_request_t));
92
+ request.type = HAP_power_set_HMX;
93
+ request.hmx.power_up = TRUE;
94
+ FARF(ALWAYS, "Powering HMX on\n");
95
+ err = HAP_power_set((void *) &ctx, &request);
96
+ if (err != AEE_SUCCESS) {
97
+ FARF(ERROR, "Error powering on HMX.");
98
+ return err;
99
+ }
100
+ }
101
+
102
+ return AEE_SUCCESS;
103
+ }
104
+
105
+ AEEResult htp_iface_close(remote_handle64 handle) {
106
+ struct htp_context * ctx = (struct htp_context *) handle;
107
+
108
+ if (!ctx) {
109
+ return AEE_EBADPARM;
110
+ }
111
+
112
+ if (ctx->queue) {
113
+ FARF(ERROR, "Closing handle with queue still open");
114
+ return AEE_EITEMBUSY;
115
+ }
116
+
117
+ free(ctx);
118
+ return AEE_SUCCESS;
119
+ }
120
+
121
+ AEEResult htp_iface_enable_etm(remote_handle64 handle) {
122
+ int err = HAP_user_etm_enable();
123
+ if (err) {
124
+ if (err == AEE_EVERSIONNOTSUPPORT) {
125
+ FARF(ERROR, "API HAP_user_etm_enable is not supported\n");
126
+ } else {
127
+ FARF(ERROR, "Error executing HAP_user_etm_enable with error code : 0x%x\n", err);
128
+ }
129
+ }
130
+ return err;
131
+ }
132
+
133
+ AEEResult htp_iface_disable_etm(remote_handle64 handle) {
134
+ int err = HAP_user_etm_disable();
135
+ if (err) {
136
+ if (err == AEE_EVERSIONNOTSUPPORT) {
137
+ FARF(ERROR, "API HAP_user_etm_disable is not supported\n");
138
+ } else {
139
+ FARF(ERROR, "Error executing HAP_user_etm_disable with error code : 0x%x\n", err);
140
+ }
141
+ }
142
+ return err;
143
+ }
144
+
145
+ static int vtcm_acquire(struct htp_context * ctx) {
146
+ int err;
147
+ if (!ctx->vtcm_valid) {
148
+ // Temporarily bump thread priority to make sure it's higher than other sessions.
149
+ // This way the resource manager will notify the other thread to release VTCM.
150
+ // Note that we need to reaquire VTCM at normal priority for this to work next time.
151
+ qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10);
152
+ err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
153
+ if (err != 0) {
154
+ FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
155
+ abort();
156
+ }
157
+ HAP_compute_res_release_cached(ctx->vtcm_rctx);
158
+ qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio);
159
+
160
+ err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
161
+ if (err != 0) {
162
+ FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
163
+ abort();
164
+ }
165
+ ctx->vtcm_valid = true;
166
+ }
167
+
168
+ ctx->vtcm_inuse = true;
169
+ return 0;
170
+ }
171
+
172
+ static int vtcm_release(struct htp_context * ctx) {
173
+ ctx->vtcm_inuse = false;
174
+
175
+ if (ctx->vtcm_valid && ctx->vtcm_needs_release) {
176
+ ctx->vtcm_valid = false;
177
+ ctx->vtcm_needs_release = false;
178
+ HAP_compute_res_release_cached(ctx->vtcm_rctx);
179
+ }
180
+
181
+ return 0;
182
+ }
183
+
184
+ static int vtcm_release_callback(unsigned int rctx, void * state) {
185
+ struct htp_context * ctx = (struct htp_context *) state;
186
+
187
+ if (!ctx || ctx->vtcm_rctx != rctx) {
188
+ return AEE_EBADPARM;
189
+ }
190
+
191
+ // If VTCM is not inuse (not processing Ops) release it right here
192
+ // otherwise we'll release it once we're done with the current Op.
193
+
194
+ if (ctx->vtcm_inuse) {
195
+ ctx->vtcm_needs_release = false;
196
+ return 0;
197
+ }
198
+
199
+ ctx->vtcm_valid = false;
200
+ HAP_compute_res_release_cached(ctx->vtcm_rctx);
201
+
202
+ return 0;
203
+ }
204
+
205
+ static int vtcm_alloc(struct htp_context * ctx) {
206
+ unsigned int vtcm_size = 8 * 1024 * 1024; // 8MB default
207
+ HAP_compute_res_query_VTCM(0, &vtcm_size, NULL, NULL, NULL);
208
+
209
+ compute_res_attr_t attr;
210
+ HAP_compute_res_attr_init(&attr);
211
+ HAP_compute_res_attr_set_serialize(&attr, 0);
212
+ HAP_compute_res_attr_set_cache_mode(&attr, 1);
213
+ HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, 0, vtcm_size);
214
+ HAP_compute_res_attr_set_release_callback(&attr, vtcm_release_callback, (void *) ctx);
215
+ HAP_compute_res_attr_set_hmx_param(&attr, 1);
216
+
217
+ // Allocate VTCM for scratch pads
218
+ uint32_t rctx = HAP_compute_res_acquire(&attr, 1000000 /* timeout */);
219
+ if (!rctx) {
220
+ FARF(ERROR, "failed to allocate %zu bytes VTCM\n", ctx->vtcm_size);
221
+ return AEE_ENOMEMORY;
222
+ }
223
+
224
+ void * vtcm_ptr;
225
+ if (HAP_compute_res_attr_get_vtcm_ptr_v2(&attr, &vtcm_ptr, &vtcm_size) != 0) {
226
+ HAP_compute_res_release(rctx);
227
+ FARF(ERROR, "failed to allocate %zu bytes VTCM (new)\n", ctx->vtcm_size);
228
+ return AEE_ENOMEMORY;
229
+ }
230
+
231
+ ctx->vtcm_base = (uint8_t *) vtcm_ptr;
232
+ ctx->vtcm_size = vtcm_size;
233
+ ctx->vtcm_rctx = rctx;
234
+ ctx->vtcm_valid = false;
235
+ ctx->vtcm_inuse = false;
236
+ ctx->vtcm_needs_release = false;
237
+
238
+ return 0;
239
+ }
240
+
241
+ static void vtcm_free(struct htp_context * ctx) {
242
+ if (ctx->vtcm_rctx) {
243
+ HAP_compute_res_release(ctx->vtcm_rctx);
244
+ ctx->vtcm_base = 0;
245
+ ctx->vtcm_rctx = 0;
246
+ }
247
+ }
248
+
249
+ static void htp_packet_callback(dspqueue_t queue, int error, void * context);
250
+ static void htp_error_callback(dspqueue_t queue, int error, void * context);
251
+
252
+ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx) {
253
+ struct htp_context * ctx = (struct htp_context *) handle;
254
+
255
+ if (!ctx) {
256
+ return AEE_EBADPARM;
257
+ }
258
+
259
+ if (ctx->queue) {
260
+ FARF(ERROR, "Queue already open");
261
+ return AEE_EITEMBUSY;
262
+ }
263
+
264
+ // Import queue created on the CPU
265
+ int err = dspqueue_import(dsp_queue_id, // Queue ID from dspqueue_export
266
+ htp_packet_callback, // Packet callback
267
+ htp_error_callback, // Error callback; no errors expected on the DSP
268
+ (void *) ctx, // Callback context
269
+ &ctx->queue);
270
+
271
+ if (err) {
272
+ FARF(ERROR, "Queue import failed with 0x%08x", (unsigned) err);
273
+ return err;
274
+ }
275
+
276
+ ctx->thread_id = qurt_thread_get_id();
277
+ ctx->thread_prio = qurt_thread_get_priority(ctx->thread_id);
278
+
279
+ // allocate VTCM
280
+ err = vtcm_alloc(ctx);
281
+ if (err != AEE_SUCCESS) {
282
+ FARF(ERROR, "Unable to allocate VTCM");
283
+ return AEE_ENOMEMORY;
284
+ }
285
+
286
+ qurt_sysenv_max_hthreads_t hw_threads;
287
+ qurt_sysenv_get_max_hw_threads(&hw_threads);
288
+ uint32_t hw_nhvx = (qurt_hvx_get_units() >> 8) & 0xFF;
289
+
290
+ if (n_hvx == 0) {
291
+ n_hvx = hw_nhvx;
292
+ }
293
+ if (n_hvx > hw_threads.max_hthreads) {
294
+ n_hvx = hw_threads.max_hthreads;
295
+ }
296
+ if (n_hvx > HTP_MAX_NTHREADS) {
297
+ n_hvx = HTP_MAX_NTHREADS;
298
+ }
299
+
300
+ ctx->n_threads = n_hvx;
301
+ for (int i = 0; i < ctx->n_threads; i++) {
302
+ // see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541
303
+ ctx->dma[i] = dma_queue_create(64);
304
+ }
305
+
306
+ // init worker pool
307
+ err = worker_pool_init(&ctx->worker_pool, n_hvx);
308
+ if (err != AEE_SUCCESS) {
309
+ FARF(ERROR, "Unable to create worker pool");
310
+ return err;
311
+ }
312
+
313
+ FARF(HIGH, "session %u started: n-hvx %u vtcm-size %zu vtcm-rctx %u n-threads %u thread-id %d thread-prio %d \n",
314
+ sess_id, hw_nhvx, ctx->vtcm_size, ctx->vtcm_rctx, ctx->n_threads, ctx->thread_id, ctx->thread_prio);
315
+
316
+ return AEE_SUCCESS;
317
+ }
318
+
319
+ AEEResult htp_iface_stop(remote_handle64 handle) {
320
+ struct htp_context * ctx = (struct htp_context *) handle;
321
+ if (!ctx) {
322
+ return AEE_EBADPARM;
323
+ }
324
+
325
+ if (!ctx->queue) {
326
+ FARF(ERROR, "Queue not open");
327
+ return AEE_EBADSTATE;
328
+ }
329
+
330
+ // Close queue. dspqueue_close() will also wait for callbacks to finish.
331
+ int err = dspqueue_close(ctx->queue);
332
+ ctx->queue = NULL;
333
+ if (err != 0) {
334
+ FARF(ERROR, "Queue close failed with 0x%08x", (unsigned) err);
335
+ return err;
336
+ }
337
+
338
+ if (ctx->worker_pool) {
339
+ // Release worker pool
340
+ worker_pool_release(&ctx->worker_pool);
341
+ }
342
+
343
+ for (int i = 0; i < ctx->n_threads; i++) {
344
+ dma_queue_delete(ctx->dma[i]);
345
+ }
346
+
347
+ vtcm_free(ctx);
348
+
349
+ return AEE_SUCCESS;
350
+ }
351
+
352
+ static void htp_error_callback(dspqueue_t queue, int error, void * context) {
353
+ // No errors expected on the DSP.
354
+ FARF(ERROR, "Error callback: 0x%08x", (unsigned) error);
355
+ }
356
+
357
+ struct profile_data {
358
+ uint64_t usecs;
359
+ uint64_t cycles;
360
+ uint64_t pkts;
361
+ };
362
+
363
+ static inline void profile_start(struct profile_data * d) {
364
+ d->usecs = HAP_perf_get_qtimer_count();
365
+ d->cycles = htp_get_cycles();
366
+ d->pkts = htp_get_pktcnt();
367
+ }
368
+
369
+ static inline void profile_stop(struct profile_data * d) {
370
+ d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
371
+ d->cycles = htp_get_cycles() - d->cycles;
372
+ d->pkts = htp_get_pktcnt() - d->pkts;
373
+ }
374
+
375
+ static int send_htp_rsp(struct htp_context * c,
376
+ uint32_t op,
377
+ uint32_t status,
378
+ struct dspqueue_buffer * bufs,
379
+ size_t n_bufs,
380
+ struct profile_data * prof) {
381
+ // Prep response struct
382
+ struct htp_general_rsp rsp;
383
+ rsp.op = op;
384
+ rsp.status = status;
385
+ rsp.prof_usecs = prof->usecs;
386
+ rsp.prof_cycles = prof->cycles;
387
+ rsp.prof_pkts = prof->pkts;
388
+
389
+ int err = dspqueue_write(c->queue,
390
+ 0, // Flags
391
+ n_bufs,
392
+ bufs, // Buffer references
393
+ sizeof(rsp),
394
+ (const uint8_t *) &rsp, // Message
395
+ DSPQUEUE_TIMEOUT_NONE);
396
+
397
+ if (err != 0) {
398
+ FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err);
399
+ }
400
+
401
+ return err;
402
+ }
403
+
404
+ static void proc_matmul_req(struct htp_context * ctx,
405
+ struct htp_general_req * req,
406
+ struct dspqueue_buffer * bufs,
407
+ size_t n_bufs) {
408
+ struct dspqueue_buffer rsp_bufs[1];
409
+
410
+ // We had written to the output buffer, we'd also need to flush it
411
+ rsp_bufs[0].fd = bufs[2].fd;
412
+ rsp_bufs[0].ptr = bufs[2].ptr;
413
+ rsp_bufs[0].size = bufs[2].size;
414
+ rsp_bufs[0].offset = bufs[2].offset;
415
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
416
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
417
+
418
+ // Setup Op context
419
+ struct htp_ops_context octx = { 0 };
420
+ octx.ctx = ctx;
421
+ octx.src0 = req->src0;
422
+ octx.src1 = req->src1;
423
+ octx.dst = req->dst;
424
+ octx.flags = req->flags;
425
+ octx.op = req->op;
426
+
427
+ // Update data pointers
428
+ octx.src0.data = (uint32_t) bufs[0].ptr;
429
+ octx.src1.data = (uint32_t) bufs[1].ptr;
430
+ octx.dst.data = (uint32_t) bufs[2].ptr;
431
+ octx.n_threads = ctx->n_threads;
432
+
433
+ struct profile_data prof;
434
+ profile_start(&prof);
435
+
436
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
437
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
438
+ rsp_status = op_matmul(&octx);
439
+ vtcm_release(ctx);
440
+ }
441
+
442
+ profile_stop(&prof);
443
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
444
+ }
445
+
446
+ static void proc_get_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
447
+ struct dspqueue_buffer rsp_bufs[1];
448
+
449
+ // We had written to the output buffer, we'd also need to flush it
450
+ rsp_bufs[0].fd = bufs[2].fd;
451
+ rsp_bufs[0].ptr = bufs[2].ptr;
452
+ rsp_bufs[0].offset = bufs[2].offset;
453
+ rsp_bufs[0].size = bufs[2].size;
454
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
455
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
456
+
457
+ // Setup Op context
458
+ struct htp_ops_context octx = { 0 };
459
+ octx.ctx = ctx;
460
+ octx.src0 = req->src0;
461
+ octx.src1 = req->src1;
462
+ octx.dst = req->dst;
463
+ octx.flags = req->flags;
464
+ octx.op = req->op;
465
+
466
+ // Update data pointers
467
+ octx.src0.data = (uint32_t) bufs[0].ptr;
468
+ octx.src1.data = (uint32_t) bufs[1].ptr;
469
+ octx.dst.data = (uint32_t) bufs[2].ptr;
470
+ octx.n_threads = ctx->n_threads;
471
+
472
+ struct profile_data prof;
473
+ profile_start(&prof);
474
+
475
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
476
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
477
+ rsp_status = op_get_rows(&octx);
478
+ vtcm_release(ctx);
479
+ }
480
+
481
+ profile_stop(&prof);
482
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
483
+ }
484
+
485
+ static void proc_matmul_id_req(struct htp_context * ctx,
486
+ struct htp_general_req * req,
487
+ struct dspqueue_buffer * bufs,
488
+ size_t n_bufs) {
489
+ struct dspqueue_buffer rsp_bufs[1];
490
+
491
+ // We had written to the output buffer, we'd also need to flush it
492
+ rsp_bufs[0].fd = bufs[3].fd;
493
+ rsp_bufs[0].ptr = bufs[3].ptr;
494
+ rsp_bufs[0].size = bufs[3].size;
495
+ rsp_bufs[0].offset = bufs[3].offset;
496
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
497
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
498
+
499
+ // Setup Op context
500
+ struct htp_ops_context octx = { 0 };
501
+ octx.ctx = ctx;
502
+ octx.src0 = req->src0;
503
+ octx.src1 = req->src1;
504
+ octx.src2 = req->src2;
505
+ octx.dst = req->dst;
506
+ octx.flags = req->flags;
507
+ octx.op = req->op;
508
+
509
+ // Update data pointers
510
+ octx.src0.data = (uint32_t) bufs[0].ptr;
511
+ octx.src1.data = (uint32_t) bufs[1].ptr;
512
+ octx.src2.data = (uint32_t) bufs[2].ptr;
513
+ octx.dst.data = (uint32_t) bufs[3].ptr;
514
+ octx.n_threads = ctx->n_threads;
515
+
516
+ struct profile_data prof;
517
+ profile_start(&prof);
518
+
519
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
520
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
521
+ rsp_status = op_matmul_id(&octx);
522
+ vtcm_release(ctx);
523
+ }
524
+
525
+ profile_stop(&prof);
526
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
527
+ }
528
+
529
+ static void proc_binary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
530
+ struct dspqueue_buffer rsp_bufs[1];
531
+
532
+ // We had written to the output buffer, we'd also need to flush it
533
+ rsp_bufs[0].fd = bufs[2].fd;
534
+ rsp_bufs[0].ptr = bufs[2].ptr;
535
+ rsp_bufs[0].offset = bufs[2].offset;
536
+ rsp_bufs[0].size = bufs[2].size;
537
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
538
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
539
+
540
+ // Setup Op context
541
+ struct htp_ops_context octx = { 0 };
542
+ octx.ctx = ctx;
543
+ octx.src0 = req->src0;
544
+ octx.src1 = req->src1;
545
+ octx.dst = req->dst;
546
+ octx.flags = req->flags;
547
+ octx.op = req->op;
548
+
549
+ // Update data pointers
550
+ octx.src0.data = (uint32_t) bufs[0].ptr;
551
+ octx.src1.data = (uint32_t) bufs[1].ptr;
552
+ octx.dst.data = (uint32_t) bufs[2].ptr;
553
+ octx.n_threads = ctx->n_threads;
554
+
555
+ struct profile_data prof;
556
+ profile_start(&prof);
557
+
558
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
559
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
560
+ rsp_status = op_binary(&octx);
561
+ vtcm_release(ctx);
562
+ }
563
+
564
+ profile_stop(&prof);
565
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
566
+ }
567
+
568
+ static void proc_add_id_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
569
+ struct dspqueue_buffer rsp_bufs[1];
570
+
571
+ // We had written to the output buffer, we'd also need to flush it
572
+ rsp_bufs[0].fd = bufs[3].fd;
573
+ rsp_bufs[0].ptr = bufs[3].ptr;
574
+ rsp_bufs[0].offset = bufs[3].offset;
575
+ rsp_bufs[0].size = bufs[3].size;
576
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
577
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
578
+
579
+ // Setup Op context
580
+ struct htp_ops_context octx = { 0 };
581
+ octx.ctx = ctx;
582
+ octx.src0 = req->src0;
583
+ octx.src1 = req->src1;
584
+ octx.src2 = req->src2;
585
+ octx.dst = req->dst;
586
+ octx.flags = req->flags;
587
+ octx.op = req->op;
588
+
589
+ // Update data pointers
590
+ octx.src0.data = (uint32_t) bufs[0].ptr;
591
+ octx.src1.data = (uint32_t) bufs[1].ptr;
592
+ octx.src2.data = (uint32_t) bufs[2].ptr;
593
+ octx.dst.data = (uint32_t) bufs[3].ptr;
594
+ octx.n_threads = ctx->n_threads;
595
+
596
+ struct profile_data prof;
597
+ profile_start(&prof);
598
+
599
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
600
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
601
+ rsp_status = op_binary(&octx);
602
+ vtcm_release(ctx);
603
+ }
604
+
605
+ profile_stop(&prof);
606
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
607
+ }
608
+
609
+ static void proc_unary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
610
+ struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
611
+
612
+ // We had written to the output buffer, we'd also need to flush it
613
+ rsp_bufs[0].fd = bufs[1].fd;
614
+ rsp_bufs[0].ptr = bufs[1].ptr;
615
+ rsp_bufs[0].offset = bufs[1].offset;
616
+ rsp_bufs[0].size = bufs[1].size;
617
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
618
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
619
+
620
+ // Setup Op context
621
+ struct htp_ops_context octx = { 0 };
622
+ octx.ctx = ctx;
623
+ octx.src0 = req->src0;
624
+ octx.dst = req->dst;
625
+ octx.flags = req->flags;
626
+ octx.op = req->op;
627
+
628
+ memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
629
+
630
+ // Update data pointers
631
+ octx.src0.data = (uint32_t) bufs[0].ptr;
632
+ octx.dst.data = (uint32_t) bufs[1].ptr;
633
+ octx.n_threads = ctx->n_threads;
634
+
635
+ struct profile_data prof;
636
+ profile_start(&prof);
637
+
638
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
639
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
640
+ rsp_status = op_unary(&octx);
641
+ vtcm_release(ctx);
642
+ }
643
+
644
+ profile_stop(&prof);
645
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
646
+ }
647
+
648
+ static void proc_activations_req(struct htp_context * ctx,
649
+ struct htp_general_req * req,
650
+ struct dspqueue_buffer * bufs,
651
+ uint32_t n_bufs) {
652
+ struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
653
+
654
+ int write_idx = (n_bufs == 3) ? 2 : 1;
655
+
656
+ // We had written to the output buffer, we'd also need to flush it
657
+ rsp_bufs[0].fd = bufs[write_idx].fd;
658
+ rsp_bufs[0].ptr = bufs[write_idx].ptr;
659
+ rsp_bufs[0].offset = bufs[write_idx].offset;
660
+ rsp_bufs[0].size = bufs[write_idx].size;
661
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
662
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
663
+
664
+ // Setup Op context
665
+ struct htp_ops_context octx = { 0 };
666
+ octx.ctx = ctx;
667
+ octx.src0 = req->src0;
668
+ if (3 == n_bufs) {
669
+ octx.src1 = req->src1;
670
+ }
671
+ octx.dst = req->dst;
672
+ octx.flags = req->flags;
673
+ octx.op = req->op;
674
+
675
+ memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
676
+
677
+ // Update data pointers
678
+ octx.src0.data = (uint32_t) bufs[0].ptr;
679
+ if (3 == n_bufs) {
680
+ octx.src1.data = (uint32_t) bufs[1].ptr;
681
+ octx.dst.data = (uint32_t) bufs[2].ptr;
682
+ } else {
683
+ octx.dst.data = (uint32_t) bufs[1].ptr;
684
+ }
685
+ octx.n_threads = ctx->n_threads;
686
+
687
+ struct profile_data prof;
688
+ profile_start(&prof);
689
+
690
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
691
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
692
+ if (octx.op == HTP_OP_SOFTMAX) {
693
+ rsp_status = op_softmax(&octx);
694
+ } else {
695
+ rsp_status = op_activations(&octx);
696
+ }
697
+ vtcm_release(ctx);
698
+ }
699
+
700
+ profile_stop(&prof);
701
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
702
+ }
703
+
704
+ static void proc_rope_req(struct htp_context * ctx,
705
+ struct htp_general_req * req,
706
+ struct dspqueue_buffer * bufs,
707
+ uint32_t n_bufs) {
708
+ struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
709
+
710
+ int write_idx = n_bufs - 1;
711
+
712
+ // We had written to the output buffer, we'd also need to flush it
713
+ rsp_bufs[0].fd = bufs[write_idx].fd;
714
+ rsp_bufs[0].ptr = bufs[write_idx].ptr;
715
+ rsp_bufs[0].offset = bufs[write_idx].offset;
716
+ rsp_bufs[0].size = bufs[write_idx].size;
717
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
718
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
719
+
720
+ // Setup Op context
721
+ struct htp_ops_context octx = { 0 };
722
+ octx.ctx = ctx;
723
+ octx.src0 = req->src0;
724
+ octx.src1 = req->src1;
725
+ if (4 == n_bufs) {
726
+ octx.src2 = req->src2;
727
+ }
728
+ octx.dst = req->dst;
729
+ octx.flags = req->flags;
730
+ octx.op = req->op;
731
+
732
+ memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
733
+
734
+ // Update data pointers
735
+ octx.src0.data = (uint32_t) bufs[0].ptr;
736
+ octx.src1.data = (uint32_t) bufs[1].ptr;
737
+ if (4 == n_bufs) {
738
+ octx.src2.data = (uint32_t) bufs[2].ptr;
739
+ octx.dst.data = (uint32_t) bufs[3].ptr;
740
+ } else {
741
+ octx.dst.data = (uint32_t) bufs[2].ptr;
742
+ }
743
+ octx.n_threads = ctx->n_threads;
744
+
745
+ struct profile_data prof;
746
+ profile_start(&prof);
747
+
748
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
749
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
750
+ rsp_status = op_rope(&octx);
751
+ vtcm_release(ctx);
752
+ }
753
+
754
+ profile_stop(&prof);
755
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
756
+ }
757
+
758
+ static void proc_set_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
759
+ struct dspqueue_buffer rsp_bufs[1];
760
+
761
+ // We had written to the output buffer, we'd also need to flush it
762
+ rsp_bufs[0].fd = bufs[2].fd;
763
+ rsp_bufs[0].ptr = bufs[2].ptr;
764
+ rsp_bufs[0].offset = bufs[2].offset;
765
+ rsp_bufs[0].size = bufs[2].size;
766
+ rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
767
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
768
+
769
+ // Setup Op context
770
+ struct htp_ops_context octx = { 0 };
771
+ octx.ctx = ctx;
772
+ octx.src0 = req->src0;
773
+ octx.src1 = req->src1;
774
+ octx.dst = req->dst;
775
+ octx.flags = req->flags;
776
+ octx.op = req->op;
777
+
778
+ // Update data pointers
779
+ octx.src0.data = (uint32_t) bufs[0].ptr;
780
+ octx.src1.data = (uint32_t) bufs[1].ptr;
781
+ octx.dst.data = (uint32_t) bufs[2].ptr;
782
+ octx.n_threads = ctx->n_threads;
783
+
784
+ struct profile_data prof;
785
+ profile_start(&prof);
786
+
787
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
788
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
789
+ rsp_status = op_set_rows(&octx);
790
+ vtcm_release(ctx);
791
+ }
792
+
793
+ profile_stop(&prof);
794
+ send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
795
+ }
796
+
797
+ static void proc_flash_attn_ext_req(struct htp_context * ctx,
798
+ struct htp_general_req * req,
799
+ struct dspqueue_buffer * bufs,
800
+ uint32_t n_bufs) {
801
+ // Setup Op context
802
+ struct htp_ops_context octx;
803
+ memset(&octx, 0, sizeof(octx));
804
+
805
+ octx.ctx = ctx;
806
+ octx.n_threads = ctx->n_threads;
807
+
808
+ octx.src0 = req->src0;
809
+ octx.src1 = req->src1;
810
+ octx.src2 = req->src2;
811
+ octx.src3 = req->src3;
812
+ octx.src4 = req->src4;
813
+ octx.dst = req->dst;
814
+ octx.flags = req->flags;
815
+ octx.op = req->op;
816
+
817
+ memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
818
+
819
+ // Update data pointers
820
+ octx.src0.data = (uint32_t) bufs[0].ptr;
821
+ octx.src1.data = (uint32_t) bufs[1].ptr;
822
+ octx.src2.data = (uint32_t) bufs[2].ptr;
823
+
824
+ int last_buf = 3;
825
+
826
+ if (octx.src3.ne[0]) {
827
+ octx.src3.data = (uint32_t) bufs[last_buf++].ptr; // mask is valid
828
+ }
829
+
830
+ if (octx.src4.ne[0]) {
831
+ octx.src4.data = (uint32_t) bufs[last_buf++].ptr; // sinks is valid
832
+ }
833
+
834
+ octx.dst.data = (uint32_t) bufs[last_buf].ptr;
835
+
836
+ struct profile_data prof;
837
+ profile_start(&prof);
838
+
839
+ uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
840
+ if (vtcm_acquire(ctx) == AEE_SUCCESS) {
841
+ rsp_status = op_flash_attn_ext(&octx);
842
+ vtcm_release(ctx);
843
+ }
844
+
845
+ profile_stop(&prof);
846
+
847
+ struct dspqueue_buffer rsp_buf = bufs[last_buf];
848
+ rsp_buf.flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
849
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
850
+
851
+ send_htp_rsp(ctx, req->op, rsp_status, &bufs[last_buf], 1, &prof);
852
+ }
853
+
854
+ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
855
+ struct htp_context * ctx = (struct htp_context *) context;
856
+
857
+ // Repeatedly read packets from the queue until it's empty. We don't
858
+ // necessarily get a separate callback for each packet, and new packets
859
+ // may arrive while we're processing the previous one. This ensures we
860
+ // keep the DSP busy as much as possible and avoid waiting for the CPU.
861
+
862
+ while (1) {
863
+ struct htp_general_req req;
864
+ uint32_t req_size;
865
+
866
+ struct dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
867
+ uint32_t n_bufs;
868
+ uint32_t flags;
869
+
870
+ // Read packet from queue
871
+ int err = dspqueue_read_noblock(queue, &flags,
872
+ HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
873
+ &n_bufs, // Number of buffer references
874
+ bufs, // Buffer references
875
+ sizeof(req), // Max message length
876
+ &req_size, // Message length
877
+ (uint8_t *) &req); // Message
878
+
879
+ if (err == AEE_EWOULDBLOCK) {
880
+ // Consumed all packets available for now
881
+ return;
882
+ }
883
+
884
+ if (err != 0) {
885
+ FARF(ERROR, "dspqueue_read_noblock failed: 0x%08x", (unsigned) err);
886
+ return;
887
+ }
888
+
889
+ if (req_size != sizeof(req)) {
890
+ FARF(ERROR, "Invalid request size");
891
+ continue;
892
+ }
893
+
894
+ if (req.flags & HTP_OPFLAGS_EARLY_WAKEUP) {
895
+ // Host wants early notification
896
+ dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);
897
+ }
898
+
899
+ // Process packet based on its message type
900
+ switch (req.op) {
901
+ case HTP_OP_MUL_MAT:
902
+ if (n_bufs != 3) {
903
+ FARF(ERROR, "Bad matmul-req buffer list");
904
+ continue;
905
+ }
906
+ proc_matmul_req(ctx, &req, bufs, n_bufs);
907
+ break;
908
+
909
+ case HTP_OP_MUL_MAT_ID:
910
+ if (n_bufs != 4) {
911
+ FARF(ERROR, "Bad matmul-id-req buffer list");
912
+ continue;
913
+ }
914
+ proc_matmul_id_req(ctx, &req, bufs, n_bufs);
915
+ break;
916
+
917
+ case HTP_OP_MUL:
918
+ case HTP_OP_ADD:
919
+ case HTP_OP_SUB:
920
+ if (n_bufs != 3) {
921
+ FARF(ERROR, "Bad binary-req buffer list");
922
+ continue;
923
+ }
924
+ proc_binary_req(ctx, &req, bufs);
925
+ break;
926
+
927
+ case HTP_OP_RMS_NORM:
928
+ case HTP_OP_SCALE:
929
+ if (n_bufs != 2) {
930
+ FARF(ERROR, "Bad unary-req buffer list");
931
+ continue;
932
+ }
933
+
934
+ proc_unary_req(ctx, &req, bufs);
935
+ break;
936
+
937
+ case HTP_OP_UNARY_SILU:
938
+ case HTP_OP_UNARY_GELU:
939
+ if (n_bufs != 2) {
940
+ FARF(ERROR, "Bad act-req buffer list");
941
+ continue;
942
+ }
943
+ proc_activations_req(ctx, &req, bufs, n_bufs);
944
+ break;
945
+
946
+ case HTP_OP_GLU_SWIGLU:
947
+ case HTP_OP_GLU_SWIGLU_OAI:
948
+ case HTP_OP_SOFTMAX:
949
+ if ((n_bufs != 2) && (n_bufs != 3)) {
950
+ FARF(ERROR, "Bad act-req buffer list");
951
+ continue;
952
+ }
953
+ proc_activations_req(ctx, &req, bufs, n_bufs);
954
+ break;
955
+
956
+ case HTP_OP_ADD_ID:
957
+ if (n_bufs != 4) {
958
+ FARF(ERROR, "Bad add-id-req buffer list");
959
+ continue;
960
+ }
961
+ proc_add_id_req(ctx, &req, bufs);
962
+ break;
963
+
964
+ case HTP_OP_ROPE:
965
+ if ((n_bufs != 3) && (n_bufs != 4)) {
966
+ FARF(ERROR, "Bad rope-req buffer list");
967
+ continue;
968
+ }
969
+ proc_rope_req(ctx, &req, bufs, n_bufs);
970
+ break;
971
+
972
+ case HTP_OP_FLASH_ATTN_EXT:
973
+ if (!(n_bufs >= 4 && n_bufs <= 6)) {
974
+ FARF(ERROR, "Bad flash-attn-ext-req buffer list");
975
+ continue;
976
+ }
977
+ proc_flash_attn_ext_req(ctx, &req, bufs, n_bufs);
978
+ break;
979
+
980
+ case HTP_OP_SET_ROWS:
981
+ if (n_bufs != 3) {
982
+ FARF(ERROR, "Bad set-rows-req buffer list");
983
+ continue;
984
+ }
985
+ proc_set_rows_req(ctx, &req, bufs);
986
+ break;
987
+
988
+ case HTP_OP_GET_ROWS:
989
+ if (n_bufs != 3) {
990
+ FARF(ERROR, "Bad get-rows-req buffer list");
991
+ continue;
992
+ }
993
+ proc_get_rows_req(ctx, &req, bufs);
994
+ break;
995
+
996
+ default:
997
+ FARF(ERROR, "Unknown Op %u", req.op);
998
+ break;
999
+ }
1000
+ }
1001
+ }