whispercpp 1.3.6 → 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (828) hide show
  1. checksums.yaml +4 -4
  2. data/.document +3 -0
  3. data/.rdoc_options +2 -0
  4. data/README.md +38 -5
  5. data/Rakefile +18 -3
  6. data/ext/dependencies.rb +10 -4
  7. data/ext/dependencies_for_windows.rb +17 -0
  8. data/ext/extconf.rb +20 -8
  9. data/ext/options.rb +54 -14
  10. data/ext/options_for_windows.rb +51 -0
  11. data/ext/ruby_whisper.c +36 -42
  12. data/ext/ruby_whisper.h +135 -0
  13. data/ext/ruby_whisper_context.c +107 -28
  14. data/ext/ruby_whisper_log_queue.c +180 -0
  15. data/ext/ruby_whisper_log_settable.h +47 -0
  16. data/ext/ruby_whisper_parakeet.c +49 -0
  17. data/ext/ruby_whisper_parakeet_context.c +304 -0
  18. data/ext/ruby_whisper_parakeet_context_params.c +117 -0
  19. data/ext/ruby_whisper_parakeet_model.c +84 -0
  20. data/ext/ruby_whisper_parakeet_params.c +548 -0
  21. data/ext/ruby_whisper_parakeet_segment.c +157 -0
  22. data/ext/ruby_whisper_parakeet_token.c +188 -0
  23. data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
  24. data/ext/ruby_whisper_params.c +256 -65
  25. data/ext/ruby_whisper_segment.c +6 -6
  26. data/ext/ruby_whisper_transcribe.cpp +42 -15
  27. data/ext/sources/CMakeLists.txt +41 -3
  28. data/ext/sources/CMakePresets.json +95 -0
  29. data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
  30. data/ext/sources/cmake/parakeet.pc.in +10 -0
  31. data/ext/sources/cmake/whisper.pc.in +1 -1
  32. data/ext/sources/examples/CMakeLists.txt +4 -2
  33. data/ext/sources/examples/bench/bench.cpp +1 -1
  34. data/ext/sources/examples/cli/cli.cpp +43 -9
  35. data/ext/sources/examples/common-ggml.cpp +2 -0
  36. data/ext/sources/examples/common-whisper.cpp +139 -67
  37. data/ext/sources/examples/common-whisper.h +11 -0
  38. data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
  39. data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
  40. data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
  41. data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
  42. data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
  43. data/ext/sources/examples/server/server.cpp +199 -163
  44. data/ext/sources/ggml/CMakeLists.txt +21 -13
  45. data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
  46. data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
  47. data/ext/sources/ggml/include/ggml-alloc.h +1 -0
  48. data/ext/sources/ggml/include/ggml-backend.h +72 -10
  49. data/ext/sources/ggml/include/ggml-cuda.h +3 -0
  50. data/ext/sources/ggml/include/ggml-rpc.h +3 -3
  51. data/ext/sources/ggml/include/ggml.h +101 -9
  52. data/ext/sources/ggml/include/gguf.h +10 -2
  53. data/ext/sources/ggml/src/CMakeLists.txt +22 -5
  54. data/ext/sources/ggml/src/ggml-alloc.c +5 -1
  55. data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
  56. data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
  57. data/ext/sources/ggml/src/ggml-backend-reg.cpp +12 -0
  58. data/ext/sources/ggml/src/ggml-backend.cpp +110 -9
  59. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +4 -0
  60. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +672 -257
  61. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +71 -0
  62. data/ext/sources/ggml/src/ggml-cann/common.h +20 -10
  63. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +211 -30
  64. data/ext/sources/ggml/src/ggml-common.h +11 -0
  65. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +58 -29
  66. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +2 -0
  67. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +16 -16
  68. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +116 -7
  69. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +65 -0
  70. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
  71. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
  72. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +4279 -1292
  73. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +5 -35
  74. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +0 -1
  75. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
  76. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +177 -27
  77. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +1 -1
  78. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +5 -0
  79. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  80. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -0
  81. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +95 -5
  82. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  83. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +146 -134
  84. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +88 -70
  85. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +372 -73
  86. data/ext/sources/ggml/src/ggml-cpu/ops.h +3 -0
  87. data/ext/sources/ggml/src/ggml-cpu/quants.c +55 -0
  88. data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
  89. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +3 -0
  90. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +90 -0
  91. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +3 -16
  92. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
  93. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
  94. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
  95. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  96. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  97. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  98. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
  99. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  100. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  101. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  102. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  103. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  104. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  105. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  106. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  107. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +37 -53
  108. data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
  109. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +17 -7
  110. data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
  111. data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  112. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +62 -26
  113. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +44 -18
  114. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
  115. data/ext/sources/ggml/src/ggml-cuda/common.cuh +242 -28
  116. data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
  117. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
  118. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
  119. data/ext/sources/ggml/src/ggml-cuda/convert.cu +53 -0
  120. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
  121. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +14 -6
  122. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
  123. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +278 -44
  124. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +331 -130
  125. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +126 -27
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +40 -15
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +18 -9
  129. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +152 -49
  130. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  131. data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
  132. data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
  133. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +84 -35
  134. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
  135. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1069 -609
  136. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
  137. data/ext/sources/ggml/src/ggml-cuda/mean.cu +4 -2
  138. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +242 -195
  139. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +3 -3
  140. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
  141. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +502 -423
  142. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +19 -12
  143. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +485 -57
  144. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -1
  145. data/ext/sources/ggml/src/ggml-cuda/norm.cu +36 -10
  146. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
  147. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +133 -26
  148. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
  149. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +5 -1
  150. data/ext/sources/ggml/src/ggml-cuda/rope.cu +11 -4
  151. data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
  152. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
  153. data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
  154. data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
  155. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
  156. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +45 -13
  157. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  158. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
  159. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
  160. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
  161. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +1 -0
  162. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
  163. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
  164. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
  165. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +1 -0
  166. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
  167. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
  168. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
  169. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
  170. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
  171. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
  172. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
  173. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  174. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  175. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  176. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  177. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  178. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  179. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  180. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  181. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  182. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  183. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  184. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  185. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  186. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  187. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  188. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  189. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  190. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  191. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -4
  192. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +26 -23
  193. data/ext/sources/ggml/src/ggml-cuda/unary.cu +31 -2
  194. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
  195. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +80 -0
  196. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
  197. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +22 -4
  198. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  199. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +2 -1
  200. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1428 -743
  201. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -7
  202. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +53 -84
  203. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +25 -12
  204. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +165 -184
  205. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
  206. data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
  207. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +170 -127
  208. data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  209. data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  210. data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  211. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +125 -97
  212. data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
  213. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +148 -42
  214. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.c +2 -2
  215. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +252 -62
  216. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +9 -0
  217. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +87 -1
  218. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
  219. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
  220. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
  221. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
  222. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  223. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  224. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  225. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  226. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +96 -13
  227. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +182 -57
  228. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
  229. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +71 -3
  230. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +27 -10
  231. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +63 -23
  232. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +9 -8
  233. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
  234. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
  235. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
  236. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  237. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +1 -0
  238. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
  239. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +5 -8
  240. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +529 -815
  241. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2522 -234
  242. data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
  243. data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  244. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +291 -95
  245. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +59 -37
  246. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +121 -133
  247. data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  248. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +244 -151
  249. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +6 -6
  250. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +719 -45
  251. data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  252. data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
  253. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +3 -1
  254. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +22 -9
  255. data/ext/sources/ggml/src/ggml-impl.h +6 -1
  256. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +138 -13
  257. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +32 -1
  258. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +164 -28
  259. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +80 -0
  260. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +190 -19
  261. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +2 -0
  262. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +39 -26
  263. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +823 -322
  264. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
  265. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +54 -5
  266. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +12248 -5907
  267. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +67 -0
  268. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +59 -0
  269. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1819 -112
  270. data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
  271. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
  272. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
  273. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
  274. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
  275. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
  276. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
  277. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
  278. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
  279. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  280. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
  281. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  282. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
  283. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
  284. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  285. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  286. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mm_q8_0_f32_8x4.cl → gemm_noshuffle_q8_0_f32.cl} +1 -1
  287. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  288. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
  289. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
  290. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
  291. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
  292. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
  293. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
  294. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
  295. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
  296. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  297. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  300. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
  301. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
  302. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  303. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  304. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
  305. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  306. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  307. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  308. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  309. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
  310. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
  311. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  312. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  313. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  314. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  315. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
  316. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
  317. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
  318. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +48 -64
  322. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +15 -5
  323. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +18 -11
  324. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +35 -13
  325. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +264 -192
  326. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +33 -7
  327. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  328. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +1 -0
  329. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +1 -0
  330. data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  331. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +27 -3
  332. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +67 -36
  333. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +1 -0
  334. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +101 -44
  335. data/ext/sources/ggml/src/ggml-openvino/utils.h +23 -3
  336. data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
  337. data/ext/sources/ggml/src/ggml-quants.c +289 -114
  338. data/ext/sources/ggml/src/ggml-quants.h +3 -0
  339. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
  340. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
  341. data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
  342. data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
  343. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +50 -4
  344. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +1 -1
  345. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +3 -1
  346. data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
  347. data/ext/sources/ggml/src/ggml-sycl/common.hpp +41 -1
  348. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +115 -13
  349. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +9 -0
  350. data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  351. data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  352. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
  353. data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
  354. data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
  355. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
  356. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1 -90
  357. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +0 -2
  358. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  359. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  360. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +7 -5
  361. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +4 -0
  362. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +76 -168
  363. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +7 -0
  364. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +3 -1
  365. data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
  366. data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
  367. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +69 -31
  368. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +1 -0
  369. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  370. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
  371. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +823 -190
  372. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
  373. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
  374. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
  375. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
  376. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
  377. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +71 -0
  378. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
  379. data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  380. data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  381. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
  382. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  383. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  384. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
  385. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
  386. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  387. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +1 -0
  388. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +1 -0
  389. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +1 -0
  390. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +1 -0
  391. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +1 -0
  392. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +1 -0
  393. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +1 -0
  394. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +1 -0
  395. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +1 -0
  396. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +1 -0
  397. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +1 -0
  398. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +1 -0
  399. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +1 -0
  400. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +1 -0
  401. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +1 -0
  402. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +1 -0
  403. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +1 -0
  404. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +1 -0
  405. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +1 -0
  406. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +1 -0
  407. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +1 -0
  408. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +1 -0
  409. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +1 -0
  410. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +1 -0
  411. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +1 -0
  412. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +1 -0
  413. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +1 -0
  414. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +1 -0
  415. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +1 -0
  416. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +1 -0
  417. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +1 -0
  418. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +1 -0
  419. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +1 -0
  420. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +1 -0
  421. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +1 -0
  422. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +1 -0
  423. data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
  424. data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
  425. data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
  426. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +215 -53
  427. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +4 -0
  428. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +2 -0
  429. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +2 -0
  430. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +1 -0
  431. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +1 -0
  432. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +0 -2
  433. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +11 -0
  434. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +2060 -535
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
  449. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +197 -48
  450. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +60 -59
  451. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +115 -113
  452. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +122 -31
  453. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
  454. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  455. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
  456. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +125 -64
  457. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
  458. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
  459. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
  460. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
  461. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
  462. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
  463. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
  464. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
  465. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
  466. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
  467. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
  468. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
  469. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
  470. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -17
  471. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +43 -10
  472. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
  473. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
  474. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
  475. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
  476. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +5 -2
  477. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
  478. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +3 -2
  479. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
  480. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +11 -1
  481. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
  482. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
  483. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +171 -147
  484. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
  485. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +2202 -283
  486. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +2610 -1403
  487. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
  488. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  489. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +8 -7
  490. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +76 -95
  491. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +19 -1
  492. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  493. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -50
  494. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +107 -184
  495. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
  496. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
  497. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  498. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  499. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
  500. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
  501. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +183 -78
  502. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  503. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  504. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +655 -495
  505. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  506. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
  507. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  508. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +8 -6
  509. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +5 -1
  510. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +80 -409
  511. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
  512. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
  513. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
  514. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
  515. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  516. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
  517. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  518. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +6 -4
  519. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  520. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +2 -3
  521. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
  522. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
  523. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  524. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  525. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  526. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +68 -48
  527. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  528. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +18 -14
  529. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +1 -1
  530. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +244 -10
  531. data/ext/sources/ggml/src/ggml.c +110 -28
  532. data/ext/sources/ggml/src/gguf.cpp +173 -28
  533. data/ext/sources/include/parakeet.h +342 -0
  534. data/ext/sources/include/whisper.h +10 -0
  535. data/ext/sources/media/matmul.png +0 -0
  536. data/ext/sources/src/CMakeLists.txt +23 -0
  537. data/ext/sources/src/parakeet-arch.h +188 -0
  538. data/ext/sources/src/parakeet.cpp +3838 -0
  539. data/ext/sources/src/whisper.cpp +56 -12
  540. data/extsources.rb +26 -10
  541. data/lib/whisper/log_settable.rb +36 -0
  542. data/lib/whisper/model/uri.rb +13 -1
  543. data/lib/whisper/output.rb +74 -0
  544. data/sig/whisper.rbs +411 -62
  545. data/test/helper.rb +2 -0
  546. data/test/jfk_reader/jfk_reader.c +50 -7
  547. data/test/test_callback.rb +1 -0
  548. data/test/test_package.rb +6 -5
  549. data/test/test_parakeet.rb +28 -0
  550. data/test/test_parakeet_callback.rb +107 -0
  551. data/test/test_parakeet_context.rb +116 -0
  552. data/test/test_parakeet_context_params.rb +24 -0
  553. data/test/test_parakeet_model.rb +21 -0
  554. data/test/test_parakeet_params.rb +78 -0
  555. data/test/test_parakeet_segment.rb +42 -0
  556. data/test/test_parakeet_token.rb +73 -0
  557. data/test/test_params.rb +2 -0
  558. data/test/test_vad_segment.rb +1 -1
  559. data/test/test_whisper.rb +24 -6
  560. data/whispercpp.gemspec +2 -2
  561. metadata +215 -281
  562. data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
  563. data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
  564. data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
  565. data/ext/sources/bindings/javascript/package.json +0 -26
  566. data/ext/sources/bindings/javascript/whisper.js +0 -19
  567. data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
  568. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
  569. data/ext/sources/examples/addon.node/addon.cpp +0 -557
  570. data/ext/sources/examples/addon.node/index.js +0 -59
  571. data/ext/sources/examples/addon.node/package.json +0 -16
  572. data/ext/sources/examples/addon.node/vad-example.js +0 -132
  573. data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
  574. data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
  575. data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
  576. data/ext/sources/examples/coi-serviceworker.js +0 -146
  577. data/ext/sources/examples/command/CMakeLists.txt +0 -10
  578. data/ext/sources/examples/command/command.cpp +0 -802
  579. data/ext/sources/examples/command/commands.txt +0 -9
  580. data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
  581. data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
  582. data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
  583. data/ext/sources/examples/generate-karaoke.sh +0 -57
  584. data/ext/sources/examples/helpers.js +0 -191
  585. data/ext/sources/examples/livestream.sh +0 -112
  586. data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
  587. data/ext/sources/examples/lsp/lsp.cpp +0 -471
  588. data/ext/sources/examples/lsp/whisper.vim +0 -362
  589. data/ext/sources/examples/python/test_whisper_processor.py +0 -7
  590. data/ext/sources/examples/python/whisper_processor.py +0 -54
  591. data/ext/sources/examples/server/bench.js +0 -29
  592. data/ext/sources/examples/server.py +0 -120
  593. data/ext/sources/examples/stream/CMakeLists.txt +0 -10
  594. data/ext/sources/examples/stream/stream.cpp +0 -437
  595. data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
  596. data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
  597. data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
  598. data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
  599. data/ext/sources/examples/sycl/build.sh +0 -22
  600. data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
  601. data/ext/sources/examples/sycl/run-whisper.sh +0 -17
  602. data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -48
  603. data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
  604. data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -488
  605. data/ext/sources/examples/talk-llama/llama-adapter.h +0 -89
  606. data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2877
  607. data/ext/sources/examples/talk-llama/llama-arch.h +0 -628
  608. data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -919
  609. data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
  610. data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -896
  611. data/ext/sources/examples/talk-llama/llama-chat.h +0 -71
  612. data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3633
  613. data/ext/sources/examples/talk-llama/llama-context.h +0 -359
  614. data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
  615. data/ext/sources/examples/talk-llama/llama-cparams.h +0 -47
  616. data/ext/sources/examples/talk-llama/llama-ext.h +0 -12
  617. data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
  618. data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
  619. data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2735
  620. data/ext/sources/examples/talk-llama/llama-graph.h +0 -1031
  621. data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -258
  622. data/ext/sources/examples/talk-llama/llama-hparams.h +0 -353
  623. data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
  624. data/ext/sources/examples/talk-llama/llama-impl.h +0 -75
  625. data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
  626. data/ext/sources/examples/talk-llama/llama-io.h +0 -35
  627. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -330
  628. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
  629. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2285
  630. data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -389
  631. data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
  632. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +0 -275
  633. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +0 -140
  634. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
  635. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
  636. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1165
  637. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
  638. data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
  639. data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
  640. data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -752
  641. data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
  642. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1655
  643. data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -206
  644. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -299
  645. data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -40
  646. data/ext/sources/examples/talk-llama/llama-model.cpp +0 -9056
  647. data/ext/sources/examples/talk-llama/llama-model.h +0 -597
  648. data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1304
  649. data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
  650. data/ext/sources/examples/talk-llama/llama-sampler.cpp +0 -3885
  651. data/ext/sources/examples/talk-llama/llama-sampler.h +0 -42
  652. data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3970
  653. data/ext/sources/examples/talk-llama/llama-vocab.h +0 -187
  654. data/ext/sources/examples/talk-llama/llama.cpp +0 -1194
  655. data/ext/sources/examples/talk-llama/llama.h +0 -1573
  656. data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -190
  657. data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
  658. data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
  659. data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -137
  660. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
  661. data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -123
  662. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -143
  663. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -133
  664. data/ext/sources/examples/talk-llama/models/bert.cpp +0 -184
  665. data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -145
  666. data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
  667. data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
  668. data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
  669. data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
  670. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
  671. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
  672. data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
  673. data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -122
  674. data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
  675. data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -142
  676. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -262
  677. data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +0 -445
  678. data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -132
  679. data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
  680. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -148
  681. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
  682. data/ext/sources/examples/talk-llama/models/eurobert.cpp +0 -97
  683. data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +0 -145
  684. data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
  685. data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
  686. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -111
  687. data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
  688. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
  689. data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
  690. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
  691. data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
  692. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
  693. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
  694. data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -157
  695. data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
  696. data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
  697. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -195
  698. data/ext/sources/examples/talk-llama/models/granite.cpp +0 -210
  699. data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
  700. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -139
  701. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
  702. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -153
  703. data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
  704. data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
  705. data/ext/sources/examples/talk-llama/models/jais2.cpp +0 -123
  706. data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
  707. data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +0 -381
  708. data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -196
  709. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
  710. data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
  711. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
  712. data/ext/sources/examples/talk-llama/models/llama.cpp +0 -175
  713. data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
  714. data/ext/sources/examples/talk-llama/models/mamba-base.cpp +0 -289
  715. data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -54
  716. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -129
  717. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -200
  718. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -123
  719. data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
  720. data/ext/sources/examples/talk-llama/models/models.h +0 -704
  721. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -109
  722. data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
  723. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -162
  724. data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
  725. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
  726. data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
  727. data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
  728. data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
  729. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
  730. data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
  731. data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
  732. data/ext/sources/examples/talk-llama/models/paddleocr.cpp +0 -122
  733. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
  734. data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
  735. data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
  736. data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
  737. data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -320
  738. data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
  739. data/ext/sources/examples/talk-llama/models/plm.cpp +0 -169
  740. data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
  741. data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
  742. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
  743. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
  744. data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -120
  745. data/ext/sources/examples/talk-llama/models/qwen35.cpp +0 -381
  746. data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +0 -422
  747. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -131
  748. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -525
  749. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -140
  750. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -132
  751. data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
  752. data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
  753. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -164
  754. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
  755. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
  756. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -137
  757. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
  758. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
  759. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
  760. data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
  761. data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
  762. data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
  763. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
  764. data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +0 -165
  765. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
  766. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
  767. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
  768. data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
  769. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
  770. data/ext/sources/examples/talk-llama/speak +0 -40
  771. data/ext/sources/examples/talk-llama/speak.bat +0 -1
  772. data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
  773. data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
  774. data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
  775. data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
  776. data/ext/sources/examples/talk-llama/unicode.cpp +0 -1103
  777. data/ext/sources/examples/talk-llama/unicode.h +0 -111
  778. data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
  779. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
  780. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
  781. data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
  782. data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
  783. data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
  784. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
  785. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
  786. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
  787. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
  788. data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
  789. data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
  790. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
  791. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -155
  792. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
  793. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
  794. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +0 -123
  795. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +0 -17
  796. data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +0 -333
  797. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
  798. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -182
  799. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
  800. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -718
  801. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
  802. data/ext/sources/tests/CMakeLists.txt +0 -112
  803. data/ext/sources/tests/earnings21/eval.mk +0 -58
  804. data/ext/sources/tests/earnings21/eval.py +0 -68
  805. data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
  806. data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
  807. data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
  808. data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
  809. data/ext/sources/tests/earnings21/requirements.txt +0 -6
  810. data/ext/sources/tests/en-0-ref.txt +0 -1
  811. data/ext/sources/tests/en-1-ref.txt +0 -1
  812. data/ext/sources/tests/en-2-ref.txt +0 -1
  813. data/ext/sources/tests/es-0-ref.txt +0 -1
  814. data/ext/sources/tests/librispeech/eval.mk +0 -39
  815. data/ext/sources/tests/librispeech/eval.py +0 -47
  816. data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
  817. data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
  818. data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
  819. data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
  820. data/ext/sources/tests/librispeech/requirements.txt +0 -6
  821. data/ext/sources/tests/run-tests.sh +0 -130
  822. data/ext/sources/tests/test-c.c +0 -3
  823. data/ext/sources/tests/test-vad-full.cpp +0 -56
  824. data/ext/sources/tests/test-vad.cpp +0 -83
  825. data/ext/sources/tests/test-whisper.js +0 -58
  826. data/lib/whisper/context.rb +0 -15
  827. data/lib/whisper/segment.rb +0 -58
  828. /data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general_q8_0_f32.cl → gemv_noshuffle_q8_0_f32.cl} +0 -0
@@ -1,5 +1,7 @@
1
1
  #pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
2
2
  #pragma clang diagnostic ignored "-Wunused-function"
3
+ #pragma clang diagnostic ignored "-Wunused-variable"
4
+ #pragma clang diagnostic ignored "-Wunused-but-set-variable"
3
5
 
4
6
  #include <HAP_farf.h>
5
7
  #include <HAP_perf.h>
@@ -10,19 +12,23 @@
10
12
  #include <HAP_mem.h>
11
13
  #include <HAP_power.h>
12
14
  #include <HAP_ps.h>
15
+ #include <HAP_dcvs.h>
13
16
  #include <qurt.h>
14
17
  #include <qurt_thread.h>
18
+ #include <qurt_memory.h>
15
19
  #include <remote.h>
16
20
  #include <string.h>
17
21
 
18
- #include "hex-dma.h"
19
22
  #include "hex-utils.h"
23
+ #include "hex-dma.h"
24
+ #include "hmx-queue.h"
20
25
 
21
26
  #define GGML_COMMON_DECL_C
22
27
  #include "ggml-common.h"
23
28
  #include "htp-ctx.h"
24
- #include "htp-msg.h"
25
29
  #include "htp-ops.h"
30
+ #include "htp-ops.h"
31
+ #include "htp_iface.h"
26
32
  #include "worker-pool.h"
27
33
 
28
34
  AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
@@ -34,7 +40,7 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
34
40
  return AEE_ENOMEMORY;
35
41
  }
36
42
 
37
- // Use the context structure as a handle
43
+ // Use the context structure as the handle
38
44
  *handle = (remote_handle64) ctx;
39
45
 
40
46
  // Enable FARF logs
@@ -58,8 +64,7 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
58
64
 
59
65
  request.type = HAP_power_set_DCVS_v3;
60
66
  request.dcvs_v3.set_dcvs_enable = TRUE;
61
- request.dcvs_v3.dcvs_enable = TRUE;
62
- request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
67
+ request.dcvs_v3.dcvs_enable = FALSE;
63
68
  request.dcvs_v3.set_bus_params = TRUE;
64
69
  request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_MAX;
65
70
  request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_MAX;
@@ -70,6 +75,10 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
70
75
  request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_MAX;
71
76
  request.dcvs_v3.set_sleep_disable = TRUE;
72
77
  request.dcvs_v3.sleep_disable = TRUE;
78
+
79
+ #if (__HEXAGON_ARCH__ >= 79)
80
+ HAP_set_dcvs_v3_protected_bus_corners(&request, 1);
81
+ #endif
73
82
  if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
74
83
  return err;
75
84
  }
@@ -82,6 +91,27 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
82
91
  }
83
92
  }
84
93
 
94
+ #if __HVX_ARCH__ >= 75
95
+ {
96
+ // Power on HMX and set HMX clock
97
+ HAP_power_request_t request;
98
+ memset(&request, 0, sizeof(HAP_power_request_t));
99
+ request.type = HAP_power_set_HMX_v2;
100
+ request.hmx_v2.set_power = TRUE;
101
+ request.hmx_v2.power_up = TRUE;
102
+ request.hmx_v2.set_clock = TRUE;
103
+ request.hmx_v2.target_corner = HAP_DCVS_EXP_VCORNER_MAX;
104
+ request.hmx_v2.min_corner = HAP_DCVS_EXP_VCORNER_MAX;
105
+ request.hmx_v2.max_corner = HAP_DCVS_EXP_VCORNER_MAX;
106
+ request.hmx_v2.perf_mode = HAP_CLK_PERF_HIGH;
107
+ FARF(ALWAYS, "Setting HMX clock\n");
108
+ err = HAP_power_set((void *) ctx, &request);
109
+ if (err != AEE_SUCCESS) {
110
+ FARF(ERROR, "ggml-hex: error setting HMX clock.");
111
+ return err;
112
+ }
113
+ }
114
+ #else
85
115
  {
86
116
  // Power on HMX
87
117
  HAP_power_request_t request;
@@ -89,12 +119,61 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
89
119
  request.type = HAP_power_set_HMX;
90
120
  request.hmx.power_up = TRUE;
91
121
  FARF(ALWAYS, "Powering HMX on\n");
92
- err = HAP_power_set((void *) &ctx, &request);
122
+ err = HAP_power_set((void *) ctx, &request);
93
123
  if (err != AEE_SUCCESS) {
94
- FARF(ERROR, "Error powering on HMX.");
124
+ FARF(ERROR, "ggml-hex: error powering on HMX.");
95
125
  return err;
96
126
  }
97
127
  }
128
+ #endif
129
+
130
+ return AEE_SUCCESS;
131
+ }
132
+
133
+ AEEResult htp_iface_etm(remote_handle64 handle, uint32_t enable) {
134
+ int err = enable ? HAP_user_etm_enable() : HAP_user_etm_disable();
135
+ if (err) {
136
+ if (err == AEE_EVERSIONNOTSUPPORT) {
137
+ FARF(ERROR, "API HAP_user_etm_enable/disable is not supported\n");
138
+ } else {
139
+ FARF(ERROR, "Error executing HAP_user_etm_enable/disable with error code : 0x%x\n", err);
140
+ }
141
+ }
142
+ return err;
143
+ }
144
+
145
+ AEEResult htp_iface_profiler(remote_handle64 handle, uint32_t mode, const htp_iface_pmu_conf* pmu_conf) {
146
+ struct htp_context * ctx = (struct htp_context *) handle;
147
+ if (!ctx) {
148
+ return AEE_EBADPARM;
149
+ }
150
+
151
+ if (mode == HTP_PROF_PMU) {
152
+ const uint32_t* events = pmu_conf->events;
153
+
154
+ // Pack 4 event IDs (low 8 bits) into each 32-bit config register
155
+ uint32_t evtcfg = 0, evtcfg1 = 0, cfg = 0, i = 0;
156
+ for (; i < HEX_NUM_PMU_COUNTERS/2; i++) {
157
+ evtcfg |= ((events[i + 0] & 0xFF) << (i * 8));
158
+ evtcfg1 |= ((events[i + 4] & 0xFF) << (i * 8));
159
+ }
160
+
161
+ // For events >255 pack high 2 bits of all 8 event IDs into cfg register
162
+ // 2 bits per counter: bits [1:0] for counter 0, [3:2] for counter 1, etc.
163
+ for (i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
164
+ cfg |= (((events[i] >> 8) & 3) << (i * 2));
165
+ }
166
+
167
+ FARF(ALWAYS, "Configuring PMU registers: evtcfg = 0x%x, evtcfg1 = 0x%x, pmucfg = 0x%x", evtcfg, evtcfg1, cfg);
168
+
169
+ // Configure PMU registers
170
+ qurt_pmu_set(QURT_PMUCFG, cfg);
171
+ qurt_pmu_set(QURT_PMUEVTCFG, evtcfg);
172
+ qurt_pmu_set(QURT_PMUEVTCFG1, evtcfg1);
173
+ qurt_pmu_enable(1);
174
+ }
175
+
176
+ ctx->profiler = mode;
98
177
 
99
178
  return AEE_SUCCESS;
100
179
  }
@@ -111,91 +190,128 @@ AEEResult htp_iface_close(remote_handle64 handle) {
111
190
  return AEE_EITEMBUSY;
112
191
  }
113
192
 
193
+ // release the mmaps (if any)
194
+ for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
195
+ if (ctx->mmap[i].size) {
196
+ #if __HVX_ARCH__ > 73
197
+ HAP_munmap2((void *) ctx->mmap[i].base, ctx->mmap[i].size);
198
+ #else
199
+ HAP_munmap((void *) ctx->mmap[i].base, ctx->mmap[i].size);
200
+ #endif
201
+ ctx->mmap[i].size = 0;
202
+ ctx->mmap[i].base = NULL;
203
+ ctx->mmap[i].fd = -1;
204
+ }
205
+ }
206
+
207
+ if (ctx->profiler) {
208
+ qurt_pmu_enable(1);
209
+ }
210
+
211
+ if (ctx->etm) {
212
+ HAP_user_etm_disable();
213
+ }
214
+
114
215
  free(ctx);
115
216
  return AEE_SUCCESS;
116
217
  }
117
218
 
118
- AEEResult htp_iface_enable_etm(remote_handle64 handle) {
119
- int err = HAP_user_etm_enable();
120
- if (err) {
121
- if (err == AEE_EVERSIONNOTSUPPORT) {
122
- FARF(ERROR, "API HAP_user_etm_enable is not supported\n");
123
- } else {
124
- FARF(ERROR, "Error executing HAP_user_etm_enable with error code : 0x%x\n", err);
219
+ AEEResult htp_iface_mmap(remote_handle64 handle, uint32_t fd, uint32_t size) {
220
+ struct htp_context * ctx = (struct htp_context *) handle;
221
+ if (!ctx) {
222
+ return AEE_EBADPARM;
223
+ }
224
+
225
+ // See if we already have this mapping
226
+ for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
227
+ struct htp_mmap *m = &ctx->mmap[i];
228
+ if (m->fd == fd) {
229
+ return AEE_SUCCESS;
125
230
  }
126
231
  }
127
- return err;
232
+
233
+ // Add new mapping
234
+ for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
235
+ struct htp_mmap *m = &ctx->mmap[i];
236
+ if (!m->size) {
237
+ FARF(HIGH, "mmap : fd %u size %u", fd, size);
238
+ #if __HVX_ARCH__ > 73
239
+ void *va = HAP_mmap2(NULL, size, HAP_PROT_READ | HAP_PROT_WRITE, 0, fd, 0);
240
+ #else
241
+ if (size > HTP_MMAP_MAX_VMEM) { // HAP_mmap has a size limit of 2GB
242
+ FARF(ERROR, "mmap failed : size %u exceeds 2GB limit for HAP_mmap", (uint32_t) size);
243
+ abort(); // can't do much else at this point
244
+ }
245
+
246
+ void *va = HAP_mmap(NULL, size, HAP_PROT_READ | HAP_PROT_WRITE, 0, fd, 0);
247
+ #endif
248
+ if (va == (void*)-1) {
249
+ FARF(ERROR, "mmap failed : va %p fd %u size %u", va, fd, (uint32_t) size);
250
+ return AEE_EFAILED;
251
+ }
252
+
253
+ m->base = (uint64_t) va;
254
+ m->fd = fd;
255
+ m->size = size;
256
+
257
+ return AEE_SUCCESS;
258
+ }
259
+ }
260
+
261
+ return AEE_ENOMEMORY;
128
262
  }
129
263
 
130
- AEEResult htp_iface_disable_etm(remote_handle64 handle) {
131
- int err = HAP_user_etm_disable();
132
- if (err) {
133
- if (err == AEE_EVERSIONNOTSUPPORT) {
134
- FARF(ERROR, "API HAP_user_etm_disable is not supported\n");
135
- } else {
136
- FARF(ERROR, "Error executing HAP_user_etm_disable with error code : 0x%x\n", err);
264
+ AEEResult htp_iface_munmap(remote_handle64 handle, uint32 fd) {
265
+ struct htp_context * ctx = (struct htp_context *) handle;
266
+ if (!ctx) {
267
+ return AEE_EBADPARM;
268
+ }
269
+
270
+ for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
271
+ struct htp_mmap *m = &ctx->mmap[i];
272
+ if (fd < 0 || m->fd == fd) {
273
+ FARF(HIGH, "unmmap : base %p fd %u size %u", (void*) m->base, m->fd, (uint32_t) m->size);
274
+ #if __HVX_ARCH__ > 73
275
+ HAP_munmap2((void *) m->base, m->size);
276
+ #else
277
+ HAP_munmap((void *) m->base, m->size);
278
+ #endif
279
+ m->size = 0;
280
+ m->base = NULL;
281
+ m->fd = -1;
137
282
  }
138
283
  }
139
- return err;
284
+
285
+ return AEE_SUCCESS;
140
286
  }
141
287
 
142
- static int vtcm_acquire(struct htp_context * ctx) {
143
- int err;
288
+ static void vtcm_acquire(struct htp_context * ctx) {
144
289
  if (!ctx->vtcm_valid) {
145
- // Temporarily bump thread priority to make sure it's higher than other sessions.
146
- // This way the resource manager will notify the other thread to release VTCM.
147
- // Note that we need to reaquire VTCM at normal priority for this to work next time.
148
- qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10);
149
- err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
290
+ int err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000u);
150
291
  if (err != 0) {
151
- FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
292
+ FARF(ERROR, "ggml-hex: failed to acquire VTCM: 0x%08x", (unsigned)err);
152
293
  abort();
153
294
  }
154
- HAP_compute_res_release_cached(ctx->vtcm_rctx);
155
- qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio);
156
295
 
157
- err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
158
- if (err != 0) {
159
- FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
160
- abort();
161
- }
296
+ ctx->vtcm_needs_release = false;
162
297
  ctx->vtcm_valid = true;
163
- }
164
298
 
165
- ctx->vtcm_inuse = true;
166
- return 0;
299
+ // Drop the priority to make sure we get the release callback from other GGML-HTP and QNN-HTP sessions
300
+ HAP_compute_res_update_priority(ctx->vtcm_rctx, ctx->thread_prio + 10);
301
+ }
167
302
  }
168
303
 
169
- static int vtcm_release(struct htp_context * ctx) {
170
- ctx->vtcm_inuse = false;
171
-
172
- if (ctx->vtcm_valid && ctx->vtcm_needs_release) {
304
+ static void vtcm_release(struct htp_context * ctx) {
305
+ if (ctx->vtcm_valid) {
173
306
  ctx->vtcm_valid = false;
174
307
  ctx->vtcm_needs_release = false;
175
308
  HAP_compute_res_release_cached(ctx->vtcm_rctx);
176
309
  }
177
-
178
- return 0;
179
310
  }
180
311
 
181
312
  static int vtcm_release_callback(unsigned int rctx, void * state) {
182
313
  struct htp_context * ctx = (struct htp_context *) state;
183
-
184
- if (!ctx || ctx->vtcm_rctx != rctx) {
185
- return AEE_EBADPARM;
186
- }
187
-
188
- // If VTCM is not inuse (not processing Ops) release it right here
189
- // otherwise we'll release it once we're done with the current Op.
190
-
191
- if (ctx->vtcm_inuse) {
192
- ctx->vtcm_needs_release = true;
193
- return 0;
194
- }
195
-
196
- ctx->vtcm_valid = false;
197
- HAP_compute_res_release_cached(ctx->vtcm_rctx);
198
-
314
+ ctx->vtcm_needs_release = true;
199
315
  return 0;
200
316
  }
201
317
 
@@ -207,7 +323,7 @@ static int vtcm_alloc(struct htp_context * ctx) {
207
323
  HAP_compute_res_attr_init(&attr);
208
324
  HAP_compute_res_attr_set_serialize(&attr, 0);
209
325
  HAP_compute_res_attr_set_cache_mode(&attr, 1);
210
- HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, 0, vtcm_size);
326
+ HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, vtcm_size, vtcm_size); // single page
211
327
  HAP_compute_res_attr_set_release_callback(&attr, vtcm_release_callback, (void *) ctx);
212
328
  HAP_compute_res_attr_set_hmx_param(&attr, 1);
213
329
 
@@ -229,7 +345,6 @@ static int vtcm_alloc(struct htp_context * ctx) {
229
345
  ctx->vtcm_size = vtcm_size;
230
346
  ctx->vtcm_rctx = rctx;
231
347
  ctx->vtcm_valid = false;
232
- ctx->vtcm_inuse = false;
233
348
  ctx->vtcm_needs_release = false;
234
349
 
235
350
  return 0;
@@ -246,7 +361,7 @@ static void vtcm_free(struct htp_context * ctx) {
246
361
  static void htp_packet_callback(dspqueue_t queue, int error, void * context);
247
362
  static void htp_error_callback(dspqueue_t queue, int error, void * context);
248
363
 
249
- AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx) {
364
+ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx, uint32 use_hmx, uint64_t max_vmem) {
250
365
  struct htp_context * ctx = (struct htp_context *) handle;
251
366
 
252
367
  if (!ctx) {
@@ -264,12 +379,12 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
264
379
  htp_error_callback, // Error callback; no errors expected on the DSP
265
380
  (void *) ctx, // Callback context
266
381
  &ctx->queue);
267
-
268
382
  if (err) {
269
383
  FARF(ERROR, "Queue import failed with 0x%08x", (unsigned) err);
270
384
  return err;
271
385
  }
272
386
 
387
+ ctx->max_vmem = max_vmem;
273
388
  ctx->thread_id = qurt_thread_get_id();
274
389
  ctx->thread_prio = qurt_thread_get_priority(ctx->thread_id);
275
390
 
@@ -280,6 +395,19 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
280
395
  return AEE_ENOMEMORY;
281
396
  }
282
397
 
398
+ #ifdef HTP_HAS_HMX
399
+ ctx->hmx_enabled = use_hmx;
400
+ ctx->hmx_queue = NULL;
401
+ if (use_hmx) {
402
+ ctx->hmx_queue = hmx_queue_create(16, ctx->vtcm_rctx);
403
+ if (!ctx->hmx_queue) {
404
+ FARF(ERROR, "hmx-queue-create failed");
405
+ ctx->hmx_enabled = false;
406
+ }
407
+ }
408
+ FARF(HIGH, "HMX %s (use_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", use_hmx);
409
+ #endif
410
+
283
411
  qurt_sysenv_max_hthreads_t hw_threads;
284
412
  qurt_sysenv_get_max_hw_threads(&hw_threads);
285
413
  uint32_t hw_nhvx = (qurt_hvx_get_units() >> 8) & 0xFF;
@@ -296,14 +424,21 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
296
424
 
297
425
  ctx->n_threads = n_hvx;
298
426
  for (int i = 0; i < ctx->n_threads; i++) {
299
- // see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541
300
- ctx->dma[i] = dma_queue_create(64);
427
+ ctx->dma[i] = dma_queue_create(256); // queue depth
301
428
  }
302
429
 
430
+ ctx->ddr_spad_size = 512 * 1024; // 512 KB
431
+ ctx->ddr_spad_base = memalign(128, ctx->ddr_spad_size);
432
+
303
433
  // init worker pool
304
434
  err = worker_pool_init(&ctx->worker_pool, n_hvx);
305
435
  if (err != AEE_SUCCESS) {
306
436
  FARF(ERROR, "Unable to create worker pool");
437
+ if (ctx->ddr_spad_base) {
438
+ free(ctx->ddr_spad_base);
439
+ ctx->ddr_spad_base = NULL;
440
+ ctx->ddr_spad_size = 0;
441
+ }
307
442
  return err;
308
443
  }
309
444
 
@@ -341,8 +476,22 @@ AEEResult htp_iface_stop(remote_handle64 handle) {
341
476
  dma_queue_delete(ctx->dma[i]);
342
477
  }
343
478
 
479
+ #ifdef HTP_HAS_HMX
480
+ if (ctx->hmx_queue) {
481
+ hmx_queue_delete(ctx->hmx_queue);
482
+ ctx->hmx_queue = NULL;
483
+ }
484
+ ctx->hmx_enabled = false;
485
+ #endif
486
+
344
487
  vtcm_free(ctx);
345
488
 
489
+ if (ctx->ddr_spad_base) {
490
+ free(ctx->ddr_spad_base);
491
+ ctx->ddr_spad_base = NULL;
492
+ ctx->ddr_spad_size = 0;
493
+ }
494
+
346
495
  return AEE_SUCCESS;
347
496
  }
348
497
 
@@ -354,846 +503,411 @@ static void htp_error_callback(dspqueue_t queue, int error, void * context) {
354
503
  struct profile_data {
355
504
  uint64_t usecs;
356
505
  uint64_t cycles;
357
- uint64_t pkts;
506
+ uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
358
507
  };
359
508
 
360
- static inline void profile_start(struct profile_data * d) {
361
- d->usecs = HAP_perf_get_qtimer_count();
362
- d->cycles = hex_get_cycles();
363
- d->pkts = hex_get_pktcnt();
509
+ static inline void profile_start(uint32_t mode, struct profile_data * d) {
510
+ switch (mode) {
511
+ case HTP_PROF_PMU:
512
+ hex_get_pmu(d->pmu_counters);
513
+ // fallthrough
514
+ case HTP_PROF_BASIC:
515
+ d->usecs = HAP_perf_get_qtimer_count();
516
+ d->cycles = hex_get_cycles();
517
+ break;
518
+ default:
519
+ break;
520
+ }
364
521
  }
365
522
 
366
- static inline void profile_stop(struct profile_data * d) {
367
- d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
368
- d->cycles = hex_get_cycles() - d->cycles;
369
- d->pkts = hex_get_pktcnt() - d->pkts;
523
+ static inline void profile_stop(uint32_t mode, struct profile_data * d) {
524
+ uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
525
+ switch (mode) {
526
+ case HTP_PROF_PMU:
527
+ hex_get_pmu(pmu_counters);
528
+ for (int i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
529
+ d->pmu_counters[i] = pmu_counters[i] - d->pmu_counters[i];
530
+ }
531
+ // fallthrough
532
+ case HTP_PROF_BASIC:
533
+ d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
534
+ d->cycles = hex_get_cycles() - d->cycles;
535
+ break;
536
+ default:
537
+ break;
538
+ }
370
539
  }
371
540
 
372
- static int send_htp_rsp(struct htp_context * c,
373
- uint32_t op,
374
- uint32_t status,
375
- struct dspqueue_buffer * bufs,
376
- size_t n_bufs,
377
- struct profile_data * prof) {
378
- // Prep response struct
379
- struct htp_general_rsp rsp;
380
- rsp.op = op;
381
- rsp.status = status;
382
- rsp.prof_usecs = prof->usecs;
383
- rsp.prof_cycles = prof->cycles;
384
- rsp.prof_pkts = prof->pkts;
385
-
386
- int err = dspqueue_write(c->queue,
387
- 0, // Flags
388
- n_bufs,
389
- bufs, // Buffer references
390
- sizeof(rsp),
391
- (const uint8_t *) &rsp, // Message
392
- DSPQUEUE_TIMEOUT_NONE);
541
+ static int execute_op(struct htp_ops_context * octx) {
542
+ switch (octx->op) {
543
+ case HTP_OP_MUL_MAT:
544
+ return op_matmul(octx);
393
545
 
394
- if (err != 0) {
395
- FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err);
396
- }
546
+ case HTP_OP_MUL_MAT_ID:
547
+ return op_matmul_id(octx);
397
548
 
398
- return err;
399
- }
549
+ case HTP_OP_MUL:
550
+ case HTP_OP_ADD:
551
+ case HTP_OP_SUB:
552
+ case HTP_OP_DIV:
553
+ case HTP_OP_ADD_ID:
554
+ return op_binary(octx);
400
555
 
401
- static void proc_matmul_req(struct htp_context * ctx,
402
- struct htp_general_req * req,
403
- struct dspqueue_buffer * bufs,
404
- size_t n_bufs) {
405
- struct dspqueue_buffer rsp_bufs[1];
406
-
407
- // We had written to the output buffer, we'd also need to flush it
408
- rsp_bufs[0].fd = bufs[2].fd;
409
- rsp_bufs[0].ptr = bufs[2].ptr;
410
- rsp_bufs[0].size = bufs[2].size;
411
- rsp_bufs[0].offset = bufs[2].offset;
412
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
413
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
414
-
415
- // Setup Op context
416
- struct htp_ops_context octx = { 0 };
417
- octx.ctx = ctx;
418
- octx.src0 = req->src0;
419
- octx.src1 = req->src1;
420
- octx.dst = req->dst;
421
- octx.flags = req->flags;
422
- octx.op = req->op;
423
-
424
- // Update data pointers
425
- octx.src0.data = (uint32_t) bufs[0].ptr;
426
- octx.src1.data = (uint32_t) bufs[1].ptr;
427
- octx.dst.data = (uint32_t) bufs[2].ptr;
428
- octx.n_threads = ctx->n_threads;
429
-
430
- struct profile_data prof;
431
- profile_start(&prof);
432
-
433
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
434
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
435
- rsp_status = op_matmul(&octx);
436
- vtcm_release(ctx);
437
- }
556
+ case HTP_OP_NORM:
557
+ case HTP_OP_RMS_NORM:
558
+ case HTP_OP_RMS_NORM_MUL:
559
+ case HTP_OP_SCALE:
560
+ case HTP_OP_SQR:
561
+ case HTP_OP_SQRT:
562
+ case HTP_OP_UNARY_SOFTPLUS:
563
+ case HTP_OP_UNARY_SIGMOID:
564
+ case HTP_OP_UNARY_NEG:
565
+ case HTP_OP_UNARY_EXP:
566
+ case HTP_OP_UNARY_TANH:
567
+ case HTP_OP_L2_NORM:
568
+ return op_unary(octx);
438
569
 
439
- profile_stop(&prof);
440
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
441
- }
570
+ case HTP_OP_UNARY_SILU:
571
+ case HTP_OP_UNARY_GELU:
572
+ case HTP_OP_GLU_SWIGLU:
573
+ case HTP_OP_GLU_SWIGLU_OAI:
574
+ case HTP_OP_GLU_GEGLU:
575
+ return op_activations(octx);
442
576
 
443
- static void proc_argsort_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
444
- struct dspqueue_buffer rsp_bufs[1];
445
-
446
- // We had written to the output buffer, we'd also need to flush it
447
- rsp_bufs[0].fd = bufs[1].fd;
448
- rsp_bufs[0].ptr = bufs[1].ptr;
449
- rsp_bufs[0].offset = bufs[1].offset;
450
- rsp_bufs[0].size = bufs[1].size;
451
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
452
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
453
-
454
- // Setup Op context
455
- struct htp_ops_context octx = { 0 };
456
- octx.ctx = ctx;
457
- octx.src0 = req->src0;
458
- octx.dst = req->dst;
459
- octx.flags = req->flags;
460
- octx.op = req->op;
461
-
462
- memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
463
-
464
- // Update data pointers
465
- octx.src0.data = (uint32_t) bufs[0].ptr;
466
- octx.dst.data = (uint32_t) bufs[1].ptr;
467
- octx.n_threads = ctx->n_threads;
468
-
469
- struct profile_data prof;
470
- profile_start(&prof);
471
-
472
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
473
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
474
- rsp_status = op_argsort(&octx);
475
- vtcm_release(ctx);
476
- }
577
+ case HTP_OP_SOFTMAX:
578
+ return op_softmax(octx);
477
579
 
478
- profile_stop(&prof);
479
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
480
- }
580
+ case HTP_OP_ROPE:
581
+ return op_rope(octx);
481
582
 
482
- static void proc_cpy_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
483
- struct dspqueue_buffer rsp_bufs[1];
484
-
485
- // We had written to the output buffer, we'd also need to flush it
486
- rsp_bufs[0].fd = bufs[1].fd;
487
- rsp_bufs[0].ptr = bufs[1].ptr;
488
- rsp_bufs[0].offset = bufs[1].offset;
489
- rsp_bufs[0].size = bufs[1].size;
490
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
491
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
492
-
493
- // Setup Op context
494
- struct htp_ops_context octx = { 0 };
495
- octx.ctx = ctx;
496
- octx.src0 = req->src0;
497
- octx.dst = req->dst;
498
- octx.flags = req->flags;
499
- octx.op = req->op;
500
-
501
- // Update data pointers
502
- octx.src0.data = (uint32_t) bufs[0].ptr;
503
- octx.dst.data = (uint32_t) bufs[1].ptr;
504
- octx.n_threads = ctx->n_threads;
505
-
506
- struct profile_data prof;
507
- profile_start(&prof);
508
-
509
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
510
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
511
- rsp_status = op_cpy(&octx);
512
- vtcm_release(ctx);
513
- }
583
+ case HTP_OP_FLASH_ATTN_EXT:
584
+ return op_flash_attn_ext(octx);
514
585
 
515
- profile_stop(&prof);
516
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
517
- }
586
+ case HTP_OP_SET_ROWS:
587
+ return op_set_rows(octx);
518
588
 
519
- static void proc_get_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
520
- struct dspqueue_buffer rsp_bufs[1];
521
-
522
- // We had written to the output buffer, we'd also need to flush it
523
- rsp_bufs[0].fd = bufs[2].fd;
524
- rsp_bufs[0].ptr = bufs[2].ptr;
525
- rsp_bufs[0].offset = bufs[2].offset;
526
- rsp_bufs[0].size = bufs[2].size;
527
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
528
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
529
-
530
- // Setup Op context
531
- struct htp_ops_context octx = { 0 };
532
- octx.ctx = ctx;
533
- octx.src0 = req->src0;
534
- octx.src1 = req->src1;
535
- octx.dst = req->dst;
536
- octx.flags = req->flags;
537
- octx.op = req->op;
538
-
539
- // Update data pointers
540
- octx.src0.data = (uint32_t) bufs[0].ptr;
541
- octx.src1.data = (uint32_t) bufs[1].ptr;
542
- octx.dst.data = (uint32_t) bufs[2].ptr;
543
- octx.n_threads = ctx->n_threads;
544
-
545
- struct profile_data prof;
546
- profile_start(&prof);
547
-
548
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
549
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
550
- rsp_status = op_get_rows(&octx);
551
- vtcm_release(ctx);
552
- }
589
+ case HTP_OP_GET_ROWS:
590
+ return op_get_rows(octx);
553
591
 
554
- profile_stop(&prof);
555
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
556
- }
592
+ case HTP_OP_SUM_ROWS:
593
+ return op_sum_rows(octx);
557
594
 
558
- static void proc_matmul_id_req(struct htp_context * ctx,
559
- struct htp_general_req * req,
560
- struct dspqueue_buffer * bufs,
561
- size_t n_bufs) {
562
- struct dspqueue_buffer rsp_bufs[1];
563
-
564
- // We had written to the output buffer, we'd also need to flush it
565
- rsp_bufs[0].fd = bufs[3].fd;
566
- rsp_bufs[0].ptr = bufs[3].ptr;
567
- rsp_bufs[0].size = bufs[3].size;
568
- rsp_bufs[0].offset = bufs[3].offset;
569
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
570
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
571
-
572
- // Setup Op context
573
- struct htp_ops_context octx = { 0 };
574
- octx.ctx = ctx;
575
- octx.src0 = req->src0;
576
- octx.src1 = req->src1;
577
- octx.src2 = req->src2;
578
- octx.dst = req->dst;
579
- octx.flags = req->flags;
580
- octx.op = req->op;
581
-
582
- // Update data pointers
583
- octx.src0.data = (uint32_t) bufs[0].ptr;
584
- octx.src1.data = (uint32_t) bufs[1].ptr;
585
- octx.src2.data = (uint32_t) bufs[2].ptr;
586
- octx.dst.data = (uint32_t) bufs[3].ptr;
587
- octx.n_threads = ctx->n_threads;
588
-
589
- struct profile_data prof;
590
- profile_start(&prof);
591
-
592
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
593
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
594
- rsp_status = op_matmul_id(&octx);
595
- vtcm_release(ctx);
596
- }
595
+ case HTP_OP_CPY:
596
+ return op_cpy(octx);
597
597
 
598
- profile_stop(&prof);
599
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
600
- }
598
+ case HTP_OP_REPEAT:
599
+ return op_repeat(octx);
601
600
 
602
- static void proc_binary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
603
- struct dspqueue_buffer rsp_bufs[1];
604
-
605
- // We had written to the output buffer, we'd also need to flush it
606
- rsp_bufs[0].fd = bufs[2].fd;
607
- rsp_bufs[0].ptr = bufs[2].ptr;
608
- rsp_bufs[0].offset = bufs[2].offset;
609
- rsp_bufs[0].size = bufs[2].size;
610
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
611
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
612
-
613
- // Setup Op context
614
- struct htp_ops_context octx = { 0 };
615
- octx.ctx = ctx;
616
- octx.src0 = req->src0;
617
- octx.src1 = req->src1;
618
- octx.dst = req->dst;
619
- octx.flags = req->flags;
620
- octx.op = req->op;
621
-
622
- // Update data pointers
623
- octx.src0.data = (uint32_t) bufs[0].ptr;
624
- octx.src1.data = (uint32_t) bufs[1].ptr;
625
- octx.dst.data = (uint32_t) bufs[2].ptr;
626
- octx.n_threads = ctx->n_threads;
627
-
628
- struct profile_data prof;
629
- profile_start(&prof);
630
-
631
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
632
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
633
- rsp_status = op_binary(&octx);
634
- vtcm_release(ctx);
635
- }
601
+ case HTP_OP_ARGSORT:
602
+ return op_argsort(octx);
636
603
 
637
- profile_stop(&prof);
638
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
639
- }
604
+ case HTP_OP_SSM_CONV:
605
+ return op_ssm_conv(octx);
640
606
 
641
- static void proc_add_id_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
642
- struct dspqueue_buffer rsp_bufs[1];
643
-
644
- // We had written to the output buffer, we'd also need to flush it
645
- rsp_bufs[0].fd = bufs[3].fd;
646
- rsp_bufs[0].ptr = bufs[3].ptr;
647
- rsp_bufs[0].offset = bufs[3].offset;
648
- rsp_bufs[0].size = bufs[3].size;
649
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
650
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
651
-
652
- // Setup Op context
653
- struct htp_ops_context octx = { 0 };
654
- octx.ctx = ctx;
655
- octx.src0 = req->src0;
656
- octx.src1 = req->src1;
657
- octx.src2 = req->src2;
658
- octx.dst = req->dst;
659
- octx.flags = req->flags;
660
- octx.op = req->op;
661
-
662
- // Update data pointers
663
- octx.src0.data = (uint32_t) bufs[0].ptr;
664
- octx.src1.data = (uint32_t) bufs[1].ptr;
665
- octx.src2.data = (uint32_t) bufs[2].ptr;
666
- octx.dst.data = (uint32_t) bufs[3].ptr;
667
- octx.n_threads = ctx->n_threads;
668
-
669
- struct profile_data prof;
670
- profile_start(&prof);
671
-
672
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
673
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
674
- rsp_status = op_binary(&octx);
675
- vtcm_release(ctx);
676
- }
607
+ case HTP_OP_CUMSUM:
608
+ return op_cumsum(octx);
677
609
 
678
- profile_stop(&prof);
679
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
680
- }
610
+ case HTP_OP_FILL:
611
+ return op_fill(octx);
681
612
 
682
- static void proc_unary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
683
- struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
684
-
685
- // We had written to the output buffer, we'd also need to flush it
686
- rsp_bufs[0].fd = bufs[1].fd;
687
- rsp_bufs[0].ptr = bufs[1].ptr;
688
- rsp_bufs[0].offset = bufs[1].offset;
689
- rsp_bufs[0].size = bufs[1].size;
690
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
691
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
692
-
693
- // Setup Op context
694
- struct htp_ops_context octx = { 0 };
695
- octx.ctx = ctx;
696
- octx.src0 = req->src0;
697
- octx.dst = req->dst;
698
- octx.flags = req->flags;
699
- octx.op = req->op;
700
-
701
- memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
702
-
703
- // Update data pointers
704
- octx.src0.data = (uint32_t) bufs[0].ptr;
705
- octx.dst.data = (uint32_t) bufs[1].ptr;
706
- octx.n_threads = ctx->n_threads;
707
-
708
- struct profile_data prof;
709
- profile_start(&prof);
710
-
711
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
712
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
713
- rsp_status = op_unary(&octx);
714
- vtcm_release(ctx);
715
- }
613
+ case HTP_OP_DIAG:
614
+ return op_diag(octx);
716
615
 
717
- profile_stop(&prof);
718
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
719
- }
616
+ case HTP_OP_SOLVE_TRI:
617
+ return op_solve_tri(octx);
618
+
619
+ case HTP_OP_PAD:
620
+ return op_pad(octx);
621
+
622
+ case HTP_OP_CONCAT:
623
+ return op_concat(octx);
720
624
 
721
- static void proc_sum_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
722
- struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
723
-
724
- // We had written to the output buffer, we'd also need to flush it
725
- rsp_bufs[0].fd = bufs[1].fd;
726
- rsp_bufs[0].ptr = bufs[1].ptr;
727
- rsp_bufs[0].offset = bufs[1].offset;
728
- rsp_bufs[0].size = bufs[1].size;
729
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
730
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
731
-
732
- // Setup Op context
733
- struct htp_ops_context octx = { 0 };
734
- octx.ctx = ctx;
735
- octx.src0 = req->src0;
736
- octx.dst = req->dst;
737
- octx.flags = req->flags;
738
- octx.op = req->op;
739
-
740
- memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
741
-
742
- // Update data pointers
743
- octx.src0.data = (uint32_t) bufs[0].ptr;
744
- octx.dst.data = (uint32_t) bufs[1].ptr;
745
- octx.n_threads = ctx->n_threads;
746
-
747
- struct profile_data prof;
748
- profile_start(&prof);
749
-
750
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
751
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
752
- rsp_status = op_sum_rows(&octx);
753
- vtcm_release(ctx);
625
+ case HTP_OP_GATED_DELTA_NET:
626
+ return op_gated_delta_net(octx);
627
+
628
+ case HTP_OP_TRI:
629
+ return op_tri(octx);
630
+
631
+ case HTP_OP_INVALID:
632
+ break;
633
+
634
+ // No default to catch missing cases
754
635
  }
755
636
 
756
- profile_stop(&prof);
757
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
637
+ FARF(ERROR, "Unknown Op %u", octx->op);
638
+ return -1;
758
639
  }
759
640
 
760
- static void proc_ssm_conv_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
761
- struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
762
-
763
- // We've written to the output buffer, we'd also need to flush it
764
- rsp_bufs[0].fd = bufs[2].fd;
765
- rsp_bufs[0].ptr = bufs[2].ptr;
766
- rsp_bufs[0].offset = bufs[2].offset;
767
- rsp_bufs[0].size = bufs[2].size;
768
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
769
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
770
-
771
- // Setup OP context
772
- struct htp_ops_context octx = { 0 };
773
- octx.ctx = ctx;
774
- octx.src0 = req->src0;
775
- octx.src1 = req->src1;
776
- octx.dst = req->dst;
777
- octx.flags = req->flags;
778
- octx.op = req->op;
779
-
780
- memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
781
-
782
- // Update data pointers
783
- octx.src0.data = (uint32_t) bufs[0].ptr;
784
- octx.src1.data = (uint32_t) bufs[1].ptr;
785
- octx.dst.data = (uint32_t) bufs[2].ptr;
786
- octx.n_threads = ctx->n_threads;
787
-
788
- struct profile_data prof;
789
- profile_start(&prof);
790
-
791
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
792
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
793
- rsp_status = op_ssm_conv(&octx);
794
- vtcm_release(ctx);
641
+ static inline bool reuse_buf(struct htp_context *ctx, uint32_t *m_reuse, struct htp_buf_desc *b) {
642
+ b->base = NULL;
643
+
644
+ for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
645
+ struct htp_mmap *m = ctx->mmap + i;
646
+ if (m->size && m->fd == b->fd) {
647
+ b->base = m->base;
648
+ *m_reuse |= (1 << i);
649
+ return true;
650
+ }
795
651
  }
796
652
 
797
- profile_stop(&prof);
798
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
653
+ return false;
799
654
  }
800
655
 
801
- static void proc_activations_req(struct htp_context * ctx,
802
- struct htp_general_req * req,
803
- struct dspqueue_buffer * bufs,
804
- uint32_t n_bufs) {
805
- struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
806
-
807
- int write_idx = (n_bufs == 3) ? 2 : 1;
808
-
809
- // We had written to the output buffer, we'd also need to flush it
810
- rsp_bufs[0].fd = bufs[write_idx].fd;
811
- rsp_bufs[0].ptr = bufs[write_idx].ptr;
812
- rsp_bufs[0].offset = bufs[write_idx].offset;
813
- rsp_bufs[0].size = bufs[write_idx].size;
814
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
815
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
816
-
817
- // Setup Op context
818
- struct htp_ops_context octx = { 0 };
819
- octx.ctx = ctx;
820
- octx.src0 = req->src0;
821
- if (3 == n_bufs) {
822
- octx.src1 = req->src1;
823
- }
824
- octx.dst = req->dst;
825
- octx.flags = req->flags;
826
- octx.op = req->op;
827
-
828
- memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
829
-
830
- // Update data pointers
831
- octx.src0.data = (uint32_t) bufs[0].ptr;
832
- if (3 == n_bufs) {
833
- octx.src1.data = (uint32_t) bufs[1].ptr;
834
- octx.dst.data = (uint32_t) bufs[2].ptr;
835
- } else {
836
- octx.dst.data = (uint32_t) bufs[1].ptr;
656
+ static inline void drop_mmap(struct htp_context *ctx, struct htp_mmap *m) {
657
+ if (m->size) {
658
+ FARF(HIGH, "unmap : fd %u base %p size %u", m->fd, (void*) m->base, (uint32_t) m->size);
659
+ #if __HVX_ARCH__ > 73
660
+ HAP_munmap2((void *) m->base, m->size);
661
+ #else
662
+ HAP_munmap((void *) m->base, m->size);
663
+ #endif
664
+ m->size = 0;
665
+ m->base = 0;
666
+ m->fd = -1;
837
667
  }
838
- octx.n_threads = ctx->n_threads;
839
-
840
- struct profile_data prof;
841
- profile_start(&prof);
668
+ }
842
669
 
843
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
844
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
845
- if (octx.op == HTP_OP_SOFTMAX) {
846
- rsp_status = op_softmax(&octx);
847
- } else {
848
- rsp_status = op_activations(&octx);
670
+ static inline void mmap_buf(struct htp_context *ctx, struct htp_buf_desc *b) {
671
+ if (b->base) return; // already mapped
672
+
673
+ // find unused mapping
674
+ for (uint32_t i=0; i < HTP_MAX_MMAPS; i++) {
675
+ struct htp_mmap *m = &ctx->mmap[i];
676
+ if (!m->size) {
677
+ #if __HVX_ARCH__ > 73
678
+ void *va = HAP_mmap2(NULL, b->size, HAP_PROT_READ | HAP_PROT_WRITE, 0, b->fd, 0);
679
+ #else
680
+ if (b->size > HTP_MMAP_MAX_VMEM) { // HAP_mmap has a size limit of 2GB
681
+ FARF(ERROR, "mmap failed : size %u exceeds 2GB limit for HAP_mmap", (uint32_t) b->size);
682
+ abort(); // can't do much else at this point
683
+ }
684
+
685
+ void *va = HAP_mmap(NULL, b->size, HAP_PROT_READ | HAP_PROT_WRITE, 0, b->fd, 0);
686
+ #endif
687
+ if (va == (void*)-1) {
688
+ FARF(ERROR, "mmap failed : va %p fd %u size %u", va, b->fd, (uint32_t) b->size);
689
+ abort(); // can't do much else at this point
690
+ }
691
+
692
+ m->base = b->base = (uint64_t) va;
693
+ m->fd = b->fd;
694
+ m->size = b->size;
695
+
696
+ FARF(HIGH, "mmap : fd %u base %p size %u", m->fd, (void*) m->base, (uint32_t) m->size);
697
+ return;
849
698
  }
850
- vtcm_release(ctx);
851
699
  }
852
-
853
- profile_stop(&prof);
854
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
855
700
  }
856
701
 
857
- static void proc_rope_req(struct htp_context * ctx,
858
- struct htp_general_req * req,
859
- struct dspqueue_buffer * bufs,
860
- uint32_t n_bufs) {
861
- struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
862
-
863
- int write_idx = n_bufs - 1;
864
-
865
- // We had written to the output buffer, we'd also need to flush it
866
- rsp_bufs[0].fd = bufs[write_idx].fd;
867
- rsp_bufs[0].ptr = bufs[write_idx].ptr;
868
- rsp_bufs[0].offset = bufs[write_idx].offset;
869
- rsp_bufs[0].size = bufs[write_idx].size;
870
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
871
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
872
-
873
- // Setup Op context
874
- struct htp_ops_context octx = { 0 };
875
- octx.ctx = ctx;
876
- octx.src0 = req->src0;
877
- octx.src1 = req->src1;
878
- if (4 == n_bufs) {
879
- octx.src2 = req->src2;
880
- }
881
- octx.dst = req->dst;
882
- octx.flags = req->flags;
883
- octx.op = req->op;
884
-
885
- memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
886
-
887
- // Update data pointers
888
- octx.src0.data = (uint32_t) bufs[0].ptr;
889
- octx.src1.data = (uint32_t) bufs[1].ptr;
890
- if (4 == n_bufs) {
891
- octx.src2.data = (uint32_t) bufs[2].ptr;
892
- octx.dst.data = (uint32_t) bufs[3].ptr;
893
- } else {
894
- octx.dst.data = (uint32_t) bufs[2].ptr;
702
+ static void prep_op_bufs(struct htp_context *ctx, struct htp_buf_desc *bufs, uint32_t n_bufs) {
703
+ uint32_t m_reuse = 0; // mmap reuse mask (index from ctx->mmap array)
704
+ uint32_t b_reuse = 0; // buf reuse count
705
+
706
+ uint64_t m_vmem = 0; // mapped vmem
707
+ uint64_t e_vmem = 0; // extra vmem
708
+
709
+ // See what we can reuse
710
+ for (uint32_t i=0; i < n_bufs; i++) {
711
+ struct htp_buf_desc *b = bufs + i;
712
+ if (reuse_buf(ctx, &m_reuse, b)) { b_reuse++; } else { e_vmem += b->size; }
713
+ FARF(HIGH, "prep-buf #%u : pass0 fd %u base %p size %u flags 0x%x", i, b->fd, (void*) b->base, (uint32_t) b->size, b->flags);
895
714
  }
896
- octx.n_threads = ctx->n_threads;
897
715
 
898
- struct profile_data prof;
899
- profile_start(&prof);
716
+ if (b_reuse == n_bufs) return; // all bufs reuse existing mappings
900
717
 
901
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
902
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
903
- rsp_status = op_rope(&octx);
904
- vtcm_release(ctx);
718
+ // See how much vmem we have mmaped right now
719
+ for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) { m_vmem += ctx->mmap[i].size; }
720
+
721
+ FARF(HIGH, "prep-bufs : pass1 mmap-vmem %zu extra-vmem %zu max-vmem %zu : n-bufs %u b-reuse %u",
722
+ (size_t) m_vmem, (size_t) e_vmem, (size_t) ctx->max_vmem, n_bufs, b_reuse);
723
+
724
+ if ((m_vmem + e_vmem) > ctx->max_vmem) {
725
+ // Drop unused mappings
726
+ for (uint32_t i=0; i < HTP_MAX_MMAPS; i++) {
727
+ bool used = m_reuse & (1<<i);
728
+ if (!used) { drop_mmap(ctx, ctx->mmap + i); }
729
+ }
905
730
  }
906
731
 
907
- profile_stop(&prof);
908
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
732
+ // Create missing mappings
733
+ for (uint32_t i=0; i < n_bufs; i++) {
734
+ struct htp_buf_desc *b = bufs + i;
735
+ mmap_buf(ctx, b);
736
+ FARF(HIGH, "prep-buf #%u : pass1 fd %u base %p size %u flags 0x%x", i, b->fd, (void*) b->base, (uint32_t) b->size, b->flags);
737
+ }
909
738
  }
910
739
 
911
- static void proc_set_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
912
- struct dspqueue_buffer rsp_bufs[1];
913
-
914
- // We had written to the output buffer, we'd also need to flush it
915
- rsp_bufs[0].fd = bufs[2].fd;
916
- rsp_bufs[0].ptr = bufs[2].ptr;
917
- rsp_bufs[0].offset = bufs[2].offset;
918
- rsp_bufs[0].size = bufs[2].size;
919
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
920
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
921
-
922
- // Setup Op context
923
- struct htp_ops_context octx = { 0 };
924
- octx.ctx = ctx;
925
- octx.src0 = req->src0;
926
- octx.src1 = req->src1;
927
- octx.dst = req->dst;
928
- octx.flags = req->flags;
929
- octx.op = req->op;
930
-
931
- // Update data pointers
932
- octx.src0.data = (uint32_t) bufs[0].ptr;
933
- octx.src1.data = (uint32_t) bufs[1].ptr;
934
- octx.dst.data = (uint32_t) bufs[2].ptr;
935
- octx.n_threads = ctx->n_threads;
936
-
937
- struct profile_data prof;
938
- profile_start(&prof);
939
-
940
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
941
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
942
- rsp_status = op_set_rows(&octx);
943
- vtcm_release(ctx);
944
- }
740
+ static void prep_tensor(struct htp_context *ctx, struct htp_buf_desc *bufs, uint32_t idx, struct htp_tensor *t) {
741
+ uint32_t offset = t->data;
742
+ uint32_t size = t->size;
743
+ uint32_t bi = t->bi;
744
+
745
+ t->data = bufs[bi].base + offset; // update data to the actual pointer
945
746
 
946
- profile_stop(&prof);
947
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
747
+ FARF(HIGH, "prep-tensor #%u: bi %u offset %u size %u data %p : %u:%u:%u:%u", idx, t->bi, offset, t->size, (void*) t->data,
748
+ t->ne[0], t->ne[1], t->ne[3], t->ne[3]);
948
749
  }
949
750
 
950
- static void proc_flash_attn_ext_req(struct htp_context * ctx,
951
- struct htp_general_req * req,
952
- struct dspqueue_buffer * bufs,
953
- uint32_t n_bufs) {
954
- // Setup Op context
955
- struct htp_ops_context octx;
956
- memset(&octx, 0, sizeof(octx));
957
-
958
- octx.ctx = ctx;
959
- octx.n_threads = ctx->n_threads;
960
-
961
- octx.src0 = req->src0;
962
- octx.src1 = req->src1;
963
- octx.src2 = req->src2;
964
- octx.src3 = req->src3;
965
- octx.src4 = req->src4;
966
- octx.dst = req->dst;
967
- octx.flags = req->flags;
968
- octx.op = req->op;
969
-
970
- memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
971
-
972
- // Update data pointers
973
- octx.src0.data = (uint32_t) bufs[0].ptr;
974
- octx.src1.data = (uint32_t) bufs[1].ptr;
975
- octx.src2.data = (uint32_t) bufs[2].ptr;
976
-
977
- int last_buf = 3;
978
-
979
- if (octx.src3.ne[0]) {
980
- octx.src3.data = (uint32_t) bufs[last_buf++].ptr; // mask is valid
751
+ static void prep_tensors(struct htp_context *ctx, struct htp_buf_desc *bufs, struct htp_tensor *tens, uint32_t n_tens) {
752
+ for (uint32_t i=0; i < n_tens; i++) {
753
+ prep_tensor(ctx, bufs, i, tens + i);
981
754
  }
755
+ }
982
756
 
983
- if (octx.src4.ne[0]) {
984
- octx.src4.data = (uint32_t) bufs[last_buf++].ptr; // sinks is valid
985
- }
757
+ static void proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens, uint32_t idx, struct htp_op_desc * op) {
758
+ memcpy(octx->op_params, op->params, sizeof(octx->op_params));
759
+ octx->flags = op->flags;
760
+ octx->op = op->opcode;
986
761
 
987
- octx.dst.data = (uint32_t) bufs[last_buf].ptr;
762
+ FARF(HIGH, "proc-op #%u: opcode %u flags 0x%x", idx, octx->op, octx->flags);
988
763
 
989
- struct profile_data prof;
990
- profile_start(&prof);
764
+ // Prep input tensors
765
+ for (uint32_t i=0; i<HTP_OP_MAX_INPUTS; i++) {
766
+ struct htp_tensor *src = op->src[i] == 0xffff ? NULL : tens + op->src[i];
991
767
 
992
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
993
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
994
- rsp_status = op_flash_attn_ext(&octx);
995
- vtcm_release(ctx);
768
+ octx->src[i] = src;
769
+ if (!src) continue;
770
+
771
+ if (!(src->flags & HTP_TENSOR_FLUSHED) && (src->flags & HTP_TENSOR_COMPUTE)) {
772
+ // flush compute buffers on input
773
+ hex_l2flush((void *) src->data, src->size);
774
+ }
775
+
776
+ FARF(HIGH, "prep-src #%u: data %p size %u : %u:%u:%u:%u", op->src[i], (void*) src->data, src->size,
777
+ src->ne[0], src->ne[1], src->ne[3], src->ne[3]);
996
778
  }
997
779
 
998
- profile_stop(&prof);
780
+ // Prep output tensor
781
+ struct htp_tensor *dst = tens + op->dst;
782
+
783
+ octx->dst = dst;
784
+
785
+ FARF(HIGH, "prep-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size,
786
+ dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]);
787
+
788
+ (void) execute_op(octx);
999
789
 
1000
- struct dspqueue_buffer rsp_buf = bufs[last_buf];
1001
- rsp_buf.flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
1002
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
790
+ // flush buffers on output
791
+ hex_l2flush((void *) dst->data, dst->size);
792
+ dst->flags |= HTP_TENSOR_FLUSHED;
1003
793
 
1004
- send_htp_rsp(ctx, req->op, rsp_status, &bufs[last_buf], 1, &prof);
794
+ FARF(HIGH, "post-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size,
795
+ dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]);
1005
796
  }
1006
797
 
798
+ #define DSPQUEUE_POLL_TIMEOUT_USEC 100
799
+ #define DSPQUEUE_POLL_COUNT 100
800
+
1007
801
  static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
1008
802
  struct htp_context * ctx = (struct htp_context *) context;
1009
803
 
1010
- // Repeatedly read packets from the queue until it's empty. We don't
1011
- // necessarily get a separate callback for each packet, and new packets
1012
- // may arrive while we're processing the previous one. This ensures we
1013
- // keep the DSP busy as much as possible and avoid waiting for the CPU.
804
+ int err;
805
+
806
+ uint32_t poll_count = DSPQUEUE_POLL_COUNT;
1014
807
 
1015
- while (1) {
1016
- struct htp_general_req req;
1017
- uint32_t req_size;
808
+ vtcm_acquire(ctx);
1018
809
 
1019
- struct dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
1020
- uint32_t n_bufs;
1021
- uint32_t flags;
810
+ while (!ctx->vtcm_needs_release) {
811
+ struct htp_opbatch_req req;
812
+ uint32_t r_size = sizeof(req);
1022
813
 
1023
- // Read packet from queue
1024
- int err = dspqueue_read_noblock(queue, &flags,
1025
- HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
1026
- &n_bufs, // Number of buffer references
1027
- bufs, // Buffer references
1028
- sizeof(req), // Max message length
1029
- &req_size, // Message length
1030
- (uint8_t *) &req); // Message
814
+ struct dspqueue_buffer dbuf;
815
+ uint32_t n_dbufs = 1;
816
+ uint32_t flags = 0;
1031
817
 
818
+ err = dspqueue_read_noblock(queue, &flags, n_dbufs, &n_dbufs, &dbuf, r_size, &r_size, (uint8_t *) &req);
1032
819
  if (err == AEE_EWOULDBLOCK) {
1033
- // Consumed all packets available for now
1034
- return;
820
+ if (--poll_count) {
821
+ qurt_sleep(DSPQUEUE_POLL_TIMEOUT_USEC);
822
+ continue;
823
+ }
824
+ break;
1035
825
  }
1036
826
 
1037
827
  if (err != 0) {
1038
828
  FARF(ERROR, "dspqueue_read_noblock failed: 0x%08x", (unsigned) err);
1039
- return;
829
+ break;
1040
830
  }
1041
831
 
1042
- if (req_size != sizeof(req)) {
1043
- FARF(ERROR, "Invalid request size");
832
+ if (r_size < sizeof(req) || n_dbufs != 1) {
833
+ FARF(ERROR, "invalid request : size %u n-dbufs %u", r_size, n_dbufs);
1044
834
  continue;
1045
835
  }
1046
836
 
1047
- if (req.flags & HTP_OPFLAGS_EARLY_WAKEUP) {
1048
- // Host wants early notification
1049
- dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);
1050
- }
837
+ // Reset poll count for valid requests
838
+ poll_count = DSPQUEUE_POLL_COUNT;
1051
839
 
1052
- // Process packet based on its message type
1053
- switch (req.op) {
1054
- case HTP_OP_MUL_MAT:
1055
- if (n_bufs != 3) {
1056
- FARF(ERROR, "Bad matmul-req buffer list");
1057
- continue;
1058
- }
1059
- proc_matmul_req(ctx, &req, bufs, n_bufs);
1060
- break;
840
+ const uint32_t n_bufs = req.n_bufs;
841
+ const uint32_t n_tens = req.n_tensors;
842
+ const uint32_t n_ops = req.n_ops;
1061
843
 
1062
- case HTP_OP_MUL_MAT_ID:
1063
- if (n_bufs != 4) {
1064
- FARF(ERROR, "Bad matmul-id-req buffer list");
1065
- continue;
1066
- }
1067
- proc_matmul_id_req(ctx, &req, bufs, n_bufs);
1068
- break;
1069
-
1070
- case HTP_OP_MUL:
1071
- case HTP_OP_ADD:
1072
- case HTP_OP_SUB:
1073
- case HTP_OP_DIV:
1074
- if (n_bufs != 3) {
1075
- FARF(ERROR, "Bad binary-req buffer list");
1076
- continue;
1077
- }
1078
- proc_binary_req(ctx, &req, bufs);
1079
- break;
1080
-
1081
- case HTP_OP_RMS_NORM:
1082
- case HTP_OP_SCALE:
1083
- if (n_bufs != 2) {
1084
- FARF(ERROR, "Bad unary-req buffer list");
1085
- continue;
1086
- }
844
+ const uint32_t b_size = sizeof(struct htp_buf_desc) * n_bufs;
845
+ const uint32_t t_size = sizeof(struct htp_tensor) * n_tens;
846
+ const uint32_t o_size = sizeof(struct htp_op_desc) * n_ops;
847
+ const uint32_t p_size = sizeof(struct htp_prof_desc) * n_ops;
1087
848
 
1088
- proc_unary_req(ctx, &req, bufs);
1089
- break;
849
+ if (dbuf.size < b_size + t_size + o_size + p_size) {
850
+ FARF(ERROR, "invalid opbatch memory block size %u", dbuf.size);
851
+ break;
852
+ }
1090
853
 
1091
- case HTP_OP_SQR:
1092
- case HTP_OP_SQRT:
1093
- if (n_bufs != 2) {
1094
- FARF(ERROR, "Bad unary-req buffer list");
1095
- continue;
1096
- }
854
+ FARF(HIGH, "processing opbatch #%u: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u", req.id,
855
+ n_bufs, n_tens, n_ops, dbuf.size, b_size, t_size, o_size);
1097
856
 
1098
- proc_unary_req(ctx, &req, bufs);
1099
- break;
857
+ // Setup descriptor pointers
858
+ uint8_t * m_ptr = dbuf.ptr;
859
+ struct htp_buf_desc* bufs = (struct htp_buf_desc*) m_ptr; m_ptr += b_size;
860
+ struct htp_tensor* tens = (struct htp_tensor*) m_ptr; m_ptr += t_size;
861
+ struct htp_op_desc* ops = (struct htp_op_desc*) m_ptr; m_ptr += o_size;
862
+ struct htp_prof_desc* pds = (struct htp_prof_desc*) m_ptr;
1100
863
 
1101
- case HTP_OP_SUM_ROWS:
1102
- if (n_bufs != 2) {
1103
- FARF(ERROR, "Bad unary-req buffer list");
1104
- continue;
1105
- }
864
+ prep_op_bufs(ctx, bufs, n_bufs);
865
+ prep_tensors(ctx, bufs, tens, n_tens);
1106
866
 
1107
- proc_sum_rows_req(ctx, &req, bufs);
1108
- break;
867
+ struct htp_ops_context *octx = &ctx->octx;
868
+ memset(octx, 0, sizeof(*octx));
869
+ octx->n_threads = ctx->n_threads;
870
+ octx->ctx = ctx;
1109
871
 
1110
- case HTP_OP_UNARY_SILU:
1111
- case HTP_OP_UNARY_GELU:
1112
- if (n_bufs != 2) {
1113
- FARF(ERROR, "Bad act-req buffer list");
1114
- continue;
1115
- }
1116
- proc_activations_req(ctx, &req, bufs, n_bufs);
1117
- break;
1118
-
1119
- case HTP_OP_GLU_SWIGLU:
1120
- case HTP_OP_GLU_SWIGLU_OAI:
1121
- case HTP_OP_SOFTMAX:
1122
- case HTP_OP_GLU_GEGLU:
1123
- if ((n_bufs != 2) && (n_bufs != 3)) {
1124
- FARF(ERROR, "Bad act-req buffer list");
1125
- continue;
1126
- }
1127
- proc_activations_req(ctx, &req, bufs, n_bufs);
1128
- break;
872
+ for (uint32_t i=0; i < n_ops; i++) {
873
+ struct profile_data prof;
1129
874
 
1130
- case HTP_OP_ADD_ID:
1131
- if (n_bufs != 4) {
1132
- FARF(ERROR, "Bad add-id-req buffer list");
1133
- continue;
1134
- }
1135
- proc_add_id_req(ctx, &req, bufs);
1136
- break;
875
+ if (i == (n_ops-1)) {
876
+ // wake up the host before starting the last op
877
+ dspqueue_write_early_wakeup_noblock(queue, 0, 0);
878
+ }
1137
879
 
1138
- case HTP_OP_ROPE:
1139
- if ((n_bufs != 3) && (n_bufs != 4)) {
1140
- FARF(ERROR, "Bad rope-req buffer list");
1141
- continue;
1142
- }
1143
- proc_rope_req(ctx, &req, bufs, n_bufs);
1144
- break;
880
+ profile_start(ctx->profiler, &prof);
1145
881
 
1146
- case HTP_OP_FLASH_ATTN_EXT:
1147
- if (!(n_bufs >= 4 && n_bufs <= 6)) {
1148
- FARF(ERROR, "Bad flash-attn-ext-req buffer list");
1149
- continue;
1150
- }
1151
- proc_flash_attn_ext_req(ctx, &req, bufs, n_bufs);
1152
- break;
882
+ proc_op_req(octx, tens, i, &ops[i]);
1153
883
 
1154
- case HTP_OP_SET_ROWS:
1155
- if (n_bufs != 3) {
1156
- FARF(ERROR, "Bad set-rows-req buffer list");
1157
- continue;
1158
- }
1159
- proc_set_rows_req(ctx, &req, bufs);
1160
- break;
1161
-
1162
- case HTP_OP_GET_ROWS:
1163
- if (n_bufs != 3) {
1164
- FARF(ERROR, "Bad get-rows-req buffer list");
1165
- continue;
1166
- }
1167
- proc_get_rows_req(ctx, &req, bufs);
1168
- break;
884
+ profile_stop(ctx->profiler, &prof);
1169
885
 
1170
- case HTP_OP_CPY:
1171
- if (n_bufs != 2) {
1172
- FARF(ERROR, "Bad cpy-req buffer list");
1173
- continue;
886
+ if (ctx->profiler) {
887
+ pds[i].opcode = ops[i].opcode;
888
+ pds[i].usecs = prof.usecs;
889
+ pds[i].cycles = prof.cycles;
890
+ for (int j = 0; j < HEX_NUM_PMU_COUNTERS; j++) {
891
+ pds[i].pmu[j] = prof.pmu_counters[j];
1174
892
  }
1175
- proc_cpy_req(ctx, &req, bufs);
1176
- break;
893
+ }
894
+ }
1177
895
 
1178
- case HTP_OP_ARGSORT:
1179
- if (n_bufs != 2) {
1180
- FARF(ERROR, "Bad argsort-req buffer list");
1181
- continue;
1182
- }
1183
- proc_argsort_req(ctx, &req, bufs);
1184
- break;
896
+ struct htp_opbatch_rsp rsp;
897
+ rsp.id = req.id;
898
+ rsp.status = HTP_STATUS_OK;
899
+ rsp.n_bufs = n_bufs;
900
+ rsp.n_tensors = n_tens;
901
+ rsp.n_ops = n_ops;
1185
902
 
1186
- case HTP_OP_SSM_CONV:
1187
- if (n_bufs != 3) {
1188
- FARF(ERROR, "Bad ssm-conv-req buffer list");
1189
- continue;
1190
- }
1191
- proc_ssm_conv_req(ctx, &req, bufs);
1192
- break;
903
+ dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
1193
904
 
1194
- default:
1195
- FARF(ERROR, "Unknown Op %u", req.op);
1196
- break;
905
+ err = dspqueue_write(queue, 0, 1, &dbuf, sizeof(rsp), (const uint8_t *) &rsp, DSPQUEUE_TIMEOUT_NONE);
906
+ if (err != 0) {
907
+ FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err);
908
+ break;
1197
909
  }
1198
910
  }
911
+
912
+ vtcm_release(ctx);
1199
913
  }