whispercpp 1.3.5 → 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1017) hide show
  1. checksums.yaml +4 -4
  2. data/.document +3 -0
  3. data/.rdoc_options +2 -0
  4. data/LICENSE +1 -1
  5. data/README.md +133 -3
  6. data/Rakefile +18 -3
  7. data/ext/dependencies.rb +10 -4
  8. data/ext/dependencies_for_windows.rb +17 -0
  9. data/ext/extconf.rb +20 -7
  10. data/ext/options.rb +54 -14
  11. data/ext/options_for_windows.rb +51 -0
  12. data/ext/ruby_whisper.c +56 -46
  13. data/ext/ruby_whisper.h +165 -2
  14. data/ext/ruby_whisper_context.c +297 -126
  15. data/ext/ruby_whisper_context_params.c +163 -0
  16. data/ext/ruby_whisper_log_queue.c +180 -0
  17. data/ext/ruby_whisper_log_settable.h +47 -0
  18. data/ext/ruby_whisper_model.c +0 -1
  19. data/ext/ruby_whisper_parakeet.c +49 -0
  20. data/ext/ruby_whisper_parakeet_context.c +304 -0
  21. data/ext/ruby_whisper_parakeet_context_params.c +117 -0
  22. data/ext/ruby_whisper_parakeet_model.c +84 -0
  23. data/ext/ruby_whisper_parakeet_params.c +548 -0
  24. data/ext/ruby_whisper_parakeet_segment.c +157 -0
  25. data/ext/ruby_whisper_parakeet_token.c +188 -0
  26. data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
  27. data/ext/ruby_whisper_params.c +256 -66
  28. data/ext/ruby_whisper_segment.c +6 -7
  29. data/ext/ruby_whisper_token.c +29 -9
  30. data/ext/ruby_whisper_transcribe.cpp +46 -16
  31. data/ext/ruby_whisper_vad_context.c +48 -1
  32. data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
  33. data/ext/ruby_whisper_vad_params.c +0 -1
  34. data/ext/ruby_whisper_vad_segment.c +0 -1
  35. data/ext/ruby_whisper_vad_segments.c +0 -1
  36. data/ext/sources/CMakeLists.txt +41 -3
  37. data/ext/sources/CMakePresets.json +95 -0
  38. data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
  39. data/ext/sources/cmake/parakeet.pc.in +10 -0
  40. data/ext/sources/cmake/whisper-config.cmake.in +5 -40
  41. data/ext/sources/cmake/whisper.pc.in +1 -1
  42. data/ext/sources/examples/CMakeLists.txt +4 -2
  43. data/ext/sources/examples/bench/bench.cpp +24 -19
  44. data/ext/sources/examples/cli/cli.cpp +51 -9
  45. data/ext/sources/examples/common-ggml.cpp +4 -0
  46. data/ext/sources/examples/common-whisper.cpp +139 -67
  47. data/ext/sources/examples/common-whisper.h +11 -0
  48. data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
  49. data/ext/sources/examples/miniaudio.h +4507 -2131
  50. data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
  51. data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
  52. data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
  53. data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
  54. data/ext/sources/examples/server/server.cpp +213 -163
  55. data/ext/sources/ggml/CMakeLists.txt +29 -15
  56. data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
  57. data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
  58. data/ext/sources/ggml/include/ggml-alloc.h +1 -0
  59. data/ext/sources/ggml/include/ggml-backend.h +73 -11
  60. data/ext/sources/ggml/include/ggml-cann.h +1 -1
  61. data/ext/sources/ggml/include/ggml-cpu.h +5 -0
  62. data/ext/sources/ggml/include/ggml-cuda.h +3 -0
  63. data/ext/sources/ggml/include/ggml-openvino.h +37 -0
  64. data/ext/sources/ggml/include/ggml-opt.h +1 -1
  65. data/ext/sources/ggml/include/ggml-rpc.h +8 -3
  66. data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
  67. data/ext/sources/ggml/include/ggml.h +155 -16
  68. data/ext/sources/ggml/include/gguf.h +10 -2
  69. data/ext/sources/ggml/src/CMakeLists.txt +25 -5
  70. data/ext/sources/ggml/src/ggml-alloc.c +9 -10
  71. data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
  72. data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
  73. data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
  74. data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
  75. data/ext/sources/ggml/src/ggml-backend-reg.cpp +40 -86
  76. data/ext/sources/ggml/src/ggml-backend.cpp +114 -10
  77. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  78. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +10 -2
  79. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
  80. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
  81. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +1016 -442
  82. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +111 -85
  83. data/ext/sources/ggml/src/ggml-cann/common.h +23 -14
  84. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +255 -92
  85. data/ext/sources/ggml/src/ggml-common.h +22 -0
  86. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +68 -34
  87. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +44 -19
  88. data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
  89. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +101 -101
  90. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +194 -1
  91. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2874 -613
  92. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
  93. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
  94. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +5480 -840
  95. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1361 -0
  96. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -11
  97. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
  98. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +186 -36
  99. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +119 -19
  100. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +112 -26
  101. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
  102. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  103. data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
  104. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -0
  105. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +153 -16
  106. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +17 -0
  107. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
  108. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +976 -251
  109. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +671 -266
  110. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1277 -263
  111. data/ext/sources/ggml/src/ggml-cpu/ops.h +4 -0
  112. data/ext/sources/ggml/src/ggml-cpu/quants.c +95 -0
  113. data/ext/sources/ggml/src/ggml-cpu/quants.h +6 -0
  114. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2893 -679
  115. data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
  116. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +226 -0
  117. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +114 -19
  118. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
  119. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
  120. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
  121. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  122. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  123. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  124. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
  125. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  126. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  127. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  128. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  129. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  130. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  131. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  132. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  133. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
  134. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +54 -53
  135. data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
  136. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +18 -8
  137. data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
  138. data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  139. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +73 -28
  140. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +69 -41
  141. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
  142. data/ext/sources/ggml/src/ggml-cuda/common.cuh +359 -29
  143. data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
  144. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
  145. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
  146. data/ext/sources/ggml/src/ggml-cuda/convert.cu +94 -27
  147. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
  148. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +20 -9
  149. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
  150. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +333 -85
  151. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +632 -190
  152. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
  153. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +162 -49
  154. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +43 -18
  155. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +44 -14
  156. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
  157. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +241 -23
  158. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  159. data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
  160. data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
  161. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +312 -0
  162. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  163. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
  164. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1454 -599
  165. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
  166. data/ext/sources/ggml/src/ggml-cuda/mean.cu +13 -10
  167. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +397 -183
  168. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
  169. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +161 -88
  170. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
  171. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +522 -431
  172. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +139 -72
  173. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
  174. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +608 -88
  175. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -0
  176. data/ext/sources/ggml/src/ggml-cuda/norm.cu +47 -79
  177. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
  178. data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
  179. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +134 -27
  180. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
  181. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +7 -17
  182. data/ext/sources/ggml/src/ggml-cuda/rope.cu +244 -137
  183. data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
  184. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
  185. data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
  186. data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
  187. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
  188. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
  189. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
  190. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +96 -40
  191. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  192. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
  193. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
  194. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
  195. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +6 -0
  196. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
  197. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +2 -0
  198. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
  199. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +6 -0
  200. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +2 -0
  201. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
  202. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
  203. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +2 -0
  204. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
  205. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +2 -0
  206. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
  207. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  208. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  209. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  210. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  211. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  212. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  213. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  214. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  225. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -5
  226. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +202 -135
  227. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
  228. data/ext/sources/ggml/src/ggml-cuda/unary.cu +86 -2
  229. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +4 -0
  230. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +111 -17
  231. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
  232. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +30 -2
  233. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  234. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +84 -46
  235. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1612 -753
  236. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +51 -11
  237. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +361 -261
  238. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +294 -0
  239. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +753 -241
  240. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
  241. data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
  242. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +295 -0
  243. data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  244. data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  245. data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  246. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +471 -296
  247. data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
  248. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +159 -53
  249. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +3 -3
  250. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +372 -0
  251. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +86 -0
  252. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  253. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +137 -0
  254. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
  255. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
  256. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
  257. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
  258. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  259. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  260. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  261. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  262. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +97 -14
  263. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +163 -67
  264. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
  265. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  266. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +308 -0
  267. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +262 -0
  268. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +291 -0
  269. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  270. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +216 -0
  271. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
  272. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  273. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  274. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
  275. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
  276. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  277. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  278. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  279. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +142 -0
  280. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
  281. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  282. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  283. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -1348
  284. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +547 -635
  285. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +3556 -1101
  286. data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
  287. data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  288. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +475 -269
  289. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +94 -72
  290. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +222 -217
  291. data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  292. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +432 -0
  293. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  294. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +886 -117
  295. data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  296. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
  297. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  298. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  299. data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
  300. data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
  301. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +40 -0
  302. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +28 -9
  303. data/ext/sources/ggml/src/ggml-impl.h +68 -1
  304. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
  305. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
  306. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
  307. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
  308. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +409 -83
  309. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +54 -5
  310. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +254 -52
  311. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +254 -23
  312. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +756 -285
  313. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +7 -4
  314. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +359 -133
  315. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1867 -1123
  316. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
  317. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +71 -4
  318. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +14127 -5314
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +97 -88
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +104 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  322. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1978 -67
  323. data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  324. data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  325. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
  326. data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
  327. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
  328. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
  329. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
  330. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
  331. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
  332. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
  333. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
  334. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
  335. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  336. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl +129 -0
  344. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  345. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
  346. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
  347. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
  348. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
  349. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
  350. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
  351. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
  352. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
  353. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  354. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
  355. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
  356. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  357. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  358. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
  359. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
  360. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  361. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  362. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl +195 -0
  363. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
  364. data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  365. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
  366. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  367. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  368. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  369. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  370. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  371. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  372. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
  373. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
  374. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  375. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  376. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  377. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  378. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  379. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  380. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  381. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  382. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
  383. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
  384. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
  385. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
  386. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  387. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  388. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
  389. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +178 -0
  390. data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  391. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
  392. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
  393. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
  394. data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  395. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
  396. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
  397. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
  398. data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  399. data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
  400. data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  401. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +985 -0
  402. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  403. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +380 -0
  404. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  405. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1132 -0
  406. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +956 -0
  407. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  408. data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  409. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  410. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  411. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  412. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  413. data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  414. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  415. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  416. data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  417. data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  418. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  419. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  420. data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  421. data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  422. data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  423. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  424. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +149 -0
  425. data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  426. data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  427. data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  428. data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  429. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  430. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  431. data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  432. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +47 -0
  433. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +40 -0
  434. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  435. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  436. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  437. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  438. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  439. data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  440. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +317 -0
  441. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  442. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +257 -0
  443. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +86 -0
  444. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +880 -0
  445. data/ext/sources/ggml/src/ggml-openvino/utils.h +143 -0
  446. data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
  447. data/ext/sources/ggml/src/ggml-quants.c +385 -119
  448. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  449. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
  450. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
  451. data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
  452. data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
  453. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +64 -91
  454. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
  455. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +4 -1
  456. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
  457. data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
  458. data/ext/sources/ggml/src/ggml-sycl/common.hpp +356 -11
  459. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +184 -14
  460. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +31 -1
  461. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
  462. data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  463. data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  464. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
  465. data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
  466. data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
  467. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
  468. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
  469. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +77 -156
  470. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -2
  471. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  472. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  473. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1181 -0
  474. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +59 -0
  475. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1246 -0
  476. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +674 -0
  477. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +227 -0
  478. data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
  479. data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
  480. data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
  481. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +347 -0
  482. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +9 -0
  483. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  484. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
  485. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +1134 -236
  486. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
  487. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
  488. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
  489. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
  490. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
  491. data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
  492. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
  493. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
  494. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +72 -1
  495. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
  496. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
  497. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
  498. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
  499. data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  500. data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  501. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
  502. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  503. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  504. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
  505. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
  506. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  507. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  508. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  509. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  510. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  511. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  512. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  513. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  514. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  515. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  516. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +8 -0
  517. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +8 -0
  518. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +8 -0
  519. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +8 -0
  520. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +8 -0
  521. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +8 -0
  522. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +8 -0
  523. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +8 -0
  524. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +8 -0
  525. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +8 -0
  526. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +8 -0
  527. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +8 -0
  528. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +8 -0
  529. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +8 -0
  530. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +8 -0
  531. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +8 -0
  532. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +8 -0
  533. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +8 -0
  534. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +8 -0
  535. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +8 -0
  536. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +8 -0
  537. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +8 -0
  538. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +8 -0
  539. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +8 -0
  540. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +8 -0
  541. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +8 -0
  542. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +8 -0
  543. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +8 -0
  544. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +8 -0
  545. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +8 -0
  546. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +8 -0
  547. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +8 -0
  548. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +8 -0
  549. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +8 -0
  550. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +8 -0
  551. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +8 -0
  552. data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
  553. data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
  554. data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
  555. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +228 -53
  556. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
  557. data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  558. data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  559. data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  560. data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  561. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  562. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  563. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  564. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  565. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  566. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  567. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  568. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  569. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  570. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  571. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  572. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  573. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  574. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  575. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  576. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  577. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  578. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +123 -0
  579. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +160 -0
  580. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  581. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +71 -0
  582. data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  583. data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  584. data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  585. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  586. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  587. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  588. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  589. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  590. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  591. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  592. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +99 -0
  593. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  594. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  595. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  596. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +545 -0
  597. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +115 -0
  598. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
  599. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3250 -940
  600. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  601. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
  602. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
  603. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
  604. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
  605. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
  606. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
  607. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
  608. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
  609. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  610. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  611. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
  612. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
  613. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  614. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
  615. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
  616. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +533 -180
  617. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +113 -68
  618. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +412 -222
  619. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +222 -83
  620. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
  621. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  622. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  623. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
  624. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
  625. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +189 -0
  626. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
  627. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
  628. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
  629. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
  630. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
  631. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
  632. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
  633. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
  634. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
  635. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
  636. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
  637. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
  638. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
  639. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
  640. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
  641. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +22 -20
  642. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +51 -14
  643. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
  644. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  645. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
  646. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +39 -63
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +13 -7
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +27 -11
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +193 -149
  663. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
  664. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +3221 -97
  665. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3493 -1997
  666. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
  667. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  668. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  669. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  670. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  671. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +142 -0
  672. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +115 -141
  673. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +93 -0
  674. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  675. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -44
  676. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  677. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +198 -230
  678. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
  679. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
  680. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  681. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  682. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
  683. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
  684. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +234 -335
  685. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  686. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  687. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +871 -42
  688. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  689. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
  690. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  691. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +149 -0
  692. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +36 -138
  693. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +151 -0
  694. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
  695. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
  696. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  697. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
  698. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
  699. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  700. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  701. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
  702. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  703. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +15 -40
  704. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  705. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +39 -12
  706. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
  707. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
  708. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  709. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  710. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  711. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  712. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +213 -0
  713. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  714. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +24 -15
  715. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
  716. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +253 -16
  717. data/ext/sources/ggml/src/ggml.c +268 -52
  718. data/ext/sources/ggml/src/gguf.cpp +377 -47
  719. data/ext/sources/include/parakeet.h +342 -0
  720. data/ext/sources/include/whisper.h +10 -0
  721. data/ext/sources/media/matmul.png +0 -0
  722. data/ext/sources/src/CMakeLists.txt +23 -0
  723. data/ext/sources/src/parakeet-arch.h +188 -0
  724. data/ext/sources/src/parakeet.cpp +3838 -0
  725. data/ext/sources/src/whisper.cpp +62 -40
  726. data/extsources.rb +26 -10
  727. data/lib/whisper/log_settable.rb +36 -0
  728. data/lib/whisper/model/uri.rb +13 -1
  729. data/lib/whisper/output.rb +74 -0
  730. data/sig/whisper.rbs +445 -55
  731. data/test/helper.rb +2 -0
  732. data/test/jfk_reader/jfk_reader.c +50 -7
  733. data/test/test_callback.rb +1 -0
  734. data/test/test_context_params.rb +82 -0
  735. data/test/test_package.rb +6 -5
  736. data/test/test_parakeet.rb +28 -0
  737. data/test/test_parakeet_callback.rb +107 -0
  738. data/test/test_parakeet_context.rb +116 -0
  739. data/test/test_parakeet_context_params.rb +24 -0
  740. data/test/test_parakeet_model.rb +21 -0
  741. data/test/test_parakeet_params.rb +78 -0
  742. data/test/test_parakeet_segment.rb +42 -0
  743. data/test/test_parakeet_token.rb +73 -0
  744. data/test/test_params.rb +2 -0
  745. data/test/test_token.rb +11 -0
  746. data/test/test_vad_context.rb +58 -8
  747. data/test/test_vad_segment.rb +1 -1
  748. data/test/test_whisper.rb +44 -6
  749. data/whispercpp.gemspec +2 -2
  750. metadata +426 -280
  751. data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
  752. data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
  753. data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
  754. data/ext/sources/bindings/javascript/package.json +0 -26
  755. data/ext/sources/bindings/javascript/whisper.js +0 -19
  756. data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
  757. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
  758. data/ext/sources/examples/addon.node/addon.cpp +0 -557
  759. data/ext/sources/examples/addon.node/index.js +0 -59
  760. data/ext/sources/examples/addon.node/package.json +0 -16
  761. data/ext/sources/examples/addon.node/vad-example.js +0 -132
  762. data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
  763. data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
  764. data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
  765. data/ext/sources/examples/coi-serviceworker.js +0 -146
  766. data/ext/sources/examples/command/CMakeLists.txt +0 -10
  767. data/ext/sources/examples/command/command.cpp +0 -802
  768. data/ext/sources/examples/command/commands.txt +0 -9
  769. data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
  770. data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
  771. data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
  772. data/ext/sources/examples/generate-karaoke.sh +0 -57
  773. data/ext/sources/examples/helpers.js +0 -191
  774. data/ext/sources/examples/livestream.sh +0 -112
  775. data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
  776. data/ext/sources/examples/lsp/lsp.cpp +0 -471
  777. data/ext/sources/examples/lsp/whisper.vim +0 -362
  778. data/ext/sources/examples/python/test_whisper_processor.py +0 -7
  779. data/ext/sources/examples/python/whisper_processor.py +0 -54
  780. data/ext/sources/examples/server/bench.js +0 -29
  781. data/ext/sources/examples/server.py +0 -120
  782. data/ext/sources/examples/stream/CMakeLists.txt +0 -10
  783. data/ext/sources/examples/stream/stream.cpp +0 -437
  784. data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
  785. data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
  786. data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
  787. data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
  788. data/ext/sources/examples/sycl/build.sh +0 -22
  789. data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
  790. data/ext/sources/examples/sycl/run-whisper.sh +0 -17
  791. data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -47
  792. data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
  793. data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -494
  794. data/ext/sources/examples/talk-llama/llama-adapter.h +0 -88
  795. data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2559
  796. data/ext/sources/examples/talk-llama/llama-arch.h +0 -586
  797. data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -917
  798. data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
  799. data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -876
  800. data/ext/sources/examples/talk-llama/llama-chat.h +0 -70
  801. data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3645
  802. data/ext/sources/examples/talk-llama/llama-context.h +0 -360
  803. data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
  804. data/ext/sources/examples/talk-llama/llama-cparams.h +0 -42
  805. data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
  806. data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
  807. data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2282
  808. data/ext/sources/examples/talk-llama/llama-graph.h +0 -910
  809. data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -241
  810. data/ext/sources/examples/talk-llama/llama-hparams.h +0 -284
  811. data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
  812. data/ext/sources/examples/talk-llama/llama-impl.h +0 -63
  813. data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
  814. data/ext/sources/examples/talk-llama/llama-io.h +0 -35
  815. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -328
  816. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
  817. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2100
  818. data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -390
  819. data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
  820. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
  821. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
  822. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1167
  823. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
  824. data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
  825. data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
  826. data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -735
  827. data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
  828. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1247
  829. data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -176
  830. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -285
  831. data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -37
  832. data/ext/sources/examples/talk-llama/llama-model.cpp +0 -8338
  833. data/ext/sources/examples/talk-llama/llama-model.h +0 -544
  834. data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1072
  835. data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
  836. data/ext/sources/examples/talk-llama/llama-sampling.cpp +0 -3771
  837. data/ext/sources/examples/talk-llama/llama-sampling.h +0 -44
  838. data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3900
  839. data/ext/sources/examples/talk-llama/llama-vocab.h +0 -182
  840. data/ext/sources/examples/talk-llama/llama.cpp +0 -1140
  841. data/ext/sources/examples/talk-llama/llama.h +0 -1540
  842. data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -191
  843. data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
  844. data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
  845. data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -138
  846. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
  847. data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -122
  848. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -144
  849. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -135
  850. data/ext/sources/examples/talk-llama/models/bert.cpp +0 -178
  851. data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -160
  852. data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
  853. data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
  854. data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
  855. data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
  856. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
  857. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
  858. data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
  859. data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -123
  860. data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
  861. data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -144
  862. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -259
  863. data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -134
  864. data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
  865. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -150
  866. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
  867. data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
  868. data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
  869. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -113
  870. data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
  871. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
  872. data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
  873. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
  874. data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
  875. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
  876. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
  877. data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -150
  878. data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
  879. data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
  880. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -196
  881. data/ext/sources/examples/talk-llama/models/granite.cpp +0 -211
  882. data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +0 -283
  883. data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
  884. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -141
  885. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
  886. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -154
  887. data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
  888. data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
  889. data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
  890. data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -175
  891. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
  892. data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
  893. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
  894. data/ext/sources/examples/talk-llama/models/llama.cpp +0 -168
  895. data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
  896. data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -55
  897. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -123
  898. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -199
  899. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -124
  900. data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
  901. data/ext/sources/examples/talk-llama/models/models.h +0 -569
  902. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -116
  903. data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
  904. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -150
  905. data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
  906. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
  907. data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
  908. data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
  909. data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
  910. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
  911. data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
  912. data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
  913. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
  914. data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
  915. data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
  916. data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
  917. data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -316
  918. data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
  919. data/ext/sources/examples/talk-llama/models/plm.cpp +0 -168
  920. data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
  921. data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
  922. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
  923. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
  924. data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -117
  925. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -124
  926. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -873
  927. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -149
  928. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -141
  929. data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
  930. data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
  931. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -162
  932. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
  933. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
  934. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -135
  935. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
  936. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
  937. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
  938. data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
  939. data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
  940. data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
  941. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
  942. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
  943. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
  944. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
  945. data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
  946. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
  947. data/ext/sources/examples/talk-llama/speak +0 -40
  948. data/ext/sources/examples/talk-llama/speak.bat +0 -1
  949. data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
  950. data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
  951. data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
  952. data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
  953. data/ext/sources/examples/talk-llama/unicode.cpp +0 -1147
  954. data/ext/sources/examples/talk-llama/unicode.h +0 -111
  955. data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
  956. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
  957. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
  958. data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
  959. data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
  960. data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
  961. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
  962. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
  963. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
  964. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
  965. data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
  966. data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
  967. data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
  968. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
  969. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
  970. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +0 -157
  971. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -165
  972. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
  973. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
  974. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
  975. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
  976. data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
  977. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
  978. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
  979. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
  980. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
  981. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
  982. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
  983. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
  984. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -147
  985. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
  986. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +0 -907
  987. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +0 -247
  988. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
  989. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
  990. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
  991. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
  992. data/ext/sources/tests/CMakeLists.txt +0 -112
  993. data/ext/sources/tests/earnings21/eval.mk +0 -58
  994. data/ext/sources/tests/earnings21/eval.py +0 -68
  995. data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
  996. data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
  997. data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
  998. data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
  999. data/ext/sources/tests/earnings21/requirements.txt +0 -6
  1000. data/ext/sources/tests/en-0-ref.txt +0 -1
  1001. data/ext/sources/tests/en-1-ref.txt +0 -1
  1002. data/ext/sources/tests/en-2-ref.txt +0 -1
  1003. data/ext/sources/tests/es-0-ref.txt +0 -1
  1004. data/ext/sources/tests/librispeech/eval.mk +0 -39
  1005. data/ext/sources/tests/librispeech/eval.py +0 -47
  1006. data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
  1007. data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
  1008. data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
  1009. data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
  1010. data/ext/sources/tests/librispeech/requirements.txt +0 -6
  1011. data/ext/sources/tests/run-tests.sh +0 -130
  1012. data/ext/sources/tests/test-c.c +0 -3
  1013. data/ext/sources/tests/test-vad-full.cpp +0 -56
  1014. data/ext/sources/tests/test-vad.cpp +0 -83
  1015. data/ext/sources/tests/test-whisper.js +0 -58
  1016. data/lib/whisper/context.rb +0 -15
  1017. data/lib/whisper/segment.rb +0 -58
@@ -1,31 +1,34 @@
1
1
  #pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
2
2
  #pragma clang diagnostic ignored "-Wunused-function"
3
+ #pragma clang diagnostic ignored "-Wunused-variable"
4
+ #pragma clang diagnostic ignored "-Wunused-but-set-variable"
3
5
 
4
- #define FARF_ERROR 1
5
- #define FARF_HIGH 1
6
- #define FARF_MEDIUM 0
7
- #define FARF_LOW 0
6
+ #include <HAP_farf.h>
7
+ #include <HAP_perf.h>
8
8
  #include <AEEStdErr.h>
9
9
  #include <dspqueue.h>
10
10
  #include <HAP_compute_res.h>
11
11
  #include <HAP_etm_config.h>
12
- #include <HAP_farf.h>
13
12
  #include <HAP_mem.h>
14
- #include <HAP_perf.h>
15
13
  #include <HAP_power.h>
16
14
  #include <HAP_ps.h>
15
+ #include <HAP_dcvs.h>
17
16
  #include <qurt.h>
18
17
  #include <qurt_thread.h>
18
+ #include <qurt_memory.h>
19
19
  #include <remote.h>
20
20
  #include <string.h>
21
21
 
22
+ #include "hex-utils.h"
23
+ #include "hex-dma.h"
24
+ #include "hmx-queue.h"
25
+
22
26
  #define GGML_COMMON_DECL_C
23
27
  #include "ggml-common.h"
24
28
  #include "htp-ctx.h"
25
- #include "htp-dma.h"
26
- #include "htp-msg.h"
27
29
  #include "htp-ops.h"
28
- #include "ops-utils.h"
30
+ #include "htp-ops.h"
31
+ #include "htp_iface.h"
29
32
  #include "worker-pool.h"
30
33
 
31
34
  AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
@@ -37,7 +40,7 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
37
40
  return AEE_ENOMEMORY;
38
41
  }
39
42
 
40
- // Use the context structure as a handle
43
+ // Use the context structure as the handle
41
44
  *handle = (remote_handle64) ctx;
42
45
 
43
46
  // Enable FARF logs
@@ -61,8 +64,7 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
61
64
 
62
65
  request.type = HAP_power_set_DCVS_v3;
63
66
  request.dcvs_v3.set_dcvs_enable = TRUE;
64
- request.dcvs_v3.dcvs_enable = TRUE;
65
- request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
67
+ request.dcvs_v3.dcvs_enable = FALSE;
66
68
  request.dcvs_v3.set_bus_params = TRUE;
67
69
  request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_MAX;
68
70
  request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_MAX;
@@ -73,6 +75,10 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
73
75
  request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_MAX;
74
76
  request.dcvs_v3.set_sleep_disable = TRUE;
75
77
  request.dcvs_v3.sleep_disable = TRUE;
78
+
79
+ #if (__HEXAGON_ARCH__ >= 79)
80
+ HAP_set_dcvs_v3_protected_bus_corners(&request, 1);
81
+ #endif
76
82
  if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
77
83
  return err;
78
84
  }
@@ -85,6 +91,27 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
85
91
  }
86
92
  }
87
93
 
94
+ #if __HVX_ARCH__ >= 75
95
+ {
96
+ // Power on HMX and set HMX clock
97
+ HAP_power_request_t request;
98
+ memset(&request, 0, sizeof(HAP_power_request_t));
99
+ request.type = HAP_power_set_HMX_v2;
100
+ request.hmx_v2.set_power = TRUE;
101
+ request.hmx_v2.power_up = TRUE;
102
+ request.hmx_v2.set_clock = TRUE;
103
+ request.hmx_v2.target_corner = HAP_DCVS_EXP_VCORNER_MAX;
104
+ request.hmx_v2.min_corner = HAP_DCVS_EXP_VCORNER_MAX;
105
+ request.hmx_v2.max_corner = HAP_DCVS_EXP_VCORNER_MAX;
106
+ request.hmx_v2.perf_mode = HAP_CLK_PERF_HIGH;
107
+ FARF(ALWAYS, "Setting HMX clock\n");
108
+ err = HAP_power_set((void *) ctx, &request);
109
+ if (err != AEE_SUCCESS) {
110
+ FARF(ERROR, "ggml-hex: error setting HMX clock.");
111
+ return err;
112
+ }
113
+ }
114
+ #else
88
115
  {
89
116
  // Power on HMX
90
117
  HAP_power_request_t request;
@@ -92,12 +119,61 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
92
119
  request.type = HAP_power_set_HMX;
93
120
  request.hmx.power_up = TRUE;
94
121
  FARF(ALWAYS, "Powering HMX on\n");
95
- err = HAP_power_set((void *) &ctx, &request);
122
+ err = HAP_power_set((void *) ctx, &request);
96
123
  if (err != AEE_SUCCESS) {
97
- FARF(ERROR, "Error powering on HMX.");
124
+ FARF(ERROR, "ggml-hex: error powering on HMX.");
98
125
  return err;
99
126
  }
100
127
  }
128
+ #endif
129
+
130
+ return AEE_SUCCESS;
131
+ }
132
+
133
+ AEEResult htp_iface_etm(remote_handle64 handle, uint32_t enable) {
134
+ int err = enable ? HAP_user_etm_enable() : HAP_user_etm_disable();
135
+ if (err) {
136
+ if (err == AEE_EVERSIONNOTSUPPORT) {
137
+ FARF(ERROR, "API HAP_user_etm_enable/disable is not supported\n");
138
+ } else {
139
+ FARF(ERROR, "Error executing HAP_user_etm_enable/disable with error code : 0x%x\n", err);
140
+ }
141
+ }
142
+ return err;
143
+ }
144
+
145
+ AEEResult htp_iface_profiler(remote_handle64 handle, uint32_t mode, const htp_iface_pmu_conf* pmu_conf) {
146
+ struct htp_context * ctx = (struct htp_context *) handle;
147
+ if (!ctx) {
148
+ return AEE_EBADPARM;
149
+ }
150
+
151
+ if (mode == HTP_PROF_PMU) {
152
+ const uint32_t* events = pmu_conf->events;
153
+
154
+ // Pack 4 event IDs (low 8 bits) into each 32-bit config register
155
+ uint32_t evtcfg = 0, evtcfg1 = 0, cfg = 0, i = 0;
156
+ for (; i < HEX_NUM_PMU_COUNTERS/2; i++) {
157
+ evtcfg |= ((events[i + 0] & 0xFF) << (i * 8));
158
+ evtcfg1 |= ((events[i + 4] & 0xFF) << (i * 8));
159
+ }
160
+
161
+ // For events >255 pack high 2 bits of all 8 event IDs into cfg register
162
+ // 2 bits per counter: bits [1:0] for counter 0, [3:2] for counter 1, etc.
163
+ for (i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
164
+ cfg |= (((events[i] >> 8) & 3) << (i * 2));
165
+ }
166
+
167
+ FARF(ALWAYS, "Configuring PMU registers: evtcfg = 0x%x, evtcfg1 = 0x%x, pmucfg = 0x%x", evtcfg, evtcfg1, cfg);
168
+
169
+ // Configure PMU registers
170
+ qurt_pmu_set(QURT_PMUCFG, cfg);
171
+ qurt_pmu_set(QURT_PMUEVTCFG, evtcfg);
172
+ qurt_pmu_set(QURT_PMUEVTCFG1, evtcfg1);
173
+ qurt_pmu_enable(1);
174
+ }
175
+
176
+ ctx->profiler = mode;
101
177
 
102
178
  return AEE_SUCCESS;
103
179
  }
@@ -114,91 +190,128 @@ AEEResult htp_iface_close(remote_handle64 handle) {
114
190
  return AEE_EITEMBUSY;
115
191
  }
116
192
 
193
+ // release the mmaps (if any)
194
+ for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
195
+ if (ctx->mmap[i].size) {
196
+ #if __HVX_ARCH__ > 73
197
+ HAP_munmap2((void *) ctx->mmap[i].base, ctx->mmap[i].size);
198
+ #else
199
+ HAP_munmap((void *) ctx->mmap[i].base, ctx->mmap[i].size);
200
+ #endif
201
+ ctx->mmap[i].size = 0;
202
+ ctx->mmap[i].base = NULL;
203
+ ctx->mmap[i].fd = -1;
204
+ }
205
+ }
206
+
207
+ if (ctx->profiler) {
208
+ qurt_pmu_enable(1);
209
+ }
210
+
211
+ if (ctx->etm) {
212
+ HAP_user_etm_disable();
213
+ }
214
+
117
215
  free(ctx);
118
216
  return AEE_SUCCESS;
119
217
  }
120
218
 
121
- AEEResult htp_iface_enable_etm(remote_handle64 handle) {
122
- int err = HAP_user_etm_enable();
123
- if (err) {
124
- if (err == AEE_EVERSIONNOTSUPPORT) {
125
- FARF(ERROR, "API HAP_user_etm_enable is not supported\n");
126
- } else {
127
- FARF(ERROR, "Error executing HAP_user_etm_enable with error code : 0x%x\n", err);
219
+ AEEResult htp_iface_mmap(remote_handle64 handle, uint32_t fd, uint32_t size) {
220
+ struct htp_context * ctx = (struct htp_context *) handle;
221
+ if (!ctx) {
222
+ return AEE_EBADPARM;
223
+ }
224
+
225
+ // See if we already have this mapping
226
+ for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
227
+ struct htp_mmap *m = &ctx->mmap[i];
228
+ if (m->fd == fd) {
229
+ return AEE_SUCCESS;
128
230
  }
129
231
  }
130
- return err;
232
+
233
+ // Add new mapping
234
+ for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
235
+ struct htp_mmap *m = &ctx->mmap[i];
236
+ if (!m->size) {
237
+ FARF(HIGH, "mmap : fd %u size %u", fd, size);
238
+ #if __HVX_ARCH__ > 73
239
+ void *va = HAP_mmap2(NULL, size, HAP_PROT_READ | HAP_PROT_WRITE, 0, fd, 0);
240
+ #else
241
+ if (size > HTP_MMAP_MAX_VMEM) { // HAP_mmap has a size limit of 2GB
242
+ FARF(ERROR, "mmap failed : size %u exceeds 2GB limit for HAP_mmap", (uint32_t) size);
243
+ abort(); // can't do much else at this point
244
+ }
245
+
246
+ void *va = HAP_mmap(NULL, size, HAP_PROT_READ | HAP_PROT_WRITE, 0, fd, 0);
247
+ #endif
248
+ if (va == (void*)-1) {
249
+ FARF(ERROR, "mmap failed : va %p fd %u size %u", va, fd, (uint32_t) size);
250
+ return AEE_EFAILED;
251
+ }
252
+
253
+ m->base = (uint64_t) va;
254
+ m->fd = fd;
255
+ m->size = size;
256
+
257
+ return AEE_SUCCESS;
258
+ }
259
+ }
260
+
261
+ return AEE_ENOMEMORY;
131
262
  }
132
263
 
133
- AEEResult htp_iface_disable_etm(remote_handle64 handle) {
134
- int err = HAP_user_etm_disable();
135
- if (err) {
136
- if (err == AEE_EVERSIONNOTSUPPORT) {
137
- FARF(ERROR, "API HAP_user_etm_disable is not supported\n");
138
- } else {
139
- FARF(ERROR, "Error executing HAP_user_etm_disable with error code : 0x%x\n", err);
264
+ AEEResult htp_iface_munmap(remote_handle64 handle, uint32 fd) {
265
+ struct htp_context * ctx = (struct htp_context *) handle;
266
+ if (!ctx) {
267
+ return AEE_EBADPARM;
268
+ }
269
+
270
+ for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
271
+ struct htp_mmap *m = &ctx->mmap[i];
272
+ if (fd < 0 || m->fd == fd) {
273
+ FARF(HIGH, "unmmap : base %p fd %u size %u", (void*) m->base, m->fd, (uint32_t) m->size);
274
+ #if __HVX_ARCH__ > 73
275
+ HAP_munmap2((void *) m->base, m->size);
276
+ #else
277
+ HAP_munmap((void *) m->base, m->size);
278
+ #endif
279
+ m->size = 0;
280
+ m->base = NULL;
281
+ m->fd = -1;
140
282
  }
141
283
  }
142
- return err;
284
+
285
+ return AEE_SUCCESS;
143
286
  }
144
287
 
145
- static int vtcm_acquire(struct htp_context * ctx) {
146
- int err;
288
+ static void vtcm_acquire(struct htp_context * ctx) {
147
289
  if (!ctx->vtcm_valid) {
148
- // Temporarily bump thread priority to make sure it's higher than other sessions.
149
- // This way the resource manager will notify the other thread to release VTCM.
150
- // Note that we need to reaquire VTCM at normal priority for this to work next time.
151
- qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10);
152
- err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
290
+ int err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000u);
153
291
  if (err != 0) {
154
- FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
292
+ FARF(ERROR, "ggml-hex: failed to acquire VTCM: 0x%08x", (unsigned)err);
155
293
  abort();
156
294
  }
157
- HAP_compute_res_release_cached(ctx->vtcm_rctx);
158
- qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio);
159
295
 
160
- err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
161
- if (err != 0) {
162
- FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
163
- abort();
164
- }
296
+ ctx->vtcm_needs_release = false;
165
297
  ctx->vtcm_valid = true;
166
- }
167
298
 
168
- ctx->vtcm_inuse = true;
169
- return 0;
299
+ // Drop the priority to make sure we get the release callback from other GGML-HTP and QNN-HTP sessions
300
+ HAP_compute_res_update_priority(ctx->vtcm_rctx, ctx->thread_prio + 10);
301
+ }
170
302
  }
171
303
 
172
- static int vtcm_release(struct htp_context * ctx) {
173
- ctx->vtcm_inuse = false;
174
-
175
- if (ctx->vtcm_valid && ctx->vtcm_needs_release) {
304
+ static void vtcm_release(struct htp_context * ctx) {
305
+ if (ctx->vtcm_valid) {
176
306
  ctx->vtcm_valid = false;
177
307
  ctx->vtcm_needs_release = false;
178
308
  HAP_compute_res_release_cached(ctx->vtcm_rctx);
179
309
  }
180
-
181
- return 0;
182
310
  }
183
311
 
184
312
  static int vtcm_release_callback(unsigned int rctx, void * state) {
185
313
  struct htp_context * ctx = (struct htp_context *) state;
186
-
187
- if (!ctx || ctx->vtcm_rctx != rctx) {
188
- return AEE_EBADPARM;
189
- }
190
-
191
- // If VTCM is not inuse (not processing Ops) release it right here
192
- // otherwise we'll release it once we're done with the current Op.
193
-
194
- if (ctx->vtcm_inuse) {
195
- ctx->vtcm_needs_release = false;
196
- return 0;
197
- }
198
-
199
- ctx->vtcm_valid = false;
200
- HAP_compute_res_release_cached(ctx->vtcm_rctx);
201
-
314
+ ctx->vtcm_needs_release = true;
202
315
  return 0;
203
316
  }
204
317
 
@@ -210,7 +323,7 @@ static int vtcm_alloc(struct htp_context * ctx) {
210
323
  HAP_compute_res_attr_init(&attr);
211
324
  HAP_compute_res_attr_set_serialize(&attr, 0);
212
325
  HAP_compute_res_attr_set_cache_mode(&attr, 1);
213
- HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, 0, vtcm_size);
326
+ HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, vtcm_size, vtcm_size); // single page
214
327
  HAP_compute_res_attr_set_release_callback(&attr, vtcm_release_callback, (void *) ctx);
215
328
  HAP_compute_res_attr_set_hmx_param(&attr, 1);
216
329
 
@@ -232,7 +345,6 @@ static int vtcm_alloc(struct htp_context * ctx) {
232
345
  ctx->vtcm_size = vtcm_size;
233
346
  ctx->vtcm_rctx = rctx;
234
347
  ctx->vtcm_valid = false;
235
- ctx->vtcm_inuse = false;
236
348
  ctx->vtcm_needs_release = false;
237
349
 
238
350
  return 0;
@@ -249,7 +361,7 @@ static void vtcm_free(struct htp_context * ctx) {
249
361
  static void htp_packet_callback(dspqueue_t queue, int error, void * context);
250
362
  static void htp_error_callback(dspqueue_t queue, int error, void * context);
251
363
 
252
- AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx) {
364
+ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx, uint32 use_hmx, uint64_t max_vmem) {
253
365
  struct htp_context * ctx = (struct htp_context *) handle;
254
366
 
255
367
  if (!ctx) {
@@ -267,12 +379,12 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
267
379
  htp_error_callback, // Error callback; no errors expected on the DSP
268
380
  (void *) ctx, // Callback context
269
381
  &ctx->queue);
270
-
271
382
  if (err) {
272
383
  FARF(ERROR, "Queue import failed with 0x%08x", (unsigned) err);
273
384
  return err;
274
385
  }
275
386
 
387
+ ctx->max_vmem = max_vmem;
276
388
  ctx->thread_id = qurt_thread_get_id();
277
389
  ctx->thread_prio = qurt_thread_get_priority(ctx->thread_id);
278
390
 
@@ -283,6 +395,19 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
283
395
  return AEE_ENOMEMORY;
284
396
  }
285
397
 
398
+ #ifdef HTP_HAS_HMX
399
+ ctx->hmx_enabled = use_hmx;
400
+ ctx->hmx_queue = NULL;
401
+ if (use_hmx) {
402
+ ctx->hmx_queue = hmx_queue_create(16, ctx->vtcm_rctx);
403
+ if (!ctx->hmx_queue) {
404
+ FARF(ERROR, "hmx-queue-create failed");
405
+ ctx->hmx_enabled = false;
406
+ }
407
+ }
408
+ FARF(HIGH, "HMX %s (use_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", use_hmx);
409
+ #endif
410
+
286
411
  qurt_sysenv_max_hthreads_t hw_threads;
287
412
  qurt_sysenv_get_max_hw_threads(&hw_threads);
288
413
  uint32_t hw_nhvx = (qurt_hvx_get_units() >> 8) & 0xFF;
@@ -299,14 +424,21 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
299
424
 
300
425
  ctx->n_threads = n_hvx;
301
426
  for (int i = 0; i < ctx->n_threads; i++) {
302
- // see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541
303
- ctx->dma[i] = dma_queue_create(64);
427
+ ctx->dma[i] = dma_queue_create(256); // queue depth
304
428
  }
305
429
 
430
+ ctx->ddr_spad_size = 512 * 1024; // 512 KB
431
+ ctx->ddr_spad_base = memalign(128, ctx->ddr_spad_size);
432
+
306
433
  // init worker pool
307
434
  err = worker_pool_init(&ctx->worker_pool, n_hvx);
308
435
  if (err != AEE_SUCCESS) {
309
436
  FARF(ERROR, "Unable to create worker pool");
437
+ if (ctx->ddr_spad_base) {
438
+ free(ctx->ddr_spad_base);
439
+ ctx->ddr_spad_base = NULL;
440
+ ctx->ddr_spad_size = 0;
441
+ }
310
442
  return err;
311
443
  }
312
444
 
@@ -344,8 +476,22 @@ AEEResult htp_iface_stop(remote_handle64 handle) {
344
476
  dma_queue_delete(ctx->dma[i]);
345
477
  }
346
478
 
479
+ #ifdef HTP_HAS_HMX
480
+ if (ctx->hmx_queue) {
481
+ hmx_queue_delete(ctx->hmx_queue);
482
+ ctx->hmx_queue = NULL;
483
+ }
484
+ ctx->hmx_enabled = false;
485
+ #endif
486
+
347
487
  vtcm_free(ctx);
348
488
 
489
+ if (ctx->ddr_spad_base) {
490
+ free(ctx->ddr_spad_base);
491
+ ctx->ddr_spad_base = NULL;
492
+ ctx->ddr_spad_size = 0;
493
+ }
494
+
349
495
  return AEE_SUCCESS;
350
496
  }
351
497
 
@@ -357,645 +503,411 @@ static void htp_error_callback(dspqueue_t queue, int error, void * context) {
357
503
  struct profile_data {
358
504
  uint64_t usecs;
359
505
  uint64_t cycles;
360
- uint64_t pkts;
506
+ uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
361
507
  };
362
508
 
363
- static inline void profile_start(struct profile_data * d) {
364
- d->usecs = HAP_perf_get_qtimer_count();
365
- d->cycles = htp_get_cycles();
366
- d->pkts = htp_get_pktcnt();
509
+ static inline void profile_start(uint32_t mode, struct profile_data * d) {
510
+ switch (mode) {
511
+ case HTP_PROF_PMU:
512
+ hex_get_pmu(d->pmu_counters);
513
+ // fallthrough
514
+ case HTP_PROF_BASIC:
515
+ d->usecs = HAP_perf_get_qtimer_count();
516
+ d->cycles = hex_get_cycles();
517
+ break;
518
+ default:
519
+ break;
520
+ }
367
521
  }
368
522
 
369
- static inline void profile_stop(struct profile_data * d) {
370
- d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
371
- d->cycles = htp_get_cycles() - d->cycles;
372
- d->pkts = htp_get_pktcnt() - d->pkts;
523
+ static inline void profile_stop(uint32_t mode, struct profile_data * d) {
524
+ uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
525
+ switch (mode) {
526
+ case HTP_PROF_PMU:
527
+ hex_get_pmu(pmu_counters);
528
+ for (int i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
529
+ d->pmu_counters[i] = pmu_counters[i] - d->pmu_counters[i];
530
+ }
531
+ // fallthrough
532
+ case HTP_PROF_BASIC:
533
+ d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
534
+ d->cycles = hex_get_cycles() - d->cycles;
535
+ break;
536
+ default:
537
+ break;
538
+ }
373
539
  }
374
540
 
375
- static int send_htp_rsp(struct htp_context * c,
376
- uint32_t op,
377
- uint32_t status,
378
- struct dspqueue_buffer * bufs,
379
- size_t n_bufs,
380
- struct profile_data * prof) {
381
- // Prep response struct
382
- struct htp_general_rsp rsp;
383
- rsp.op = op;
384
- rsp.status = status;
385
- rsp.prof_usecs = prof->usecs;
386
- rsp.prof_cycles = prof->cycles;
387
- rsp.prof_pkts = prof->pkts;
388
-
389
- int err = dspqueue_write(c->queue,
390
- 0, // Flags
391
- n_bufs,
392
- bufs, // Buffer references
393
- sizeof(rsp),
394
- (const uint8_t *) &rsp, // Message
395
- DSPQUEUE_TIMEOUT_NONE);
541
+ static int execute_op(struct htp_ops_context * octx) {
542
+ switch (octx->op) {
543
+ case HTP_OP_MUL_MAT:
544
+ return op_matmul(octx);
396
545
 
397
- if (err != 0) {
398
- FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err);
399
- }
546
+ case HTP_OP_MUL_MAT_ID:
547
+ return op_matmul_id(octx);
400
548
 
401
- return err;
402
- }
549
+ case HTP_OP_MUL:
550
+ case HTP_OP_ADD:
551
+ case HTP_OP_SUB:
552
+ case HTP_OP_DIV:
553
+ case HTP_OP_ADD_ID:
554
+ return op_binary(octx);
403
555
 
404
- static void proc_matmul_req(struct htp_context * ctx,
405
- struct htp_general_req * req,
406
- struct dspqueue_buffer * bufs,
407
- size_t n_bufs) {
408
- struct dspqueue_buffer rsp_bufs[1];
409
-
410
- // We had written to the output buffer, we'd also need to flush it
411
- rsp_bufs[0].fd = bufs[2].fd;
412
- rsp_bufs[0].ptr = bufs[2].ptr;
413
- rsp_bufs[0].size = bufs[2].size;
414
- rsp_bufs[0].offset = bufs[2].offset;
415
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
416
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
417
-
418
- // Setup Op context
419
- struct htp_ops_context octx = { 0 };
420
- octx.ctx = ctx;
421
- octx.src0 = req->src0;
422
- octx.src1 = req->src1;
423
- octx.dst = req->dst;
424
- octx.flags = req->flags;
425
- octx.op = req->op;
426
-
427
- // Update data pointers
428
- octx.src0.data = (uint32_t) bufs[0].ptr;
429
- octx.src1.data = (uint32_t) bufs[1].ptr;
430
- octx.dst.data = (uint32_t) bufs[2].ptr;
431
- octx.n_threads = ctx->n_threads;
432
-
433
- struct profile_data prof;
434
- profile_start(&prof);
435
-
436
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
437
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
438
- rsp_status = op_matmul(&octx);
439
- vtcm_release(ctx);
440
- }
441
-
442
- profile_stop(&prof);
443
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
444
- }
556
+ case HTP_OP_NORM:
557
+ case HTP_OP_RMS_NORM:
558
+ case HTP_OP_RMS_NORM_MUL:
559
+ case HTP_OP_SCALE:
560
+ case HTP_OP_SQR:
561
+ case HTP_OP_SQRT:
562
+ case HTP_OP_UNARY_SOFTPLUS:
563
+ case HTP_OP_UNARY_SIGMOID:
564
+ case HTP_OP_UNARY_NEG:
565
+ case HTP_OP_UNARY_EXP:
566
+ case HTP_OP_UNARY_TANH:
567
+ case HTP_OP_L2_NORM:
568
+ return op_unary(octx);
445
569
 
446
- static void proc_get_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
447
- struct dspqueue_buffer rsp_bufs[1];
448
-
449
- // We had written to the output buffer, we'd also need to flush it
450
- rsp_bufs[0].fd = bufs[2].fd;
451
- rsp_bufs[0].ptr = bufs[2].ptr;
452
- rsp_bufs[0].offset = bufs[2].offset;
453
- rsp_bufs[0].size = bufs[2].size;
454
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
455
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
456
-
457
- // Setup Op context
458
- struct htp_ops_context octx = { 0 };
459
- octx.ctx = ctx;
460
- octx.src0 = req->src0;
461
- octx.src1 = req->src1;
462
- octx.dst = req->dst;
463
- octx.flags = req->flags;
464
- octx.op = req->op;
465
-
466
- // Update data pointers
467
- octx.src0.data = (uint32_t) bufs[0].ptr;
468
- octx.src1.data = (uint32_t) bufs[1].ptr;
469
- octx.dst.data = (uint32_t) bufs[2].ptr;
470
- octx.n_threads = ctx->n_threads;
471
-
472
- struct profile_data prof;
473
- profile_start(&prof);
474
-
475
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
476
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
477
- rsp_status = op_get_rows(&octx);
478
- vtcm_release(ctx);
479
- }
480
-
481
- profile_stop(&prof);
482
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
483
- }
570
+ case HTP_OP_UNARY_SILU:
571
+ case HTP_OP_UNARY_GELU:
572
+ case HTP_OP_GLU_SWIGLU:
573
+ case HTP_OP_GLU_SWIGLU_OAI:
574
+ case HTP_OP_GLU_GEGLU:
575
+ return op_activations(octx);
484
576
 
485
- static void proc_matmul_id_req(struct htp_context * ctx,
486
- struct htp_general_req * req,
487
- struct dspqueue_buffer * bufs,
488
- size_t n_bufs) {
489
- struct dspqueue_buffer rsp_bufs[1];
490
-
491
- // We had written to the output buffer, we'd also need to flush it
492
- rsp_bufs[0].fd = bufs[3].fd;
493
- rsp_bufs[0].ptr = bufs[3].ptr;
494
- rsp_bufs[0].size = bufs[3].size;
495
- rsp_bufs[0].offset = bufs[3].offset;
496
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
497
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
498
-
499
- // Setup Op context
500
- struct htp_ops_context octx = { 0 };
501
- octx.ctx = ctx;
502
- octx.src0 = req->src0;
503
- octx.src1 = req->src1;
504
- octx.src2 = req->src2;
505
- octx.dst = req->dst;
506
- octx.flags = req->flags;
507
- octx.op = req->op;
508
-
509
- // Update data pointers
510
- octx.src0.data = (uint32_t) bufs[0].ptr;
511
- octx.src1.data = (uint32_t) bufs[1].ptr;
512
- octx.src2.data = (uint32_t) bufs[2].ptr;
513
- octx.dst.data = (uint32_t) bufs[3].ptr;
514
- octx.n_threads = ctx->n_threads;
515
-
516
- struct profile_data prof;
517
- profile_start(&prof);
518
-
519
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
520
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
521
- rsp_status = op_matmul_id(&octx);
522
- vtcm_release(ctx);
523
- }
524
-
525
- profile_stop(&prof);
526
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
527
- }
577
+ case HTP_OP_SOFTMAX:
578
+ return op_softmax(octx);
528
579
 
529
- static void proc_binary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
530
- struct dspqueue_buffer rsp_bufs[1];
531
-
532
- // We had written to the output buffer, we'd also need to flush it
533
- rsp_bufs[0].fd = bufs[2].fd;
534
- rsp_bufs[0].ptr = bufs[2].ptr;
535
- rsp_bufs[0].offset = bufs[2].offset;
536
- rsp_bufs[0].size = bufs[2].size;
537
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
538
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
539
-
540
- // Setup Op context
541
- struct htp_ops_context octx = { 0 };
542
- octx.ctx = ctx;
543
- octx.src0 = req->src0;
544
- octx.src1 = req->src1;
545
- octx.dst = req->dst;
546
- octx.flags = req->flags;
547
- octx.op = req->op;
548
-
549
- // Update data pointers
550
- octx.src0.data = (uint32_t) bufs[0].ptr;
551
- octx.src1.data = (uint32_t) bufs[1].ptr;
552
- octx.dst.data = (uint32_t) bufs[2].ptr;
553
- octx.n_threads = ctx->n_threads;
554
-
555
- struct profile_data prof;
556
- profile_start(&prof);
557
-
558
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
559
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
560
- rsp_status = op_binary(&octx);
561
- vtcm_release(ctx);
562
- }
563
-
564
- profile_stop(&prof);
565
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
566
- }
580
+ case HTP_OP_ROPE:
581
+ return op_rope(octx);
567
582
 
568
- static void proc_add_id_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
569
- struct dspqueue_buffer rsp_bufs[1];
570
-
571
- // We had written to the output buffer, we'd also need to flush it
572
- rsp_bufs[0].fd = bufs[3].fd;
573
- rsp_bufs[0].ptr = bufs[3].ptr;
574
- rsp_bufs[0].offset = bufs[3].offset;
575
- rsp_bufs[0].size = bufs[3].size;
576
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
577
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
578
-
579
- // Setup Op context
580
- struct htp_ops_context octx = { 0 };
581
- octx.ctx = ctx;
582
- octx.src0 = req->src0;
583
- octx.src1 = req->src1;
584
- octx.src2 = req->src2;
585
- octx.dst = req->dst;
586
- octx.flags = req->flags;
587
- octx.op = req->op;
588
-
589
- // Update data pointers
590
- octx.src0.data = (uint32_t) bufs[0].ptr;
591
- octx.src1.data = (uint32_t) bufs[1].ptr;
592
- octx.src2.data = (uint32_t) bufs[2].ptr;
593
- octx.dst.data = (uint32_t) bufs[3].ptr;
594
- octx.n_threads = ctx->n_threads;
595
-
596
- struct profile_data prof;
597
- profile_start(&prof);
598
-
599
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
600
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
601
- rsp_status = op_binary(&octx);
602
- vtcm_release(ctx);
603
- }
604
-
605
- profile_stop(&prof);
606
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
607
- }
583
+ case HTP_OP_FLASH_ATTN_EXT:
584
+ return op_flash_attn_ext(octx);
585
+
586
+ case HTP_OP_SET_ROWS:
587
+ return op_set_rows(octx);
588
+
589
+ case HTP_OP_GET_ROWS:
590
+ return op_get_rows(octx);
591
+
592
+ case HTP_OP_SUM_ROWS:
593
+ return op_sum_rows(octx);
594
+
595
+ case HTP_OP_CPY:
596
+ return op_cpy(octx);
597
+
598
+ case HTP_OP_REPEAT:
599
+ return op_repeat(octx);
608
600
 
609
- static void proc_unary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
610
- struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
601
+ case HTP_OP_ARGSORT:
602
+ return op_argsort(octx);
611
603
 
612
- // We had written to the output buffer, we'd also need to flush it
613
- rsp_bufs[0].fd = bufs[1].fd;
614
- rsp_bufs[0].ptr = bufs[1].ptr;
615
- rsp_bufs[0].offset = bufs[1].offset;
616
- rsp_bufs[0].size = bufs[1].size;
617
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
618
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
604
+ case HTP_OP_SSM_CONV:
605
+ return op_ssm_conv(octx);
619
606
 
620
- // Setup Op context
621
- struct htp_ops_context octx = { 0 };
622
- octx.ctx = ctx;
623
- octx.src0 = req->src0;
624
- octx.dst = req->dst;
625
- octx.flags = req->flags;
626
- octx.op = req->op;
607
+ case HTP_OP_CUMSUM:
608
+ return op_cumsum(octx);
627
609
 
628
- memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
610
+ case HTP_OP_FILL:
611
+ return op_fill(octx);
629
612
 
630
- // Update data pointers
631
- octx.src0.data = (uint32_t) bufs[0].ptr;
632
- octx.dst.data = (uint32_t) bufs[1].ptr;
633
- octx.n_threads = ctx->n_threads;
613
+ case HTP_OP_DIAG:
614
+ return op_diag(octx);
634
615
 
635
- struct profile_data prof;
636
- profile_start(&prof);
616
+ case HTP_OP_SOLVE_TRI:
617
+ return op_solve_tri(octx);
637
618
 
638
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
639
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
640
- rsp_status = op_unary(&octx);
641
- vtcm_release(ctx);
619
+ case HTP_OP_PAD:
620
+ return op_pad(octx);
621
+
622
+ case HTP_OP_CONCAT:
623
+ return op_concat(octx);
624
+
625
+ case HTP_OP_GATED_DELTA_NET:
626
+ return op_gated_delta_net(octx);
627
+
628
+ case HTP_OP_TRI:
629
+ return op_tri(octx);
630
+
631
+ case HTP_OP_INVALID:
632
+ break;
633
+
634
+ // No default to catch missing cases
642
635
  }
643
636
 
644
- profile_stop(&prof);
645
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
637
+ FARF(ERROR, "Unknown Op %u", octx->op);
638
+ return -1;
646
639
  }
647
640
 
648
- static void proc_activations_req(struct htp_context * ctx,
649
- struct htp_general_req * req,
650
- struct dspqueue_buffer * bufs,
651
- uint32_t n_bufs) {
652
- struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
653
-
654
- int write_idx = (n_bufs == 3) ? 2 : 1;
655
-
656
- // We had written to the output buffer, we'd also need to flush it
657
- rsp_bufs[0].fd = bufs[write_idx].fd;
658
- rsp_bufs[0].ptr = bufs[write_idx].ptr;
659
- rsp_bufs[0].offset = bufs[write_idx].offset;
660
- rsp_bufs[0].size = bufs[write_idx].size;
661
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
662
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
663
-
664
- // Setup Op context
665
- struct htp_ops_context octx = { 0 };
666
- octx.ctx = ctx;
667
- octx.src0 = req->src0;
668
- if (3 == n_bufs) {
669
- octx.src1 = req->src1;
670
- }
671
- octx.dst = req->dst;
672
- octx.flags = req->flags;
673
- octx.op = req->op;
674
-
675
- memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
676
-
677
- // Update data pointers
678
- octx.src0.data = (uint32_t) bufs[0].ptr;
679
- if (3 == n_bufs) {
680
- octx.src1.data = (uint32_t) bufs[1].ptr;
681
- octx.dst.data = (uint32_t) bufs[2].ptr;
682
- } else {
683
- octx.dst.data = (uint32_t) bufs[1].ptr;
684
- }
685
- octx.n_threads = ctx->n_threads;
686
-
687
- struct profile_data prof;
688
- profile_start(&prof);
689
-
690
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
691
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
692
- if (octx.op == HTP_OP_SOFTMAX) {
693
- rsp_status = op_softmax(&octx);
694
- } else {
695
- rsp_status = op_activations(&octx);
641
+ static inline bool reuse_buf(struct htp_context *ctx, uint32_t *m_reuse, struct htp_buf_desc *b) {
642
+ b->base = NULL;
643
+
644
+ for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
645
+ struct htp_mmap *m = ctx->mmap + i;
646
+ if (m->size && m->fd == b->fd) {
647
+ b->base = m->base;
648
+ *m_reuse |= (1 << i);
649
+ return true;
696
650
  }
697
- vtcm_release(ctx);
698
651
  }
699
652
 
700
- profile_stop(&prof);
701
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
653
+ return false;
702
654
  }
703
655
 
704
- static void proc_rope_req(struct htp_context * ctx,
705
- struct htp_general_req * req,
706
- struct dspqueue_buffer * bufs,
707
- uint32_t n_bufs) {
708
- struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
709
-
710
- int write_idx = n_bufs - 1;
711
-
712
- // We had written to the output buffer, we'd also need to flush it
713
- rsp_bufs[0].fd = bufs[write_idx].fd;
714
- rsp_bufs[0].ptr = bufs[write_idx].ptr;
715
- rsp_bufs[0].offset = bufs[write_idx].offset;
716
- rsp_bufs[0].size = bufs[write_idx].size;
717
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
718
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
719
-
720
- // Setup Op context
721
- struct htp_ops_context octx = { 0 };
722
- octx.ctx = ctx;
723
- octx.src0 = req->src0;
724
- octx.src1 = req->src1;
725
- if (4 == n_bufs) {
726
- octx.src2 = req->src2;
727
- }
728
- octx.dst = req->dst;
729
- octx.flags = req->flags;
730
- octx.op = req->op;
731
-
732
- memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
733
-
734
- // Update data pointers
735
- octx.src0.data = (uint32_t) bufs[0].ptr;
736
- octx.src1.data = (uint32_t) bufs[1].ptr;
737
- if (4 == n_bufs) {
738
- octx.src2.data = (uint32_t) bufs[2].ptr;
739
- octx.dst.data = (uint32_t) bufs[3].ptr;
740
- } else {
741
- octx.dst.data = (uint32_t) bufs[2].ptr;
742
- }
743
- octx.n_threads = ctx->n_threads;
744
-
745
- struct profile_data prof;
746
- profile_start(&prof);
747
-
748
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
749
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
750
- rsp_status = op_rope(&octx);
751
- vtcm_release(ctx);
752
- }
753
-
754
- profile_stop(&prof);
755
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
656
+ static inline void drop_mmap(struct htp_context *ctx, struct htp_mmap *m) {
657
+ if (m->size) {
658
+ FARF(HIGH, "unmap : fd %u base %p size %u", m->fd, (void*) m->base, (uint32_t) m->size);
659
+ #if __HVX_ARCH__ > 73
660
+ HAP_munmap2((void *) m->base, m->size);
661
+ #else
662
+ HAP_munmap((void *) m->base, m->size);
663
+ #endif
664
+ m->size = 0;
665
+ m->base = 0;
666
+ m->fd = -1;
667
+ }
756
668
  }
757
669
 
758
- static void proc_set_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
759
- struct dspqueue_buffer rsp_bufs[1];
760
-
761
- // We had written to the output buffer, we'd also need to flush it
762
- rsp_bufs[0].fd = bufs[2].fd;
763
- rsp_bufs[0].ptr = bufs[2].ptr;
764
- rsp_bufs[0].offset = bufs[2].offset;
765
- rsp_bufs[0].size = bufs[2].size;
766
- rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
767
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
768
-
769
- // Setup Op context
770
- struct htp_ops_context octx = { 0 };
771
- octx.ctx = ctx;
772
- octx.src0 = req->src0;
773
- octx.src1 = req->src1;
774
- octx.dst = req->dst;
775
- octx.flags = req->flags;
776
- octx.op = req->op;
777
-
778
- // Update data pointers
779
- octx.src0.data = (uint32_t) bufs[0].ptr;
780
- octx.src1.data = (uint32_t) bufs[1].ptr;
781
- octx.dst.data = (uint32_t) bufs[2].ptr;
782
- octx.n_threads = ctx->n_threads;
783
-
784
- struct profile_data prof;
785
- profile_start(&prof);
786
-
787
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
788
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
789
- rsp_status = op_set_rows(&octx);
790
- vtcm_release(ctx);
791
- }
792
-
793
- profile_stop(&prof);
794
- send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
670
+ static inline void mmap_buf(struct htp_context *ctx, struct htp_buf_desc *b) {
671
+ if (b->base) return; // already mapped
672
+
673
+ // find unused mapping
674
+ for (uint32_t i=0; i < HTP_MAX_MMAPS; i++) {
675
+ struct htp_mmap *m = &ctx->mmap[i];
676
+ if (!m->size) {
677
+ #if __HVX_ARCH__ > 73
678
+ void *va = HAP_mmap2(NULL, b->size, HAP_PROT_READ | HAP_PROT_WRITE, 0, b->fd, 0);
679
+ #else
680
+ if (b->size > HTP_MMAP_MAX_VMEM) { // HAP_mmap has a size limit of 2GB
681
+ FARF(ERROR, "mmap failed : size %u exceeds 2GB limit for HAP_mmap", (uint32_t) b->size);
682
+ abort(); // can't do much else at this point
683
+ }
684
+
685
+ void *va = HAP_mmap(NULL, b->size, HAP_PROT_READ | HAP_PROT_WRITE, 0, b->fd, 0);
686
+ #endif
687
+ if (va == (void*)-1) {
688
+ FARF(ERROR, "mmap failed : va %p fd %u size %u", va, b->fd, (uint32_t) b->size);
689
+ abort(); // can't do much else at this point
690
+ }
691
+
692
+ m->base = b->base = (uint64_t) va;
693
+ m->fd = b->fd;
694
+ m->size = b->size;
695
+
696
+ FARF(HIGH, "mmap : fd %u base %p size %u", m->fd, (void*) m->base, (uint32_t) m->size);
697
+ return;
698
+ }
699
+ }
795
700
  }
796
701
 
797
- static void proc_flash_attn_ext_req(struct htp_context * ctx,
798
- struct htp_general_req * req,
799
- struct dspqueue_buffer * bufs,
800
- uint32_t n_bufs) {
801
- // Setup Op context
802
- struct htp_ops_context octx;
803
- memset(&octx, 0, sizeof(octx));
702
+ static void prep_op_bufs(struct htp_context *ctx, struct htp_buf_desc *bufs, uint32_t n_bufs) {
703
+ uint32_t m_reuse = 0; // mmap reuse mask (index from ctx->mmap array)
704
+ uint32_t b_reuse = 0; // buf reuse count
804
705
 
805
- octx.ctx = ctx;
806
- octx.n_threads = ctx->n_threads;
706
+ uint64_t m_vmem = 0; // mapped vmem
707
+ uint64_t e_vmem = 0; // extra vmem
807
708
 
808
- octx.src0 = req->src0;
809
- octx.src1 = req->src1;
810
- octx.src2 = req->src2;
811
- octx.src3 = req->src3;
812
- octx.src4 = req->src4;
813
- octx.dst = req->dst;
814
- octx.flags = req->flags;
815
- octx.op = req->op;
709
+ // See what we can reuse
710
+ for (uint32_t i=0; i < n_bufs; i++) {
711
+ struct htp_buf_desc *b = bufs + i;
712
+ if (reuse_buf(ctx, &m_reuse, b)) { b_reuse++; } else { e_vmem += b->size; }
713
+ FARF(HIGH, "prep-buf #%u : pass0 fd %u base %p size %u flags 0x%x", i, b->fd, (void*) b->base, (uint32_t) b->size, b->flags);
714
+ }
816
715
 
817
- memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
716
+ if (b_reuse == n_bufs) return; // all bufs reuse existing mappings
818
717
 
819
- // Update data pointers
820
- octx.src0.data = (uint32_t) bufs[0].ptr;
821
- octx.src1.data = (uint32_t) bufs[1].ptr;
822
- octx.src2.data = (uint32_t) bufs[2].ptr;
718
+ // See how much vmem we have mmaped right now
719
+ for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) { m_vmem += ctx->mmap[i].size; }
823
720
 
824
- int last_buf = 3;
721
+ FARF(HIGH, "prep-bufs : pass1 mmap-vmem %zu extra-vmem %zu max-vmem %zu : n-bufs %u b-reuse %u",
722
+ (size_t) m_vmem, (size_t) e_vmem, (size_t) ctx->max_vmem, n_bufs, b_reuse);
825
723
 
826
- if (octx.src3.ne[0]) {
827
- octx.src3.data = (uint32_t) bufs[last_buf++].ptr; // mask is valid
724
+ if ((m_vmem + e_vmem) > ctx->max_vmem) {
725
+ // Drop unused mappings
726
+ for (uint32_t i=0; i < HTP_MAX_MMAPS; i++) {
727
+ bool used = m_reuse & (1<<i);
728
+ if (!used) { drop_mmap(ctx, ctx->mmap + i); }
729
+ }
828
730
  }
829
731
 
830
- if (octx.src4.ne[0]) {
831
- octx.src4.data = (uint32_t) bufs[last_buf++].ptr; // sinks is valid
732
+ // Create missing mappings
733
+ for (uint32_t i=0; i < n_bufs; i++) {
734
+ struct htp_buf_desc *b = bufs + i;
735
+ mmap_buf(ctx, b);
736
+ FARF(HIGH, "prep-buf #%u : pass1 fd %u base %p size %u flags 0x%x", i, b->fd, (void*) b->base, (uint32_t) b->size, b->flags);
832
737
  }
738
+ }
739
+
740
+ static void prep_tensor(struct htp_context *ctx, struct htp_buf_desc *bufs, uint32_t idx, struct htp_tensor *t) {
741
+ uint32_t offset = t->data;
742
+ uint32_t size = t->size;
743
+ uint32_t bi = t->bi;
833
744
 
834
- octx.dst.data = (uint32_t) bufs[last_buf].ptr;
745
+ t->data = bufs[bi].base + offset; // update data to the actual pointer
835
746
 
836
- struct profile_data prof;
837
- profile_start(&prof);
747
+ FARF(HIGH, "prep-tensor #%u: bi %u offset %u size %u data %p : %u:%u:%u:%u", idx, t->bi, offset, t->size, (void*) t->data,
748
+ t->ne[0], t->ne[1], t->ne[3], t->ne[3]);
749
+ }
838
750
 
839
- uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
840
- if (vtcm_acquire(ctx) == AEE_SUCCESS) {
841
- rsp_status = op_flash_attn_ext(&octx);
842
- vtcm_release(ctx);
751
+ static void prep_tensors(struct htp_context *ctx, struct htp_buf_desc *bufs, struct htp_tensor *tens, uint32_t n_tens) {
752
+ for (uint32_t i=0; i < n_tens; i++) {
753
+ prep_tensor(ctx, bufs, i, tens + i);
843
754
  }
755
+ }
756
+
757
+ static void proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens, uint32_t idx, struct htp_op_desc * op) {
758
+ memcpy(octx->op_params, op->params, sizeof(octx->op_params));
759
+ octx->flags = op->flags;
760
+ octx->op = op->opcode;
761
+
762
+ FARF(HIGH, "proc-op #%u: opcode %u flags 0x%x", idx, octx->op, octx->flags);
844
763
 
845
- profile_stop(&prof);
764
+ // Prep input tensors
765
+ for (uint32_t i=0; i<HTP_OP_MAX_INPUTS; i++) {
766
+ struct htp_tensor *src = op->src[i] == 0xffff ? NULL : tens + op->src[i];
846
767
 
847
- struct dspqueue_buffer rsp_buf = bufs[last_buf];
848
- rsp_buf.flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
849
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
768
+ octx->src[i] = src;
769
+ if (!src) continue;
770
+
771
+ if (!(src->flags & HTP_TENSOR_FLUSHED) && (src->flags & HTP_TENSOR_COMPUTE)) {
772
+ // flush compute buffers on input
773
+ hex_l2flush((void *) src->data, src->size);
774
+ }
850
775
 
851
- send_htp_rsp(ctx, req->op, rsp_status, &bufs[last_buf], 1, &prof);
776
+ FARF(HIGH, "prep-src #%u: data %p size %u : %u:%u:%u:%u", op->src[i], (void*) src->data, src->size,
777
+ src->ne[0], src->ne[1], src->ne[3], src->ne[3]);
778
+ }
779
+
780
+ // Prep output tensor
781
+ struct htp_tensor *dst = tens + op->dst;
782
+
783
+ octx->dst = dst;
784
+
785
+ FARF(HIGH, "prep-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size,
786
+ dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]);
787
+
788
+ (void) execute_op(octx);
789
+
790
+ // flush buffers on output
791
+ hex_l2flush((void *) dst->data, dst->size);
792
+ dst->flags |= HTP_TENSOR_FLUSHED;
793
+
794
+ FARF(HIGH, "post-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size,
795
+ dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]);
852
796
  }
853
797
 
798
+ #define DSPQUEUE_POLL_TIMEOUT_USEC 100
799
+ #define DSPQUEUE_POLL_COUNT 100
800
+
854
801
  static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
855
802
  struct htp_context * ctx = (struct htp_context *) context;
856
803
 
857
- // Repeatedly read packets from the queue until it's empty. We don't
858
- // necessarily get a separate callback for each packet, and new packets
859
- // may arrive while we're processing the previous one. This ensures we
860
- // keep the DSP busy as much as possible and avoid waiting for the CPU.
804
+ int err;
805
+
806
+ uint32_t poll_count = DSPQUEUE_POLL_COUNT;
861
807
 
862
- while (1) {
863
- struct htp_general_req req;
864
- uint32_t req_size;
808
+ vtcm_acquire(ctx);
865
809
 
866
- struct dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
867
- uint32_t n_bufs;
868
- uint32_t flags;
810
+ while (!ctx->vtcm_needs_release) {
811
+ struct htp_opbatch_req req;
812
+ uint32_t r_size = sizeof(req);
869
813
 
870
- // Read packet from queue
871
- int err = dspqueue_read_noblock(queue, &flags,
872
- HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
873
- &n_bufs, // Number of buffer references
874
- bufs, // Buffer references
875
- sizeof(req), // Max message length
876
- &req_size, // Message length
877
- (uint8_t *) &req); // Message
814
+ struct dspqueue_buffer dbuf;
815
+ uint32_t n_dbufs = 1;
816
+ uint32_t flags = 0;
878
817
 
818
+ err = dspqueue_read_noblock(queue, &flags, n_dbufs, &n_dbufs, &dbuf, r_size, &r_size, (uint8_t *) &req);
879
819
  if (err == AEE_EWOULDBLOCK) {
880
- // Consumed all packets available for now
881
- return;
820
+ if (--poll_count) {
821
+ qurt_sleep(DSPQUEUE_POLL_TIMEOUT_USEC);
822
+ continue;
823
+ }
824
+ break;
882
825
  }
883
826
 
884
827
  if (err != 0) {
885
828
  FARF(ERROR, "dspqueue_read_noblock failed: 0x%08x", (unsigned) err);
886
- return;
829
+ break;
887
830
  }
888
831
 
889
- if (req_size != sizeof(req)) {
890
- FARF(ERROR, "Invalid request size");
832
+ if (r_size < sizeof(req) || n_dbufs != 1) {
833
+ FARF(ERROR, "invalid request : size %u n-dbufs %u", r_size, n_dbufs);
891
834
  continue;
892
835
  }
893
836
 
894
- if (req.flags & HTP_OPFLAGS_EARLY_WAKEUP) {
895
- // Host wants early notification
896
- dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);
837
+ // Reset poll count for valid requests
838
+ poll_count = DSPQUEUE_POLL_COUNT;
839
+
840
+ const uint32_t n_bufs = req.n_bufs;
841
+ const uint32_t n_tens = req.n_tensors;
842
+ const uint32_t n_ops = req.n_ops;
843
+
844
+ const uint32_t b_size = sizeof(struct htp_buf_desc) * n_bufs;
845
+ const uint32_t t_size = sizeof(struct htp_tensor) * n_tens;
846
+ const uint32_t o_size = sizeof(struct htp_op_desc) * n_ops;
847
+ const uint32_t p_size = sizeof(struct htp_prof_desc) * n_ops;
848
+
849
+ if (dbuf.size < b_size + t_size + o_size + p_size) {
850
+ FARF(ERROR, "invalid opbatch memory block size %u", dbuf.size);
851
+ break;
897
852
  }
898
853
 
899
- // Process packet based on its message type
900
- switch (req.op) {
901
- case HTP_OP_MUL_MAT:
902
- if (n_bufs != 3) {
903
- FARF(ERROR, "Bad matmul-req buffer list");
904
- continue;
905
- }
906
- proc_matmul_req(ctx, &req, bufs, n_bufs);
907
- break;
854
+ FARF(HIGH, "processing opbatch #%u: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u", req.id,
855
+ n_bufs, n_tens, n_ops, dbuf.size, b_size, t_size, o_size);
908
856
 
909
- case HTP_OP_MUL_MAT_ID:
910
- if (n_bufs != 4) {
911
- FARF(ERROR, "Bad matmul-id-req buffer list");
912
- continue;
913
- }
914
- proc_matmul_id_req(ctx, &req, bufs, n_bufs);
915
- break;
916
-
917
- case HTP_OP_MUL:
918
- case HTP_OP_ADD:
919
- case HTP_OP_SUB:
920
- if (n_bufs != 3) {
921
- FARF(ERROR, "Bad binary-req buffer list");
922
- continue;
923
- }
924
- proc_binary_req(ctx, &req, bufs);
925
- break;
926
-
927
- case HTP_OP_RMS_NORM:
928
- case HTP_OP_SCALE:
929
- if (n_bufs != 2) {
930
- FARF(ERROR, "Bad unary-req buffer list");
931
- continue;
932
- }
857
+ // Setup descriptor pointers
858
+ uint8_t * m_ptr = dbuf.ptr;
859
+ struct htp_buf_desc* bufs = (struct htp_buf_desc*) m_ptr; m_ptr += b_size;
860
+ struct htp_tensor* tens = (struct htp_tensor*) m_ptr; m_ptr += t_size;
861
+ struct htp_op_desc* ops = (struct htp_op_desc*) m_ptr; m_ptr += o_size;
862
+ struct htp_prof_desc* pds = (struct htp_prof_desc*) m_ptr;
933
863
 
934
- proc_unary_req(ctx, &req, bufs);
935
- break;
864
+ prep_op_bufs(ctx, bufs, n_bufs);
865
+ prep_tensors(ctx, bufs, tens, n_tens);
936
866
 
937
- case HTP_OP_UNARY_SILU:
938
- case HTP_OP_UNARY_GELU:
939
- if (n_bufs != 2) {
940
- FARF(ERROR, "Bad act-req buffer list");
941
- continue;
942
- }
943
- proc_activations_req(ctx, &req, bufs, n_bufs);
944
- break;
945
-
946
- case HTP_OP_GLU_SWIGLU:
947
- case HTP_OP_GLU_SWIGLU_OAI:
948
- case HTP_OP_SOFTMAX:
949
- if ((n_bufs != 2) && (n_bufs != 3)) {
950
- FARF(ERROR, "Bad act-req buffer list");
951
- continue;
952
- }
953
- proc_activations_req(ctx, &req, bufs, n_bufs);
954
- break;
867
+ struct htp_ops_context *octx = &ctx->octx;
868
+ memset(octx, 0, sizeof(*octx));
869
+ octx->n_threads = ctx->n_threads;
870
+ octx->ctx = ctx;
955
871
 
956
- case HTP_OP_ADD_ID:
957
- if (n_bufs != 4) {
958
- FARF(ERROR, "Bad add-id-req buffer list");
959
- continue;
960
- }
961
- proc_add_id_req(ctx, &req, bufs);
962
- break;
872
+ for (uint32_t i=0; i < n_ops; i++) {
873
+ struct profile_data prof;
963
874
 
964
- case HTP_OP_ROPE:
965
- if ((n_bufs != 3) && (n_bufs != 4)) {
966
- FARF(ERROR, "Bad rope-req buffer list");
967
- continue;
968
- }
969
- proc_rope_req(ctx, &req, bufs, n_bufs);
970
- break;
875
+ if (i == (n_ops-1)) {
876
+ // wake up the host before starting the last op
877
+ dspqueue_write_early_wakeup_noblock(queue, 0, 0);
878
+ }
971
879
 
972
- case HTP_OP_FLASH_ATTN_EXT:
973
- if (!(n_bufs >= 4 && n_bufs <= 6)) {
974
- FARF(ERROR, "Bad flash-attn-ext-req buffer list");
975
- continue;
976
- }
977
- proc_flash_attn_ext_req(ctx, &req, bufs, n_bufs);
978
- break;
880
+ profile_start(ctx->profiler, &prof);
979
881
 
980
- case HTP_OP_SET_ROWS:
981
- if (n_bufs != 3) {
982
- FARF(ERROR, "Bad set-rows-req buffer list");
983
- continue;
984
- }
985
- proc_set_rows_req(ctx, &req, bufs);
986
- break;
882
+ proc_op_req(octx, tens, i, &ops[i]);
883
+
884
+ profile_stop(ctx->profiler, &prof);
987
885
 
988
- case HTP_OP_GET_ROWS:
989
- if (n_bufs != 3) {
990
- FARF(ERROR, "Bad get-rows-req buffer list");
991
- continue;
886
+ if (ctx->profiler) {
887
+ pds[i].opcode = ops[i].opcode;
888
+ pds[i].usecs = prof.usecs;
889
+ pds[i].cycles = prof.cycles;
890
+ for (int j = 0; j < HEX_NUM_PMU_COUNTERS; j++) {
891
+ pds[i].pmu[j] = prof.pmu_counters[j];
992
892
  }
993
- proc_get_rows_req(ctx, &req, bufs);
994
- break;
893
+ }
894
+ }
895
+
896
+ struct htp_opbatch_rsp rsp;
897
+ rsp.id = req.id;
898
+ rsp.status = HTP_STATUS_OK;
899
+ rsp.n_bufs = n_bufs;
900
+ rsp.n_tensors = n_tens;
901
+ rsp.n_ops = n_ops;
902
+
903
+ dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
995
904
 
996
- default:
997
- FARF(ERROR, "Unknown Op %u", req.op);
998
- break;
905
+ err = dspqueue_write(queue, 0, 1, &dbuf, sizeof(rsp), (const uint8_t *) &rsp, DSPQUEUE_TIMEOUT_NONE);
906
+ if (err != 0) {
907
+ FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err);
908
+ break;
999
909
  }
1000
910
  }
911
+
912
+ vtcm_release(ctx);
1001
913
  }