whispercpp 1.3.5 → 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1017) hide show
  1. checksums.yaml +4 -4
  2. data/.document +3 -0
  3. data/.rdoc_options +2 -0
  4. data/LICENSE +1 -1
  5. data/README.md +133 -3
  6. data/Rakefile +18 -3
  7. data/ext/dependencies.rb +10 -4
  8. data/ext/dependencies_for_windows.rb +17 -0
  9. data/ext/extconf.rb +20 -7
  10. data/ext/options.rb +54 -14
  11. data/ext/options_for_windows.rb +51 -0
  12. data/ext/ruby_whisper.c +56 -46
  13. data/ext/ruby_whisper.h +165 -2
  14. data/ext/ruby_whisper_context.c +297 -126
  15. data/ext/ruby_whisper_context_params.c +163 -0
  16. data/ext/ruby_whisper_log_queue.c +180 -0
  17. data/ext/ruby_whisper_log_settable.h +47 -0
  18. data/ext/ruby_whisper_model.c +0 -1
  19. data/ext/ruby_whisper_parakeet.c +49 -0
  20. data/ext/ruby_whisper_parakeet_context.c +304 -0
  21. data/ext/ruby_whisper_parakeet_context_params.c +117 -0
  22. data/ext/ruby_whisper_parakeet_model.c +84 -0
  23. data/ext/ruby_whisper_parakeet_params.c +548 -0
  24. data/ext/ruby_whisper_parakeet_segment.c +157 -0
  25. data/ext/ruby_whisper_parakeet_token.c +188 -0
  26. data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
  27. data/ext/ruby_whisper_params.c +256 -66
  28. data/ext/ruby_whisper_segment.c +6 -7
  29. data/ext/ruby_whisper_token.c +29 -9
  30. data/ext/ruby_whisper_transcribe.cpp +46 -16
  31. data/ext/ruby_whisper_vad_context.c +48 -1
  32. data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
  33. data/ext/ruby_whisper_vad_params.c +0 -1
  34. data/ext/ruby_whisper_vad_segment.c +0 -1
  35. data/ext/ruby_whisper_vad_segments.c +0 -1
  36. data/ext/sources/CMakeLists.txt +41 -3
  37. data/ext/sources/CMakePresets.json +95 -0
  38. data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
  39. data/ext/sources/cmake/parakeet.pc.in +10 -0
  40. data/ext/sources/cmake/whisper-config.cmake.in +5 -40
  41. data/ext/sources/cmake/whisper.pc.in +1 -1
  42. data/ext/sources/examples/CMakeLists.txt +4 -2
  43. data/ext/sources/examples/bench/bench.cpp +24 -19
  44. data/ext/sources/examples/cli/cli.cpp +51 -9
  45. data/ext/sources/examples/common-ggml.cpp +4 -0
  46. data/ext/sources/examples/common-whisper.cpp +139 -67
  47. data/ext/sources/examples/common-whisper.h +11 -0
  48. data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
  49. data/ext/sources/examples/miniaudio.h +4507 -2131
  50. data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
  51. data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
  52. data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
  53. data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
  54. data/ext/sources/examples/server/server.cpp +213 -163
  55. data/ext/sources/ggml/CMakeLists.txt +29 -15
  56. data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
  57. data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
  58. data/ext/sources/ggml/include/ggml-alloc.h +1 -0
  59. data/ext/sources/ggml/include/ggml-backend.h +73 -11
  60. data/ext/sources/ggml/include/ggml-cann.h +1 -1
  61. data/ext/sources/ggml/include/ggml-cpu.h +5 -0
  62. data/ext/sources/ggml/include/ggml-cuda.h +3 -0
  63. data/ext/sources/ggml/include/ggml-openvino.h +37 -0
  64. data/ext/sources/ggml/include/ggml-opt.h +1 -1
  65. data/ext/sources/ggml/include/ggml-rpc.h +8 -3
  66. data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
  67. data/ext/sources/ggml/include/ggml.h +155 -16
  68. data/ext/sources/ggml/include/gguf.h +10 -2
  69. data/ext/sources/ggml/src/CMakeLists.txt +25 -5
  70. data/ext/sources/ggml/src/ggml-alloc.c +9 -10
  71. data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
  72. data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
  73. data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
  74. data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
  75. data/ext/sources/ggml/src/ggml-backend-reg.cpp +40 -86
  76. data/ext/sources/ggml/src/ggml-backend.cpp +114 -10
  77. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  78. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +10 -2
  79. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
  80. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
  81. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +1016 -442
  82. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +111 -85
  83. data/ext/sources/ggml/src/ggml-cann/common.h +23 -14
  84. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +255 -92
  85. data/ext/sources/ggml/src/ggml-common.h +22 -0
  86. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +68 -34
  87. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +44 -19
  88. data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
  89. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +101 -101
  90. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +194 -1
  91. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2874 -613
  92. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
  93. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
  94. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +5480 -840
  95. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1361 -0
  96. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -11
  97. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
  98. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +186 -36
  99. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +119 -19
  100. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +112 -26
  101. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
  102. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  103. data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
  104. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -0
  105. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +153 -16
  106. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +17 -0
  107. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
  108. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +976 -251
  109. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +671 -266
  110. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1277 -263
  111. data/ext/sources/ggml/src/ggml-cpu/ops.h +4 -0
  112. data/ext/sources/ggml/src/ggml-cpu/quants.c +95 -0
  113. data/ext/sources/ggml/src/ggml-cpu/quants.h +6 -0
  114. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2893 -679
  115. data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
  116. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +226 -0
  117. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +114 -19
  118. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
  119. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
  120. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
  121. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  122. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  123. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  124. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
  125. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  126. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  127. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  128. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  129. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  130. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  131. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  132. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  133. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
  134. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +54 -53
  135. data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
  136. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +18 -8
  137. data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
  138. data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  139. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +73 -28
  140. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +69 -41
  141. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
  142. data/ext/sources/ggml/src/ggml-cuda/common.cuh +359 -29
  143. data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
  144. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
  145. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
  146. data/ext/sources/ggml/src/ggml-cuda/convert.cu +94 -27
  147. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
  148. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +20 -9
  149. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
  150. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +333 -85
  151. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +632 -190
  152. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
  153. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +162 -49
  154. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +43 -18
  155. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +44 -14
  156. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
  157. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +241 -23
  158. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  159. data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
  160. data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
  161. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +312 -0
  162. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  163. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
  164. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1454 -599
  165. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
  166. data/ext/sources/ggml/src/ggml-cuda/mean.cu +13 -10
  167. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +397 -183
  168. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
  169. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +161 -88
  170. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
  171. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +522 -431
  172. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +139 -72
  173. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
  174. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +608 -88
  175. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -0
  176. data/ext/sources/ggml/src/ggml-cuda/norm.cu +47 -79
  177. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
  178. data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
  179. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +134 -27
  180. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
  181. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +7 -17
  182. data/ext/sources/ggml/src/ggml-cuda/rope.cu +244 -137
  183. data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
  184. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
  185. data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
  186. data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
  187. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
  188. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
  189. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
  190. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +96 -40
  191. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  192. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
  193. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
  194. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
  195. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +6 -0
  196. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
  197. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +2 -0
  198. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
  199. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +6 -0
  200. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +2 -0
  201. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
  202. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
  203. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +2 -0
  204. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
  205. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +2 -0
  206. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
  207. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  208. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  209. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  210. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  211. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  212. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  213. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  214. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  225. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -5
  226. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +202 -135
  227. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
  228. data/ext/sources/ggml/src/ggml-cuda/unary.cu +86 -2
  229. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +4 -0
  230. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +111 -17
  231. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
  232. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +30 -2
  233. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  234. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +84 -46
  235. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1612 -753
  236. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +51 -11
  237. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +361 -261
  238. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +294 -0
  239. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +753 -241
  240. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
  241. data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
  242. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +295 -0
  243. data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  244. data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  245. data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  246. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +471 -296
  247. data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
  248. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +159 -53
  249. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +3 -3
  250. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +372 -0
  251. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +86 -0
  252. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  253. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +137 -0
  254. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
  255. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
  256. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
  257. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
  258. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  259. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  260. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  261. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  262. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +97 -14
  263. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +163 -67
  264. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
  265. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  266. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +308 -0
  267. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +262 -0
  268. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +291 -0
  269. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  270. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +216 -0
  271. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
  272. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  273. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  274. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
  275. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
  276. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  277. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  278. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  279. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +142 -0
  280. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
  281. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  282. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  283. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -1348
  284. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +547 -635
  285. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +3556 -1101
  286. data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
  287. data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  288. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +475 -269
  289. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +94 -72
  290. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +222 -217
  291. data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  292. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +432 -0
  293. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  294. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +886 -117
  295. data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  296. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
  297. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  298. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  299. data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
  300. data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
  301. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +40 -0
  302. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +28 -9
  303. data/ext/sources/ggml/src/ggml-impl.h +68 -1
  304. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
  305. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
  306. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
  307. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
  308. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +409 -83
  309. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +54 -5
  310. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +254 -52
  311. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +254 -23
  312. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +756 -285
  313. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +7 -4
  314. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +359 -133
  315. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1867 -1123
  316. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
  317. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +71 -4
  318. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +14127 -5314
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +97 -88
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +104 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  322. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1978 -67
  323. data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  324. data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  325. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
  326. data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
  327. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
  328. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
  329. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
  330. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
  331. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
  332. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
  333. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
  334. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
  335. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  336. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl +129 -0
  344. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  345. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
  346. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
  347. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
  348. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
  349. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
  350. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
  351. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
  352. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
  353. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  354. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
  355. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
  356. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  357. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  358. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
  359. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
  360. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  361. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  362. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl +195 -0
  363. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
  364. data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  365. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
  366. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  367. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  368. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  369. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  370. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  371. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  372. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
  373. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
  374. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  375. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  376. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  377. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  378. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  379. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  380. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  381. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  382. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
  383. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
  384. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
  385. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
  386. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  387. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  388. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
  389. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +178 -0
  390. data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  391. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
  392. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
  393. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
  394. data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  395. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
  396. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
  397. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
  398. data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  399. data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
  400. data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  401. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +985 -0
  402. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  403. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +380 -0
  404. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  405. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1132 -0
  406. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +956 -0
  407. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  408. data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  409. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  410. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  411. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  412. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  413. data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  414. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  415. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  416. data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  417. data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  418. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  419. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  420. data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  421. data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  422. data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  423. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  424. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +149 -0
  425. data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  426. data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  427. data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  428. data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  429. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  430. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  431. data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  432. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +47 -0
  433. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +40 -0
  434. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  435. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  436. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  437. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  438. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  439. data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  440. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +317 -0
  441. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  442. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +257 -0
  443. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +86 -0
  444. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +880 -0
  445. data/ext/sources/ggml/src/ggml-openvino/utils.h +143 -0
  446. data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
  447. data/ext/sources/ggml/src/ggml-quants.c +385 -119
  448. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  449. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
  450. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
  451. data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
  452. data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
  453. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +64 -91
  454. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
  455. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +4 -1
  456. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
  457. data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
  458. data/ext/sources/ggml/src/ggml-sycl/common.hpp +356 -11
  459. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +184 -14
  460. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +31 -1
  461. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
  462. data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  463. data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  464. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
  465. data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
  466. data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
  467. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
  468. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
  469. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +77 -156
  470. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -2
  471. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  472. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  473. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1181 -0
  474. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +59 -0
  475. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1246 -0
  476. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +674 -0
  477. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +227 -0
  478. data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
  479. data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
  480. data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
  481. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +347 -0
  482. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +9 -0
  483. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  484. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
  485. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +1134 -236
  486. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
  487. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
  488. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
  489. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
  490. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
  491. data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
  492. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
  493. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
  494. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +72 -1
  495. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
  496. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
  497. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
  498. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
  499. data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  500. data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  501. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
  502. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  503. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  504. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
  505. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
  506. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  507. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  508. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  509. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  510. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  511. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  512. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  513. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  514. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  515. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  516. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +8 -0
  517. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +8 -0
  518. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +8 -0
  519. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +8 -0
  520. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +8 -0
  521. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +8 -0
  522. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +8 -0
  523. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +8 -0
  524. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +8 -0
  525. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +8 -0
  526. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +8 -0
  527. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +8 -0
  528. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +8 -0
  529. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +8 -0
  530. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +8 -0
  531. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +8 -0
  532. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +8 -0
  533. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +8 -0
  534. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +8 -0
  535. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +8 -0
  536. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +8 -0
  537. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +8 -0
  538. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +8 -0
  539. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +8 -0
  540. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +8 -0
  541. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +8 -0
  542. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +8 -0
  543. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +8 -0
  544. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +8 -0
  545. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +8 -0
  546. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +8 -0
  547. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +8 -0
  548. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +8 -0
  549. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +8 -0
  550. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +8 -0
  551. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +8 -0
  552. data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
  553. data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
  554. data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
  555. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +228 -53
  556. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
  557. data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  558. data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  559. data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  560. data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  561. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  562. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  563. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  564. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  565. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  566. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  567. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  568. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  569. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  570. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  571. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  572. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  573. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  574. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  575. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  576. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  577. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  578. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +123 -0
  579. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +160 -0
  580. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  581. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +71 -0
  582. data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  583. data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  584. data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  585. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  586. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  587. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  588. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  589. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  590. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  591. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  592. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +99 -0
  593. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  594. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  595. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  596. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +545 -0
  597. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +115 -0
  598. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
  599. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3250 -940
  600. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  601. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
  602. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
  603. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
  604. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
  605. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
  606. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
  607. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
  608. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
  609. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  610. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  611. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
  612. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
  613. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  614. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
  615. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
  616. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +533 -180
  617. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +113 -68
  618. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +412 -222
  619. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +222 -83
  620. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
  621. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  622. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  623. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
  624. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
  625. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +189 -0
  626. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
  627. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
  628. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
  629. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
  630. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
  631. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
  632. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
  633. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
  634. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
  635. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
  636. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
  637. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
  638. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
  639. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
  640. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
  641. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +22 -20
  642. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +51 -14
  643. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
  644. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  645. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
  646. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +39 -63
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +13 -7
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +27 -11
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +193 -149
  663. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
  664. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +3221 -97
  665. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3493 -1997
  666. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
  667. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  668. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  669. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  670. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  671. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +142 -0
  672. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +115 -141
  673. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +93 -0
  674. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  675. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -44
  676. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  677. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +198 -230
  678. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
  679. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
  680. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  681. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  682. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
  683. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
  684. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +234 -335
  685. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  686. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  687. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +871 -42
  688. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  689. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
  690. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  691. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +149 -0
  692. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +36 -138
  693. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +151 -0
  694. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
  695. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
  696. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  697. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
  698. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
  699. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  700. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  701. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
  702. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  703. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +15 -40
  704. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  705. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +39 -12
  706. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
  707. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
  708. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  709. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  710. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  711. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  712. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +213 -0
  713. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  714. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +24 -15
  715. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
  716. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +253 -16
  717. data/ext/sources/ggml/src/ggml.c +268 -52
  718. data/ext/sources/ggml/src/gguf.cpp +377 -47
  719. data/ext/sources/include/parakeet.h +342 -0
  720. data/ext/sources/include/whisper.h +10 -0
  721. data/ext/sources/media/matmul.png +0 -0
  722. data/ext/sources/src/CMakeLists.txt +23 -0
  723. data/ext/sources/src/parakeet-arch.h +188 -0
  724. data/ext/sources/src/parakeet.cpp +3838 -0
  725. data/ext/sources/src/whisper.cpp +62 -40
  726. data/extsources.rb +26 -10
  727. data/lib/whisper/log_settable.rb +36 -0
  728. data/lib/whisper/model/uri.rb +13 -1
  729. data/lib/whisper/output.rb +74 -0
  730. data/sig/whisper.rbs +445 -55
  731. data/test/helper.rb +2 -0
  732. data/test/jfk_reader/jfk_reader.c +50 -7
  733. data/test/test_callback.rb +1 -0
  734. data/test/test_context_params.rb +82 -0
  735. data/test/test_package.rb +6 -5
  736. data/test/test_parakeet.rb +28 -0
  737. data/test/test_parakeet_callback.rb +107 -0
  738. data/test/test_parakeet_context.rb +116 -0
  739. data/test/test_parakeet_context_params.rb +24 -0
  740. data/test/test_parakeet_model.rb +21 -0
  741. data/test/test_parakeet_params.rb +78 -0
  742. data/test/test_parakeet_segment.rb +42 -0
  743. data/test/test_parakeet_token.rb +73 -0
  744. data/test/test_params.rb +2 -0
  745. data/test/test_token.rb +11 -0
  746. data/test/test_vad_context.rb +58 -8
  747. data/test/test_vad_segment.rb +1 -1
  748. data/test/test_whisper.rb +44 -6
  749. data/whispercpp.gemspec +2 -2
  750. metadata +426 -280
  751. data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
  752. data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
  753. data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
  754. data/ext/sources/bindings/javascript/package.json +0 -26
  755. data/ext/sources/bindings/javascript/whisper.js +0 -19
  756. data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
  757. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
  758. data/ext/sources/examples/addon.node/addon.cpp +0 -557
  759. data/ext/sources/examples/addon.node/index.js +0 -59
  760. data/ext/sources/examples/addon.node/package.json +0 -16
  761. data/ext/sources/examples/addon.node/vad-example.js +0 -132
  762. data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
  763. data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
  764. data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
  765. data/ext/sources/examples/coi-serviceworker.js +0 -146
  766. data/ext/sources/examples/command/CMakeLists.txt +0 -10
  767. data/ext/sources/examples/command/command.cpp +0 -802
  768. data/ext/sources/examples/command/commands.txt +0 -9
  769. data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
  770. data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
  771. data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
  772. data/ext/sources/examples/generate-karaoke.sh +0 -57
  773. data/ext/sources/examples/helpers.js +0 -191
  774. data/ext/sources/examples/livestream.sh +0 -112
  775. data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
  776. data/ext/sources/examples/lsp/lsp.cpp +0 -471
  777. data/ext/sources/examples/lsp/whisper.vim +0 -362
  778. data/ext/sources/examples/python/test_whisper_processor.py +0 -7
  779. data/ext/sources/examples/python/whisper_processor.py +0 -54
  780. data/ext/sources/examples/server/bench.js +0 -29
  781. data/ext/sources/examples/server.py +0 -120
  782. data/ext/sources/examples/stream/CMakeLists.txt +0 -10
  783. data/ext/sources/examples/stream/stream.cpp +0 -437
  784. data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
  785. data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
  786. data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
  787. data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
  788. data/ext/sources/examples/sycl/build.sh +0 -22
  789. data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
  790. data/ext/sources/examples/sycl/run-whisper.sh +0 -17
  791. data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -47
  792. data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
  793. data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -494
  794. data/ext/sources/examples/talk-llama/llama-adapter.h +0 -88
  795. data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2559
  796. data/ext/sources/examples/talk-llama/llama-arch.h +0 -586
  797. data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -917
  798. data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
  799. data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -876
  800. data/ext/sources/examples/talk-llama/llama-chat.h +0 -70
  801. data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3645
  802. data/ext/sources/examples/talk-llama/llama-context.h +0 -360
  803. data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
  804. data/ext/sources/examples/talk-llama/llama-cparams.h +0 -42
  805. data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
  806. data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
  807. data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2282
  808. data/ext/sources/examples/talk-llama/llama-graph.h +0 -910
  809. data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -241
  810. data/ext/sources/examples/talk-llama/llama-hparams.h +0 -284
  811. data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
  812. data/ext/sources/examples/talk-llama/llama-impl.h +0 -63
  813. data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
  814. data/ext/sources/examples/talk-llama/llama-io.h +0 -35
  815. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -328
  816. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
  817. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2100
  818. data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -390
  819. data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
  820. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
  821. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
  822. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1167
  823. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
  824. data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
  825. data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
  826. data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -735
  827. data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
  828. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1247
  829. data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -176
  830. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -285
  831. data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -37
  832. data/ext/sources/examples/talk-llama/llama-model.cpp +0 -8338
  833. data/ext/sources/examples/talk-llama/llama-model.h +0 -544
  834. data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1072
  835. data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
  836. data/ext/sources/examples/talk-llama/llama-sampling.cpp +0 -3771
  837. data/ext/sources/examples/talk-llama/llama-sampling.h +0 -44
  838. data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3900
  839. data/ext/sources/examples/talk-llama/llama-vocab.h +0 -182
  840. data/ext/sources/examples/talk-llama/llama.cpp +0 -1140
  841. data/ext/sources/examples/talk-llama/llama.h +0 -1540
  842. data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -191
  843. data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
  844. data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
  845. data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -138
  846. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
  847. data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -122
  848. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -144
  849. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -135
  850. data/ext/sources/examples/talk-llama/models/bert.cpp +0 -178
  851. data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -160
  852. data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
  853. data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
  854. data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
  855. data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
  856. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
  857. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
  858. data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
  859. data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -123
  860. data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
  861. data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -144
  862. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -259
  863. data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -134
  864. data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
  865. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -150
  866. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
  867. data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
  868. data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
  869. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -113
  870. data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
  871. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
  872. data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
  873. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
  874. data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
  875. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
  876. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
  877. data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -150
  878. data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
  879. data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
  880. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -196
  881. data/ext/sources/examples/talk-llama/models/granite.cpp +0 -211
  882. data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +0 -283
  883. data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
  884. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -141
  885. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
  886. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -154
  887. data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
  888. data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
  889. data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
  890. data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -175
  891. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
  892. data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
  893. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
  894. data/ext/sources/examples/talk-llama/models/llama.cpp +0 -168
  895. data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
  896. data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -55
  897. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -123
  898. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -199
  899. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -124
  900. data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
  901. data/ext/sources/examples/talk-llama/models/models.h +0 -569
  902. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -116
  903. data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
  904. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -150
  905. data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
  906. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
  907. data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
  908. data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
  909. data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
  910. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
  911. data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
  912. data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
  913. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
  914. data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
  915. data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
  916. data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
  917. data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -316
  918. data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
  919. data/ext/sources/examples/talk-llama/models/plm.cpp +0 -168
  920. data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
  921. data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
  922. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
  923. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
  924. data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -117
  925. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -124
  926. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -873
  927. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -149
  928. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -141
  929. data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
  930. data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
  931. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -162
  932. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
  933. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
  934. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -135
  935. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
  936. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
  937. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
  938. data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
  939. data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
  940. data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
  941. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
  942. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
  943. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
  944. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
  945. data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
  946. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
  947. data/ext/sources/examples/talk-llama/speak +0 -40
  948. data/ext/sources/examples/talk-llama/speak.bat +0 -1
  949. data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
  950. data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
  951. data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
  952. data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
  953. data/ext/sources/examples/talk-llama/unicode.cpp +0 -1147
  954. data/ext/sources/examples/talk-llama/unicode.h +0 -111
  955. data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
  956. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
  957. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
  958. data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
  959. data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
  960. data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
  961. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
  962. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
  963. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
  964. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
  965. data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
  966. data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
  967. data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
  968. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
  969. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
  970. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +0 -157
  971. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -165
  972. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
  973. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
  974. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
  975. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
  976. data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
  977. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
  978. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
  979. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
  980. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
  981. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
  982. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
  983. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
  984. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -147
  985. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
  986. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +0 -907
  987. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +0 -247
  988. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
  989. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
  990. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
  991. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
  992. data/ext/sources/tests/CMakeLists.txt +0 -112
  993. data/ext/sources/tests/earnings21/eval.mk +0 -58
  994. data/ext/sources/tests/earnings21/eval.py +0 -68
  995. data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
  996. data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
  997. data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
  998. data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
  999. data/ext/sources/tests/earnings21/requirements.txt +0 -6
  1000. data/ext/sources/tests/en-0-ref.txt +0 -1
  1001. data/ext/sources/tests/en-1-ref.txt +0 -1
  1002. data/ext/sources/tests/en-2-ref.txt +0 -1
  1003. data/ext/sources/tests/es-0-ref.txt +0 -1
  1004. data/ext/sources/tests/librispeech/eval.mk +0 -39
  1005. data/ext/sources/tests/librispeech/eval.py +0 -47
  1006. data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
  1007. data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
  1008. data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
  1009. data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
  1010. data/ext/sources/tests/librispeech/requirements.txt +0 -6
  1011. data/ext/sources/tests/run-tests.sh +0 -130
  1012. data/ext/sources/tests/test-c.c +0 -3
  1013. data/ext/sources/tests/test-vad-full.cpp +0 -56
  1014. data/ext/sources/tests/test-vad.cpp +0 -83
  1015. data/ext/sources/tests/test-whisper.js +0 -58
  1016. data/lib/whisper/context.rb +0 -15
  1017. data/lib/whisper/segment.rb +0 -58
@@ -7,16 +7,20 @@
7
7
 
8
8
  #include <atomic>
9
9
  #include <chrono>
10
- #include <cstddef>
11
10
  #include <mutex>
11
+ #include <thread>
12
+ #include <cstddef>
12
13
  #include <stdexcept>
13
14
  #include <string>
15
+ #include <sstream>
16
+ #include <iomanip>
17
+ #include <unordered_set>
18
+ #include <unordered_map>
19
+ #include <regex>
20
+ #include <queue>
14
21
 
15
22
  #ifdef _WIN32
16
23
  # include <sal.h>
17
- # ifndef _WINDOWS
18
- # define _WINDOWS
19
- # endif
20
24
  #else
21
25
  # include <semaphore.h>
22
26
  # include <unistd.h>
@@ -25,8 +29,6 @@
25
29
  #pragma clang diagnostic ignored "-Wnested-anon-types"
26
30
  #pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
27
31
 
28
- #include "htp-utils.h"
29
-
30
32
  #include <AEEStdErr.h>
31
33
  #include <dspqueue.h>
32
34
  #include <rpcmem.h>
@@ -37,22 +39,38 @@
37
39
  #include "ggml-hexagon.h"
38
40
  #include "ggml-impl.h"
39
41
  #include "ggml-quants.h"
40
- #include "op-desc.h"
41
- #include "htp-msg.h"
42
+ #include "htp-opnode.h"
43
+ #include "htp-ops.h"
42
44
  #include "htp_iface.h"
43
-
44
- static size_t opt_ndev = 1;
45
- static size_t opt_nhvx = 0; // use all
46
- static int opt_arch = 0; // autodetect
47
- static int opt_etm = 0;
48
- static int opt_verbose = 0;
49
- static int opt_profile = 0;
50
- static int opt_hostbuf = 1;
51
- static int opt_experimental = 0;
45
+ #include "htp-drv.h"
46
+
47
+ using intvec = std::vector<int>;
48
+ using uintvec = std::vector<unsigned int>;
49
+ using u32vec = std::vector<uint32_t>;
50
+
51
+ static int opt_arch = 0; // autodetect
52
+ static size_t opt_ndev = 1;
53
+ static size_t opt_nhvx = 0; // use all
54
+ static int opt_use_hmx = 1; // when set, enable HMX; when 0, use HVX only
55
+ static size_t opt_vmem = HTP_OP_MAX_VMEM_DEFAULT; // max available va space for buffer mappings
56
+ static size_t opt_mbuf = 1ul * 1024 * 1024 * 1024; // max buffer size
57
+ static int opt_etm = 0;
58
+ static int opt_verbose = 0;
59
+ static int opt_profile = 0; // profiling mode (0-disabled, 1-basic, 2-pmu)
60
+ static int opt_hostbuf = 1; // hostbuf ON by default
61
+
62
+ // Default PMU events, if profiling with PMU (mode=2) is enabled
63
+ // See https://docs.qualcomm.com/doc/80-N2040-60/topic/pmu-events.html
64
+ // https://docs.qualcomm.com/doc/80-N2040-61/topic/hvx-pmu-events.html
65
+ static u32vec opt_pmu_evt { 0x3, 0x111, 0x100, 0x105, 0x240, 0x256, 0x7D, 0x8C };
52
66
 
53
67
  // Enable all stages by default
54
- static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_QUANTIZE | HTP_OPMASK_COMPUTE;
55
- static int opt_opsync = 0; // synchronous ops
68
+ static int opt_opstage = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE;
69
+ static int opt_opbatch = 1024; // max number of ops in a batch
70
+ static int opt_opqueue = 16; // max number of pending batches
71
+ static int opt_oppoll = 0; // polling for batch completions
72
+
73
+ static std::regex* opt_opfilter = NULL; // regex of ops to not claim
56
74
 
57
75
  #define HEX_VERBOSE(...) \
58
76
  if (opt_verbose) GGML_LOG_DEBUG(__VA_ARGS__)
@@ -84,47 +102,45 @@ static const char * status_to_str(uint32_t status) {
84
102
 
85
103
  // ** debug helpers
86
104
 
87
- static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const ggml_tensor * op, const uint32_t req_flags) {
105
+ static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const htp_opnode & node, const uint32_t req_flags) {
88
106
  if (!opt_verbose) return;
89
107
 
90
- op_desc desc(op);
108
+ htp_opformat fmt(node);
91
109
  GGML_LOG_DEBUG("ggml-hex: %s execute-op %s: %s : %s : %s : %s : %s : flags 0x%x\n", sess_name.c_str(),
92
- ggml_op_name(op->op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, req_flags);
110
+ node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, req_flags);
93
111
  }
94
112
 
95
113
  static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct ggml_tensor * op, bool supp) {
96
114
  if (!opt_verbose) return;
97
115
 
98
- op_desc desc(op);
99
- GGML_LOG_DEBUG("ggml-hex: %s supports-op %s : %s : %s : %s : %s : %s : %s\n", sess_name.c_str(),
100
- ggml_op_name(op->op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, supp ? "yes" : "no");
116
+ htp_opformat fmt(htp_opformat(htp_opnode{const_cast<ggml_tensor*>(op), {}, HTP_OP_INVALID}));
117
+ GGML_LOG_DEBUG("ggml-hex: %s supports-op %s: %s : %s : %s : %s : %s : %s\n", sess_name.c_str(),
118
+ ggml_op_desc(op), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, supp ? "yes" : "no");
101
119
  }
102
120
 
103
- static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op,
104
- uint32_t op_usec, uint32_t op_cycles, uint32_t op_pkts, uint64_t call_usec) {
121
+ static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const htp_opnode & node,
122
+ uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) {
105
123
  if (!opt_profile) return;
106
124
 
107
- op_desc desc(op);
108
- GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : %s : op-usec %u op-cycles %u op-pkts %u (%f) call-usec %llu\n", sess_name.c_str(),
109
- ggml_op_name(op->op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs,
110
- op_usec, op_cycles, op_pkts, (float) op_cycles / op_pkts, (unsigned long long) call_usec);
125
+ char pmu_str[256] = "";
126
+ if (opt_profile > 1) {
127
+ static_assert(HTP_PROF_PMU_NCNT == 8, "current implementation assumes 8 PMU counters");
128
+ sprintf(pmu_str, " pmu [%u,%u,%u,%u,%u,%u,%u,%u]",
129
+ pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]);
130
+ }
131
+
132
+ htp_opformat fmt(node);
133
+ GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(),
134
+ node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, op_usec, op_cycles, pmu_str);
111
135
  }
112
136
 
113
137
  // ** backend sessions
114
138
 
115
- struct ggml_hexagon_session {
116
- ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false);
117
- ~ggml_hexagon_session() noexcept(true);
118
-
119
- void allocate(int dev_id) noexcept(false);
120
- void release() noexcept(true);
121
-
122
- void enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync = false);
123
- void flush();
124
-
125
- ggml_backend_buffer_type buffer_type = {};
126
- ggml_backend_buffer_type repack_buffer_type = {};
139
+ struct ggml_hexagon_opbatch;
140
+ struct ggml_hexagon_opqueue;
141
+ struct htp_opnode;
127
142
 
143
+ struct ggml_hexagon_session {
128
144
  std::string name;
129
145
  remote_handle64 handle;
130
146
  dspqueue_t queue;
@@ -136,87 +152,28 @@ struct ggml_hexagon_session {
136
152
  bool valid_handle;
137
153
  bool valid_queue;
138
154
  bool valid_iface;
139
- std::atomic<int> op_pending;
140
- uint32_t prof_usecs;
141
- uint32_t prof_cycles;
142
- uint32_t prof_pkts;
143
- };
144
-
145
- void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
146
- // Bump pending flag (cleared in the session::flush once we get the responce)
147
- this->op_pending++; // atomic inc
148
-
149
- int err = dspqueue_write(this->queue,
150
- 0, // flags - the framework will autoset this
151
- n_bufs, // number of buffers
152
- bufs, // buffer references
153
- sizeof(req),
154
- (const uint8_t *) &req, // Message
155
- 1000000 // Timeout
156
- );
157
-
158
- if (err != 0) {
159
- GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->name.c_str(), (unsigned) err);
160
- }
161
-
162
- if (sync) {
163
- flush();
164
- }
165
- }
166
-
167
- // Flush HTP response queue i.e wait for all outstanding requests to complete
168
- void ggml_hexagon_session::flush() {
169
- dspqueue_t q = this->queue;
170
-
171
- // Repeatedly read packets from the queue until it's empty. We don't
172
- // necessarily get a separate callback for each packet, and new packets
173
- // may arrive while we're processing the previous one.
174
-
175
- while (this->op_pending) {
176
- struct htp_general_rsp rsp;
177
- uint32_t rsp_size;
178
- uint32_t flags;
179
-
180
- struct dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
181
- uint32_t n_bufs;
182
155
 
183
- // Read response packet from queue
184
- int err = dspqueue_read(q, &flags,
185
- HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
186
- &n_bufs, // Number of buffer references
187
- bufs, // Buffer references
188
- sizeof(rsp), // Max message length
189
- &rsp_size, // Message length
190
- (uint8_t *) &rsp,
191
- 1000000); // Timeout
156
+ std::atomic<int> op_pending;
157
+ ggml_hexagon_opbatch* op_batch;
158
+ ggml_hexagon_opqueue* op_queue;
192
159
 
193
- if (err == AEE_EEXPIRED) {
194
- // TODO: might need to bail out if the HTP is stuck on something
195
- continue;
196
- }
160
+ ggml_backend_buffer_type buffer_type = {};
161
+ ggml_backend_buffer_type repack_buffer_type = {};
197
162
 
198
- if (err != 0) {
199
- GGML_ABORT("ggml-hex: dspqueue_read failed: 0x%08x\n", (unsigned) err);
200
- }
163
+ ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false);
164
+ ~ggml_hexagon_session() noexcept(true);
201
165
 
202
- // Basic sanity checks
203
- if (rsp_size != sizeof(rsp)) {
204
- GGML_ABORT("ggml-hex: dspcall : bad response (size)\n");
205
- }
166
+ const char* c_name() const { return name.c_str(); }
206
167
 
207
- if (rsp.status != HTP_STATUS_OK) {
208
- GGML_LOG_ERROR("ggml-hex: dspcall : dsp-rsp: %s\n", status_to_str(rsp.status));
209
- // TODO: handle errors
210
- }
168
+ void allocate(int dev_id) noexcept(false);
169
+ void release() noexcept(true);
211
170
 
212
- // TODO: update profiling implementation, currently only works for opt_opsync mode
213
- this->prof_usecs = rsp.prof_usecs;
214
- this->prof_cycles = rsp.prof_cycles;
215
- this->prof_pkts = rsp.prof_pkts;
171
+ void enqueue_op(const htp_opnode & node);
172
+ void flush(bool all = true);
216
173
 
217
- this->op_pending--; // atomic dec
218
- }
219
- }
174
+ void flush_pending(bool all = false);
175
+ void flush_batch();
176
+ };
220
177
 
221
178
  // ** backend buffers
222
179
 
@@ -230,88 +187,94 @@ struct ggml_backend_hexagon_buffer_type_context {
230
187
  std::string name;
231
188
  };
232
189
 
233
- struct ggml_backend_hexagon_buffer_context {
234
- bool mmap_to(ggml_hexagon_session * s) {
235
- HEX_VERBOSE("ggml-hex: %s mmaping buffer: base %p domain-id %d session-id %d size %zu fd %d repack %d\n",
236
- s->name.c_str(), (void *) this->base, s->domain_id, s->session_id, this->size, this->fd,
237
- (int) this->repack);
190
+ struct ggml_hexagon_shared_buffer {
191
+ ggml_hexagon_session * sess;
192
+ uint8_t * base;
193
+ size_t size;
194
+ int fd;
195
+ bool mapped;
196
+ bool pinned;
197
+
198
+ void mmap() {
199
+ fastrpc_map_flags flags = this->pinned ? FASTRPC_MAP_FD : FASTRPC_MAP_FD_DELAYED;
238
200
 
239
- int err = fastrpc_mmap(s->domain_id, this->fd, (void *) this->base, 0, this->size, FASTRPC_MAP_FD);
201
+ int err = fastrpc_mmap(sess->domain_id, this->fd, (void *) this->base, 0, this->size, flags);
240
202
  if (err != 0) {
241
- GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n",
242
- s->domain_id, this->size, this->fd, (unsigned) err);
243
- return false;
203
+ GGML_LOG_ERROR("ggml-hex: %s buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n", sess->c_name(),
204
+ sess->domain_id, this->size, this->fd, (unsigned) err);
205
+ throw std::runtime_error("ggml-hex: fastrpc_mmap failed (see log for details)");
244
206
  }
245
207
 
246
- return true;
247
- }
208
+ HEX_VERBOSE("ggml-hex: %s mapped buffer: base %p size %zu fd %d pinned %u\n",
209
+ sess->c_name(), (void *) this->base, this->size, this->fd, pinned);
248
210
 
249
- bool mmap() {
250
- if (this->mapped) {
251
- return true;
252
- }
253
- if (!mmap_to(this->sess)) {
254
- return false;
255
- }
256
211
  this->mapped = true;
257
- return true;
258
212
  }
259
213
 
260
- void munmap() {
261
- if (!this->mapped) {
262
- return;
214
+ void unmap() {
215
+ if (!this->mapped) return;
216
+
217
+ if (!this->pinned) {
218
+ // HTP might still hold a reference, tell it drop it
219
+ htp_iface_munmap(sess->handle, this->fd);
263
220
  }
264
221
 
265
- fastrpc_munmap(this->sess->domain_id, this->fd, this->base, this->size);
222
+ fastrpc_munmap(sess->domain_id, this->fd, (void *) this->base, this->size);
223
+
224
+ HEX_VERBOSE("ggml-hex: %s unmapped buffer: base %p size %zu fd %d\n", sess->c_name(),
225
+ (void *) this->base, size, this->fd);
226
+
266
227
  this->mapped = false;
228
+ this->fd = -1;
267
229
  }
268
230
 
269
- ggml_backend_hexagon_buffer_context(ggml_hexagon_session * sess, size_t size, bool repack) {
270
- size += 4 * 1024; // extra page for padding
271
-
272
- if (rpcmem_alloc2) {
273
- this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
274
- } else {
275
- GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str());
276
- this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
277
- }
231
+ void alloc(size_t size) {
232
+ if (this->base) return;
278
233
 
234
+ this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, size);
279
235
  if (!this->base) {
280
- GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size);
236
+ GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->c_name(), size);
281
237
  throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)");
282
238
  }
283
239
 
284
240
  this->fd = rpcmem_to_fd(this->base);
285
241
  if (this->fd < 0) {
286
- GGML_LOG_ERROR("ggml-hex: %s failed to get FD for buffer %p\n", sess->name.c_str(), (void *) this->base);
287
- rpcmem_free(this->base);
288
- this->base = NULL;
242
+ GGML_LOG_ERROR("ggml-hex: %s failed to get FD for buffer %p\n", sess->c_name(), (void *) this->base);
289
243
  throw std::runtime_error("ggml-hex: rpcmem_to_fd failed (see log for details)");
290
244
  }
245
+ this->size = size;
246
+
247
+ HEX_VERBOSE("ggml-hex: %s allocated buffer: base %p size %zu fd %d pinned %d\n", sess->c_name(),
248
+ (void *) this->base, this->size, this->fd, (int) pinned);
249
+ mmap();
250
+ }
251
+
252
+ void free() {
253
+ if (!this->base) return;
254
+
255
+ unmap();
256
+ rpcmem_free(this->base);
257
+
258
+ HEX_VERBOSE("ggml-hex: %s freed buffer: base %p size %zu fd %d\n", sess->c_name(),
259
+ (void *) this->base, size, this->fd);
291
260
 
292
- HEX_VERBOSE("ggml-hex: %s allocated buffer: base %p size %zu fd %d repack %d\n", sess->name.c_str(),
293
- (void *) this->base, size, this->fd, (int) repack);
261
+ this->base = NULL;
262
+ }
294
263
 
264
+ ggml_hexagon_shared_buffer(ggml_hexagon_session * sess, size_t size, bool pinned = false) {
295
265
  this->sess = sess;
296
- this->size = size;
266
+ this->size = 0;
267
+ this->base = nullptr;
268
+ this->fd = -1;
297
269
  this->mapped = false;
298
- this->repack = repack;
299
- }
270
+ this->pinned = pinned;
300
271
 
301
- ~ggml_backend_hexagon_buffer_context() {
302
- munmap();
303
- if (this->base) {
304
- rpcmem_free(this->base);
305
- this->base = NULL;
306
- }
272
+ alloc(size);
307
273
  }
308
274
 
309
- ggml_hexagon_session * sess; // primary session
310
- uint8_t * base;
311
- size_t size;
312
- int fd;
313
- bool mapped; // mmap is done
314
- bool repack; // repacked buffer
275
+ ~ggml_hexagon_shared_buffer() {
276
+ free();
277
+ }
315
278
  };
316
279
 
317
280
  static ggml_hexagon_session * ggml_backend_hexagon_buffer_get_sess(ggml_backend_buffer_t buffer) {
@@ -319,30 +282,26 @@ static ggml_hexagon_session * ggml_backend_hexagon_buffer_get_sess(ggml_backend_
319
282
  }
320
283
 
321
284
  static void ggml_backend_hexagon_buffer_free_buffer(ggml_backend_buffer_t buffer) {
322
- auto ctx = static_cast<ggml_backend_hexagon_buffer_context *>(buffer->context);
323
- delete ctx;
285
+ auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(buffer->context);
286
+ delete sbuf;
324
287
  }
325
288
 
326
289
  static void * ggml_backend_hexagon_buffer_get_base(ggml_backend_buffer_t buffer) {
327
- auto ctx = static_cast<ggml_backend_hexagon_buffer_context *>(buffer->context);
328
- return ctx->base;
290
+ auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(buffer->context);
291
+ return sbuf->base;
329
292
  }
330
293
 
331
294
  static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
332
- auto ctx = static_cast<ggml_backend_hexagon_buffer_context *>(buffer->context);
333
- auto sess = ctx->sess;
295
+ auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(buffer->context);
296
+ auto sess = sbuf->sess;
334
297
 
335
- HEX_VERBOSE("ggml-hex: %s init-tensor %s : base %p data %p nbytes %zu usage %d repack %d\n", sess->name.c_str(),
336
- tensor->name, (void *) ctx->base, tensor->data, ggml_nbytes(tensor), (int) buffer->usage,
337
- (int) ctx->repack);
298
+ HEX_VERBOSE("ggml-hex: %s init-tensor %s : base %p data %p nbytes %zu usage %d\n", sess->c_name(),
299
+ tensor->name, (void *) sbuf->base, tensor->data, ggml_nbytes(tensor), (int) buffer->usage);
338
300
 
339
301
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
340
- ; // nothing to do for the view
341
- } else {
342
- if (!ctx->mapped) {
343
- ctx->mmap();
344
- }
302
+ return GGML_STATUS_SUCCESS; // nothing to do for the view
345
303
  }
304
+
346
305
  return GGML_STATUS_SUCCESS;
347
306
  }
348
307
 
@@ -412,6 +371,7 @@ static void pack_q4_0_quants(block_q4_0 * x, const uint8_t * qs, unsigned int bi
412
371
  static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
413
372
  static const int qk = QK_Q4_0x4x2;
414
373
  const int nb = (k + qk - 1) / qk; // number of blocks (padded)
374
+ const int nloe = k % qk; // leftovers
415
375
 
416
376
  const int dblk_size = 8 * 2; // 8x __fp16
417
377
  const int qblk_size = qk / 2; // int4
@@ -445,15 +405,17 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
445
405
  unpack_q4_0_quants(qs, &x[i * 8 + 6], 6);
446
406
  unpack_q4_0_quants(qs, &x[i * 8 + 7], 7);
447
407
 
408
+ bool partial = (nloe && i == nb-1);
409
+
448
410
  uint8_t * q = y_q + (i * qblk_size);
449
411
  for (int j = 0; j < qk / 2; j++) {
450
- q[j] = (qs[j + 128] << 4) | qs[j];
412
+ q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
451
413
  }
452
414
  }
453
415
 
454
416
  // Repack the scales
455
417
  // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
456
- // the last block is truncated and overriden by the scales.
418
+ // the last block is truncated and overridden by the scales.
457
419
  for (int i = 0; i < nb; i++) {
458
420
  // Repack the scales
459
421
  ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
@@ -467,7 +429,7 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
467
429
  d[7] = x[i * 8 + 7].d;
468
430
  }
469
431
 
470
- if (opt_verbose > 1) {
432
+ if (opt_verbose > 2) {
471
433
  for (int i = 0; i < nb; i++) {
472
434
  dump_packed_block_q4x4x2(y, i, k);
473
435
  }
@@ -477,6 +439,7 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
477
439
  static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
478
440
  static const int qk = QK_Q4_0x4x2;
479
441
  const int nb = (k + qk - 1) / qk; // number of blocks (padded)
442
+ const int nloe = k % qk; // leftovers
480
443
 
481
444
  const int dblk_size = 8 * 2; // 8x __fp16
482
445
  const int qblk_size = qk / 2; // int4
@@ -485,7 +448,7 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
485
448
  const uint8_t * y_q = y + 0; // quants first
486
449
  const uint8_t * y_d = y + qrow_size; // then scales
487
450
 
488
- if (opt_verbose > 1) {
451
+ if (opt_verbose > 2) {
489
452
  for (int i = 0; i < nb; i++) {
490
453
  dump_packed_block_q4x4x2(y, i, k);
491
454
  }
@@ -495,10 +458,17 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
495
458
  for (int i = 0; i < nb; i++) {
496
459
  uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
497
460
 
461
+ bool partial = (nloe && i == nb-1);
462
+
498
463
  const uint8_t * q = y_q + (i * qblk_size);
499
464
  for (int j = 0; j < qk / 2; j++) {
500
- qs[j] = q[j] & 0xf;
501
- qs[j + 128] = q[j] >> 4;
465
+ if (partial) {
466
+ qs[j*2+0] = q[j] & 0xf;
467
+ qs[j*2+1] = q[j] >> 4;
468
+ } else {
469
+ qs[j+000] = q[j] & 0xf;
470
+ qs[j+128] = q[j] >> 4;
471
+ }
502
472
  }
503
473
 
504
474
  pack_q4_0_quants(&x[i * 8 + 0], qs, 0);
@@ -513,7 +483,7 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
513
483
 
514
484
  // Repack the scales
515
485
  // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
516
- // the last block is truncated and overriden by the scales.
486
+ // the last block is truncated and overridden by the scales.
517
487
  for (int i = 0; i < nb; i++) {
518
488
  // Unpack the scales
519
489
  const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
@@ -562,7 +532,7 @@ static void init_row_q4x4x2(block_q4_0 * x, int64_t k) {
562
532
 
563
533
  // Init the scales
564
534
  // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
565
- // the last block is truncated and overriden by the scales.
535
+ // the last block is truncated and overridden by the scales.
566
536
  for (int i = 0; i < nb; i++) {
567
537
  // Unpack the scales
568
538
  x[i * 8 + 0].d = 0;
@@ -582,7 +552,7 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
582
552
 
583
553
  size_t row_size = ggml_row_size(t->type, t->ne[0]);
584
554
  size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
585
- size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
555
+ size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
586
556
 
587
557
  // Ensure we don't try to read more data than is available in the source buffer 'data'
588
558
  // or write more than the tensor can hold.
@@ -643,7 +613,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
643
613
 
644
614
  size_t row_size = ggml_row_size(t->type, t->ne[0]);
645
615
  size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
646
- size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
616
+ size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
647
617
 
648
618
  // Ensure we don't try to copy more data than the tensor actually contains.
649
619
  const size_t total_tensor_size = (size_t)nrows * row_size;
@@ -692,6 +662,239 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
692
662
  ggml_aligned_free(buf_rp, row_size_rp);
693
663
  }
694
664
 
665
+ static void unpack_q4_1_quants(uint8_t * qs, const block_q4_1 * x, unsigned int bi) {
666
+ static const int qk = QK4_1;
667
+
668
+ for (unsigned int i = 0; i < qk / 2; ++i) {
669
+ const int x0 = (x->qs[i] & 0x0F);
670
+ const int x1 = (x->qs[i] >> 4);
671
+ qs[bi * qk + i + 0] = x0;
672
+ qs[bi * qk + i + qk / 2] = x1;
673
+ }
674
+ }
675
+
676
+ static void pack_q4_1_quants(block_q4_1 * x, const uint8_t * qs, unsigned int bi) {
677
+ static const int qk = QK4_1;
678
+
679
+ for (unsigned int i = 0; i < qk / 2; ++i) {
680
+ const uint8_t x0 = qs[bi * qk + i + 0];
681
+ const uint8_t x1 = qs[bi * qk + i + qk / 2];
682
+ x->qs[i] = x0 | (x1 << 4);
683
+ }
684
+ }
685
+
686
+ static void repack_row_q4_1x4x2(uint8_t * y, const block_q4_1 * x, int64_t k) {
687
+ static const int qk = QK_Q4_0x4x2;
688
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
689
+ const int nloe = k % qk; // leftovers
690
+
691
+ const int dblk_size = 8 * 4; // 8x (d, m) __fp16 = 32 bytes
692
+ const int qblk_size = qk / 2; // int4 = 128 bytes
693
+ const int qrow_size = k / 2; // int4 (not padded to blocks)
694
+
695
+ uint8_t * y_q = y + 0; // quants first
696
+ uint8_t * y_d = y + qrow_size; // then scales/offsets
697
+
698
+ // Repack the quants
699
+ for (int i = 0; i < nb; i++) {
700
+ uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
701
+ unpack_q4_1_quants(qs, &x[i * 8 + 0], 0);
702
+ unpack_q4_1_quants(qs, &x[i * 8 + 1], 1);
703
+ unpack_q4_1_quants(qs, &x[i * 8 + 2], 2);
704
+ unpack_q4_1_quants(qs, &x[i * 8 + 3], 3);
705
+ unpack_q4_1_quants(qs, &x[i * 8 + 4], 4);
706
+ unpack_q4_1_quants(qs, &x[i * 8 + 5], 5);
707
+ unpack_q4_1_quants(qs, &x[i * 8 + 6], 6);
708
+ unpack_q4_1_quants(qs, &x[i * 8 + 7], 7);
709
+
710
+ bool partial = (nloe && i == nb-1);
711
+
712
+ uint8_t * q = y_q + (i * qblk_size);
713
+ for (int j = 0; j < qk / 2; j++) {
714
+ q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
715
+ }
716
+ }
717
+
718
+ // Repack the scales and offsets
719
+ for (int i = 0; i < nb; i++) {
720
+ ggml_half * d_m = (ggml_half *) (y_d + i * dblk_size);
721
+ for (int j = 0; j < 8; j++) {
722
+ d_m[j * 2 + 0] = x[i * 8 + j].d;
723
+ d_m[j * 2 + 1] = x[i * 8 + j].m;
724
+ }
725
+ }
726
+ }
727
+
728
+ static void unpack_row_q4_1x4x2(block_q4_1 * x, const uint8_t * y, int64_t k) {
729
+ static const int qk = QK_Q4_0x4x2;
730
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
731
+ const int nloe = k % qk; // leftovers
732
+
733
+ const int dblk_size = 8 * 4; // 8x (d, m) __fp16 = 32 bytes
734
+ const int qblk_size = qk / 2; // int4 = 128 bytes
735
+ const int qrow_size = k / 2; // int4 (not padded to blocks)
736
+
737
+ const uint8_t * y_q = y + 0; // quants first
738
+ const uint8_t * y_d = y + qrow_size; // then scales/offsets
739
+
740
+ // Unpack the quants
741
+ for (int i = 0; i < nb; i++) {
742
+ uint8_t qs[QK_Q4_0x4x2];
743
+ bool partial = (nloe && i == nb-1);
744
+
745
+ const uint8_t * q = y_q + (i * qblk_size);
746
+ for (int j = 0; j < qk / 2; j++) {
747
+ if (partial) {
748
+ qs[j*2+0] = q[j] & 0x0F;
749
+ qs[j*2+1] = q[j] >> 4;
750
+ } else {
751
+ qs[j+000] = q[j] & 0x0F;
752
+ qs[j+128] = q[j] >> 4;
753
+ }
754
+ }
755
+
756
+ pack_q4_1_quants(&x[i * 8 + 0], qs, 0);
757
+ pack_q4_1_quants(&x[i * 8 + 1], qs, 1);
758
+ pack_q4_1_quants(&x[i * 8 + 2], qs, 2);
759
+ pack_q4_1_quants(&x[i * 8 + 3], qs, 3);
760
+ pack_q4_1_quants(&x[i * 8 + 4], qs, 4);
761
+ pack_q4_1_quants(&x[i * 8 + 5], qs, 5);
762
+ pack_q4_1_quants(&x[i * 8 + 6], qs, 6);
763
+ pack_q4_1_quants(&x[i * 8 + 7], qs, 7);
764
+ }
765
+
766
+ // Unpack the scales and offsets
767
+ for (int i = 0; i < nb; i++) {
768
+ const ggml_half * d_m = (const ggml_half *) (y_d + i * dblk_size);
769
+ for (int j = 0; j < 8; j++) {
770
+ x[i * 8 + j].d = d_m[j * 2 + 0];
771
+ x[i * 8 + j].m = d_m[j * 2 + 1];
772
+ }
773
+ }
774
+ }
775
+
776
+ static void init_row_q4_1x4x2(block_q4_1 * x, int64_t k) {
777
+ static const int qk = QK_Q4_0x4x2;
778
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
779
+
780
+ uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
781
+ memset(qs, 0, sizeof(qs));
782
+
783
+ for (int i = 0; i < nb; i++) {
784
+ pack_q4_1_quants(&x[i * 8 + 0], qs, 0);
785
+ pack_q4_1_quants(&x[i * 8 + 1], qs, 1);
786
+ pack_q4_1_quants(&x[i * 8 + 2], qs, 2);
787
+ pack_q4_1_quants(&x[i * 8 + 3], qs, 3);
788
+ pack_q4_1_quants(&x[i * 8 + 4], qs, 4);
789
+ pack_q4_1_quants(&x[i * 8 + 5], qs, 5);
790
+ pack_q4_1_quants(&x[i * 8 + 6], qs, 6);
791
+ pack_q4_1_quants(&x[i * 8 + 7], qs, 7);
792
+ }
793
+
794
+ for (int i = 0; i < nb; i++) {
795
+ for (int j = 0; j < 8; j++) {
796
+ x[i * 8 + j].d = 0;
797
+ x[i * 8 + j].m = 0;
798
+ }
799
+ }
800
+ }
801
+
802
+ static void repack_q4_1_q4x4x2(ggml_tensor * t, const void * data, size_t size) {
803
+ int64_t nrows = ggml_nrows(t);
804
+
805
+ size_t row_size = ggml_row_size(t->type, t->ne[0]);
806
+ size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));
807
+ size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
808
+
809
+ const size_t total_tensor_size = (size_t)nrows * row_size;
810
+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
811
+
812
+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
813
+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
814
+
815
+ void * buf_pd = ggml_aligned_malloc(row_size_pd);
816
+ GGML_ASSERT(buf_pd != NULL);
817
+
818
+ void * buf_rp = ggml_aligned_malloc(row_size_rp);
819
+ GGML_ASSERT(buf_rp != NULL);
820
+
821
+ HEX_VERBOSE("ggml-hex: repack-q4_1-q4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
822
+ t->ne[0], nrows, row_size);
823
+
824
+ init_row_q4_1x4x2((block_q4_1 *) buf_pd, t->ne[0]);
825
+
826
+ for (int64_t i = 0; i < n_full_rows; i++) {
827
+ const uint8_t * src = (const uint8_t *) data + (i * row_size);
828
+ uint8_t * dst = (uint8_t *) t->data + (i * row_size);
829
+
830
+ memcpy(buf_pd, src, row_size);
831
+ repack_row_q4_1x4x2((uint8_t *) buf_rp, (const block_q4_1 *) buf_pd, t->ne[0]);
832
+ memcpy(dst, buf_rp, row_size);
833
+ }
834
+
835
+ if (n_rem_bytes > 0) {
836
+ const int64_t i = n_full_rows;
837
+ const uint8_t * src = (const uint8_t *) data + (i * row_size);
838
+ uint8_t * dst = (uint8_t *) t->data + (i * row_size);
839
+
840
+ init_row_q4_1x4x2((block_q4_1 *) buf_pd, t->ne[0]);
841
+ memcpy(buf_pd, src, n_rem_bytes);
842
+ repack_row_q4_1x4x2((uint8_t *) buf_rp, (const block_q4_1 *) buf_pd, t->ne[0]);
843
+ memcpy(dst, buf_rp, n_rem_bytes);
844
+ }
845
+
846
+ ggml_aligned_free(buf_pd, row_size_pd);
847
+ ggml_aligned_free(buf_rp, row_size_rp);
848
+ }
849
+
850
+ static void repack_q4x4x2_q4_1(void * data, const ggml_tensor * t, size_t size) {
851
+ int64_t nrows = ggml_nrows(t);
852
+
853
+ size_t row_size = ggml_row_size(t->type, t->ne[0]);
854
+ size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));
855
+ size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
856
+
857
+ const size_t total_tensor_size = (size_t)nrows * row_size;
858
+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
859
+
860
+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
861
+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
862
+
863
+ void * buf_pd = ggml_aligned_malloc(row_size_pd);
864
+ GGML_ASSERT(buf_pd != NULL);
865
+
866
+ void * buf_rp = ggml_aligned_malloc(row_size_rp);
867
+ GGML_ASSERT(buf_rp != NULL);
868
+
869
+ HEX_VERBOSE("ggml-hex: repack-q4x4x2-q4_1 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
870
+ t->ne[0], nrows, row_size);
871
+
872
+ memset(buf_rp, 0, row_size_rp); // clear-out padded buffer to make sure the tail is all zeros
873
+
874
+ for (int64_t i = 0; i < n_full_rows; i++) {
875
+ const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
876
+ uint8_t * dst = (uint8_t *) data + (i * row_size);
877
+
878
+ memcpy(buf_rp, src, row_size);
879
+ unpack_row_q4_1x4x2((block_q4_1 *) buf_pd, (const uint8_t *) buf_rp, t->ne[0]);
880
+ memcpy(dst, buf_pd, row_size);
881
+ }
882
+
883
+ if (n_rem_bytes > 0) {
884
+ const int64_t i = n_full_rows;
885
+ const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
886
+ uint8_t * dst = (uint8_t *) data + (i * row_size);
887
+
888
+ // We still need to read and unpack the entire source row because quantization is block-based.
889
+ memcpy(buf_rp, src, row_size);
890
+ unpack_row_q4_1x4x2((block_q4_1 *) buf_pd, (const uint8_t *) buf_rp, t->ne[0]);
891
+ memcpy(dst, buf_pd, n_rem_bytes);
892
+ }
893
+
894
+ ggml_aligned_free(buf_pd, row_size_pd);
895
+ ggml_aligned_free(buf_rp, row_size_rp);
896
+ }
897
+
695
898
  // ======== Q8x4x2 ====================
696
899
  static void dump_block_q8_0(const block_q8_0 * b, int i) {
697
900
  HEX_VERBOSE("ggml-hex: repack q8_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, b->qs[0], b->qs[1], b->qs[2],
@@ -780,7 +983,7 @@ static void repack_row_q8x4x2(uint8_t * y, const block_q8_0 * x, int64_t k) {
780
983
 
781
984
  // Repack the scales
782
985
  // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
783
- // the last block is truncated and overriden by the scales.
986
+ // the last block is truncated and overridden by the scales.
784
987
  for (int i = 0; i < nb; i++) {
785
988
  // Repack the scales
786
989
  ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
@@ -794,7 +997,7 @@ static void repack_row_q8x4x2(uint8_t * y, const block_q8_0 * x, int64_t k) {
794
997
  d[7] = x[i * 8 + 7].d;
795
998
  }
796
999
 
797
- if (opt_verbose > 1) {
1000
+ if (opt_verbose > 2) {
798
1001
  for (int i = 0; i < nb; i++) {
799
1002
  dump_packed_block_q8x4x2(y, i, k);
800
1003
  }
@@ -812,7 +1015,7 @@ static void unpack_row_q8x4x2(block_q8_0 * x, const uint8_t * y, int64_t k) {
812
1015
  const uint8_t * y_q = y + 0; // quants first
813
1016
  const uint8_t * y_d = y + qrow_size; // then scales
814
1017
 
815
- if (opt_verbose > 1) {
1018
+ if (opt_verbose > 2) {
816
1019
  for (int i = 0; i < nb; i++) {
817
1020
  dump_packed_block_q8x4x2(y, i, k);
818
1021
  }
@@ -839,7 +1042,7 @@ static void unpack_row_q8x4x2(block_q8_0 * x, const uint8_t * y, int64_t k) {
839
1042
 
840
1043
  // Repack the scales
841
1044
  // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
842
- // the last block is truncated and overriden by the scales.
1045
+ // the last block is truncated and overridden by the scales.
843
1046
  for (int i = 0; i < nb; i++) {
844
1047
  // Unpack the scales
845
1048
  const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
@@ -888,7 +1091,7 @@ static void init_row_q8x4x2(block_q8_0 * x, int64_t k) {
888
1091
 
889
1092
  // Init the scales
890
1093
  // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q8_0x4x2)
891
- // the last block is truncated and overriden by the scales.
1094
+ // the last block is truncated and overridden by the scales.
892
1095
  for (int i = 0; i < nb; i++) {
893
1096
  // Unpack the scales
894
1097
  x[i * 8 + 0].d = 0;
@@ -908,7 +1111,7 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
908
1111
 
909
1112
  size_t row_size = ggml_row_size(t->type, t->ne[0]);
910
1113
  size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
911
- size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
1114
+ size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size quants + scales)
912
1115
 
913
1116
  // Ensure we don't try to read more data than is available in the source buffer 'data'
914
1117
  // or write more than the tensor can hold.
@@ -969,7 +1172,7 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
969
1172
 
970
1173
  size_t row_size = ggml_row_size(t->type, t->ne[0]);
971
1174
  size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
972
- size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
1175
+ size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size quants + scales)
973
1176
 
974
1177
  // Ensure we don't try to copy more data than the tensor actually contains.
975
1178
  const size_t total_tensor_size = (size_t)nrows * row_size;
@@ -1088,6 +1291,7 @@ static void pack_mxfp4_quants(block_mxfp4 * x, const uint8_t * qs, unsigned int
1088
1291
  static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k) {
1089
1292
  static const int qk = QK_MXFP4x4x2;
1090
1293
  const int nb = (k + qk - 1) / qk; // number of blocks (padded)
1294
+ const int nloe = k % qk; // leftovers
1091
1295
 
1092
1296
  const int eblk_size = 8 * 1; // 8x E8M0
1093
1297
  const int qblk_size = qk / 2; // int4
@@ -1122,15 +1326,17 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
1122
1326
  unpack_mxfp4_quants(qs, &x[i * 8 + 6], 6);
1123
1327
  unpack_mxfp4_quants(qs, &x[i * 8 + 7], 7);
1124
1328
 
1329
+ bool partial = (nloe && i == nb-1);
1330
+
1125
1331
  uint8_t * q = y_q + (i * qblk_size);
1126
1332
  for (int j = 0; j < qk / 2; j++) {
1127
- q[j] = (qs[j + 128] << 4) | qs[j];
1333
+ q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
1128
1334
  }
1129
1335
  }
1130
1336
 
1131
1337
  // Repack the scales
1132
1338
  // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
1133
- // the last block is truncated and overriden by the scales.
1339
+ // the last block is truncated and overridden by the scales.
1134
1340
  for (int i = 0; i < nb; i++) {
1135
1341
  // Repack the scales
1136
1342
  uint8_t * e = (uint8_t *) (y_e + i * eblk_size);
@@ -1144,7 +1350,7 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
1144
1350
  e[7] = x[i * 8 + 7].e;
1145
1351
  }
1146
1352
 
1147
- if (opt_verbose > 1) {
1353
+ if (opt_verbose > 2) {
1148
1354
  for (int i = 0; i < nb; i++) {
1149
1355
  dump_packed_block_mxfp4x4x2(y, i, k);
1150
1356
  }
@@ -1154,6 +1360,7 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
1154
1360
  static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k) {
1155
1361
  static const int qk = QK_MXFP4x4x2;
1156
1362
  const int nb = (k + qk - 1) / qk; // number of blocks (padded)
1363
+ const int nloe = k % qk; // leftovers
1157
1364
 
1158
1365
  const int eblk_size = 8 * 1; // 8x E8M0
1159
1366
  const int qblk_size = qk / 2; // int4
@@ -1162,7 +1369,7 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
1162
1369
  const uint8_t * y_q = y + 0; // quants first
1163
1370
  const uint8_t * y_e = y + qrow_size; // then scales
1164
1371
 
1165
- if (opt_verbose > 1) {
1372
+ if (opt_verbose > 2) {
1166
1373
  for (int i = 0; i < nb; i++) {
1167
1374
  dump_packed_block_mxfp4x4x2(y, i, k);
1168
1375
  }
@@ -1172,10 +1379,17 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
1172
1379
  for (int i = 0; i < nb; i++) {
1173
1380
  uint8_t qs[QK_MXFP4x4x2]; // unpacked quants
1174
1381
 
1382
+ bool partial = (nloe && i == nb-1);
1383
+
1175
1384
  const uint8_t * q = y_q + (i * qblk_size);
1176
1385
  for (int j = 0; j < qk / 2; j++) {
1177
- qs[j] = q[j] & 0xf;
1178
- qs[j + 128] = q[j] >> 4;
1386
+ if (partial) {
1387
+ qs[j*2+0] = q[j] & 0xf;
1388
+ qs[j*2+1] = q[j] >> 4;
1389
+ } else {
1390
+ qs[j+000] = q[j] & 0xf;
1391
+ qs[j+128] = q[j] >> 4;
1392
+ }
1179
1393
  }
1180
1394
 
1181
1395
  pack_mxfp4_quants(&x[i * 8 + 0], qs, 0);
@@ -1190,7 +1404,7 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
1190
1404
 
1191
1405
  // Repack the scales
1192
1406
  // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4_0x4x2)
1193
- // the last block is truncated and overriden by the scales.
1407
+ // the last block is truncated and overridden by the scales.
1194
1408
  for (int i = 0; i < nb; i++) {
1195
1409
  // Unpack the scales
1196
1410
  const uint8_t * e = (const uint8_t *) (y_e + i * eblk_size);
@@ -1239,7 +1453,7 @@ static void init_row_mxfp4x4x2(block_mxfp4 * x, int64_t k) {
1239
1453
 
1240
1454
  // Init the scales
1241
1455
  // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
1242
- // the last block is truncated and overriden by the scales.
1456
+ // the last block is truncated and overridden by the scales.
1243
1457
  for (int i = 0; i < nb; i++) {
1244
1458
  // Unpack the scales
1245
1459
  x[i * 8 + 0].e = 0;
@@ -1259,7 +1473,7 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
1259
1473
 
1260
1474
  size_t row_size = ggml_row_size(t->type, t->ne[0]);
1261
1475
  size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
1262
- size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
1476
+ size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
1263
1477
 
1264
1478
  // Ensure we don't try to read more data than is available in the source buffer 'data'
1265
1479
  // or write more than the tensor can hold.
@@ -1320,7 +1534,7 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
1320
1534
 
1321
1535
  size_t row_size = ggml_row_size(t->type, t->ne[0]);
1322
1536
  size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
1323
- size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
1537
+ size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
1324
1538
 
1325
1539
  // Ensure we don't try to copy more data than the tensor actually contains.
1326
1540
  const size_t total_tensor_size = (size_t)nrows * row_size;
@@ -1374,11 +1588,10 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
1374
1588
  const void * data,
1375
1589
  size_t offset,
1376
1590
  size_t size) {
1377
- auto ctx = (ggml_backend_hexagon_buffer_context *) buffer->context;
1378
- auto sess = ctx->sess;
1591
+ auto sbuf = (ggml_hexagon_shared_buffer *) buffer->context;
1592
+ auto sess = sbuf->sess;
1379
1593
 
1380
- HEX_VERBOSE("ggml-hex: %s set-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data,
1381
- offset, size);
1594
+ HEX_VERBOSE("ggml-hex: %s set-tensor %s : data %p offset %zu size %zu\n", sess->c_name(), tensor->name, data, offset, size);
1382
1595
 
1383
1596
  switch (tensor->type) {
1384
1597
  case GGML_TYPE_Q4_0:
@@ -1387,10 +1600,23 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
1387
1600
  repack_q4_0_q4x4x2(tensor, data, size);
1388
1601
  break;
1389
1602
 
1390
- case GGML_TYPE_Q8_0:
1603
+ case GGML_TYPE_Q4_1:
1391
1604
  GGML_ASSERT(offset == 0);
1392
1605
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1393
- repack_q8_0_q8x4x2(tensor, data, size);
1606
+ repack_q4_1_q4x4x2(tensor, data, size);
1607
+ break;
1608
+
1609
+ case GGML_TYPE_Q8_0:
1610
+ GGML_ASSERT(offset == 0);
1611
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1612
+ repack_q8_0_q8x4x2(tensor, data, size);
1613
+ break;
1614
+
1615
+ case GGML_TYPE_IQ4_NL:
1616
+ GGML_ASSERT(offset == 0);
1617
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1618
+ // IQ4_NL has identical block layout to Q4_0 (ggml_half d + uint8_t qs[16])
1619
+ repack_q4_0_q4x4x2(tensor, data, size);
1394
1620
  break;
1395
1621
 
1396
1622
  case GGML_TYPE_MXFP4:
@@ -1410,11 +1636,10 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
1410
1636
  void * data,
1411
1637
  size_t offset,
1412
1638
  size_t size) {
1413
- auto ctx = (ggml_backend_hexagon_buffer_context *) buffer->context;
1414
- auto sess = ctx->sess;
1639
+ auto sbuf = (ggml_hexagon_shared_buffer *) buffer->context;
1640
+ auto sess = sbuf->sess;
1415
1641
 
1416
- HEX_VERBOSE("ggml-hex: %s get-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data,
1417
- offset, size);
1642
+ HEX_VERBOSE("ggml-hex: %s get-tensor %s : data %p offset %zu size %zu\n", sess->c_name(), tensor->name, data, offset, size);
1418
1643
 
1419
1644
  switch (tensor->type) {
1420
1645
  case GGML_TYPE_Q4_0:
@@ -1423,12 +1648,24 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
1423
1648
  repack_q4x4x2_q4_0(data, tensor, size);
1424
1649
  break;
1425
1650
 
1651
+ case GGML_TYPE_Q4_1:
1652
+ GGML_ASSERT(offset == 0);
1653
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1654
+ repack_q4x4x2_q4_1(data, tensor, size);
1655
+ break;
1656
+
1426
1657
  case GGML_TYPE_Q8_0:
1427
1658
  GGML_ASSERT(offset == 0);
1428
1659
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1429
1660
  repack_q8x4x2_q8_0(data, tensor, size);
1430
1661
  break;
1431
1662
 
1663
+ case GGML_TYPE_IQ4_NL:
1664
+ GGML_ASSERT(offset == 0);
1665
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1666
+ repack_q4x4x2_q4_0(data, tensor, size);
1667
+ break;
1668
+
1432
1669
  case GGML_TYPE_MXFP4:
1433
1670
  GGML_ASSERT(offset == 0);
1434
1671
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
@@ -1452,10 +1689,10 @@ static bool ggml_backend_hexagon_buffer_cpy_tensor(ggml_backend_buffer_t bu
1452
1689
  }
1453
1690
 
1454
1691
  static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
1455
- auto ctx = (ggml_backend_hexagon_buffer_context *) buffer->context;
1456
- auto sess = ctx->sess;
1457
- HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->name.c_str(), (void *) ctx->base, ctx->size);
1458
- memset(ctx->base, value, ctx->size);
1692
+ auto sbuf = (ggml_hexagon_shared_buffer *) buffer->context;
1693
+ auto sess = sbuf->sess;
1694
+ HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->c_name(), (void *) sbuf->base, sbuf->size);
1695
+ memset(sbuf->base, value, sbuf->size);
1459
1696
  }
1460
1697
 
1461
1698
  static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
@@ -1465,6 +1702,8 @@ static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
1465
1702
  /* .memset_tensor = */ NULL,
1466
1703
  /* .set_tensor = */ ggml_backend_hexagon_buffer_set_tensor,
1467
1704
  /* .get_tensor = */ ggml_backend_hexagon_buffer_get_tensor,
1705
+ /* .set_tensor_2d = */ NULL,
1706
+ /* .get_tensor_2d = */ NULL,
1468
1707
  /* .cpy_tensor = */ ggml_backend_hexagon_buffer_cpy_tensor,
1469
1708
  /* .clear = */ ggml_backend_hexagon_buffer_clear,
1470
1709
  /* .reset = */ NULL,
@@ -1480,10 +1719,11 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
1480
1719
  ggml_backend_buffer_type_t buffer_type, size_t size) {
1481
1720
  auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
1482
1721
  try {
1483
- ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
1484
- return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
1722
+ size += 4 * 1024; // guard page
1723
+ ggml_hexagon_shared_buffer * sbuf = new ggml_hexagon_shared_buffer(sess, size);
1724
+ return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, sbuf, size);
1485
1725
  } catch (const std::exception & exc) {
1486
- GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
1726
+ GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context (host): %s\n", sess->c_name(), exc.what());
1487
1727
  return nullptr;
1488
1728
  }
1489
1729
  }
@@ -1492,10 +1732,11 @@ static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffe
1492
1732
  ggml_backend_buffer_type_t buffer_type, size_t size) {
1493
1733
  auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
1494
1734
  try {
1495
- ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/);
1496
- return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
1735
+ size += 4 * 1024; // guard page
1736
+ ggml_hexagon_shared_buffer * sbuf = new ggml_hexagon_shared_buffer(sess, size);
1737
+ return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, sbuf, size);
1497
1738
  } catch (const std::exception & exc) {
1498
- GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
1739
+ GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context (repack): %s\n", sess->c_name(), exc.what());
1499
1740
  return nullptr;
1500
1741
  }
1501
1742
  }
@@ -1510,7 +1751,7 @@ static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffe
1510
1751
  }
1511
1752
 
1512
1753
  static size_t ggml_backend_hexagon_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
1513
- return 1 * 1024 * 1024 * 1024; // 1GB per buffer
1754
+ return opt_mbuf; // typically 1GB per buffer
1514
1755
  GGML_UNUSED(buffer_type);
1515
1756
  }
1516
1757
 
@@ -1542,6 +1783,448 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf
1542
1783
  /* .is_host = */ ggml_backend_hexagon_repack_buffer_type_is_host,
1543
1784
  };
1544
1785
 
1786
+ struct ggml_hexagon_opbatch {
1787
+ ggml_hexagon_session* sess;
1788
+
1789
+ std::vector<htp_opnode> ops; // htp_opnode of ops
1790
+
1791
+ std::vector<htp_buf_desc> h_bufs; // htp buffer descriptors
1792
+ std::vector<htp_tensor> h_tens; // htp tensor descriptors
1793
+ std::vector<htp_op_desc> h_ops; // htp op descriptors
1794
+
1795
+ std::unordered_map<int, int> b_map; // buffer fd to index
1796
+ std::unordered_map<const ggml_tensor*, int> t_map; // tensor ptr to index
1797
+ std::unordered_multimap<void*, int> d_map; // tensor data to index
1798
+
1799
+ unsigned int n_bufs; // num buffers in the batch
1800
+ unsigned int n_tens; // num tensors ...
1801
+ unsigned int n_ops; // num ops ...
1802
+ size_t b_vmem; // sum of all buffer sizes
1803
+
1804
+ unsigned int n_bufs_max;
1805
+ unsigned int n_tens_max;
1806
+ unsigned int n_ops_max;
1807
+ size_t b_vmem_max;
1808
+
1809
+ void reset() {
1810
+ n_bufs = 0;
1811
+ n_tens = 0;
1812
+ n_ops = 0;
1813
+ b_vmem = 0;
1814
+
1815
+ b_map.clear();
1816
+ t_map.clear();
1817
+ d_map.clear();
1818
+ }
1819
+
1820
+ ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t batch_size, size_t max_vmem) {
1821
+ this->sess = sess;
1822
+
1823
+ n_bufs_max = HTP_OP_MAX_BUFS;
1824
+ n_ops_max = batch_size;
1825
+ n_tens_max = n_ops_max + n_ops_max * HTP_OP_MAX_INPUTS;
1826
+
1827
+ b_vmem_max = max_vmem;
1828
+
1829
+ ops.resize(n_ops_max);
1830
+
1831
+ h_bufs.resize(n_bufs_max);
1832
+ h_tens.resize(n_tens_max);
1833
+ h_ops.resize(n_ops_max);
1834
+
1835
+ b_map.reserve(n_bufs_max);
1836
+ t_map.reserve(n_tens_max);
1837
+ d_map.reserve(n_tens_max);
1838
+
1839
+ GGML_LOG_INFO("ggml-hex: %s op batching: n-bufs %u n-tensors %u n-ops %u vmem %zu\n",
1840
+ sess->c_name(), n_bufs_max, n_tens_max, n_ops_max, b_vmem_max);
1841
+
1842
+ reset();
1843
+ }
1844
+
1845
+ bool empty() const { return n_ops == 0; }
1846
+
1847
+ // add buffer and return its index
1848
+ int add_buffer(ggml_hexagon_shared_buffer * sbuf) {
1849
+ // Lookup by fd
1850
+ auto it = b_map.find(sbuf->fd);
1851
+ if (it != b_map.end()) { return it->second; }
1852
+
1853
+ // Add new buffer to the batch
1854
+ int bi = n_bufs++;
1855
+ GGML_ASSERT(n_bufs < HTP_OP_MAX_BUFS);
1856
+
1857
+ b_map.insert({sbuf->fd, bi});
1858
+
1859
+ htp_buf_desc &b = h_bufs[bi];
1860
+ b.base = (uint64_t) sbuf->base;
1861
+ b.fd = sbuf->fd;
1862
+ b.size = sbuf->size;
1863
+
1864
+ b_vmem += b.size;
1865
+
1866
+ HEX_VERBOSE("ggml-hex: add-buffer #%u : fd %d base %p size %zu : vmem %zu\n", bi, b.fd, (void*) sbuf->base, (size_t) b.size, b_vmem);
1867
+
1868
+ return bi;
1869
+ }
1870
+
1871
+ bool same_shape(const htp_tensor * h, const ggml_tensor * t) const {
1872
+ return (h->ne[0] == t->ne[0]) && (h->ne[1] == t->ne[1]) && (h->ne[2] == t->ne[2]) && (h->ne[3] == t->ne[3]) &&
1873
+ (h->nb[0] == t->nb[0]) && (h->nb[1] == t->nb[1]) && (h->nb[2] == t->nb[2]) && (h->nb[3] == t->nb[3]);
1874
+ }
1875
+
1876
+ // add tensor and return its index
1877
+ int add_tensor(const ggml_tensor * t) {
1878
+ auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(t->buffer->context);
1879
+
1880
+ // First lookup by tensor data
1881
+ auto range = d_map.equal_range(t->data);
1882
+ for (auto it = range.first; it != range.second; ++it) {
1883
+ htp_tensor * h = &h_tens[it->second];
1884
+ if (same_shape(h, t)) { return it->second; }
1885
+ }
1886
+
1887
+ // Lookup by tensor ptr
1888
+ auto it = t_map.find(t);
1889
+ if (it != t_map.end()) { return it->second; }
1890
+
1891
+ // Add new tensor to the batch
1892
+ int ti = n_tens++;
1893
+ GGML_ASSERT(n_tens <= n_tens_max);
1894
+
1895
+ t_map.insert({t, ti});
1896
+ d_map.insert({t->data, ti});
1897
+
1898
+ uint64_t t_offset = (uint8_t *) t->data - sbuf->base;
1899
+ size_t t_size = ggml_nbytes(t);
1900
+
1901
+ htp_tensor &h = h_tens[ti];
1902
+ h.bi = add_buffer(sbuf);
1903
+ h.data = t_offset;
1904
+ h.size = t_size;
1905
+ h.type = t->type;
1906
+ h.ne[0] = t->ne[0]; h.ne[1] = t->ne[1]; h.ne[2] = t->ne[2]; h.ne[3] = t->ne[3];
1907
+ h.nb[0] = t->nb[0]; h.nb[1] = t->nb[1]; h.nb[2] = t->nb[2]; h.nb[3] = t->nb[3];
1908
+
1909
+ h.flags = 0;
1910
+ if (ggml_backend_buffer_get_usage(t->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
1911
+ h.flags |= HTP_TENSOR_COMPUTE;
1912
+ }
1913
+
1914
+ HEX_VERBOSE("ggml-hex: add-tensor #%u %s : bi %d data %p offset %zu size %zu flags 0x%x : %zu:%zu:%zu:%zu\n",
1915
+ ti, t->name, h.bi, (void*) t->data, (size_t) t_offset, t_size, h.flags,
1916
+ (size_t) t->ne[0], (size_t) t->ne[1], (size_t) t->ne[2], (size_t) t->ne[3]);
1917
+
1918
+ return ti;
1919
+ }
1920
+
1921
+ bool fit_op(const htp_opnode & node) const {
1922
+ if (n_ops >= n_ops_max ) return false;
1923
+
1924
+ // check how much extras we will need
1925
+ size_t extra_bufs = 0;
1926
+ size_t extra_vmem = 0;
1927
+ size_t extra_tens = 0;
1928
+
1929
+ auto fit_tensor = [&](const ggml_tensor *t) {
1930
+ if (!t) return;
1931
+ if (!t_map.count(t)) {
1932
+ extra_tens++;
1933
+
1934
+ auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(t->buffer->context);
1935
+ if (!b_map.count(sbuf->fd)) {
1936
+ extra_vmem += sbuf->size;
1937
+ extra_bufs += 1;
1938
+ }
1939
+ }
1940
+ };
1941
+
1942
+ for (const auto * src : node.get_inputs()) {
1943
+ fit_tensor(src);
1944
+ }
1945
+ fit_tensor(node.dst());
1946
+
1947
+ if ((extra_bufs + n_bufs) > n_bufs_max) return false;
1948
+ if ((extra_tens + n_tens) > n_tens_max) return false;
1949
+ if ((extra_vmem + b_vmem) > b_vmem_max) return false;
1950
+
1951
+ return true;
1952
+ }
1953
+
1954
+ // assumes that fit_op() was called first and returned true
1955
+ void add_op(const htp_opnode & node) {
1956
+ // Add new op
1957
+
1958
+ unsigned int n = n_ops++;
1959
+ GGML_ASSERT(n_ops <= n_ops_max);
1960
+
1961
+ ops[n] = node;
1962
+
1963
+ htp_op_desc &o = h_ops[n];
1964
+ memcpy(&o.params, &node.node->op_params, sizeof(node.node->op_params));
1965
+ o.opcode = node.opcode;
1966
+ o.flags = 0;
1967
+
1968
+ if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) {
1969
+ o.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
1970
+ }
1971
+
1972
+ ggml_hexagon_dump_op_exec(sess->c_name(), node, o.flags);
1973
+
1974
+ auto inputs = node.get_inputs();
1975
+ for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) {
1976
+ o.src[i] = (i < inputs.size() && inputs[i]) ? add_tensor(inputs[i]) : 0xffff;
1977
+ }
1978
+ o.dst = add_tensor(node.dst());
1979
+ }
1980
+ };
1981
+
1982
+ struct ggml_hexagon_opqueue {
1983
+ // Shared buffer for storing batches
1984
+ ggml_hexagon_shared_buffer *shm_buf;
1985
+ size_t shm_blk_size;
1986
+
1987
+ using opvec = std::vector<htp_opnode>;
1988
+
1989
+ std::queue<unsigned int> done; // completed batch ids
1990
+ std::vector<opvec> op_cache; // per batch op cache
1991
+ std::vector<uint64_t> start_usec; // per batch start time
1992
+
1993
+ ggml_hexagon_opqueue(ggml_hexagon_session *sess, size_t batch_size, size_t depth) {
1994
+ size_t n_bufs = HTP_OP_MAX_BUFS;
1995
+ size_t n_ops = batch_size;
1996
+ size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
1997
+
1998
+ shm_blk_size = sizeof(htp_buf_desc) * n_bufs +
1999
+ sizeof(htp_tensor) * n_tensors +
2000
+ sizeof(htp_op_desc) * n_ops +
2001
+ sizeof(htp_prof_desc) * n_ops;
2002
+
2003
+ shm_buf = new ggml_hexagon_shared_buffer(sess, shm_blk_size * depth, true /* pinned */);
2004
+
2005
+ op_cache.resize(depth);
2006
+ start_usec.resize(depth, 0);
2007
+
2008
+ // init done queue
2009
+ for (unsigned int i = 0; i < depth; i++) { done.push(i); }
2010
+
2011
+ if (opt_verbose) {
2012
+ GGML_LOG_INFO("ggml-hex: %s allocated op-queue : batch-size %zu depth %zu shm-size %zu shm-block-size %zu\n",
2013
+ sess->c_name(), batch_size, depth, shm_buf->size, shm_blk_size);
2014
+ }
2015
+ }
2016
+
2017
+ ~ggml_hexagon_opqueue() {
2018
+ delete shm_buf;
2019
+ }
2020
+
2021
+ // push new batch
2022
+ bool push(htp_opbatch_req& req, dspqueue_buffer& dbuf, ggml_hexagon_opbatch* op_batch) {
2023
+ static_assert(sizeof(htp_opbatch_req) % 8 == 0, "sizeof(htp_opbatch_req) must be multiple of 8");
2024
+ static_assert(sizeof(htp_opbatch_rsp) % 8 == 0, "sizeof(htp_opbatch_rsp) must be multiple of 8");
2025
+ static_assert(sizeof(htp_buf_desc) % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
2026
+ static_assert(sizeof(htp_tensor) % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
2027
+ static_assert(sizeof(htp_op_desc) % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
2028
+ static_assert(sizeof(htp_prof_desc) % 8 == 0, "sizeof(htp_prof_desc) must be multiple of 8");
2029
+
2030
+ if (done.empty()) { return false; }
2031
+
2032
+ req.id = done.front(); done.pop(); // batch id
2033
+ req.n_bufs = op_batch->n_bufs;
2034
+ req.n_tensors = op_batch->n_tens;
2035
+ req.n_ops = op_batch->n_ops;
2036
+
2037
+ op_cache[req.id] = op_batch->ops;
2038
+ start_usec[req.id] = ggml_time_us();
2039
+
2040
+ const size_t b_size = sizeof(htp_buf_desc) * req.n_bufs;
2041
+ const size_t t_size = sizeof(htp_tensor) * req.n_tensors;
2042
+ const size_t o_size = sizeof(htp_op_desc) * req.n_ops;
2043
+ const size_t p_size = sizeof(htp_prof_desc) * req.n_ops;
2044
+
2045
+ dbuf.ptr = shm_buf->base + (req.id * shm_blk_size);
2046
+ dbuf.fd = shm_buf->fd;
2047
+ dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
2048
+ dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) shm_buf->base;
2049
+ dbuf.size = b_size + t_size + o_size + p_size;
2050
+
2051
+ GGML_ASSERT(dbuf.size <= shm_blk_size);
2052
+
2053
+ uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
2054
+ uint8_t * b_ptr = m_ptr; m_ptr += b_size;
2055
+ uint8_t * t_ptr = m_ptr; m_ptr += t_size;
2056
+ uint8_t * o_ptr = m_ptr;
2057
+
2058
+ memcpy(b_ptr, (void *) op_batch->h_bufs.data(), b_size);
2059
+ memcpy(t_ptr, (void *) op_batch->h_tens.data(), t_size);
2060
+ memcpy(o_ptr, (void *) op_batch->h_ops.data(), o_size);
2061
+
2062
+ HEX_VERBOSE("ggml-hex: %s op-queue push batch #%u : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu m-size %zu\n",
2063
+ shm_buf->sess->c_name(), req.id, req.n_bufs, req.n_tensors, req.n_ops, op_batch->b_vmem,
2064
+ b_size, t_size, o_size, (size_t) dbuf.size);
2065
+
2066
+ op_batch->reset();
2067
+
2068
+ if (opt_verbose > 1) {
2069
+ htp_buf_desc *b = (htp_buf_desc*) b_ptr;
2070
+ for (unsigned int i=0; i < req.n_bufs; i++) {
2071
+ GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", shm_buf->sess->c_name(), i,
2072
+ b[i].fd, (void *) b[i].base, (size_t) b[i].size);
2073
+ }
2074
+ htp_tensor *t = (htp_tensor*) t_ptr;
2075
+ for (unsigned int i=0; i < req.n_tensors; i++) {
2076
+ GGML_LOG_DEBUG("ggml-hex: %s htp-tensor #%u : bi %u offset %u size %u : %zu:%zu:%zu:%zu\n",
2077
+ shm_buf->sess->c_name(), i, t[i].bi, t[i].data, t[i].size,
2078
+ (size_t) t[i].ne[0], (size_t) t[i].ne[1], (size_t) t[i].ne[2], (size_t) t[i].ne[3]);
2079
+ }
2080
+ }
2081
+
2082
+ return true;
2083
+ }
2084
+
2085
+ void pop(htp_opbatch_rsp rsp, dspqueue_buffer dbuf) {
2086
+ GGML_ASSERT(rsp.id < op_cache.size());
2087
+
2088
+ done.push(rsp.id);
2089
+
2090
+ const size_t b_size = sizeof(htp_buf_desc) * rsp.n_bufs;
2091
+ const size_t t_size = sizeof(htp_tensor) * rsp.n_tensors;
2092
+ const size_t o_size = sizeof(htp_op_desc) * rsp.n_ops;
2093
+ const size_t p_size = sizeof(htp_prof_desc) * rsp.n_ops;
2094
+
2095
+ const size_t m_size = b_size + t_size + o_size + p_size;
2096
+ GGML_ASSERT(m_size <= shm_blk_size);
2097
+
2098
+ HEX_VERBOSE("ggml-hex: %s op-queue pop batch #%u : n-bufs %u n-tensors %u n-ops %u : m-size %zu b-size %zu t-size %zu o-size %zu\n",
2099
+ shm_buf->sess->c_name(), rsp.id, rsp.n_bufs, rsp.n_tensors, rsp.n_ops,
2100
+ (size_t) dbuf.size, b_size, t_size, o_size);
2101
+
2102
+ uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
2103
+ uint8_t * p_ptr = m_ptr + (b_size + t_size + o_size);
2104
+
2105
+ if (opt_profile && rsp.n_ops > 0) {
2106
+ auto & ops = op_cache[rsp.id];
2107
+
2108
+ uint64_t batch_usec = ggml_time_us() - start_usec[rsp.id];
2109
+ uint32_t htp_usec = 0;
2110
+
2111
+ GGML_ASSERT(rsp.n_ops <= ops.size());
2112
+
2113
+ const htp_prof_desc * pd = (const htp_prof_desc *) p_ptr;
2114
+ for (uint32_t i = 0; i < rsp.n_ops; i++) {
2115
+ htp_usec += pd[i].usecs;
2116
+ ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i].usecs, pd[i].cycles, pd[i].pmu);
2117
+ }
2118
+
2119
+ GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u\n",
2120
+ shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec);
2121
+ }
2122
+ }
2123
+ };
2124
+
2125
+ // Flush HTP response queue i.e wait for all outstanding requests to complete
2126
+ void ggml_hexagon_session::flush_pending(bool all) {
2127
+ while (this->op_pending) {
2128
+ struct htp_opbatch_rsp rsp;
2129
+ uint32_t rsp_size;
2130
+ uint32_t flags;
2131
+
2132
+ struct dspqueue_buffer dbuf;
2133
+ uint32_t n_dbufs;
2134
+
2135
+ // Read response packet from queue
2136
+ const uint32_t timeo = opt_oppoll ? 0 : DSPQUEUE_TIMEOUT;
2137
+ int err = dspqueue_read(this->queue, &flags, 1, &n_dbufs, &dbuf, sizeof(rsp), &rsp_size, (uint8_t *) &rsp, timeo);
2138
+ if (err == AEE_EEXPIRED) {
2139
+ continue;
2140
+ }
2141
+
2142
+ if (err != 0) {
2143
+ GGML_ABORT("ggml-hex: dspqueue_read failed: 0x%08x\n", (unsigned) err);
2144
+ }
2145
+
2146
+ // Basic sanity checks
2147
+ if (rsp_size != sizeof(rsp) || n_dbufs != 1) {
2148
+ GGML_ABORT("ggml-hex: %s dspcall : bad response : size %u dspbufs %u\n", this->c_name(), rsp_size, n_dbufs);
2149
+ }
2150
+
2151
+ if (rsp.status != HTP_STATUS_OK) {
2152
+ GGML_LOG_ERROR("ggml-hex: %s dspcall : dsp-rsp: %s\n", this->c_name(), status_to_str(rsp.status));
2153
+ // TODO: handle errors
2154
+ }
2155
+
2156
+ op_queue->pop(rsp, dbuf);
2157
+
2158
+ this->op_pending--; // atomic dec
2159
+
2160
+ if (!all) break;
2161
+ }
2162
+ }
2163
+
2164
+ void ggml_hexagon_session::flush_batch() {
2165
+ if (op_batch->empty()) { return; }
2166
+
2167
+ htp_opbatch_req req {};
2168
+ dspqueue_buffer dbuf{};
2169
+
2170
+ if (!op_queue->push(req, dbuf, op_batch)) {
2171
+ flush_pending(false);
2172
+ op_queue->push(req, dbuf, op_batch);
2173
+ }
2174
+
2175
+ // Bump pending flag (cleared in the session::flush once we get the response)
2176
+ this->op_pending++; // atomic inc
2177
+
2178
+ HEX_VERBOSE("ggml-hex: %s queue-opbatch: %p size %u\n", this->c_name(), dbuf.ptr, dbuf.size);
2179
+
2180
+ int err = dspqueue_write(this->queue, 0, 1, &dbuf, sizeof(req), (const uint8_t*) &req, DSPQUEUE_TIMEOUT);
2181
+ if (err != 0) {
2182
+ GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->c_name(), (unsigned) err);
2183
+ }
2184
+ }
2185
+
2186
+ void ggml_hexagon_session::enqueue_op(const htp_opnode & node) {
2187
+ if (!op_batch->fit_op(node)) {
2188
+ flush_batch();
2189
+ }
2190
+ op_batch->add_op(node);
2191
+ }
2192
+
2193
+ // Flush HTP response queue i.e wait for all outstanding requests to complete
2194
+ void ggml_hexagon_session::flush(bool all) {
2195
+ flush_batch();
2196
+ flush_pending(all);
2197
+ }
2198
+
2199
+ static size_t ggml_hexagon_measure_max_vmem(ggml_hexagon_session *sess) {
2200
+ // Allocate a bunch pinned buffers till failure.
2201
+ // This is kind of expensive but handy for figuring out exactly how much we can mmap on a specific device.
2202
+ // Typically we're going to allocate all/most of these buffers anyway for the model weights.
2203
+
2204
+ std::vector<ggml_hexagon_shared_buffer *> sbufs;
2205
+
2206
+ const size_t MiB = 1024 * 1024;
2207
+ const size_t GiB = MiB * 1024;
2208
+
2209
+ size_t vmem = 0;
2210
+ size_t step = 256u * MiB;
2211
+
2212
+ try {
2213
+ sbufs.push_back(new ggml_hexagon_shared_buffer(sess, GiB, true)); vmem += GiB;
2214
+ sbufs.push_back(new ggml_hexagon_shared_buffer(sess, GiB, true)); vmem += GiB;
2215
+ sbufs.push_back(new ggml_hexagon_shared_buffer(sess, GiB, true)); vmem += GiB;
2216
+
2217
+ while (1) {
2218
+ sbufs.push_back(new ggml_hexagon_shared_buffer(sess, step, true));
2219
+ vmem += step;
2220
+ }
2221
+ } catch (...) { }
2222
+
2223
+ for (auto b : sbufs) { delete b; }
2224
+
2225
+ return vmem - step; // backoff to account for overhead from internal mappings
2226
+ }
2227
+
1545
2228
  void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
1546
2229
  this->valid_session = false;
1547
2230
  this->valid_handle = false;
@@ -1554,11 +2237,8 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
1554
2237
  this->name = std::string("HTP") + std::to_string(dev_id);
1555
2238
 
1556
2239
  this->op_pending = 0;
1557
- this->prof_usecs = 0;
1558
- this->prof_cycles = 0;
1559
- this->prof_pkts = 0;
1560
2240
 
1561
- GGML_LOG_INFO("ggml-hex: allocating new session: %s\n", this->name.c_str());
2241
+ GGML_LOG_DEBUG("ggml-hex: %s allocating new session\n", this->name.c_str());
1562
2242
 
1563
2243
  domain * my_domain = get_domain(this->domain_id);
1564
2244
  if (my_domain == NULL) {
@@ -1634,9 +2314,6 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
1634
2314
 
1635
2315
  this->valid_handle = true;
1636
2316
 
1637
- GGML_LOG_INFO("ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(),
1638
- this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
1639
-
1640
2317
  // Enable FastRPC QoS mode
1641
2318
  {
1642
2319
  struct remote_rpc_control_latency l;
@@ -1648,11 +2325,17 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
1648
2325
  }
1649
2326
  }
1650
2327
 
2328
+ GGML_LOG_INFO("ggml-hex: %s new session : session-id %d domain-id %d uri %s handle 0x%lx\n", this->c_name(),
2329
+ this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
2330
+
2331
+ const size_t req_q_size = (sizeof(htp_opbatch_req) * opt_opqueue * 2) + 1024;
2332
+ const size_t rsp_q_size = (sizeof(htp_opbatch_rsp) * opt_opqueue * 2) + 1024;
2333
+
1651
2334
  // Now let's setup the DSP queue
1652
2335
  err = dspqueue_create(this->domain_id,
1653
2336
  0, // Flags
1654
- 128 * 1024, // Request queue size (in bytes)
1655
- 64 * 1024, // Response queue size (in bytes)
2337
+ req_q_size, // Request queue size (in bytes)
2338
+ rsp_q_size, // Response queue size (in bytes)
1656
2339
  nullptr, // Read packet callback (we handle reads explicitly)
1657
2340
  nullptr, // Error callback (we handle errors during reads)
1658
2341
  (void *) this, // Callback context
@@ -1672,18 +2355,36 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
1672
2355
  }
1673
2356
 
1674
2357
  if (opt_etm) {
1675
- err = htp_iface_enable_etm(this->handle);
2358
+ err = htp_iface_etm(this->handle, 1);
1676
2359
  if (err != 0) {
1677
2360
  GGML_LOG_ERROR("ggml-hex: failed to enable ETM tracing: 0x%08x\n", (unsigned) err);
1678
2361
  }
1679
2362
  }
1680
2363
 
1681
- // Start the DSP-side service. We need to pass the queue ID to the
1682
- // DSP in a FastRPC call; the DSP side will import the queue and start
1683
- // listening for packets in a callback.
1684
- err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx);
2364
+ if (opt_profile) {
2365
+ htp_iface_pmu_conf pmu_conf{};
2366
+ std::copy(opt_pmu_evt.begin(), opt_pmu_evt.end(), pmu_conf.events);
2367
+
2368
+ err = htp_iface_profiler(this->handle, opt_profile, &pmu_conf);
2369
+ if (err != 0) {
2370
+ GGML_LOG_ERROR("ggml-hex: failed to enable profiling: 0x%08x\n", (unsigned) err);
2371
+ }
2372
+ }
2373
+
2374
+ // Allocate buffers and state for op batching
2375
+ this->op_queue = new ggml_hexagon_opqueue(this, opt_opbatch, opt_opqueue);
2376
+
2377
+ if (!opt_vmem) {
2378
+ opt_vmem = ggml_hexagon_measure_max_vmem(this);
2379
+ GGML_LOG_INFO("ggml-hex: %s measured max vmem %zu\n", this->c_name(), opt_vmem);
2380
+ }
2381
+
2382
+ this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch, opt_vmem);
2383
+
2384
+ // Start dspqueue/opbatch processing
2385
+ err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx, opt_use_hmx, opt_vmem);
1685
2386
  if (err != 0) {
1686
- GGML_LOG_ERROR("ggml-hex: failed to start session: 0x%08x\n", (unsigned) err);
2387
+ GGML_LOG_ERROR("ggml-hex: %s failed to start session: 0x%08x\n", this->c_name(), (unsigned) err);
1687
2388
  throw std::runtime_error("ggml-hex: iface start failed (see log for details)");
1688
2389
  }
1689
2390
  this->valid_iface = true;
@@ -1694,21 +2395,32 @@ void ggml_hexagon_session::release() noexcept(true) {
1694
2395
 
1695
2396
  int err;
1696
2397
 
1697
- // Stop the DSP-side service and close the queue
1698
2398
  if (this->valid_iface) {
2399
+ // Stop dspqueue/opbatch processing
1699
2400
  err = htp_iface_stop(this->handle);
1700
2401
  if (err != 0) {
1701
2402
  GGML_ABORT("ggml-hex: htp_iface_stop failed: 0x%08x\n", (unsigned) err);
1702
2403
  }
1703
2404
  }
1704
2405
 
2406
+ delete this->op_batch;
2407
+ delete this->op_queue;
2408
+
1705
2409
  if (opt_etm) {
1706
- err = htp_iface_disable_etm(this->handle);
2410
+ err = htp_iface_etm(this->handle, 0);
1707
2411
  if (err != 0) {
1708
2412
  GGML_LOG_ERROR("ggml-hex: warn : failed to disable ETM tracing: 0x%08x\n", (unsigned) err);
1709
2413
  }
1710
2414
  }
1711
2415
 
2416
+ if (opt_profile) {
2417
+ htp_iface_pmu_conf pmu_conf{};
2418
+ err = htp_iface_profiler(this->handle, 0, &pmu_conf);
2419
+ if (err != 0) {
2420
+ GGML_LOG_ERROR("ggml-hex: warn : failed to disable profiling: 0x%08x\n", (unsigned) err);
2421
+ }
2422
+ }
2423
+
1712
2424
  if (this->valid_queue) {
1713
2425
  err = dspqueue_close(queue);
1714
2426
  if (err != 0) {
@@ -1725,6 +2437,9 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
1725
2437
  buffer_type.device = dev;
1726
2438
  repack_buffer_type.device = dev;
1727
2439
 
2440
+ op_batch = nullptr;
2441
+ op_queue = nullptr;
2442
+
1728
2443
  try {
1729
2444
  allocate(dev_id);
1730
2445
 
@@ -1753,24 +2468,10 @@ static bool ggml_backend_buffer_is_hexagon(const struct ggml_backend_buffer * b)
1753
2468
  }
1754
2469
 
1755
2470
  static inline bool ggml_backend_buffer_is_hexagon_repack(const struct ggml_backend_buffer * b) {
1756
- return b->buft->iface.alloc_buffer == ggml_backend_hexagon_repack_buffer_type_alloc_buffer;
1757
- }
1758
-
1759
- static bool hex_supported_dims2(const struct ggml_tensor * x, const struct ggml_tensor * y) {
1760
- if (x->ne[0] != y->ne[0]) {
1761
- return false;
1762
- }
1763
- if (x->ne[1] != y->ne[1]) {
1764
- return false;
1765
- }
1766
- if (x->ne[2] != y->ne[2]) {
1767
- return false;
2471
+ if (!opt_hostbuf) {
2472
+ return ggml_backend_buffer_is_hexagon(b);
1768
2473
  }
1769
- if (x->ne[3] != y->ne[3]) {
1770
- return false;
1771
- }
1772
-
1773
- return true;
2474
+ return b->buft->iface.alloc_buffer == ggml_backend_hexagon_repack_buffer_type_alloc_buffer;
1774
2475
  }
1775
2476
 
1776
2477
  static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
@@ -1801,44 +2502,64 @@ static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_sess
1801
2502
  return false;
1802
2503
  }
1803
2504
 
1804
- return opt_experimental;
1805
- }
2505
+ if (dst->ne[3] != 1) {
2506
+ return false;
2507
+ }
1806
2508
 
1807
- static bool hex_supported_src0_type(ggml_type t) {
1808
- return t == GGML_TYPE_F32;
2509
+ return true;
1809
2510
  }
1810
2511
 
1811
- static bool hex_supported_src1_type(ggml_type t) {
1812
- return t == GGML_TYPE_F32;
1813
- }
2512
+ static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2513
+ const struct ggml_tensor * q = op->src[0];
2514
+ const struct ggml_tensor * k = op->src[1];
2515
+ const struct ggml_tensor * v = op->src[2];
2516
+ const struct ggml_tensor * g = op->src[3];
2517
+ const struct ggml_tensor * beta = op->src[4];
2518
+ const struct ggml_tensor * state = op->src[5];
2519
+ const struct ggml_tensor * dst = op;
1814
2520
 
1815
- static bool hex_supported_src2_type(ggml_type t) {
1816
- return t == GGML_TYPE_F32;
1817
- }
2521
+ if (!q || !k || !v || !g || !beta || !state) {
2522
+ return false;
2523
+ }
1818
2524
 
1819
- static bool hex_supported_src1_type2(ggml_type t) {
1820
- return t == GGML_TYPE_F16;
1821
- }
2525
+ if (q->type != GGML_TYPE_F32 || k->type != GGML_TYPE_F32 || v->type != GGML_TYPE_F32 ||
2526
+ g->type != GGML_TYPE_F32 || beta->type != GGML_TYPE_F32 || state->type != GGML_TYPE_F32 ||
2527
+ dst->type != GGML_TYPE_F32) {
2528
+ return false;
2529
+ }
1822
2530
 
1823
- static bool hex_supported_src1_type3(ggml_type t) {
1824
- return t == GGML_TYPE_I32;
1825
- }
2531
+ if (!ggml_is_contiguous_rows(q) || !ggml_is_contiguous_rows(k) || !ggml_is_contiguous_rows(v) ||
2532
+ !ggml_is_contiguous(g) || !ggml_is_contiguous(beta) || !ggml_is_contiguous(state) ||
2533
+ !ggml_is_contiguous(dst)) {
2534
+ return false;
2535
+ }
1826
2536
 
1827
- static bool hex_supported_dst_type(ggml_type t) {
1828
- return t == GGML_TYPE_F32;
1829
- }
2537
+ const int64_t S_v = v->ne[0];
2538
+ const int64_t H = v->ne[1];
2539
+ const int64_t n_tokens = v->ne[2];
2540
+ const int64_t n_seqs = v->ne[3];
2541
+ const int64_t K = ggml_get_op_params_i32(op, 0);
1830
2542
 
1831
- static bool hex_supported_dims(const struct ggml_tensor * x, const struct ggml_tensor * y) {
1832
- // TODO: support broadcast for ne[2 and 3]
1833
- if (x->ne[0] != y->ne[0]) {
2543
+ if (S_v <= 0 || S_v > 128 || H <= 0 || n_tokens <= 0 || n_seqs <= 0) {
2544
+ return false;
2545
+ }
2546
+ if (q->ne[0] != S_v || k->ne[0] != S_v || q->ne[1] <= 0 || k->ne[1] <= 0 ||
2547
+ q->ne[2] != n_tokens || k->ne[2] != n_tokens || q->ne[3] <= 0 || k->ne[3] <= 0 ||
2548
+ (n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) {
1834
2549
  return false;
1835
2550
  }
1836
- if (x->ne[2] != y->ne[2]) {
2551
+ if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) {
1837
2552
  return false;
1838
2553
  }
1839
- if (x->ne[3] != y->ne[3]) {
2554
+ // state holds s0 only [S_v, S_v, H, n_seqs]; K is op param 0.
2555
+ if (ggml_nelements(state) != S_v * S_v * H * n_seqs) {
1840
2556
  return false;
1841
2557
  }
2558
+ if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs * K) {
2559
+ return false;
2560
+ }
2561
+
2562
+ GGML_UNUSED(sess);
1842
2563
  return true;
1843
2564
  }
1844
2565
 
@@ -1856,18 +2577,20 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
1856
2577
 
1857
2578
  switch (src0->type) {
1858
2579
  case GGML_TYPE_Q4_0:
2580
+ case GGML_TYPE_Q4_1:
1859
2581
  case GGML_TYPE_Q8_0:
2582
+ case GGML_TYPE_IQ4_NL:
1860
2583
  case GGML_TYPE_MXFP4:
1861
2584
  if (src0->ne[0] % 32) {
1862
2585
  return false;
1863
2586
  }
1864
2587
 
1865
- if (src0->ne[1] > 16 * 1024) {
2588
+ if (ggml_nrows(src0) > 16 * 1024) {
1866
2589
  return false; // typically the lm-head which would be too large for VTCM
1867
2590
  }
1868
2591
 
1869
- if ((src1->ne[2] != 1 || src1->ne[3] != 1)) {
1870
- return false;
2592
+ if (ggml_nrows(src1) > 1024 || src1->ne[2] != 1 || src1->ne[3] != 1) {
2593
+ return false; // no huge batches or broadcasting (for now)
1871
2594
  }
1872
2595
 
1873
2596
  // src0 (weights) must be repacked
@@ -1881,6 +2604,30 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
1881
2604
  GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F16 src0 not supported\n");
1882
2605
  return false;
1883
2606
  }
2607
+ if (src1->ne[2] < src0->ne[2] || src1->ne[3] < src0->ne[3]) {
2608
+ GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: src1 broadcasting not supported\n");
2609
+ return false;
2610
+ }
2611
+ if (ggml_nrows(src1) > 1024) {
2612
+ return false; // no huge batches (for now)
2613
+ }
2614
+ break;
2615
+
2616
+ case GGML_TYPE_F32:
2617
+ if (src1->type != GGML_TYPE_F32) {
2618
+ return false;
2619
+ }
2620
+ if (src0->nb[1] < src0->nb[0]) {
2621
+ GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F32 src0 not supported\n");
2622
+ return false;
2623
+ }
2624
+ if (src1->ne[2] < src0->ne[2] || src1->ne[3] < src0->ne[3]) {
2625
+ GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: src1 broadcasting not supported\n");
2626
+ return false;
2627
+ }
2628
+ if (ggml_nrows(src1) > 1024) {
2629
+ return false; // no huge batches (for now)
2630
+ }
1884
2631
  break;
1885
2632
 
1886
2633
  default:
@@ -1902,7 +2649,9 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
1902
2649
 
1903
2650
  switch (src0->type) {
1904
2651
  case GGML_TYPE_Q4_0:
2652
+ case GGML_TYPE_Q4_1:
1905
2653
  case GGML_TYPE_Q8_0:
2654
+ case GGML_TYPE_IQ4_NL:
1906
2655
  case GGML_TYPE_MXFP4:
1907
2656
  if ((src0->ne[0] % 32)) {
1908
2657
  return false;
@@ -1926,24 +2675,30 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se
1926
2675
  const struct ggml_tensor * src1 = op->src[1];
1927
2676
  const struct ggml_tensor * dst = op;
1928
2677
 
1929
- if (!hex_supported_src0_type(src0->type)) {
1930
- return false;
1931
- }
1932
- if (!hex_supported_src1_type(src1->type)) {
1933
- return false;
2678
+ if (src0->type == GGML_TYPE_F32) {
2679
+ if (src1->type != GGML_TYPE_F32) {
2680
+ return false;
2681
+ }
2682
+ if (dst->type != GGML_TYPE_F32) {
2683
+ return false;
2684
+ }
1934
2685
  }
1935
- if (!hex_supported_dst_type(dst->type)) {
1936
- return false;
2686
+ else if (src0->type == GGML_TYPE_F16) {
2687
+ if (src1->type != GGML_TYPE_F16) {
2688
+ return false;
2689
+ }
2690
+ if (dst->type != GGML_TYPE_F16) {
2691
+ return false;
2692
+ }
1937
2693
  }
1938
- if (!hex_supported_dims2(src0, dst)) {
2694
+ else {
1939
2695
  return false;
1940
2696
  }
1941
- if (!ggml_can_repeat(src1, src0)) {
2697
+
2698
+ if (!ggml_are_same_shape(src0, dst)) {
1942
2699
  return false;
1943
2700
  }
1944
-
1945
- // TODO: add support for non-contigiuos tensors
1946
- if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
2701
+ if (!ggml_can_repeat(src1, src0) || ggml_is_permuted(src1)) {
1947
2702
  return false;
1948
2703
  }
1949
2704
 
@@ -1955,16 +2710,16 @@ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * se
1955
2710
  const struct ggml_tensor * src1 = op->src[1];
1956
2711
  const struct ggml_tensor * dst = op;
1957
2712
 
1958
- if (!hex_supported_src0_type(src0->type)) {
2713
+ if (src0->type != GGML_TYPE_F32) {
1959
2714
  return false;
1960
2715
  }
1961
- if (!hex_supported_src1_type(src1->type)) {
2716
+ if (src1->type != GGML_TYPE_F32) {
1962
2717
  return false;
1963
2718
  }
1964
- if (!hex_supported_dst_type(dst->type)) {
2719
+ if (dst->type != GGML_TYPE_F32) {
1965
2720
  return false;
1966
2721
  }
1967
- if (!hex_supported_dims2(src0, dst)) {
2722
+ if (!ggml_are_same_shape(src0, dst)) {
1968
2723
  return false;
1969
2724
  }
1970
2725
 
@@ -1980,13 +2735,32 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
1980
2735
  const struct ggml_tensor * src0 = op->src[0];
1981
2736
  const struct ggml_tensor * dst = op;
1982
2737
 
1983
- if (!hex_supported_src0_type(src0->type)) {
2738
+ if (src0->type != GGML_TYPE_F32) {
2739
+ return false;
2740
+ }
2741
+ if (dst->type != GGML_TYPE_F32) {
2742
+ return false;
2743
+ }
2744
+ if (!ggml_are_same_shape(src0, dst)) {
2745
+ return false;
2746
+ }
2747
+
2748
+ // dst must be contiguous; src0 may be non-contiguous
2749
+ if (!ggml_is_contiguous(dst)) {
1984
2750
  return false;
1985
2751
  }
1986
- if (!hex_supported_dst_type(dst->type)) {
2752
+
2753
+ return true;
2754
+ }
2755
+
2756
+ static bool ggml_hexagon_supported_sum_rows(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2757
+ const struct ggml_tensor * src0 = op->src[0];
2758
+ const struct ggml_tensor * dst = op;
2759
+
2760
+ if (src0->type != GGML_TYPE_F32) {
1987
2761
  return false;
1988
2762
  }
1989
- if (!hex_supported_dims2(src0, dst)) {
2763
+ if (dst->type != GGML_TYPE_F32) {
1990
2764
  return false;
1991
2765
  }
1992
2766
 
@@ -2004,10 +2778,10 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
2004
2778
  const struct ggml_tensor * src1 = op->src[1];
2005
2779
  const struct ggml_tensor * dst = op;
2006
2780
 
2007
- if (!hex_supported_src0_type(src0->type)) {
2781
+ if (src0->type != GGML_TYPE_F32) {
2008
2782
  return false;
2009
2783
  }
2010
- if (!hex_supported_dst_type(dst->type)) {
2784
+ if (dst->type != GGML_TYPE_F32) {
2011
2785
  return false;
2012
2786
  }
2013
2787
 
@@ -2016,10 +2790,10 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
2016
2790
  }
2017
2791
 
2018
2792
  if (src1) {
2019
- if (!hex_supported_src1_type(src1->type)) {
2793
+ if (src1->type != GGML_TYPE_F32) {
2020
2794
  return false;
2021
2795
  }
2022
- if (!hex_supported_dims2(src0, src1)) {
2796
+ if (!ggml_are_same_shape(src0, src1)) {
2023
2797
  return false;
2024
2798
  }
2025
2799
  if (!ggml_is_contiguous(src1)) {
@@ -2040,15 +2814,15 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
2040
2814
  return false; // FIXME: add support for sinks
2041
2815
  }
2042
2816
 
2043
- if (!hex_supported_src0_type(src0->type)) {
2817
+ if (src0->type != GGML_TYPE_F32) {
2044
2818
  return false;
2045
2819
  }
2046
- if (!hex_supported_dst_type(dst->type)) {
2820
+ if (dst->type != GGML_TYPE_F32) {
2047
2821
  return false;
2048
2822
  }
2049
2823
 
2050
2824
  if (src1) {
2051
- if (!hex_supported_src1_type(src1->type) && !hex_supported_src1_type2(src1->type)) {
2825
+ if (src1->type != GGML_TYPE_F32 && src1->type != GGML_TYPE_F16) {
2052
2826
  return false;
2053
2827
  }
2054
2828
  if (src0->ne[0] != src1->ne[0]) {
@@ -2075,6 +2849,23 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
2075
2849
  }
2076
2850
  }
2077
2851
 
2852
+ // Reject non-HVX-aligned sizes when ne[0] > HVX_F32_LANES
2853
+ // The HVX softmax implementation has issues with tail handling for larger non-aligned sizes
2854
+ // Small sizes (ne[0] <= 32) work correctly with tail-only processing
2855
+ const int64_t ne0 = src0->ne[0];
2856
+ if (ne0 > 32 && (ne0 & (32 - 1)) != 0) {
2857
+ return false;
2858
+ }
2859
+
2860
+ // HVX vector size constraints for softmax
2861
+ #define SOFTMAX_MAX_ROW_SIZE 131072 // 128K elements max for numerical precision
2862
+
2863
+ // Reject very large row sizes to avoid numerical precision issues
2864
+ // Softmax accumulation over many elements can lead to precision loss
2865
+ if (ne0 > SOFTMAX_MAX_ROW_SIZE) {
2866
+ return false;
2867
+ }
2868
+
2078
2869
  return true;
2079
2870
  }
2080
2871
 
@@ -2118,12 +2909,32 @@ static bool ggml_hexagon_supported_get_rows(const struct ggml_hexagon_session *
2118
2909
  return true;
2119
2910
  }
2120
2911
 
2912
+ static bool ggml_hexagon_supported_argsort(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2913
+ const struct ggml_tensor * src0 = op->src[0]; // values
2914
+ const struct ggml_tensor * dst = op; // indices
2915
+
2916
+ if (src0->type != GGML_TYPE_F32) {
2917
+ return false;
2918
+ }
2919
+
2920
+ if (dst->type != GGML_TYPE_I32) {
2921
+ return false;
2922
+ }
2923
+
2924
+ if (src0->ne[0] > (16*1024)) {
2925
+ // reject tensors with huge rows for now
2926
+ return false;
2927
+ }
2928
+
2929
+ return true;
2930
+ }
2931
+
2121
2932
  static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2122
2933
  const int32_t * op_params = &op->op_params[0];
2123
2934
 
2124
2935
  int mode = op_params[2];
2125
2936
 
2126
- if ((mode & GGML_ROPE_TYPE_MROPE) || (mode & GGML_ROPE_TYPE_VISION)) {
2937
+ if (mode == GGML_ROPE_TYPE_VISION) {
2127
2938
  return false;
2128
2939
  }
2129
2940
  if (mode & 1) {
@@ -2135,17 +2946,17 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
2135
2946
  const struct ggml_tensor * src2 = op->src[2];
2136
2947
  const struct ggml_tensor * dst = op;
2137
2948
 
2138
- if (!hex_supported_src0_type(src0->type)) {
2949
+ if (src0->type != GGML_TYPE_F32) {
2139
2950
  return false; // FIXME: add support for GGML_TYPE_F16 for src0
2140
2951
  }
2141
- if (!hex_supported_dst_type(dst->type)) {
2952
+ if (dst->type != GGML_TYPE_F32) {
2142
2953
  return false;
2143
2954
  }
2144
- if (!hex_supported_src1_type3(src1->type)) {
2955
+ if (src1->type != GGML_TYPE_I32) {
2145
2956
  return false;
2146
2957
  }
2147
2958
  if (src2) {
2148
- if (!hex_supported_src2_type(src2->type)) {
2959
+ if (src2->type != GGML_TYPE_F32) {
2149
2960
  return false;
2150
2961
  }
2151
2962
  int n_dims = op_params[1];
@@ -2168,277 +2979,147 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
2168
2979
  return true;
2169
2980
  }
2170
2981
 
2171
- enum dspqbuf_type {
2172
- DSPQBUF_TYPE_DSP_WRITE_CPU_READ = 0,
2173
- DSPQBUF_TYPE_CPU_WRITE_DSP_READ,
2174
- DSPQBUF_TYPE_CONSTANT,
2175
- };
2176
-
2177
- static void dspqbuf_dump(dspqueue_buffer * d, const struct ggml_tensor * t, dspqbuf_type type) {
2178
- if (opt_verbose < 2) return;
2179
-
2180
- auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
2181
- auto sess = buf->sess;
2182
-
2183
- GGML_LOG_DEBUG("ggml-hex: %s dspqbuf : %s base-addr %p base-size %zu data %p offset %u size %u\n", sess->name.c_str(),
2184
- t->name, (void *) buf->base, buf->size, (void *) d->ptr, (unsigned int) d->offset,
2185
- (unsigned int) d->size);
2186
- }
2187
-
2188
- // Init hexagon tensor from GGML tensor and Hexagon buffer
2189
- static void htp_req_tensor_init(htp_tensor * h, const ggml_tensor * t) {
2190
- h->data = 0; // updated by the receiver
2191
- h->type = t->type;
2192
- h->ne[0] = t->ne[0];
2193
- h->ne[1] = t->ne[1];
2194
- h->ne[2] = t->ne[2];
2195
- h->ne[3] = t->ne[3];
2196
- h->nb[0] = t->nb[0];
2197
- h->nb[1] = t->nb[1];
2198
- h->nb[2] = t->nb[2];
2199
- h->nb[3] = t->nb[3];
2200
- }
2982
+ static bool ggml_hexagon_supported_ssm_conv(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2983
+ const struct ggml_tensor * src0 = op->src[0];
2984
+ const struct ggml_tensor * src1 = op->src[1];
2985
+ const struct ggml_tensor * dst = op;
2201
2986
 
2202
- static size_t htp_req_buff_init(htp_tensor *h, dspqueue_buffer * d, const ggml_tensor * t, dspqbuf_type type) {
2203
- if (!t) {
2204
- return 0;
2987
+ // Only support FP32 for now
2988
+ if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
2989
+ return false;
2205
2990
  }
2206
2991
 
2207
- auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
2992
+ // Check IO tensor shapes and dims
2993
+ if (src0->ne[3] != 1 || src1->ne[2] != 1 || src1->ne[3] != 1 || dst->ne[3] != 1) {
2994
+ return false; // src0 should be effectively 3D
2995
+ }
2208
2996
 
2209
- memset(d, 0, sizeof(*d));
2210
- d->fd = buf->fd;
2211
- d->ptr = t->data;
2212
- d->offset = (uint8_t *) t->data - buf->base;
2213
- d->size = ggml_nbytes(t);
2997
+ const int d_conv = src1->ne[0];
2998
+ const int d_inner = src0->ne[1];
2999
+ const int n_t = dst->ne[1];
3000
+ const int n_s = dst->ne[2];
2214
3001
 
2215
- if (!d->size) {
2216
- // Some requests contain srcs where ggml_nbytes() returns 0 but the rest of the op is non-empty
2217
- d->size = 64;
3002
+ if (src0->ne[0] != d_conv - 1 + n_t || src0->ne[1] != d_inner || src0->ne[2] != n_s) {
3003
+ return false;
2218
3004
  }
2219
-
2220
- switch (type) {
2221
- case DSPQBUF_TYPE_DSP_WRITE_CPU_READ:
2222
- // Flush CPU
2223
- d->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER;
2224
- break;
2225
- case DSPQBUF_TYPE_CPU_WRITE_DSP_READ:
2226
- // Flush CPU, Invalidate DSP
2227
- d->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
2228
- break;
2229
- default:
2230
- // Constant buffer, no cache maintenance
2231
- d->flags = 0;
2232
- break;
3005
+ if (src1->ne[0] != d_conv || src1->ne[1] != d_inner) {
3006
+ return false;
2233
3007
  }
2234
-
2235
- htp_req_tensor_init(h, t);
2236
-
2237
- dspqbuf_dump(d, t, type);
2238
-
2239
- return 1;
2240
- }
2241
-
2242
- typedef size_t (*htp_req_init_func_t)(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * op);
2243
-
2244
- template <htp_req_init_func_t _init_req_func>
2245
- static inline void ggml_hexagon_dispatch_op(ggml_hexagon_session *sess, const struct ggml_tensor * op, uint32_t flags) {
2246
- uint64_t t = ggml_time_us();
2247
-
2248
- // Construct HTP request
2249
- htp_general_req req;
2250
- memset(&req, 0, sizeof(req));
2251
-
2252
- req.flags = flags;
2253
- if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
2254
- req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
3008
+ if (dst->ne[0] != d_inner || dst->ne[1] != n_t || dst->ne[2] != n_s) {
3009
+ return false;
2255
3010
  }
2256
- if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
2257
- req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
3011
+ if (src0->nb[0] != sizeof(float) || src1->nb[0] != sizeof(float) || dst->nb[0] != sizeof(float)) {
3012
+ return false;
2258
3013
  }
2259
-
2260
- ggml_hexagon_dump_op_exec(sess->name, op, req.flags);
2261
-
2262
- if ((opt_opmask & HTP_OPMASK_QUEUE)) {
2263
- dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
2264
- size_t n_bufs = _init_req_func(&req, bufs, op);
2265
- sess->enqueue(req, bufs, n_bufs, opt_opsync);
3014
+ if (src0->nb[1] != src0->ne[0] * sizeof(float) || src1->nb[1] != src1->ne[0] * sizeof(float)) {
3015
+ return false;
2266
3016
  }
2267
3017
 
2268
- t = ggml_time_us() - t;
2269
-
2270
- ggml_hexagon_dump_op_prof(sess->name, op, sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, t);
3018
+ return true;
2271
3019
  }
2272
3020
 
2273
- template <bool _is_src0_constant>
2274
- static inline size_t init_binary_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2275
- switch (t->op) {
2276
- case GGML_OP_MUL_MAT:
2277
- req->op = HTP_OP_MUL_MAT;
2278
- break;
2279
- case GGML_OP_MUL:
2280
- req->op = HTP_OP_MUL;
2281
- break;
2282
- case GGML_OP_ADD:
2283
- req->op = HTP_OP_ADD;
2284
- break;
2285
- case GGML_OP_SUB:
2286
- req->op = HTP_OP_SUB;
2287
- break;
2288
- default:
2289
- GGML_ABORT("ggml-hex: binary : unsupported op: %d\n", t->op);
2290
- break;
2291
- }
2292
-
2293
- // src0: Weights (mulmat) or First Operand (binary op).
2294
- // If constant (e.g. weights), no cache management is needed.
2295
- // src1: Input Activations (mulmat) or Second Operand (binary op).
3021
+ static bool ggml_hexagon_supported_pad(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3022
+ const struct ggml_tensor * src0 = op->src[0];
3023
+ const struct ggml_tensor * dst = op;
2296
3024
 
2297
- size_t n_bufs = 0;
2298
- n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], _is_src0_constant ? DSPQBUF_TYPE_CONSTANT : DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2299
- n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2300
- n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
3025
+ if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
3026
+ return false;
3027
+ }
2301
3028
 
2302
- return n_bufs;
3029
+ GGML_UNUSED(sess);
3030
+ return true;
2303
3031
  }
2304
3032
 
2305
- static inline size_t init_get_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2306
- req->op = HTP_OP_GET_ROWS;
2307
-
2308
- size_t n_bufs = 0;
2309
- n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2310
- n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2311
- n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
2312
-
2313
- return n_bufs;
2314
- }
3033
+ static bool ggml_hexagon_supported_cumsum(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3034
+ const struct ggml_tensor * src0 = op->src[0];
3035
+ const struct ggml_tensor * dst = op;
2315
3036
 
2316
- template <bool _is_src0_constant>
2317
- static inline size_t init_binary_id_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2318
- switch (t->op) {
2319
- case GGML_OP_MUL_MAT_ID:
2320
- req->op = HTP_OP_MUL_MAT_ID;
2321
- break;
2322
- case GGML_OP_ADD_ID:
2323
- req->op = HTP_OP_ADD_ID;
2324
- break;
2325
- default:
2326
- GGML_ABORT("ggml-hex: unsupported op: %d\n", t->op);
3037
+ if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
3038
+ return false;
2327
3039
  }
2328
3040
 
2329
- // src0: Weights (mulmat) or Input Activations (other op).
2330
- // If constant, no cache management is needed.
2331
- // src1: Input Activations (mulmat) or Second Operand (binary op).
2332
- // src2: Expert IDs (mulmat) or Activated Experts (other op).
2333
-
2334
- size_t n_bufs = 0;
2335
- n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], _is_src0_constant ? DSPQBUF_TYPE_CONSTANT : DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2336
- n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2337
- n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2338
- n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
3041
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
3042
+ return false;
3043
+ }
2339
3044
 
2340
- return n_bufs;
3045
+ GGML_UNUSED(sess);
3046
+ return true;
2341
3047
  }
2342
3048
 
2343
- static inline size_t init_set_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2344
- req->op = HTP_OP_SET_ROWS;
2345
-
2346
- size_t n_bufs = 0;
2347
- n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2348
- n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2349
- n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
3049
+ static bool ggml_hexagon_supported_diag(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3050
+ const struct ggml_tensor * src0 = op->src[0];
3051
+ const struct ggml_tensor * dst = op;
2350
3052
 
2351
- return n_bufs;
2352
- }
3053
+ // diag only supports F32 currently
3054
+ if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
3055
+ return false;
3056
+ }
2353
3057
 
2354
- static inline size_t init_unary_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2355
- memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
3058
+ // Input must have ne[1] == 1 (vector input)
3059
+ if (src0->ne[1] != 1) {
3060
+ return false;
3061
+ }
2356
3062
 
2357
- bool supported = false;
3063
+ // Output must be square in first two dimensions
3064
+ if (dst->ne[0] != dst->ne[1] || dst->ne[0] != src0->ne[0]) {
3065
+ return false;
3066
+ }
2358
3067
 
2359
- switch (t->op) {
2360
- case GGML_OP_RMS_NORM:
2361
- req->op = HTP_OP_RMS_NORM;
2362
- supported = true;
2363
- break;
3068
+ GGML_UNUSED(sess);
3069
+ return true;
3070
+ }
2364
3071
 
2365
- case GGML_OP_SCALE:
2366
- req->op = HTP_OP_SCALE;
2367
- supported = true;
2368
- break;
3072
+ static bool ggml_hexagon_supported_solve_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3073
+ const struct ggml_tensor * src0 = op->src[0]; // A
3074
+ const struct ggml_tensor * src1 = op->src[1]; // B
3075
+ const struct ggml_tensor * dst = op; // X
2369
3076
 
2370
- case GGML_OP_UNARY:
2371
- if (ggml_get_unary_op(t) == GGML_UNARY_OP_SILU) {
2372
- req->op = HTP_OP_UNARY_SILU;
2373
- supported = true;
2374
- } else if (ggml_get_unary_op(t) == GGML_UNARY_OP_GELU) {
2375
- req->op = HTP_OP_UNARY_GELU;
2376
- supported = true;
2377
- }
2378
- break;
3077
+ if (!src0 || !src1) {
3078
+ return false;
3079
+ }
2379
3080
 
2380
- case GGML_OP_GLU:
2381
- if (ggml_get_glu_op(t) == GGML_GLU_OP_SWIGLU) {
2382
- req->op = HTP_OP_GLU_SWIGLU;
2383
- supported = true;
2384
- } else if (ggml_get_glu_op(t) == GGML_GLU_OP_SWIGLU_OAI) {
2385
- req->op = HTP_OP_GLU_SWIGLU_OAI;
2386
- supported = true;
2387
- }
2388
- break;
3081
+ if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
3082
+ return false;
3083
+ }
2389
3084
 
2390
- case GGML_OP_SOFT_MAX:
2391
- req->op = HTP_OP_SOFTMAX;
2392
- supported = true;
2393
- break;
3085
+ if (src0->ne[0] != src0->ne[1]) {
3086
+ return false;
3087
+ }
2394
3088
 
2395
- default:
2396
- break;
3089
+ if (src0->ne[1] != src1->ne[1]) {
3090
+ return false;
2397
3091
  }
2398
3092
 
2399
- if (!supported) {
2400
- GGML_ABORT("ggml-hex: unary : unsupported op: %d\n", t->op);
3093
+ if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
3094
+ return false;
2401
3095
  }
2402
3096
 
2403
- size_t n_bufs = 0;
2404
- n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2405
- n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2406
- n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
3097
+ if (dst->ne[0] != src1->ne[0] || dst->ne[1] != src1->ne[1] || dst->ne[2] != src1->ne[2] || dst->ne[3] != src1->ne[3]) {
3098
+ return false;
3099
+ }
2407
3100
 
2408
- return n_bufs;
3101
+ GGML_UNUSED(sess);
3102
+ return true;
2409
3103
  }
2410
3104
 
2411
- static inline size_t init_rope_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2412
- memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
2413
- req->op = HTP_OP_ROPE;
3105
+ static bool ggml_hexagon_supported_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2414
3106
 
2415
- size_t n_bufs = 0;
2416
- n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2417
- n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2418
- n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2419
- n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
2420
-
2421
- return n_bufs;
2422
- }
3107
+ const struct ggml_tensor * src0 = op->src[0];
3108
+ const struct ggml_tensor * dst = op;
2423
3109
 
2424
- static inline size_t init_flash_attn_ext_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2425
- memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
2426
- req->op = HTP_OP_FLASH_ATTN_EXT;
3110
+ if (src0->type != GGML_TYPE_F32) { return false; }
3111
+ if (dst->type != GGML_TYPE_F32) { return false; }
3112
+ if (!ggml_are_same_shape(src0, dst)) { return false; }
3113
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) { return false; }
2427
3114
 
2428
- size_t n_bufs = 0;
2429
- n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2430
- n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2431
- n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2432
- n_bufs += htp_req_buff_init(&req->src3, &bufs[n_bufs], t->src[3], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2433
- n_bufs += htp_req_buff_init(&req->src4, &bufs[n_bufs], t->src[4], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2434
- n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
3115
+ return true;
2435
3116
 
2436
- return n_bufs;
3117
+ GGML_UNUSED(sess);
2437
3118
  }
2438
3119
 
2439
3120
  static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
2440
3121
  auto sess = static_cast<ggml_hexagon_session *>(backend->context);
2441
- return sess->name.c_str();
3122
+ return sess->c_name();
2442
3123
  }
2443
3124
 
2444
3125
  static void ggml_backend_hexagon_free(ggml_backend_t backend) {
@@ -2447,118 +3128,118 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
2447
3128
  delete backend;
2448
3129
  }
2449
3130
 
2450
- static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) {
2451
- return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type) && ggml_is_quantized(op1->src[1]->type));
2452
- }
3131
+ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
3132
+ switch (t->op) {
3133
+ case GGML_OP_FLASH_ATTN_EXT: return HTP_OP_FLASH_ATTN_EXT;
3134
+ case GGML_OP_MUL_MAT: return HTP_OP_MUL_MAT;
3135
+ case GGML_OP_MUL_MAT_ID: return HTP_OP_MUL_MAT_ID;
3136
+ case GGML_OP_MUL: return HTP_OP_MUL;
3137
+ case GGML_OP_ADD: return HTP_OP_ADD;
3138
+ case GGML_OP_ADD_ID: return HTP_OP_ADD_ID;
3139
+ case GGML_OP_SUB: return HTP_OP_SUB;
3140
+ case GGML_OP_DIV: return HTP_OP_DIV;
3141
+ case GGML_OP_CPY: return HTP_OP_CPY;
3142
+ case GGML_OP_CONT: return HTP_OP_CPY;
3143
+ case GGML_OP_GET_ROWS: return HTP_OP_GET_ROWS;
3144
+ case GGML_OP_SET_ROWS: return HTP_OP_SET_ROWS;
3145
+ case GGML_OP_SUM_ROWS: return HTP_OP_SUM_ROWS;
3146
+ case GGML_OP_ARGSORT: return HTP_OP_ARGSORT;
3147
+ case GGML_OP_NORM: return HTP_OP_NORM;
3148
+ case GGML_OP_L2_NORM: return HTP_OP_L2_NORM;
3149
+ case GGML_OP_RMS_NORM: return HTP_OP_RMS_NORM;
3150
+ case GGML_OP_CONCAT: return HTP_OP_CONCAT;
3151
+ case GGML_OP_SCALE: return HTP_OP_SCALE;
3152
+ case GGML_OP_SQR: return HTP_OP_SQR;
3153
+ case GGML_OP_SQRT: return HTP_OP_SQRT;
3154
+ case GGML_OP_SOFT_MAX: return HTP_OP_SOFTMAX;
3155
+ case GGML_OP_SSM_CONV: return HTP_OP_SSM_CONV;
3156
+ case GGML_OP_GATED_DELTA_NET: return HTP_OP_GATED_DELTA_NET;
3157
+ case GGML_OP_ROPE: return HTP_OP_ROPE;
3158
+ case GGML_OP_REPEAT: return HTP_OP_REPEAT;
3159
+ case GGML_OP_CUMSUM: return HTP_OP_CUMSUM;
3160
+ case GGML_OP_FILL: return HTP_OP_FILL;
3161
+ case GGML_OP_DIAG: return HTP_OP_DIAG;
3162
+ case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI;
3163
+ case GGML_OP_TRI: return HTP_OP_TRI;
3164
+ case GGML_OP_PAD: return HTP_OP_PAD;
2453
3165
 
2454
- static inline bool is_compute_op(ggml_tensor *node)
2455
- {
2456
- return !(ggml_op_is_empty(node->op) || ggml_is_empty(node));
2457
- }
3166
+ case GGML_OP_UNARY:
3167
+ switch (ggml_get_unary_op(t)) {
3168
+ case GGML_UNARY_OP_SILU: return HTP_OP_UNARY_SILU;
3169
+ case GGML_UNARY_OP_GELU: return HTP_OP_UNARY_GELU;
3170
+ case GGML_UNARY_OP_GELU_QUICK: return HTP_OP_UNARY_GELU;
3171
+ case GGML_UNARY_OP_SIGMOID: return HTP_OP_UNARY_SIGMOID;
3172
+ case GGML_UNARY_OP_NEG: return HTP_OP_UNARY_NEG;
3173
+ case GGML_UNARY_OP_EXP: return HTP_OP_UNARY_EXP;
3174
+ case GGML_UNARY_OP_SOFTPLUS: return HTP_OP_UNARY_SOFTPLUS;
3175
+ case GGML_UNARY_OP_TANH: return HTP_OP_UNARY_TANH;
3176
+ default:
3177
+ break;
3178
+ }
3179
+ break;
2458
3180
 
2459
- // scan the graph and figure out last compute op index
2460
- static inline int last_compute_op(ggml_cgraph * graph) {
2461
- int last = 0;
2462
- for (int i = 0; i < graph->n_nodes; ++i) {
2463
- if (is_compute_op(graph->nodes[i])) {
2464
- last = i;
2465
- }
3181
+ case GGML_OP_GLU:
3182
+ switch (ggml_get_glu_op(t)) {
3183
+ case GGML_GLU_OP_SWIGLU: return HTP_OP_GLU_SWIGLU;
3184
+ case GGML_GLU_OP_SWIGLU_OAI: return HTP_OP_GLU_SWIGLU_OAI;
3185
+ case GGML_GLU_OP_GEGLU: return HTP_OP_GLU_GEGLU;
3186
+ default: break;
3187
+ }
3188
+ break;
3189
+
3190
+ default:
3191
+ GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(t));
2466
3192
  }
3193
+ return HTP_OP_INVALID;
3194
+ }
2467
3195
 
2468
- return last;
3196
+ static inline bool op_is_compute(ggml_tensor *node)
3197
+ {
3198
+ return !ggml_op_is_empty(node->op) && !ggml_is_empty(node) && (node->flags & GGML_TENSOR_FLAG_COMPUTE);
2469
3199
  }
2470
3200
 
2471
3201
  static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
2472
3202
  auto sess = static_cast<ggml_hexagon_session *>(backend->context);
2473
3203
 
2474
- HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->name.c_str(), graph->n_nodes);
2475
-
2476
- const int last = last_compute_op(graph);
3204
+ HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->c_name(), graph->n_nodes);
2477
3205
 
2478
- const struct ggml_tensor * prev_quant_op = nullptr; // prev executed op with quantizer
3206
+ std::vector<htp_opnode> nodes;
3207
+ nodes.reserve(graph->n_nodes);
2479
3208
 
3209
+ // Fusion
2480
3210
  for (int i = 0; i < graph->n_nodes; ++i) {
2481
- ggml_tensor * node = graph->nodes[i];
2482
-
2483
- if (!is_compute_op(node)) {
3211
+ ggml_tensor * n = graph->nodes[i];
3212
+ if (!op_is_compute(n)) {
2484
3213
  continue;
2485
3214
  }
2486
3215
 
2487
- uint32_t flags = 0;
3216
+ ggml_tensor * next_node = (i + 1 < graph->n_nodes) ? graph->nodes[i + 1] : nullptr;
2488
3217
 
2489
- // skip quantizer if src1 is reused
2490
- if (op_reuse_src1(node, prev_quant_op)) {
2491
- flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
2492
- }
3218
+ htp_opnode node = {
3219
+ /*.node =*/ n,
3220
+ /*.fused =*/ {},
3221
+ /*.opcode =*/ HTP_OP_INVALID
3222
+ };
2493
3223
 
2494
- // ask for early notification for the last Op
2495
- if (i == last) {
2496
- flags |= HTP_OPFLAGS_EARLY_WAKEUP;
3224
+ if (n->op == GGML_OP_RMS_NORM && next_node) {
3225
+ if (next_node->op == GGML_OP_MUL && op_is_compute(next_node) && ggml_can_fuse(graph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
3226
+ node.add_fused(next_node);
3227
+ node.opcode = HTP_OP_RMS_NORM_MUL;
3228
+ i++; // skip the fused MUL node
3229
+ }
2497
3230
  }
2498
3231
 
2499
- switch (node->op) {
2500
- case GGML_OP_MUL_MAT:
2501
- if (ggml_is_quantized(node->src[0]->type)) {
2502
- ggml_hexagon_dispatch_op<init_binary_req<true>>(sess, node, flags);
2503
- } else {
2504
- ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
2505
- }
2506
- prev_quant_op = node;
2507
- break;
2508
- case GGML_OP_MUL_MAT_ID:
2509
- if (ggml_is_quantized(node->src[0]->type)) {
2510
- ggml_hexagon_dispatch_op<init_binary_id_req<true>>(sess, node, flags);
2511
- } else {
2512
- ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
2513
- }
2514
- prev_quant_op = node;
2515
- break;
2516
- case GGML_OP_MUL:
2517
- case GGML_OP_ADD:
2518
- case GGML_OP_SUB:
2519
- ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
2520
- break;
2521
- case GGML_OP_ADD_ID:
2522
- ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
2523
- break;
2524
- case GGML_OP_RMS_NORM:
2525
- case GGML_OP_SCALE:
2526
- ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
2527
- break;
2528
- case GGML_OP_UNARY:
2529
- if ((ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) ||
2530
- (ggml_get_unary_op(node) == GGML_UNARY_OP_GELU)) {
2531
- ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
2532
- }
2533
- break;
2534
- case GGML_OP_GLU:
2535
- if ((ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU) ||
2536
- (ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI)) {
2537
- ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
2538
- }
2539
- break;
2540
- case GGML_OP_SOFT_MAX:
2541
- ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
2542
- break;
2543
-
2544
- case GGML_OP_ROPE:
2545
- ggml_hexagon_dispatch_op<init_rope_req>(sess, node, flags);
2546
- break;
2547
-
2548
- case GGML_OP_FLASH_ATTN_EXT:
2549
- ggml_hexagon_dispatch_op<init_flash_attn_ext_req>(sess, node, flags);
2550
- break;
2551
-
2552
- case GGML_OP_SET_ROWS:
2553
- ggml_hexagon_dispatch_op<init_set_rows_req>(sess, node, flags);
2554
- break;
3232
+ if (node.opcode == HTP_OP_INVALID) {
3233
+ node.opcode = op_remap_to_htp(n);
3234
+ }
2555
3235
 
2556
- case GGML_OP_GET_ROWS:
2557
- ggml_hexagon_dispatch_op<init_get_rows_req>(sess, node, flags);
2558
- break;
3236
+ nodes.push_back(std::move(node));
3237
+ }
2559
3238
 
2560
- default:
2561
- GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(node));
3239
+ // Queue and execute
3240
+ if (opt_opstage & HTP_OPSTAGE_QUEUE) {
3241
+ for (const auto & node : nodes) {
3242
+ sess->enqueue_op(node);
2562
3243
  }
2563
3244
  }
2564
3245
 
@@ -2571,57 +3252,13 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
2571
3252
  static void ggml_backend_hexagon_synchronize(ggml_backend_t backend) {
2572
3253
  auto sess = static_cast<ggml_hexagon_session *>(backend->context);
2573
3254
 
2574
- HEX_VERBOSE("ggml-hex: %s synchronize\n", sess->name.c_str());
3255
+ HEX_VERBOSE("ggml-hex: %s synchronize\n", sess->c_name());
2575
3256
 
2576
3257
  // Wait until all pending ops complete
2577
3258
  sess->flush();
2578
3259
  }
2579
3260
 
2580
- struct node_info {
2581
- ggml_tensor * node;
2582
-
2583
- std::vector<ggml_tensor *> fused;
2584
-
2585
- ggml_op op() const {
2586
- return node->op;
2587
- }
2588
-
2589
- const ggml_tensor * dst() const {
2590
- return fused.empty() ? node : fused.back();
2591
- }
2592
-
2593
- const ggml_tensor * src0() const {
2594
- return node->src[0];
2595
- }
2596
-
2597
- const ggml_tensor * src1() const {
2598
- return node->src[1];
2599
- }
2600
-
2601
- bool is_empty() const {
2602
- return ggml_op_is_empty(node->op);
2603
- }
2604
-
2605
- void add_fused(ggml_tensor * t) {
2606
- fused.push_back(t);
2607
- }
2608
-
2609
- bool stackable() const {
2610
- switch (this->op()) {
2611
- case GGML_OP_MUL_MAT:
2612
- case GGML_OP_MUL_MAT_ID:
2613
- return ggml_is_quantized(this->src0()->type);
2614
- default:
2615
- return false;
2616
- }
2617
- }
2618
-
2619
- bool same_input(const node_info& n) const {
2620
- return n.src1() == this->src1();
2621
- }
2622
- };
2623
-
2624
- static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<node_info> & nodes) {
3261
+ static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<htp_opnode> & nodes) {
2625
3262
  const int n = nodes.size();
2626
3263
 
2627
3264
  std::vector<int> res;
@@ -2632,7 +3269,7 @@ static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<no
2632
3269
  // The main goal here is to stack the MUL_MAT ops with the same src1 input.
2633
3270
  // This allows use to reuse dynamically quantized src1 in VTCM.
2634
3271
 
2635
- // TODO: the current version might do incorrect reodering in cases where quantized src0
3272
+ // TODO: the current version might do incorrect reordering in cases where quantized src0
2636
3273
  // input is an output of another Op.
2637
3274
 
2638
3275
  for (int i0 = 0; i0 < n; i0++) {
@@ -2649,7 +3286,7 @@ static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<no
2649
3286
  }
2650
3287
 
2651
3288
  // that many nodes forward to search for stackable nodes that can reuse VTCM
2652
- constexpr int N_FORWARD = 8;
3289
+ constexpr int N_FORWARD = 16;
2653
3290
 
2654
3291
  for (int i1 = i0 + 1; i1 < i0 + N_FORWARD && i1 < n; i1++) {
2655
3292
  if (used[i1]) {
@@ -2675,14 +3312,14 @@ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgr
2675
3312
 
2676
3313
  enum ggml_op ops[MAX_FUSE];
2677
3314
 
2678
- std::vector<node_info> nodes;
3315
+ std::vector<htp_opnode> nodes;
2679
3316
  nodes.reserve(gf->n_nodes);
2680
3317
 
2681
3318
  // fuse nodes:
2682
3319
  // we don't want to make reorders that break fusing, so we first pack all fusable tensors
2683
3320
  // and perform the reorder over the fused nodes. after the reorder is done, we unfuse
2684
3321
  for (int i = 0; i < n; i++) {
2685
- node_info node = {
3322
+ htp_opnode node = {
2686
3323
  /*.node =*/gf->nodes[i],
2687
3324
  /*.fused =*/{},
2688
3325
  };
@@ -2749,6 +3386,8 @@ static struct ggml_backend_i hexagon_backend_i = {
2749
3386
  /* .free = */ ggml_backend_hexagon_free,
2750
3387
  /* .set_tensor_async = */ NULL,
2751
3388
  /* .get_tensor_async = */ NULL,
3389
+ /* .set_tensor_2d_async = */ NULL,
3390
+ /* .get_tensor_2d_async = */ NULL,
2752
3391
  /* .cpy_tensor_async = */ NULL,
2753
3392
  /* .synchronize = */ ggml_backend_hexagon_synchronize,
2754
3393
  /* .graph_plan_create = */ NULL,
@@ -2788,7 +3427,7 @@ static ggml_backend_t ggml_backend_hexagon_device_init(ggml_backend_dev_t dev, c
2788
3427
 
2789
3428
  static const char * ggml_backend_hexagon_device_get_name(ggml_backend_dev_t dev) {
2790
3429
  auto sess = static_cast<ggml_hexagon_session *>(dev->context);
2791
- return sess->name.c_str();
3430
+ return sess->c_name();
2792
3431
 
2793
3432
  GGML_UNUSED(dev);
2794
3433
  }
@@ -2799,8 +3438,7 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
2799
3438
  }
2800
3439
 
2801
3440
  static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
2802
- // ~2GB per session for now
2803
- *free = 2ULL * 1024 * 1024 * 1024;
3441
+ *free = 0;
2804
3442
  *total = *free;
2805
3443
 
2806
3444
  GGML_UNUSED(dev);
@@ -2858,9 +3496,98 @@ static bool ggml_hexagon_supported_buffers(ggml_hexagon_session *sess, const str
2858
3496
  return true;
2859
3497
  }
2860
3498
 
3499
+ static bool ggml_hexagon_supported_cpy(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3500
+ const struct ggml_tensor * src0 = op->src[0];
3501
+ const struct ggml_tensor * dst = op;
3502
+
3503
+ // for now we can do f32 -> f16 and f16 -> f32 (without reshaping)
3504
+ if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
3505
+ if ( dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) return false;
3506
+
3507
+ const bool sametype = (src0->type == dst->type);
3508
+ const bool transposed = ggml_is_transposed(src0) || ggml_is_transposed(dst);
3509
+ const bool sameshape = !transposed && ggml_are_same_shape(src0, dst);
3510
+
3511
+ // can handle any shape and any same-type (pretty slow if reshaping is required)
3512
+ if (sametype) return true;
3513
+
3514
+ // cannot handle re-shaping and type conversion at the same time
3515
+ if (!sameshape) return false;
3516
+
3517
+ return true;
3518
+ }
3519
+
3520
+ static bool ggml_hexagon_supported_cont(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3521
+ GGML_UNUSED(sess);
3522
+ const struct ggml_tensor * src0 = op->src[0];
3523
+
3524
+ // CONT is same-type only, supports f32 and f16
3525
+ if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
3526
+
3527
+ return true;
3528
+ }
3529
+
3530
+ static bool ggml_hexagon_supported_repeat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3531
+ GGML_UNUSED(sess);
3532
+ const struct ggml_tensor * src0 = op->src[0];
3533
+ const struct ggml_tensor * dst = op;
3534
+
3535
+ // Support f32 and f16
3536
+ if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
3537
+
3538
+ // src and dst must be the same type
3539
+ if (src0->type != dst->type) return false;
3540
+
3541
+ // dst dims must be multiples of src dims
3542
+ if (dst->ne[0] % src0->ne[0] != 0) return false;
3543
+ if (dst->ne[1] % src0->ne[1] != 0) return false;
3544
+ if (dst->ne[2] % src0->ne[2] != 0) return false;
3545
+ if (dst->ne[3] % src0->ne[3] != 0) return false;
3546
+
3547
+ // require contiguous tensors (no transposition)
3548
+ if (ggml_is_transposed(src0) || ggml_is_transposed(dst)) return false;
3549
+
3550
+ return true;
3551
+ }
3552
+
3553
+ static bool ggml_hexagon_supported_concat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3554
+ int dim = ((const int32_t *) op->op_params)[0];
3555
+ if (dim < 0 || dim >= GGML_MAX_DIMS) {
3556
+ return false;
3557
+ }
3558
+
3559
+ for (int i = 0; i < GGML_MAX_SRC; ++i) {
3560
+ const struct ggml_tensor * src = op->src[i];
3561
+ if (!src) {
3562
+ continue;
3563
+ }
3564
+ if (src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_I32 && src->type != GGML_TYPE_F16) {
3565
+ return false;
3566
+ }
3567
+ }
3568
+
3569
+ return true;
3570
+ }
3571
+
3572
+ static bool ggml_hexagon_supported_fill(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3573
+ const struct ggml_tensor * dst = op;
3574
+
3575
+ if (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) {
3576
+ return false;
3577
+ }
3578
+
3579
+ GGML_UNUSED(sess);
3580
+ return true;
3581
+ }
3582
+
2861
3583
  static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
2862
3584
  auto sess = static_cast<ggml_hexagon_session *>(dev->context);
2863
3585
 
3586
+ // reject ops that match the filter
3587
+ if (opt_opfilter && std::regex_match(ggml_op_desc(op), *opt_opfilter)) {
3588
+ return false;
3589
+ }
3590
+
2864
3591
  // all srcs & dsts must be mapped to the same session
2865
3592
  if (!ggml_hexagon_supported_buffers(sess, op)) {
2866
3593
  ggml_hexagon_dump_op_supp(sess->name, op, false);
@@ -2877,6 +3604,13 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
2877
3604
  supp = true;
2878
3605
  break;
2879
3606
 
3607
+ case GGML_OP_MUL:
3608
+ case GGML_OP_ADD:
3609
+ case GGML_OP_SUB:
3610
+ case GGML_OP_DIV:
3611
+ supp = ggml_hexagon_supported_binary(sess, op);
3612
+ break;
3613
+
2880
3614
  case GGML_OP_MUL_MAT:
2881
3615
  supp = ggml_hexagon_supported_mul_mat(sess, op);
2882
3616
  break;
@@ -2885,41 +3619,61 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
2885
3619
  supp = ggml_hexagon_supported_mul_mat_id(sess, op);
2886
3620
  break;
2887
3621
 
2888
- case GGML_OP_MUL:
2889
- case GGML_OP_ADD:
2890
- case GGML_OP_SUB:
2891
- supp = ggml_hexagon_supported_binary(sess, op);
2892
- break;
2893
-
2894
3622
  case GGML_OP_ADD_ID:
2895
3623
  supp = ggml_hexagon_supported_add_id(sess, op);
2896
3624
  break;
2897
3625
 
3626
+ case GGML_OP_NORM:
3627
+ case GGML_OP_L2_NORM:
2898
3628
  case GGML_OP_RMS_NORM:
2899
3629
  case GGML_OP_SCALE:
2900
3630
  supp = ggml_hexagon_supported_unary(sess, op);
2901
3631
  break;
2902
3632
 
3633
+ case GGML_OP_SQR:
3634
+ case GGML_OP_SQRT:
3635
+ supp = ggml_hexagon_supported_unary(sess, op);
3636
+ break;
3637
+
3638
+ case GGML_OP_SUM_ROWS:
3639
+ supp = ggml_hexagon_supported_sum_rows(sess, op);
3640
+ break;
3641
+
2903
3642
  case GGML_OP_SOFT_MAX:
2904
3643
  supp = ggml_hexagon_supported_softmax(sess, op);
2905
3644
  break;
2906
3645
 
2907
3646
  case GGML_OP_UNARY:
2908
- {
2909
- const auto unary_op = ggml_get_unary_op(op);
2910
- if (unary_op == GGML_UNARY_OP_SILU || unary_op == GGML_UNARY_OP_GELU) {
3647
+ switch (ggml_get_unary_op(op)) {
3648
+ case GGML_UNARY_OP_NEG:
3649
+ case GGML_UNARY_OP_EXP:
3650
+ case GGML_UNARY_OP_SIGMOID:
3651
+ case GGML_UNARY_OP_SOFTPLUS:
3652
+ case GGML_UNARY_OP_TANH:
3653
+ supp = ggml_hexagon_supported_unary(sess, op);
3654
+ break;
3655
+ case GGML_UNARY_OP_SILU:
3656
+ case GGML_UNARY_OP_GELU:
3657
+ case GGML_UNARY_OP_GELU_QUICK:
2911
3658
  supp = ggml_hexagon_supported_activations(sess, op);
2912
- }
2913
- break;
3659
+ break;
3660
+ default:
3661
+ break;
2914
3662
  }
3663
+ break;
3664
+
2915
3665
  case GGML_OP_GLU:
2916
- {
2917
- const auto glu_op = ggml_get_glu_op(op);
2918
- if ((glu_op == GGML_GLU_OP_SWIGLU) || (glu_op == GGML_GLU_OP_SWIGLU_OAI)) {
3666
+ switch (ggml_get_glu_op(op)) {
3667
+ case GGML_GLU_OP_SWIGLU:
3668
+ case GGML_GLU_OP_SWIGLU_OAI:
3669
+ case GGML_GLU_OP_GEGLU:
2919
3670
  supp = ggml_hexagon_supported_activations(sess, op);
2920
- }
2921
- break;
3671
+ break;
3672
+ default:
3673
+ break;
2922
3674
  }
3675
+ break;
3676
+
2923
3677
  case GGML_OP_ROPE:
2924
3678
  supp = ggml_hexagon_supported_rope(sess, op);
2925
3679
  break;
@@ -2936,6 +3690,58 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
2936
3690
  supp = ggml_hexagon_supported_get_rows(sess, op);
2937
3691
  break;
2938
3692
 
3693
+ case GGML_OP_CPY:
3694
+ supp = ggml_hexagon_supported_cpy(sess, op);
3695
+ break;
3696
+
3697
+ case GGML_OP_CONT:
3698
+ supp = ggml_hexagon_supported_cont(sess, op);
3699
+ break;
3700
+
3701
+ case GGML_OP_REPEAT:
3702
+ supp = ggml_hexagon_supported_repeat(sess, op);
3703
+ break;
3704
+
3705
+ case GGML_OP_ARGSORT:
3706
+ supp = ggml_hexagon_supported_argsort(sess, op);
3707
+ break;
3708
+
3709
+ case GGML_OP_SSM_CONV:
3710
+ supp = ggml_hexagon_supported_ssm_conv(sess, op);
3711
+ break;
3712
+
3713
+ case GGML_OP_GATED_DELTA_NET:
3714
+ supp = ggml_hexagon_supported_gated_delta_net(sess, op);
3715
+ break;
3716
+
3717
+ case GGML_OP_CUMSUM:
3718
+ supp = ggml_hexagon_supported_cumsum(sess, op);
3719
+ break;
3720
+
3721
+ case GGML_OP_CONCAT:
3722
+ supp = ggml_hexagon_supported_concat(sess, op);
3723
+ break;
3724
+
3725
+ case GGML_OP_FILL:
3726
+ supp = ggml_hexagon_supported_fill(sess, op);
3727
+ break;
3728
+
3729
+ case GGML_OP_DIAG:
3730
+ supp = ggml_hexagon_supported_diag(sess, op);
3731
+ break;
3732
+
3733
+ case GGML_OP_SOLVE_TRI:
3734
+ supp = ggml_hexagon_supported_solve_tri(sess, op);
3735
+ break;
3736
+
3737
+ case GGML_OP_TRI:
3738
+ supp = ggml_hexagon_supported_tri(sess, op);
3739
+ break;
3740
+
3741
+ case GGML_OP_PAD:
3742
+ supp = ggml_hexagon_supported_pad(sess, op);
3743
+ break;
3744
+
2939
3745
  default:
2940
3746
  break;
2941
3747
  }
@@ -3002,19 +3808,6 @@ struct ggml_hexagon_registry {
3002
3808
  ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
3003
3809
  GGML_LOG_INFO("ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev %zu\n", opt_ndev);
3004
3810
 
3005
- if (!opt_arch) {
3006
- int err = get_hex_arch_ver(CDSP_DOMAIN_ID, &opt_arch);
3007
- if (err != 0) {
3008
- GGML_LOG_ERROR("ggml-hex: failed to query HTP version (err %d) defaulting to v73\n", err);
3009
- opt_arch = 73;
3010
- }
3011
- }
3012
-
3013
- if (opt_arch < 75) {
3014
- opt_ndev = 1;
3015
- GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
3016
- }
3017
-
3018
3811
  GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
3019
3812
 
3020
3813
  // Create devices / sessions
@@ -3061,7 +3854,7 @@ static ggml_backend_dev_t ggml_backend_hexagon_reg_get_device(ggml_backend_reg_t
3061
3854
  }
3062
3855
 
3063
3856
  static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, const char * name) {
3064
- if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
3857
+ if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0 && opt_hostbuf) {
3065
3858
  ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_hexagon_device_get_extra_buffers_type;
3066
3859
  return (void *) fct;
3067
3860
  }
@@ -3069,56 +3862,117 @@ static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, cons
3069
3862
  return NULL;
3070
3863
  }
3071
3864
 
3865
+ template<typename T> std::vector<T> str_to_vec(const char* str) {
3866
+ std::stringstream ss(str);
3867
+ std::vector<T> v;
3868
+ std::string t;
3869
+
3870
+ while (std::getline(ss, t, ',')) {
3871
+ v.push_back(std::stoul(t, nullptr, 0));
3872
+ }
3873
+
3874
+ return v;
3875
+ }
3876
+
3877
+ template<typename T, int BASE=10> std::string vec_to_str(std::vector<T> v) {
3878
+ std::stringstream ss;
3879
+ ss << std::setbase(BASE) << std::showbase;
3880
+ for (auto i : v) { ss << i << ','; }
3881
+ auto str = ss.str(); str.pop_back(); // drop last comma
3882
+ return str;
3883
+ }
3884
+
3072
3885
  static void ggml_hexagon_init(ggml_backend_reg * reg) {
3073
3886
  // Basic sanity checks to make sure definitions match
3074
3887
  static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0,
3075
3888
  "please update hexagon_type to match ggml_type");
3889
+ static_assert((unsigned int) HTP_TYPE_Q4_1 == (unsigned int) GGML_TYPE_Q4_1,
3890
+ "please update hexagon_type to match ggml_type");
3076
3891
  static_assert((unsigned int) HTP_TYPE_Q8_0 == (unsigned int) GGML_TYPE_Q8_0,
3077
3892
  "please update hexagon_type to match ggml_type");
3078
3893
  static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4,
3079
3894
  "please update hexagon_type to match ggml_type");
3895
+ static_assert((unsigned int) HTP_TYPE_IQ4_NL == (unsigned int) GGML_TYPE_IQ4_NL,
3896
+ "please update hexagon_type to match ggml_type");
3897
+
3898
+ const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
3899
+ const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
3900
+ const char * str_opstage = getenv("GGML_HEXAGON_OPSTAGE");
3901
+ const char * str_opbatch = getenv("GGML_HEXAGON_OPBATCH");
3902
+ const char * str_opqueue = getenv("GGML_HEXAGON_OPQUEUE");
3903
+ const char * str_oppoll = getenv("GGML_HEXAGON_OPPOLL");
3904
+ const char * str_opfilter = getenv("GGML_HEXAGON_OPFILTER");
3905
+ const char * str_profile = getenv("GGML_HEXAGON_PROFILE");
3906
+ const char * str_etm = getenv("GGML_HEXAGON_ETM");
3907
+ const char * str_nhvx = getenv("GGML_HEXAGON_NHVX");
3908
+ const char * str_use_hmx = getenv("GGML_HEXAGON_USE_HMX");
3909
+ const char * str_ndev = getenv("GGML_HEXAGON_NDEV");
3910
+ const char * str_arch = getenv("GGML_HEXAGON_ARCH");
3911
+ const char * str_vmem = getenv("GGML_HEXAGON_VMEM");
3912
+ const char * str_mbuf = getenv("GGML_HEXAGON_MBUF");
3913
+
3914
+ // Init Arch first since it affects other defaults
3915
+ if (!str_arch) {
3916
+ int err = get_hex_arch_ver(CDSP_DOMAIN_ID, &opt_arch);
3917
+ if (err != 0) {
3918
+ GGML_LOG_ERROR("ggml-hex: failed to query HTP version (err %d) defaulting to v73\n", err);
3919
+ opt_arch = 73;
3920
+ }
3921
+ } else {
3922
+ if (str_arch[0] == 'v' || str_arch[0] == 'V') {
3923
+ str_arch++;
3924
+ }
3925
+ opt_arch = strtoul(str_arch, NULL, 0);
3926
+ }
3080
3927
 
3081
- const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
3082
- const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
3928
+ size_t MiB = 1024 * 1024;
3083
3929
 
3084
- opt_verbose = str_verbose ? atoi(str_verbose) : 0;
3085
- opt_profile = getenv("GGML_HEXAGON_PROFILE") != nullptr;
3086
- opt_etm = getenv("GGML_HEXAGON_ETM") != nullptr;
3087
- opt_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL") != nullptr;
3930
+ // Update vmem default
3931
+ opt_vmem = opt_arch >= 75 ? HTP_OP_MAX_VMEM_DEFAULT : 3000 * MiB;
3088
3932
 
3089
- const char * str_opmask = getenv("GGML_HEXAGON_OPMASK");
3090
- if (str_opmask != nullptr) {
3091
- opt_opmask = strtoul(str_opmask, NULL, 0);
3092
- }
3093
- opt_opsync = getenv("GGML_HEXAGON_OPSYNC") != nullptr;
3933
+ auto RE_ICASE = std::regex_constants::icase;
3094
3934
 
3095
- const char * str_ndev = getenv("GGML_HEXAGON_NDEV");
3096
- if (str_ndev) {
3097
- opt_ndev = strtoul(str_ndev, NULL, 0);
3098
- if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
3099
- opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
3100
- }
3101
- }
3935
+ opt_opfilter = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL;
3936
+ opt_verbose = str_verbose ? atoi(str_verbose) : 0;
3937
+ opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
3938
+ opt_opstage = str_opstage ? strtoul(str_opstage, NULL, 0) : opt_opstage;
3939
+ opt_opbatch = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
3940
+ opt_opqueue = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
3941
+ opt_oppoll = str_oppoll ? strtoul(str_oppoll, NULL, 0) : opt_oppoll;
3942
+ opt_profile = str_profile ? atoi(str_profile) : 0;
3943
+ opt_etm = str_etm ? atoi(str_etm) : 0;
3944
+ opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
3945
+ opt_use_hmx = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx;
3946
+ opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev;
3947
+ opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
3948
+ opt_mbuf = str_mbuf ? strtoul(str_mbuf, NULL, 0) * MiB : opt_mbuf;
3949
+ opt_vmem = str_vmem ? strtoul(str_vmem, NULL, 0) * MiB : opt_vmem;
3102
3950
 
3103
- const char * str_nhvx = getenv("GGML_HEXAGON_NHVX");
3104
- if (str_nhvx) {
3105
- opt_nhvx = strtoul(str_nhvx, NULL, 0);
3951
+ if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
3952
+ opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
3106
3953
  }
3107
3954
 
3108
- const char * str_arch = getenv("GGML_HEXAGON_ARCH");
3109
- if (str_arch) {
3110
- if (str_arch[0] == 'v') {
3111
- str_arch++;
3112
- }
3113
- opt_arch = strtoul(str_arch, NULL, 0);
3955
+ #if defined(__ANDROID__)
3956
+ if (opt_arch < 75) {
3957
+ opt_ndev = 1;
3958
+ GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
3114
3959
  }
3960
+ #endif
3115
3961
 
3116
- opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : 1;
3962
+ if (str_profile) {
3963
+ opt_pmu_evt = [&]() -> std::vector<uint32_t> {
3964
+ auto v = str_to_vec<uint32_t>(str_profile);
3965
+ switch (v.size()) {
3966
+ case 1: opt_profile = v[0]; return opt_pmu_evt; // mode with default pmu events
3967
+ case 8: opt_profile = 2; return v; // mode with custom pmu events
3968
+ default: opt_profile = 0; return {}; // garbage input
3969
+ }}();
3970
+ if (opt_profile == 1) opt_pmu_evt = {};
3971
+ GGML_LOG_INFO("ggml-hex: Profiling mode %u : pmu-evt [ %s ]\n", opt_profile,
3972
+ vec_to_str<uint32_t, 16>(opt_pmu_evt).c_str());
3973
+ }
3117
3974
 
3118
3975
  reg->context = new ggml_hexagon_registry(reg);
3119
-
3120
- HEX_VERBOSE("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req),
3121
- sizeof(struct htp_general_rsp));
3122
3976
  }
3123
3977
 
3124
3978
  static const struct ggml_backend_reg_i ggml_backend_hexagon_reg_i = {
@@ -3139,6 +3993,11 @@ ggml_backend_reg_t ggml_backend_hexagon_reg(void) {
3139
3993
  static std::mutex mutex;
3140
3994
  std::lock_guard<std::mutex> lock(mutex);
3141
3995
  if (!initialized) {
3996
+ auto nErr = htpdrv_init();
3997
+ if (nErr != AEE_SUCCESS) {
3998
+ return NULL;
3999
+ }
4000
+
3142
4001
  ggml_hexagon_init(&reg);
3143
4002
  }
3144
4003