whispercpp 1.3.6 → 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (828) hide show
  1. checksums.yaml +4 -4
  2. data/.document +3 -0
  3. data/.rdoc_options +2 -0
  4. data/README.md +38 -5
  5. data/Rakefile +18 -3
  6. data/ext/dependencies.rb +10 -4
  7. data/ext/dependencies_for_windows.rb +17 -0
  8. data/ext/extconf.rb +20 -8
  9. data/ext/options.rb +54 -14
  10. data/ext/options_for_windows.rb +51 -0
  11. data/ext/ruby_whisper.c +36 -42
  12. data/ext/ruby_whisper.h +135 -0
  13. data/ext/ruby_whisper_context.c +107 -28
  14. data/ext/ruby_whisper_log_queue.c +180 -0
  15. data/ext/ruby_whisper_log_settable.h +47 -0
  16. data/ext/ruby_whisper_parakeet.c +49 -0
  17. data/ext/ruby_whisper_parakeet_context.c +304 -0
  18. data/ext/ruby_whisper_parakeet_context_params.c +117 -0
  19. data/ext/ruby_whisper_parakeet_model.c +84 -0
  20. data/ext/ruby_whisper_parakeet_params.c +548 -0
  21. data/ext/ruby_whisper_parakeet_segment.c +157 -0
  22. data/ext/ruby_whisper_parakeet_token.c +188 -0
  23. data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
  24. data/ext/ruby_whisper_params.c +256 -65
  25. data/ext/ruby_whisper_segment.c +6 -6
  26. data/ext/ruby_whisper_transcribe.cpp +42 -15
  27. data/ext/sources/CMakeLists.txt +41 -3
  28. data/ext/sources/CMakePresets.json +95 -0
  29. data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
  30. data/ext/sources/cmake/parakeet.pc.in +10 -0
  31. data/ext/sources/cmake/whisper.pc.in +1 -1
  32. data/ext/sources/examples/CMakeLists.txt +4 -2
  33. data/ext/sources/examples/bench/bench.cpp +1 -1
  34. data/ext/sources/examples/cli/cli.cpp +43 -9
  35. data/ext/sources/examples/common-ggml.cpp +2 -0
  36. data/ext/sources/examples/common-whisper.cpp +139 -67
  37. data/ext/sources/examples/common-whisper.h +11 -0
  38. data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
  39. data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
  40. data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
  41. data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
  42. data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
  43. data/ext/sources/examples/server/server.cpp +199 -163
  44. data/ext/sources/ggml/CMakeLists.txt +21 -13
  45. data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
  46. data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
  47. data/ext/sources/ggml/include/ggml-alloc.h +1 -0
  48. data/ext/sources/ggml/include/ggml-backend.h +72 -10
  49. data/ext/sources/ggml/include/ggml-cuda.h +3 -0
  50. data/ext/sources/ggml/include/ggml-rpc.h +3 -3
  51. data/ext/sources/ggml/include/ggml.h +101 -9
  52. data/ext/sources/ggml/include/gguf.h +10 -2
  53. data/ext/sources/ggml/src/CMakeLists.txt +22 -5
  54. data/ext/sources/ggml/src/ggml-alloc.c +5 -1
  55. data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
  56. data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
  57. data/ext/sources/ggml/src/ggml-backend-reg.cpp +12 -0
  58. data/ext/sources/ggml/src/ggml-backend.cpp +110 -9
  59. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +4 -0
  60. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +672 -257
  61. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +71 -0
  62. data/ext/sources/ggml/src/ggml-cann/common.h +20 -10
  63. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +211 -30
  64. data/ext/sources/ggml/src/ggml-common.h +11 -0
  65. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +58 -29
  66. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +2 -0
  67. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +16 -16
  68. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +116 -7
  69. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +65 -0
  70. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
  71. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
  72. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +4279 -1292
  73. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +5 -35
  74. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +0 -1
  75. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
  76. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +177 -27
  77. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +1 -1
  78. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +5 -0
  79. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  80. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -0
  81. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +95 -5
  82. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  83. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +146 -134
  84. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +88 -70
  85. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +372 -73
  86. data/ext/sources/ggml/src/ggml-cpu/ops.h +3 -0
  87. data/ext/sources/ggml/src/ggml-cpu/quants.c +55 -0
  88. data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
  89. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +3 -0
  90. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +90 -0
  91. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +3 -16
  92. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
  93. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
  94. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
  95. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  96. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  97. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  98. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
  99. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  100. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  101. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  102. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  103. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  104. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  105. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  106. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  107. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +37 -53
  108. data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
  109. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +17 -7
  110. data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
  111. data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  112. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +62 -26
  113. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +44 -18
  114. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
  115. data/ext/sources/ggml/src/ggml-cuda/common.cuh +242 -28
  116. data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
  117. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
  118. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
  119. data/ext/sources/ggml/src/ggml-cuda/convert.cu +53 -0
  120. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
  121. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +14 -6
  122. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
  123. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +278 -44
  124. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +331 -130
  125. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +126 -27
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +40 -15
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +18 -9
  129. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +152 -49
  130. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  131. data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
  132. data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
  133. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +84 -35
  134. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
  135. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1069 -609
  136. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
  137. data/ext/sources/ggml/src/ggml-cuda/mean.cu +4 -2
  138. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +242 -195
  139. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +3 -3
  140. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
  141. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +502 -423
  142. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +19 -12
  143. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +485 -57
  144. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -1
  145. data/ext/sources/ggml/src/ggml-cuda/norm.cu +36 -10
  146. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
  147. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +133 -26
  148. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
  149. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +5 -1
  150. data/ext/sources/ggml/src/ggml-cuda/rope.cu +11 -4
  151. data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
  152. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
  153. data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
  154. data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
  155. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
  156. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +45 -13
  157. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  158. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
  159. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
  160. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
  161. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +1 -0
  162. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
  163. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
  164. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
  165. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +1 -0
  166. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
  167. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
  168. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
  169. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
  170. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
  171. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
  172. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
  173. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  174. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  175. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  176. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  177. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  178. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  179. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  180. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  181. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  182. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  183. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  184. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  185. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  186. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  187. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  188. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  189. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  190. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  191. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -4
  192. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +26 -23
  193. data/ext/sources/ggml/src/ggml-cuda/unary.cu +31 -2
  194. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
  195. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +80 -0
  196. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
  197. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +22 -4
  198. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  199. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +2 -1
  200. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1428 -743
  201. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -7
  202. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +53 -84
  203. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +25 -12
  204. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +165 -184
  205. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
  206. data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
  207. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +170 -127
  208. data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  209. data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  210. data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  211. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +125 -97
  212. data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
  213. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +148 -42
  214. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.c +2 -2
  215. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +252 -62
  216. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +9 -0
  217. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +87 -1
  218. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
  219. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
  220. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
  221. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
  222. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  223. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  224. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  225. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  226. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +96 -13
  227. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +182 -57
  228. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
  229. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +71 -3
  230. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +27 -10
  231. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +63 -23
  232. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +9 -8
  233. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
  234. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
  235. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
  236. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  237. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +1 -0
  238. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
  239. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +5 -8
  240. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +529 -815
  241. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2522 -234
  242. data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
  243. data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  244. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +291 -95
  245. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +59 -37
  246. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +121 -133
  247. data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  248. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +244 -151
  249. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +6 -6
  250. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +719 -45
  251. data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  252. data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
  253. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +3 -1
  254. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +22 -9
  255. data/ext/sources/ggml/src/ggml-impl.h +6 -1
  256. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +138 -13
  257. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +32 -1
  258. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +164 -28
  259. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +80 -0
  260. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +190 -19
  261. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +2 -0
  262. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +39 -26
  263. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +823 -322
  264. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
  265. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +54 -5
  266. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +12248 -5907
  267. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +67 -0
  268. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +59 -0
  269. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1819 -112
  270. data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
  271. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
  272. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
  273. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
  274. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
  275. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
  276. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
  277. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
  278. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
  279. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  280. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
  281. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  282. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
  283. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
  284. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  285. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  286. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mm_q8_0_f32_8x4.cl → gemm_noshuffle_q8_0_f32.cl} +1 -1
  287. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  288. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
  289. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
  290. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
  291. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
  292. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
  293. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
  294. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
  295. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
  296. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  297. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  300. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
  301. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
  302. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  303. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  304. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
  305. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  306. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  307. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  308. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  309. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
  310. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
  311. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  312. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  313. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  314. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  315. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
  316. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
  317. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
  318. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +48 -64
  322. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +15 -5
  323. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +18 -11
  324. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +35 -13
  325. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +264 -192
  326. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +33 -7
  327. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  328. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +1 -0
  329. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +1 -0
  330. data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  331. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +27 -3
  332. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +67 -36
  333. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +1 -0
  334. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +101 -44
  335. data/ext/sources/ggml/src/ggml-openvino/utils.h +23 -3
  336. data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
  337. data/ext/sources/ggml/src/ggml-quants.c +289 -114
  338. data/ext/sources/ggml/src/ggml-quants.h +3 -0
  339. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
  340. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
  341. data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
  342. data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
  343. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +50 -4
  344. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +1 -1
  345. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +3 -1
  346. data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
  347. data/ext/sources/ggml/src/ggml-sycl/common.hpp +41 -1
  348. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +115 -13
  349. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +9 -0
  350. data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  351. data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  352. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
  353. data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
  354. data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
  355. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
  356. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1 -90
  357. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +0 -2
  358. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  359. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  360. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +7 -5
  361. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +4 -0
  362. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +76 -168
  363. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +7 -0
  364. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +3 -1
  365. data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
  366. data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
  367. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +69 -31
  368. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +1 -0
  369. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  370. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
  371. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +823 -190
  372. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
  373. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
  374. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
  375. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
  376. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
  377. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +71 -0
  378. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
  379. data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  380. data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  381. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
  382. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  383. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  384. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
  385. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
  386. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  387. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +1 -0
  388. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +1 -0
  389. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +1 -0
  390. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +1 -0
  391. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +1 -0
  392. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +1 -0
  393. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +1 -0
  394. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +1 -0
  395. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +1 -0
  396. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +1 -0
  397. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +1 -0
  398. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +1 -0
  399. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +1 -0
  400. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +1 -0
  401. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +1 -0
  402. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +1 -0
  403. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +1 -0
  404. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +1 -0
  405. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +1 -0
  406. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +1 -0
  407. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +1 -0
  408. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +1 -0
  409. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +1 -0
  410. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +1 -0
  411. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +1 -0
  412. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +1 -0
  413. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +1 -0
  414. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +1 -0
  415. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +1 -0
  416. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +1 -0
  417. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +1 -0
  418. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +1 -0
  419. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +1 -0
  420. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +1 -0
  421. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +1 -0
  422. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +1 -0
  423. data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
  424. data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
  425. data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
  426. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +215 -53
  427. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +4 -0
  428. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +2 -0
  429. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +2 -0
  430. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +1 -0
  431. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +1 -0
  432. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +0 -2
  433. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +11 -0
  434. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +2060 -535
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
  449. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +197 -48
  450. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +60 -59
  451. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +115 -113
  452. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +122 -31
  453. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
  454. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  455. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
  456. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +125 -64
  457. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
  458. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
  459. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
  460. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
  461. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
  462. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
  463. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
  464. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
  465. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
  466. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
  467. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
  468. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
  469. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
  470. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -17
  471. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +43 -10
  472. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
  473. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
  474. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
  475. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
  476. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +5 -2
  477. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
  478. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +3 -2
  479. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
  480. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +11 -1
  481. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
  482. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
  483. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +171 -147
  484. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
  485. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +2202 -283
  486. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +2610 -1403
  487. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
  488. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  489. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +8 -7
  490. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +76 -95
  491. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +19 -1
  492. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  493. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -50
  494. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +107 -184
  495. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
  496. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
  497. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  498. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  499. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
  500. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
  501. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +183 -78
  502. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  503. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  504. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +655 -495
  505. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  506. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
  507. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  508. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +8 -6
  509. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +5 -1
  510. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +80 -409
  511. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
  512. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
  513. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
  514. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
  515. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  516. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
  517. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  518. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +6 -4
  519. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  520. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +2 -3
  521. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
  522. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
  523. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  524. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  525. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  526. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +68 -48
  527. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  528. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +18 -14
  529. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +1 -1
  530. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +244 -10
  531. data/ext/sources/ggml/src/ggml.c +110 -28
  532. data/ext/sources/ggml/src/gguf.cpp +173 -28
  533. data/ext/sources/include/parakeet.h +342 -0
  534. data/ext/sources/include/whisper.h +10 -0
  535. data/ext/sources/media/matmul.png +0 -0
  536. data/ext/sources/src/CMakeLists.txt +23 -0
  537. data/ext/sources/src/parakeet-arch.h +188 -0
  538. data/ext/sources/src/parakeet.cpp +3838 -0
  539. data/ext/sources/src/whisper.cpp +56 -12
  540. data/extsources.rb +26 -10
  541. data/lib/whisper/log_settable.rb +36 -0
  542. data/lib/whisper/model/uri.rb +13 -1
  543. data/lib/whisper/output.rb +74 -0
  544. data/sig/whisper.rbs +411 -62
  545. data/test/helper.rb +2 -0
  546. data/test/jfk_reader/jfk_reader.c +50 -7
  547. data/test/test_callback.rb +1 -0
  548. data/test/test_package.rb +6 -5
  549. data/test/test_parakeet.rb +28 -0
  550. data/test/test_parakeet_callback.rb +107 -0
  551. data/test/test_parakeet_context.rb +116 -0
  552. data/test/test_parakeet_context_params.rb +24 -0
  553. data/test/test_parakeet_model.rb +21 -0
  554. data/test/test_parakeet_params.rb +78 -0
  555. data/test/test_parakeet_segment.rb +42 -0
  556. data/test/test_parakeet_token.rb +73 -0
  557. data/test/test_params.rb +2 -0
  558. data/test/test_vad_segment.rb +1 -1
  559. data/test/test_whisper.rb +24 -6
  560. data/whispercpp.gemspec +2 -2
  561. metadata +215 -281
  562. data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
  563. data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
  564. data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
  565. data/ext/sources/bindings/javascript/package.json +0 -26
  566. data/ext/sources/bindings/javascript/whisper.js +0 -19
  567. data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
  568. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
  569. data/ext/sources/examples/addon.node/addon.cpp +0 -557
  570. data/ext/sources/examples/addon.node/index.js +0 -59
  571. data/ext/sources/examples/addon.node/package.json +0 -16
  572. data/ext/sources/examples/addon.node/vad-example.js +0 -132
  573. data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
  574. data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
  575. data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
  576. data/ext/sources/examples/coi-serviceworker.js +0 -146
  577. data/ext/sources/examples/command/CMakeLists.txt +0 -10
  578. data/ext/sources/examples/command/command.cpp +0 -802
  579. data/ext/sources/examples/command/commands.txt +0 -9
  580. data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
  581. data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
  582. data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
  583. data/ext/sources/examples/generate-karaoke.sh +0 -57
  584. data/ext/sources/examples/helpers.js +0 -191
  585. data/ext/sources/examples/livestream.sh +0 -112
  586. data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
  587. data/ext/sources/examples/lsp/lsp.cpp +0 -471
  588. data/ext/sources/examples/lsp/whisper.vim +0 -362
  589. data/ext/sources/examples/python/test_whisper_processor.py +0 -7
  590. data/ext/sources/examples/python/whisper_processor.py +0 -54
  591. data/ext/sources/examples/server/bench.js +0 -29
  592. data/ext/sources/examples/server.py +0 -120
  593. data/ext/sources/examples/stream/CMakeLists.txt +0 -10
  594. data/ext/sources/examples/stream/stream.cpp +0 -437
  595. data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
  596. data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
  597. data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
  598. data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
  599. data/ext/sources/examples/sycl/build.sh +0 -22
  600. data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
  601. data/ext/sources/examples/sycl/run-whisper.sh +0 -17
  602. data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -48
  603. data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
  604. data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -488
  605. data/ext/sources/examples/talk-llama/llama-adapter.h +0 -89
  606. data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2877
  607. data/ext/sources/examples/talk-llama/llama-arch.h +0 -628
  608. data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -919
  609. data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
  610. data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -896
  611. data/ext/sources/examples/talk-llama/llama-chat.h +0 -71
  612. data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3633
  613. data/ext/sources/examples/talk-llama/llama-context.h +0 -359
  614. data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
  615. data/ext/sources/examples/talk-llama/llama-cparams.h +0 -47
  616. data/ext/sources/examples/talk-llama/llama-ext.h +0 -12
  617. data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
  618. data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
  619. data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2735
  620. data/ext/sources/examples/talk-llama/llama-graph.h +0 -1031
  621. data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -258
  622. data/ext/sources/examples/talk-llama/llama-hparams.h +0 -353
  623. data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
  624. data/ext/sources/examples/talk-llama/llama-impl.h +0 -75
  625. data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
  626. data/ext/sources/examples/talk-llama/llama-io.h +0 -35
  627. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -330
  628. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
  629. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2285
  630. data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -389
  631. data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
  632. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +0 -275
  633. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +0 -140
  634. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
  635. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
  636. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1165
  637. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
  638. data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
  639. data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
  640. data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -752
  641. data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
  642. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1655
  643. data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -206
  644. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -299
  645. data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -40
  646. data/ext/sources/examples/talk-llama/llama-model.cpp +0 -9056
  647. data/ext/sources/examples/talk-llama/llama-model.h +0 -597
  648. data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1304
  649. data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
  650. data/ext/sources/examples/talk-llama/llama-sampler.cpp +0 -3885
  651. data/ext/sources/examples/talk-llama/llama-sampler.h +0 -42
  652. data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3970
  653. data/ext/sources/examples/talk-llama/llama-vocab.h +0 -187
  654. data/ext/sources/examples/talk-llama/llama.cpp +0 -1194
  655. data/ext/sources/examples/talk-llama/llama.h +0 -1573
  656. data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -190
  657. data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
  658. data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
  659. data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -137
  660. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
  661. data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -123
  662. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -143
  663. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -133
  664. data/ext/sources/examples/talk-llama/models/bert.cpp +0 -184
  665. data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -145
  666. data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
  667. data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
  668. data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
  669. data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
  670. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
  671. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
  672. data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
  673. data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -122
  674. data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
  675. data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -142
  676. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -262
  677. data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +0 -445
  678. data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -132
  679. data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
  680. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -148
  681. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
  682. data/ext/sources/examples/talk-llama/models/eurobert.cpp +0 -97
  683. data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +0 -145
  684. data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
  685. data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
  686. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -111
  687. data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
  688. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
  689. data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
  690. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
  691. data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
  692. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
  693. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
  694. data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -157
  695. data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
  696. data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
  697. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -195
  698. data/ext/sources/examples/talk-llama/models/granite.cpp +0 -210
  699. data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
  700. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -139
  701. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
  702. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -153
  703. data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
  704. data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
  705. data/ext/sources/examples/talk-llama/models/jais2.cpp +0 -123
  706. data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
  707. data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +0 -381
  708. data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -196
  709. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
  710. data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
  711. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
  712. data/ext/sources/examples/talk-llama/models/llama.cpp +0 -175
  713. data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
  714. data/ext/sources/examples/talk-llama/models/mamba-base.cpp +0 -289
  715. data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -54
  716. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -129
  717. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -200
  718. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -123
  719. data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
  720. data/ext/sources/examples/talk-llama/models/models.h +0 -704
  721. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -109
  722. data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
  723. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -162
  724. data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
  725. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
  726. data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
  727. data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
  728. data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
  729. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
  730. data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
  731. data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
  732. data/ext/sources/examples/talk-llama/models/paddleocr.cpp +0 -122
  733. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
  734. data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
  735. data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
  736. data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
  737. data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -320
  738. data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
  739. data/ext/sources/examples/talk-llama/models/plm.cpp +0 -169
  740. data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
  741. data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
  742. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
  743. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
  744. data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -120
  745. data/ext/sources/examples/talk-llama/models/qwen35.cpp +0 -381
  746. data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +0 -422
  747. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -131
  748. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -525
  749. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -140
  750. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -132
  751. data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
  752. data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
  753. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -164
  754. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
  755. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
  756. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -137
  757. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
  758. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
  759. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
  760. data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
  761. data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
  762. data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
  763. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
  764. data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +0 -165
  765. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
  766. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
  767. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
  768. data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
  769. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
  770. data/ext/sources/examples/talk-llama/speak +0 -40
  771. data/ext/sources/examples/talk-llama/speak.bat +0 -1
  772. data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
  773. data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
  774. data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
  775. data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
  776. data/ext/sources/examples/talk-llama/unicode.cpp +0 -1103
  777. data/ext/sources/examples/talk-llama/unicode.h +0 -111
  778. data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
  779. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
  780. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
  781. data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
  782. data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
  783. data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
  784. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
  785. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
  786. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
  787. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
  788. data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
  789. data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
  790. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
  791. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -155
  792. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
  793. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
  794. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +0 -123
  795. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +0 -17
  796. data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +0 -333
  797. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
  798. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -182
  799. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
  800. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -718
  801. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
  802. data/ext/sources/tests/CMakeLists.txt +0 -112
  803. data/ext/sources/tests/earnings21/eval.mk +0 -58
  804. data/ext/sources/tests/earnings21/eval.py +0 -68
  805. data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
  806. data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
  807. data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
  808. data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
  809. data/ext/sources/tests/earnings21/requirements.txt +0 -6
  810. data/ext/sources/tests/en-0-ref.txt +0 -1
  811. data/ext/sources/tests/en-1-ref.txt +0 -1
  812. data/ext/sources/tests/en-2-ref.txt +0 -1
  813. data/ext/sources/tests/es-0-ref.txt +0 -1
  814. data/ext/sources/tests/librispeech/eval.mk +0 -39
  815. data/ext/sources/tests/librispeech/eval.py +0 -47
  816. data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
  817. data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
  818. data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
  819. data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
  820. data/ext/sources/tests/librispeech/requirements.txt +0 -6
  821. data/ext/sources/tests/run-tests.sh +0 -130
  822. data/ext/sources/tests/test-c.c +0 -3
  823. data/ext/sources/tests/test-vad-full.cpp +0 -56
  824. data/ext/sources/tests/test-vad.cpp +0 -83
  825. data/ext/sources/tests/test-whisper.js +0 -58
  826. data/lib/whisper/context.rb +0 -15
  827. data/lib/whisper/segment.rb +0 -58
  828. /data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general_q8_0_f32.cl → gemv_noshuffle_q8_0_f32.cl} +0 -0
@@ -7,10 +7,17 @@
7
7
 
8
8
  #include <atomic>
9
9
  #include <chrono>
10
- #include <cstddef>
11
10
  #include <mutex>
11
+ #include <thread>
12
+ #include <cstddef>
12
13
  #include <stdexcept>
13
14
  #include <string>
15
+ #include <sstream>
16
+ #include <iomanip>
17
+ #include <unordered_set>
18
+ #include <unordered_map>
19
+ #include <regex>
20
+ #include <queue>
14
21
 
15
22
  #ifdef _WIN32
16
23
  # include <sal.h>
@@ -32,23 +39,38 @@
32
39
  #include "ggml-hexagon.h"
33
40
  #include "ggml-impl.h"
34
41
  #include "ggml-quants.h"
35
- #include "op-desc.h"
36
- #include "htp-msg.h"
42
+ #include "htp-opnode.h"
43
+ #include "htp-ops.h"
37
44
  #include "htp_iface.h"
38
45
  #include "htp-drv.h"
39
46
 
40
- static size_t opt_ndev = 1;
41
- static size_t opt_nhvx = 0; // use all
42
- static int opt_arch = 0; // autodetect
43
- static int opt_etm = 0;
44
- static int opt_verbose = 0;
45
- static int opt_profile = 0;
46
- static int opt_hostbuf = 1; // hostbuf ON by default
47
- static int opt_experimental = 0;
47
+ using intvec = std::vector<int>;
48
+ using uintvec = std::vector<unsigned int>;
49
+ using u32vec = std::vector<uint32_t>;
50
+
51
+ static int opt_arch = 0; // autodetect
52
+ static size_t opt_ndev = 1;
53
+ static size_t opt_nhvx = 0; // use all
54
+ static int opt_use_hmx = 1; // when set, enable HMX; when 0, use HVX only
55
+ static size_t opt_vmem = HTP_OP_MAX_VMEM_DEFAULT; // max available va space for buffer mappings
56
+ static size_t opt_mbuf = 1ul * 1024 * 1024 * 1024; // max buffer size
57
+ static int opt_etm = 0;
58
+ static int opt_verbose = 0;
59
+ static int opt_profile = 0; // profiling mode (0-disabled, 1-basic, 2-pmu)
60
+ static int opt_hostbuf = 1; // hostbuf ON by default
61
+
62
+ // Default PMU events, if profiling with PMU (mode=2) is enabled
63
+ // See https://docs.qualcomm.com/doc/80-N2040-60/topic/pmu-events.html
64
+ // https://docs.qualcomm.com/doc/80-N2040-61/topic/hvx-pmu-events.html
65
+ static u32vec opt_pmu_evt { 0x3, 0x111, 0x100, 0x105, 0x240, 0x256, 0x7D, 0x8C };
48
66
 
49
67
  // Enable all stages by default
50
- static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_QUANTIZE | HTP_OPMASK_COMPUTE;
51
- static int opt_opsync = 0; // synchronous ops
68
+ static int opt_opstage = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE;
69
+ static int opt_opbatch = 1024; // max number of ops in a batch
70
+ static int opt_opqueue = 16; // max number of pending batches
71
+ static int opt_oppoll = 0; // polling for batch completions
72
+
73
+ static std::regex* opt_opfilter = NULL; // regex of ops to not claim
52
74
 
53
75
  #define HEX_VERBOSE(...) \
54
76
  if (opt_verbose) GGML_LOG_DEBUG(__VA_ARGS__)
@@ -80,47 +102,45 @@ static const char * status_to_str(uint32_t status) {
80
102
 
81
103
  // ** debug helpers
82
104
 
83
- static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const ggml_tensor * op, const uint32_t req_flags) {
105
+ static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const htp_opnode & node, const uint32_t req_flags) {
84
106
  if (!opt_verbose) return;
85
107
 
86
- op_desc desc(op);
108
+ htp_opformat fmt(node);
87
109
  GGML_LOG_DEBUG("ggml-hex: %s execute-op %s: %s : %s : %s : %s : %s : flags 0x%x\n", sess_name.c_str(),
88
- ggml_op_name(op->op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, req_flags);
110
+ node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, req_flags);
89
111
  }
90
112
 
91
113
  static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct ggml_tensor * op, bool supp) {
92
114
  if (!opt_verbose) return;
93
115
 
94
- op_desc desc(op);
95
- GGML_LOG_DEBUG("ggml-hex: %s supports-op %s : %s : %s : %s : %s : %s : %s\n", sess_name.c_str(),
96
- ggml_op_name(op->op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, supp ? "yes" : "no");
116
+ htp_opformat fmt(htp_opformat(htp_opnode{const_cast<ggml_tensor*>(op), {}, HTP_OP_INVALID}));
117
+ GGML_LOG_DEBUG("ggml-hex: %s supports-op %s: %s : %s : %s : %s : %s : %s\n", sess_name.c_str(),
118
+ ggml_op_desc(op), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, supp ? "yes" : "no");
97
119
  }
98
120
 
99
- static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op,
100
- uint32_t op_usec, uint32_t op_cycles, uint32_t op_pkts, uint64_t call_usec) {
121
+ static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const htp_opnode & node,
122
+ uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) {
101
123
  if (!opt_profile) return;
102
124
 
103
- op_desc desc(op);
104
- GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : %s : op-usec %u op-cycles %u op-pkts %u (%f) call-usec %llu\n", sess_name.c_str(),
105
- ggml_op_name(op->op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs,
106
- op_usec, op_cycles, op_pkts, (float) op_cycles / op_pkts, (unsigned long long) call_usec);
125
+ char pmu_str[256] = "";
126
+ if (opt_profile > 1) {
127
+ static_assert(HTP_PROF_PMU_NCNT == 8, "current implementation assumes 8 PMU counters");
128
+ sprintf(pmu_str, " pmu [%u,%u,%u,%u,%u,%u,%u,%u]",
129
+ pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]);
130
+ }
131
+
132
+ htp_opformat fmt(node);
133
+ GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(),
134
+ node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, op_usec, op_cycles, pmu_str);
107
135
  }
108
136
 
109
137
  // ** backend sessions
110
138
 
111
- struct ggml_hexagon_session {
112
- ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false);
113
- ~ggml_hexagon_session() noexcept(true);
114
-
115
- void allocate(int dev_id) noexcept(false);
116
- void release() noexcept(true);
117
-
118
- void enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync = false);
119
- void flush();
120
-
121
- ggml_backend_buffer_type buffer_type = {};
122
- ggml_backend_buffer_type repack_buffer_type = {};
139
+ struct ggml_hexagon_opbatch;
140
+ struct ggml_hexagon_opqueue;
141
+ struct htp_opnode;
123
142
 
143
+ struct ggml_hexagon_session {
124
144
  std::string name;
125
145
  remote_handle64 handle;
126
146
  dspqueue_t queue;
@@ -132,87 +152,28 @@ struct ggml_hexagon_session {
132
152
  bool valid_handle;
133
153
  bool valid_queue;
134
154
  bool valid_iface;
135
- std::atomic<int> op_pending;
136
- uint32_t prof_usecs;
137
- uint32_t prof_cycles;
138
- uint32_t prof_pkts;
139
- };
140
-
141
- void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
142
- // Bump pending flag (cleared in the session::flush once we get the response)
143
- this->op_pending++; // atomic inc
144
-
145
- int err = dspqueue_write(this->queue,
146
- 0, // flags - the framework will autoset this
147
- n_bufs, // number of buffers
148
- bufs, // buffer references
149
- sizeof(req), // Message length
150
- (const uint8_t *) &req, // Message
151
- DSPQUEUE_TIMEOUT // Timeout
152
- );
153
-
154
- if (err != 0) {
155
- GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->name.c_str(), (unsigned) err);
156
- }
157
-
158
- if (sync) {
159
- flush();
160
- }
161
- }
162
-
163
- // Flush HTP response queue i.e wait for all outstanding requests to complete
164
- void ggml_hexagon_session::flush() {
165
- dspqueue_t q = this->queue;
166
-
167
- // Repeatedly read packets from the queue until it's empty. We don't
168
- // necessarily get a separate callback for each packet, and new packets
169
- // may arrive while we're processing the previous one.
170
-
171
- while (this->op_pending) {
172
- struct htp_general_rsp rsp;
173
- uint32_t rsp_size;
174
- uint32_t flags;
175
155
 
176
- struct dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
177
- uint32_t n_bufs;
178
-
179
- // Read response packet from queue
180
- int err = dspqueue_read(q, &flags,
181
- HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
182
- &n_bufs, // Number of buffer references
183
- bufs, // Buffer references
184
- sizeof(rsp), // Max message length
185
- &rsp_size, // Message length
186
- (uint8_t *) &rsp, // Message
187
- DSPQUEUE_TIMEOUT); // Timeout
156
+ std::atomic<int> op_pending;
157
+ ggml_hexagon_opbatch* op_batch;
158
+ ggml_hexagon_opqueue* op_queue;
188
159
 
189
- if (err == AEE_EEXPIRED) {
190
- // TODO: might need to bail out if the HTP is stuck on something
191
- continue;
192
- }
160
+ ggml_backend_buffer_type buffer_type = {};
161
+ ggml_backend_buffer_type repack_buffer_type = {};
193
162
 
194
- if (err != 0) {
195
- GGML_ABORT("ggml-hex: dspqueue_read failed: 0x%08x\n", (unsigned) err);
196
- }
163
+ ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false);
164
+ ~ggml_hexagon_session() noexcept(true);
197
165
 
198
- // Basic sanity checks
199
- if (rsp_size != sizeof(rsp)) {
200
- GGML_ABORT("ggml-hex: dspcall : bad response (size)\n");
201
- }
166
+ const char* c_name() const { return name.c_str(); }
202
167
 
203
- if (rsp.status != HTP_STATUS_OK) {
204
- GGML_LOG_ERROR("ggml-hex: dspcall : dsp-rsp: %s\n", status_to_str(rsp.status));
205
- // TODO: handle errors
206
- }
168
+ void allocate(int dev_id) noexcept(false);
169
+ void release() noexcept(true);
207
170
 
208
- // TODO: update profiling implementation, currently only works for opt_opsync mode
209
- this->prof_usecs = rsp.prof_usecs;
210
- this->prof_cycles = rsp.prof_cycles;
211
- this->prof_pkts = rsp.prof_pkts;
171
+ void enqueue_op(const htp_opnode & node);
172
+ void flush(bool all = true);
212
173
 
213
- this->op_pending--; // atomic dec
214
- }
215
- }
174
+ void flush_pending(bool all = false);
175
+ void flush_batch();
176
+ };
216
177
 
217
178
  // ** backend buffers
218
179
 
@@ -226,82 +187,94 @@ struct ggml_backend_hexagon_buffer_type_context {
226
187
  std::string name;
227
188
  };
228
189
 
229
- struct ggml_backend_hexagon_buffer_context {
230
- bool mmap_to(ggml_hexagon_session * s) {
231
- HEX_VERBOSE("ggml-hex: %s mmaping buffer: base %p domain-id %d session-id %d size %zu fd %d repack %d\n",
232
- s->name.c_str(), (void *) this->base, s->domain_id, s->session_id, this->size, this->fd,
233
- (int) this->repack);
190
+ struct ggml_hexagon_shared_buffer {
191
+ ggml_hexagon_session * sess;
192
+ uint8_t * base;
193
+ size_t size;
194
+ int fd;
195
+ bool mapped;
196
+ bool pinned;
197
+
198
+ void mmap() {
199
+ fastrpc_map_flags flags = this->pinned ? FASTRPC_MAP_FD : FASTRPC_MAP_FD_DELAYED;
234
200
 
235
- int err = fastrpc_mmap(s->domain_id, this->fd, (void *) this->base, 0, this->size, FASTRPC_MAP_FD);
201
+ int err = fastrpc_mmap(sess->domain_id, this->fd, (void *) this->base, 0, this->size, flags);
236
202
  if (err != 0) {
237
- GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n",
238
- s->domain_id, this->size, this->fd, (unsigned) err);
239
- return false;
203
+ GGML_LOG_ERROR("ggml-hex: %s buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n", sess->c_name(),
204
+ sess->domain_id, this->size, this->fd, (unsigned) err);
205
+ throw std::runtime_error("ggml-hex: fastrpc_mmap failed (see log for details)");
240
206
  }
241
207
 
242
- return true;
243
- }
208
+ HEX_VERBOSE("ggml-hex: %s mapped buffer: base %p size %zu fd %d pinned %u\n",
209
+ sess->c_name(), (void *) this->base, this->size, this->fd, pinned);
244
210
 
245
- bool mmap() {
246
- if (this->mapped) {
247
- return true;
248
- }
249
- if (!mmap_to(this->sess)) {
250
- return false;
251
- }
252
211
  this->mapped = true;
253
- return true;
254
212
  }
255
213
 
256
- void munmap() {
257
- if (!this->mapped) {
258
- return;
214
+ void unmap() {
215
+ if (!this->mapped) return;
216
+
217
+ if (!this->pinned) {
218
+ // HTP might still hold a reference, tell it drop it
219
+ htp_iface_munmap(sess->handle, this->fd);
259
220
  }
260
221
 
261
- fastrpc_munmap(this->sess->domain_id, this->fd, this->base, this->size);
222
+ fastrpc_munmap(sess->domain_id, this->fd, (void *) this->base, this->size);
223
+
224
+ HEX_VERBOSE("ggml-hex: %s unmapped buffer: base %p size %zu fd %d\n", sess->c_name(),
225
+ (void *) this->base, size, this->fd);
226
+
262
227
  this->mapped = false;
228
+ this->fd = -1;
263
229
  }
264
230
 
265
- ggml_backend_hexagon_buffer_context(ggml_hexagon_session * sess, size_t size, bool repack) {
266
- size += 4 * 1024; // extra page for padding
231
+ void alloc(size_t size) {
232
+ if (this->base) return;
267
233
 
268
- this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
234
+ this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, size);
269
235
  if (!this->base) {
270
- GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size);
236
+ GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->c_name(), size);
271
237
  throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)");
272
238
  }
273
239
 
274
240
  this->fd = rpcmem_to_fd(this->base);
275
241
  if (this->fd < 0) {
276
- GGML_LOG_ERROR("ggml-hex: %s failed to get FD for buffer %p\n", sess->name.c_str(), (void *) this->base);
277
- rpcmem_free(this->base);
278
- this->base = NULL;
242
+ GGML_LOG_ERROR("ggml-hex: %s failed to get FD for buffer %p\n", sess->c_name(), (void *) this->base);
279
243
  throw std::runtime_error("ggml-hex: rpcmem_to_fd failed (see log for details)");
280
244
  }
245
+ this->size = size;
281
246
 
282
- HEX_VERBOSE("ggml-hex: %s allocated buffer: base %p size %zu fd %d repack %d\n", sess->name.c_str(),
283
- (void *) this->base, size, this->fd, (int) repack);
247
+ HEX_VERBOSE("ggml-hex: %s allocated buffer: base %p size %zu fd %d pinned %d\n", sess->c_name(),
248
+ (void *) this->base, this->size, this->fd, (int) pinned);
249
+ mmap();
250
+ }
251
+
252
+ void free() {
253
+ if (!this->base) return;
254
+
255
+ unmap();
256
+ rpcmem_free(this->base);
257
+
258
+ HEX_VERBOSE("ggml-hex: %s freed buffer: base %p size %zu fd %d\n", sess->c_name(),
259
+ (void *) this->base, size, this->fd);
284
260
 
261
+ this->base = NULL;
262
+ }
263
+
264
+ ggml_hexagon_shared_buffer(ggml_hexagon_session * sess, size_t size, bool pinned = false) {
285
265
  this->sess = sess;
286
- this->size = size;
266
+ this->size = 0;
267
+ this->base = nullptr;
268
+ this->fd = -1;
287
269
  this->mapped = false;
288
- this->repack = repack;
289
- }
270
+ this->pinned = pinned;
290
271
 
291
- ~ggml_backend_hexagon_buffer_context() {
292
- munmap();
293
- if (this->base) {
294
- rpcmem_free(this->base);
295
- this->base = NULL;
296
- }
272
+ alloc(size);
297
273
  }
298
274
 
299
- ggml_hexagon_session * sess; // primary session
300
- uint8_t * base;
301
- size_t size;
302
- int fd;
303
- bool mapped; // mmap is done
304
- bool repack; // repacked buffer
275
+ ~ggml_hexagon_shared_buffer() {
276
+ free();
277
+ }
305
278
  };
306
279
 
307
280
  static ggml_hexagon_session * ggml_backend_hexagon_buffer_get_sess(ggml_backend_buffer_t buffer) {
@@ -309,30 +282,26 @@ static ggml_hexagon_session * ggml_backend_hexagon_buffer_get_sess(ggml_backend_
309
282
  }
310
283
 
311
284
  static void ggml_backend_hexagon_buffer_free_buffer(ggml_backend_buffer_t buffer) {
312
- auto ctx = static_cast<ggml_backend_hexagon_buffer_context *>(buffer->context);
313
- delete ctx;
285
+ auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(buffer->context);
286
+ delete sbuf;
314
287
  }
315
288
 
316
289
  static void * ggml_backend_hexagon_buffer_get_base(ggml_backend_buffer_t buffer) {
317
- auto ctx = static_cast<ggml_backend_hexagon_buffer_context *>(buffer->context);
318
- return ctx->base;
290
+ auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(buffer->context);
291
+ return sbuf->base;
319
292
  }
320
293
 
321
294
  static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
322
- auto ctx = static_cast<ggml_backend_hexagon_buffer_context *>(buffer->context);
323
- auto sess = ctx->sess;
295
+ auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(buffer->context);
296
+ auto sess = sbuf->sess;
324
297
 
325
- HEX_VERBOSE("ggml-hex: %s init-tensor %s : base %p data %p nbytes %zu usage %d repack %d\n", sess->name.c_str(),
326
- tensor->name, (void *) ctx->base, tensor->data, ggml_nbytes(tensor), (int) buffer->usage,
327
- (int) ctx->repack);
298
+ HEX_VERBOSE("ggml-hex: %s init-tensor %s : base %p data %p nbytes %zu usage %d\n", sess->c_name(),
299
+ tensor->name, (void *) sbuf->base, tensor->data, ggml_nbytes(tensor), (int) buffer->usage);
328
300
 
329
301
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
330
- ; // nothing to do for the view
331
- } else {
332
- if (!ctx->mapped) {
333
- ctx->mmap();
334
- }
302
+ return GGML_STATUS_SUCCESS; // nothing to do for the view
335
303
  }
304
+
336
305
  return GGML_STATUS_SUCCESS;
337
306
  }
338
307
 
@@ -460,7 +429,7 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
460
429
  d[7] = x[i * 8 + 7].d;
461
430
  }
462
431
 
463
- if (opt_verbose > 1) {
432
+ if (opt_verbose > 2) {
464
433
  for (int i = 0; i < nb; i++) {
465
434
  dump_packed_block_q4x4x2(y, i, k);
466
435
  }
@@ -479,7 +448,7 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
479
448
  const uint8_t * y_q = y + 0; // quants first
480
449
  const uint8_t * y_d = y + qrow_size; // then scales
481
450
 
482
- if (opt_verbose > 1) {
451
+ if (opt_verbose > 2) {
483
452
  for (int i = 0; i < nb; i++) {
484
453
  dump_packed_block_q4x4x2(y, i, k);
485
454
  }
@@ -583,7 +552,7 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
583
552
 
584
553
  size_t row_size = ggml_row_size(t->type, t->ne[0]);
585
554
  size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
586
- size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
555
+ size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
587
556
 
588
557
  // Ensure we don't try to read more data than is available in the source buffer 'data'
589
558
  // or write more than the tensor can hold.
@@ -644,7 +613,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
644
613
 
645
614
  size_t row_size = ggml_row_size(t->type, t->ne[0]);
646
615
  size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
647
- size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
616
+ size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
648
617
 
649
618
  // Ensure we don't try to copy more data than the tensor actually contains.
650
619
  const size_t total_tensor_size = (size_t)nrows * row_size;
@@ -693,6 +662,239 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
693
662
  ggml_aligned_free(buf_rp, row_size_rp);
694
663
  }
695
664
 
665
+ static void unpack_q4_1_quants(uint8_t * qs, const block_q4_1 * x, unsigned int bi) {
666
+ static const int qk = QK4_1;
667
+
668
+ for (unsigned int i = 0; i < qk / 2; ++i) {
669
+ const int x0 = (x->qs[i] & 0x0F);
670
+ const int x1 = (x->qs[i] >> 4);
671
+ qs[bi * qk + i + 0] = x0;
672
+ qs[bi * qk + i + qk / 2] = x1;
673
+ }
674
+ }
675
+
676
+ static void pack_q4_1_quants(block_q4_1 * x, const uint8_t * qs, unsigned int bi) {
677
+ static const int qk = QK4_1;
678
+
679
+ for (unsigned int i = 0; i < qk / 2; ++i) {
680
+ const uint8_t x0 = qs[bi * qk + i + 0];
681
+ const uint8_t x1 = qs[bi * qk + i + qk / 2];
682
+ x->qs[i] = x0 | (x1 << 4);
683
+ }
684
+ }
685
+
686
+ static void repack_row_q4_1x4x2(uint8_t * y, const block_q4_1 * x, int64_t k) {
687
+ static const int qk = QK_Q4_0x4x2;
688
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
689
+ const int nloe = k % qk; // leftovers
690
+
691
+ const int dblk_size = 8 * 4; // 8x (d, m) __fp16 = 32 bytes
692
+ const int qblk_size = qk / 2; // int4 = 128 bytes
693
+ const int qrow_size = k / 2; // int4 (not padded to blocks)
694
+
695
+ uint8_t * y_q = y + 0; // quants first
696
+ uint8_t * y_d = y + qrow_size; // then scales/offsets
697
+
698
+ // Repack the quants
699
+ for (int i = 0; i < nb; i++) {
700
+ uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
701
+ unpack_q4_1_quants(qs, &x[i * 8 + 0], 0);
702
+ unpack_q4_1_quants(qs, &x[i * 8 + 1], 1);
703
+ unpack_q4_1_quants(qs, &x[i * 8 + 2], 2);
704
+ unpack_q4_1_quants(qs, &x[i * 8 + 3], 3);
705
+ unpack_q4_1_quants(qs, &x[i * 8 + 4], 4);
706
+ unpack_q4_1_quants(qs, &x[i * 8 + 5], 5);
707
+ unpack_q4_1_quants(qs, &x[i * 8 + 6], 6);
708
+ unpack_q4_1_quants(qs, &x[i * 8 + 7], 7);
709
+
710
+ bool partial = (nloe && i == nb-1);
711
+
712
+ uint8_t * q = y_q + (i * qblk_size);
713
+ for (int j = 0; j < qk / 2; j++) {
714
+ q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
715
+ }
716
+ }
717
+
718
+ // Repack the scales and offsets
719
+ for (int i = 0; i < nb; i++) {
720
+ ggml_half * d_m = (ggml_half *) (y_d + i * dblk_size);
721
+ for (int j = 0; j < 8; j++) {
722
+ d_m[j * 2 + 0] = x[i * 8 + j].d;
723
+ d_m[j * 2 + 1] = x[i * 8 + j].m;
724
+ }
725
+ }
726
+ }
727
+
728
+ static void unpack_row_q4_1x4x2(block_q4_1 * x, const uint8_t * y, int64_t k) {
729
+ static const int qk = QK_Q4_0x4x2;
730
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
731
+ const int nloe = k % qk; // leftovers
732
+
733
+ const int dblk_size = 8 * 4; // 8x (d, m) __fp16 = 32 bytes
734
+ const int qblk_size = qk / 2; // int4 = 128 bytes
735
+ const int qrow_size = k / 2; // int4 (not padded to blocks)
736
+
737
+ const uint8_t * y_q = y + 0; // quants first
738
+ const uint8_t * y_d = y + qrow_size; // then scales/offsets
739
+
740
+ // Unpack the quants
741
+ for (int i = 0; i < nb; i++) {
742
+ uint8_t qs[QK_Q4_0x4x2];
743
+ bool partial = (nloe && i == nb-1);
744
+
745
+ const uint8_t * q = y_q + (i * qblk_size);
746
+ for (int j = 0; j < qk / 2; j++) {
747
+ if (partial) {
748
+ qs[j*2+0] = q[j] & 0x0F;
749
+ qs[j*2+1] = q[j] >> 4;
750
+ } else {
751
+ qs[j+000] = q[j] & 0x0F;
752
+ qs[j+128] = q[j] >> 4;
753
+ }
754
+ }
755
+
756
+ pack_q4_1_quants(&x[i * 8 + 0], qs, 0);
757
+ pack_q4_1_quants(&x[i * 8 + 1], qs, 1);
758
+ pack_q4_1_quants(&x[i * 8 + 2], qs, 2);
759
+ pack_q4_1_quants(&x[i * 8 + 3], qs, 3);
760
+ pack_q4_1_quants(&x[i * 8 + 4], qs, 4);
761
+ pack_q4_1_quants(&x[i * 8 + 5], qs, 5);
762
+ pack_q4_1_quants(&x[i * 8 + 6], qs, 6);
763
+ pack_q4_1_quants(&x[i * 8 + 7], qs, 7);
764
+ }
765
+
766
+ // Unpack the scales and offsets
767
+ for (int i = 0; i < nb; i++) {
768
+ const ggml_half * d_m = (const ggml_half *) (y_d + i * dblk_size);
769
+ for (int j = 0; j < 8; j++) {
770
+ x[i * 8 + j].d = d_m[j * 2 + 0];
771
+ x[i * 8 + j].m = d_m[j * 2 + 1];
772
+ }
773
+ }
774
+ }
775
+
776
+ static void init_row_q4_1x4x2(block_q4_1 * x, int64_t k) {
777
+ static const int qk = QK_Q4_0x4x2;
778
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
779
+
780
+ uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
781
+ memset(qs, 0, sizeof(qs));
782
+
783
+ for (int i = 0; i < nb; i++) {
784
+ pack_q4_1_quants(&x[i * 8 + 0], qs, 0);
785
+ pack_q4_1_quants(&x[i * 8 + 1], qs, 1);
786
+ pack_q4_1_quants(&x[i * 8 + 2], qs, 2);
787
+ pack_q4_1_quants(&x[i * 8 + 3], qs, 3);
788
+ pack_q4_1_quants(&x[i * 8 + 4], qs, 4);
789
+ pack_q4_1_quants(&x[i * 8 + 5], qs, 5);
790
+ pack_q4_1_quants(&x[i * 8 + 6], qs, 6);
791
+ pack_q4_1_quants(&x[i * 8 + 7], qs, 7);
792
+ }
793
+
794
+ for (int i = 0; i < nb; i++) {
795
+ for (int j = 0; j < 8; j++) {
796
+ x[i * 8 + j].d = 0;
797
+ x[i * 8 + j].m = 0;
798
+ }
799
+ }
800
+ }
801
+
802
+ static void repack_q4_1_q4x4x2(ggml_tensor * t, const void * data, size_t size) {
803
+ int64_t nrows = ggml_nrows(t);
804
+
805
+ size_t row_size = ggml_row_size(t->type, t->ne[0]);
806
+ size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));
807
+ size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
808
+
809
+ const size_t total_tensor_size = (size_t)nrows * row_size;
810
+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
811
+
812
+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
813
+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
814
+
815
+ void * buf_pd = ggml_aligned_malloc(row_size_pd);
816
+ GGML_ASSERT(buf_pd != NULL);
817
+
818
+ void * buf_rp = ggml_aligned_malloc(row_size_rp);
819
+ GGML_ASSERT(buf_rp != NULL);
820
+
821
+ HEX_VERBOSE("ggml-hex: repack-q4_1-q4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
822
+ t->ne[0], nrows, row_size);
823
+
824
+ init_row_q4_1x4x2((block_q4_1 *) buf_pd, t->ne[0]);
825
+
826
+ for (int64_t i = 0; i < n_full_rows; i++) {
827
+ const uint8_t * src = (const uint8_t *) data + (i * row_size);
828
+ uint8_t * dst = (uint8_t *) t->data + (i * row_size);
829
+
830
+ memcpy(buf_pd, src, row_size);
831
+ repack_row_q4_1x4x2((uint8_t *) buf_rp, (const block_q4_1 *) buf_pd, t->ne[0]);
832
+ memcpy(dst, buf_rp, row_size);
833
+ }
834
+
835
+ if (n_rem_bytes > 0) {
836
+ const int64_t i = n_full_rows;
837
+ const uint8_t * src = (const uint8_t *) data + (i * row_size);
838
+ uint8_t * dst = (uint8_t *) t->data + (i * row_size);
839
+
840
+ init_row_q4_1x4x2((block_q4_1 *) buf_pd, t->ne[0]);
841
+ memcpy(buf_pd, src, n_rem_bytes);
842
+ repack_row_q4_1x4x2((uint8_t *) buf_rp, (const block_q4_1 *) buf_pd, t->ne[0]);
843
+ memcpy(dst, buf_rp, n_rem_bytes);
844
+ }
845
+
846
+ ggml_aligned_free(buf_pd, row_size_pd);
847
+ ggml_aligned_free(buf_rp, row_size_rp);
848
+ }
849
+
850
+ static void repack_q4x4x2_q4_1(void * data, const ggml_tensor * t, size_t size) {
851
+ int64_t nrows = ggml_nrows(t);
852
+
853
+ size_t row_size = ggml_row_size(t->type, t->ne[0]);
854
+ size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));
855
+ size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
856
+
857
+ const size_t total_tensor_size = (size_t)nrows * row_size;
858
+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
859
+
860
+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
861
+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
862
+
863
+ void * buf_pd = ggml_aligned_malloc(row_size_pd);
864
+ GGML_ASSERT(buf_pd != NULL);
865
+
866
+ void * buf_rp = ggml_aligned_malloc(row_size_rp);
867
+ GGML_ASSERT(buf_rp != NULL);
868
+
869
+ HEX_VERBOSE("ggml-hex: repack-q4x4x2-q4_1 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
870
+ t->ne[0], nrows, row_size);
871
+
872
+ memset(buf_rp, 0, row_size_rp); // clear-out padded buffer to make sure the tail is all zeros
873
+
874
+ for (int64_t i = 0; i < n_full_rows; i++) {
875
+ const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
876
+ uint8_t * dst = (uint8_t *) data + (i * row_size);
877
+
878
+ memcpy(buf_rp, src, row_size);
879
+ unpack_row_q4_1x4x2((block_q4_1 *) buf_pd, (const uint8_t *) buf_rp, t->ne[0]);
880
+ memcpy(dst, buf_pd, row_size);
881
+ }
882
+
883
+ if (n_rem_bytes > 0) {
884
+ const int64_t i = n_full_rows;
885
+ const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
886
+ uint8_t * dst = (uint8_t *) data + (i * row_size);
887
+
888
+ // We still need to read and unpack the entire source row because quantization is block-based.
889
+ memcpy(buf_rp, src, row_size);
890
+ unpack_row_q4_1x4x2((block_q4_1 *) buf_pd, (const uint8_t *) buf_rp, t->ne[0]);
891
+ memcpy(dst, buf_pd, n_rem_bytes);
892
+ }
893
+
894
+ ggml_aligned_free(buf_pd, row_size_pd);
895
+ ggml_aligned_free(buf_rp, row_size_rp);
896
+ }
897
+
696
898
  // ======== Q8x4x2 ====================
697
899
  static void dump_block_q8_0(const block_q8_0 * b, int i) {
698
900
  HEX_VERBOSE("ggml-hex: repack q8_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, b->qs[0], b->qs[1], b->qs[2],
@@ -795,7 +997,7 @@ static void repack_row_q8x4x2(uint8_t * y, const block_q8_0 * x, int64_t k) {
795
997
  d[7] = x[i * 8 + 7].d;
796
998
  }
797
999
 
798
- if (opt_verbose > 1) {
1000
+ if (opt_verbose > 2) {
799
1001
  for (int i = 0; i < nb; i++) {
800
1002
  dump_packed_block_q8x4x2(y, i, k);
801
1003
  }
@@ -813,7 +1015,7 @@ static void unpack_row_q8x4x2(block_q8_0 * x, const uint8_t * y, int64_t k) {
813
1015
  const uint8_t * y_q = y + 0; // quants first
814
1016
  const uint8_t * y_d = y + qrow_size; // then scales
815
1017
 
816
- if (opt_verbose > 1) {
1018
+ if (opt_verbose > 2) {
817
1019
  for (int i = 0; i < nb; i++) {
818
1020
  dump_packed_block_q8x4x2(y, i, k);
819
1021
  }
@@ -909,7 +1111,7 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
909
1111
 
910
1112
  size_t row_size = ggml_row_size(t->type, t->ne[0]);
911
1113
  size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
912
- size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
1114
+ size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size quants + scales)
913
1115
 
914
1116
  // Ensure we don't try to read more data than is available in the source buffer 'data'
915
1117
  // or write more than the tensor can hold.
@@ -970,7 +1172,7 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
970
1172
 
971
1173
  size_t row_size = ggml_row_size(t->type, t->ne[0]);
972
1174
  size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
973
- size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
1175
+ size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size quants + scales)
974
1176
 
975
1177
  // Ensure we don't try to copy more data than the tensor actually contains.
976
1178
  const size_t total_tensor_size = (size_t)nrows * row_size;
@@ -1148,7 +1350,7 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
1148
1350
  e[7] = x[i * 8 + 7].e;
1149
1351
  }
1150
1352
 
1151
- if (opt_verbose > 1) {
1353
+ if (opt_verbose > 2) {
1152
1354
  for (int i = 0; i < nb; i++) {
1153
1355
  dump_packed_block_mxfp4x4x2(y, i, k);
1154
1356
  }
@@ -1167,7 +1369,7 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
1167
1369
  const uint8_t * y_q = y + 0; // quants first
1168
1370
  const uint8_t * y_e = y + qrow_size; // then scales
1169
1371
 
1170
- if (opt_verbose > 1) {
1372
+ if (opt_verbose > 2) {
1171
1373
  for (int i = 0; i < nb; i++) {
1172
1374
  dump_packed_block_mxfp4x4x2(y, i, k);
1173
1375
  }
@@ -1271,7 +1473,7 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
1271
1473
 
1272
1474
  size_t row_size = ggml_row_size(t->type, t->ne[0]);
1273
1475
  size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
1274
- size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
1476
+ size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
1275
1477
 
1276
1478
  // Ensure we don't try to read more data than is available in the source buffer 'data'
1277
1479
  // or write more than the tensor can hold.
@@ -1332,7 +1534,7 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
1332
1534
 
1333
1535
  size_t row_size = ggml_row_size(t->type, t->ne[0]);
1334
1536
  size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
1335
- size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
1537
+ size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
1336
1538
 
1337
1539
  // Ensure we don't try to copy more data than the tensor actually contains.
1338
1540
  const size_t total_tensor_size = (size_t)nrows * row_size;
@@ -1386,11 +1588,10 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
1386
1588
  const void * data,
1387
1589
  size_t offset,
1388
1590
  size_t size) {
1389
- auto ctx = (ggml_backend_hexagon_buffer_context *) buffer->context;
1390
- auto sess = ctx->sess;
1591
+ auto sbuf = (ggml_hexagon_shared_buffer *) buffer->context;
1592
+ auto sess = sbuf->sess;
1391
1593
 
1392
- HEX_VERBOSE("ggml-hex: %s set-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data,
1393
- offset, size);
1594
+ HEX_VERBOSE("ggml-hex: %s set-tensor %s : data %p offset %zu size %zu\n", sess->c_name(), tensor->name, data, offset, size);
1394
1595
 
1395
1596
  switch (tensor->type) {
1396
1597
  case GGML_TYPE_Q4_0:
@@ -1399,10 +1600,23 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
1399
1600
  repack_q4_0_q4x4x2(tensor, data, size);
1400
1601
  break;
1401
1602
 
1402
- case GGML_TYPE_Q8_0:
1603
+ case GGML_TYPE_Q4_1:
1403
1604
  GGML_ASSERT(offset == 0);
1404
1605
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1405
- repack_q8_0_q8x4x2(tensor, data, size);
1606
+ repack_q4_1_q4x4x2(tensor, data, size);
1607
+ break;
1608
+
1609
+ case GGML_TYPE_Q8_0:
1610
+ GGML_ASSERT(offset == 0);
1611
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1612
+ repack_q8_0_q8x4x2(tensor, data, size);
1613
+ break;
1614
+
1615
+ case GGML_TYPE_IQ4_NL:
1616
+ GGML_ASSERT(offset == 0);
1617
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1618
+ // IQ4_NL has identical block layout to Q4_0 (ggml_half d + uint8_t qs[16])
1619
+ repack_q4_0_q4x4x2(tensor, data, size);
1406
1620
  break;
1407
1621
 
1408
1622
  case GGML_TYPE_MXFP4:
@@ -1422,11 +1636,10 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
1422
1636
  void * data,
1423
1637
  size_t offset,
1424
1638
  size_t size) {
1425
- auto ctx = (ggml_backend_hexagon_buffer_context *) buffer->context;
1426
- auto sess = ctx->sess;
1639
+ auto sbuf = (ggml_hexagon_shared_buffer *) buffer->context;
1640
+ auto sess = sbuf->sess;
1427
1641
 
1428
- HEX_VERBOSE("ggml-hex: %s get-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data,
1429
- offset, size);
1642
+ HEX_VERBOSE("ggml-hex: %s get-tensor %s : data %p offset %zu size %zu\n", sess->c_name(), tensor->name, data, offset, size);
1430
1643
 
1431
1644
  switch (tensor->type) {
1432
1645
  case GGML_TYPE_Q4_0:
@@ -1435,12 +1648,24 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
1435
1648
  repack_q4x4x2_q4_0(data, tensor, size);
1436
1649
  break;
1437
1650
 
1651
+ case GGML_TYPE_Q4_1:
1652
+ GGML_ASSERT(offset == 0);
1653
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1654
+ repack_q4x4x2_q4_1(data, tensor, size);
1655
+ break;
1656
+
1438
1657
  case GGML_TYPE_Q8_0:
1439
1658
  GGML_ASSERT(offset == 0);
1440
1659
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1441
1660
  repack_q8x4x2_q8_0(data, tensor, size);
1442
1661
  break;
1443
1662
 
1663
+ case GGML_TYPE_IQ4_NL:
1664
+ GGML_ASSERT(offset == 0);
1665
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1666
+ repack_q4x4x2_q4_0(data, tensor, size);
1667
+ break;
1668
+
1444
1669
  case GGML_TYPE_MXFP4:
1445
1670
  GGML_ASSERT(offset == 0);
1446
1671
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
@@ -1464,10 +1689,10 @@ static bool ggml_backend_hexagon_buffer_cpy_tensor(ggml_backend_buffer_t bu
1464
1689
  }
1465
1690
 
1466
1691
  static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
1467
- auto ctx = (ggml_backend_hexagon_buffer_context *) buffer->context;
1468
- auto sess = ctx->sess;
1469
- HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->name.c_str(), (void *) ctx->base, ctx->size);
1470
- memset(ctx->base, value, ctx->size);
1692
+ auto sbuf = (ggml_hexagon_shared_buffer *) buffer->context;
1693
+ auto sess = sbuf->sess;
1694
+ HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->c_name(), (void *) sbuf->base, sbuf->size);
1695
+ memset(sbuf->base, value, sbuf->size);
1471
1696
  }
1472
1697
 
1473
1698
  static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
@@ -1477,6 +1702,8 @@ static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
1477
1702
  /* .memset_tensor = */ NULL,
1478
1703
  /* .set_tensor = */ ggml_backend_hexagon_buffer_set_tensor,
1479
1704
  /* .get_tensor = */ ggml_backend_hexagon_buffer_get_tensor,
1705
+ /* .set_tensor_2d = */ NULL,
1706
+ /* .get_tensor_2d = */ NULL,
1480
1707
  /* .cpy_tensor = */ ggml_backend_hexagon_buffer_cpy_tensor,
1481
1708
  /* .clear = */ ggml_backend_hexagon_buffer_clear,
1482
1709
  /* .reset = */ NULL,
@@ -1492,10 +1719,11 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
1492
1719
  ggml_backend_buffer_type_t buffer_type, size_t size) {
1493
1720
  auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
1494
1721
  try {
1495
- ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
1496
- return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
1722
+ size += 4 * 1024; // guard page
1723
+ ggml_hexagon_shared_buffer * sbuf = new ggml_hexagon_shared_buffer(sess, size);
1724
+ return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, sbuf, size);
1497
1725
  } catch (const std::exception & exc) {
1498
- GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
1726
+ GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context (host): %s\n", sess->c_name(), exc.what());
1499
1727
  return nullptr;
1500
1728
  }
1501
1729
  }
@@ -1504,10 +1732,11 @@ static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffe
1504
1732
  ggml_backend_buffer_type_t buffer_type, size_t size) {
1505
1733
  auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
1506
1734
  try {
1507
- ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/);
1508
- return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
1735
+ size += 4 * 1024; // guard page
1736
+ ggml_hexagon_shared_buffer * sbuf = new ggml_hexagon_shared_buffer(sess, size);
1737
+ return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, sbuf, size);
1509
1738
  } catch (const std::exception & exc) {
1510
- GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
1739
+ GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context (repack): %s\n", sess->c_name(), exc.what());
1511
1740
  return nullptr;
1512
1741
  }
1513
1742
  }
@@ -1522,7 +1751,7 @@ static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffe
1522
1751
  }
1523
1752
 
1524
1753
  static size_t ggml_backend_hexagon_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
1525
- return 1 * 1024 * 1024 * 1024; // 1GB per buffer
1754
+ return opt_mbuf; // typically 1GB per buffer
1526
1755
  GGML_UNUSED(buffer_type);
1527
1756
  }
1528
1757
 
@@ -1554,6 +1783,448 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf
1554
1783
  /* .is_host = */ ggml_backend_hexagon_repack_buffer_type_is_host,
1555
1784
  };
1556
1785
 
1786
+ struct ggml_hexagon_opbatch {
1787
+ ggml_hexagon_session* sess;
1788
+
1789
+ std::vector<htp_opnode> ops; // htp_opnode of ops
1790
+
1791
+ std::vector<htp_buf_desc> h_bufs; // htp buffer descriptors
1792
+ std::vector<htp_tensor> h_tens; // htp tensor descriptors
1793
+ std::vector<htp_op_desc> h_ops; // htp op descriptors
1794
+
1795
+ std::unordered_map<int, int> b_map; // buffer fd to index
1796
+ std::unordered_map<const ggml_tensor*, int> t_map; // tensor ptr to index
1797
+ std::unordered_multimap<void*, int> d_map; // tensor data to index
1798
+
1799
+ unsigned int n_bufs; // num buffers in the batch
1800
+ unsigned int n_tens; // num tensors ...
1801
+ unsigned int n_ops; // num ops ...
1802
+ size_t b_vmem; // sum of all buffer sizes
1803
+
1804
+ unsigned int n_bufs_max;
1805
+ unsigned int n_tens_max;
1806
+ unsigned int n_ops_max;
1807
+ size_t b_vmem_max;
1808
+
1809
+ void reset() {
1810
+ n_bufs = 0;
1811
+ n_tens = 0;
1812
+ n_ops = 0;
1813
+ b_vmem = 0;
1814
+
1815
+ b_map.clear();
1816
+ t_map.clear();
1817
+ d_map.clear();
1818
+ }
1819
+
1820
+ ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t batch_size, size_t max_vmem) {
1821
+ this->sess = sess;
1822
+
1823
+ n_bufs_max = HTP_OP_MAX_BUFS;
1824
+ n_ops_max = batch_size;
1825
+ n_tens_max = n_ops_max + n_ops_max * HTP_OP_MAX_INPUTS;
1826
+
1827
+ b_vmem_max = max_vmem;
1828
+
1829
+ ops.resize(n_ops_max);
1830
+
1831
+ h_bufs.resize(n_bufs_max);
1832
+ h_tens.resize(n_tens_max);
1833
+ h_ops.resize(n_ops_max);
1834
+
1835
+ b_map.reserve(n_bufs_max);
1836
+ t_map.reserve(n_tens_max);
1837
+ d_map.reserve(n_tens_max);
1838
+
1839
+ GGML_LOG_INFO("ggml-hex: %s op batching: n-bufs %u n-tensors %u n-ops %u vmem %zu\n",
1840
+ sess->c_name(), n_bufs_max, n_tens_max, n_ops_max, b_vmem_max);
1841
+
1842
+ reset();
1843
+ }
1844
+
1845
+ bool empty() const { return n_ops == 0; }
1846
+
1847
+ // add buffer and return its index
1848
+ int add_buffer(ggml_hexagon_shared_buffer * sbuf) {
1849
+ // Lookup by fd
1850
+ auto it = b_map.find(sbuf->fd);
1851
+ if (it != b_map.end()) { return it->second; }
1852
+
1853
+ // Add new buffer to the batch
1854
+ int bi = n_bufs++;
1855
+ GGML_ASSERT(n_bufs < HTP_OP_MAX_BUFS);
1856
+
1857
+ b_map.insert({sbuf->fd, bi});
1858
+
1859
+ htp_buf_desc &b = h_bufs[bi];
1860
+ b.base = (uint64_t) sbuf->base;
1861
+ b.fd = sbuf->fd;
1862
+ b.size = sbuf->size;
1863
+
1864
+ b_vmem += b.size;
1865
+
1866
+ HEX_VERBOSE("ggml-hex: add-buffer #%u : fd %d base %p size %zu : vmem %zu\n", bi, b.fd, (void*) sbuf->base, (size_t) b.size, b_vmem);
1867
+
1868
+ return bi;
1869
+ }
1870
+
1871
+ bool same_shape(const htp_tensor * h, const ggml_tensor * t) const {
1872
+ return (h->ne[0] == t->ne[0]) && (h->ne[1] == t->ne[1]) && (h->ne[2] == t->ne[2]) && (h->ne[3] == t->ne[3]) &&
1873
+ (h->nb[0] == t->nb[0]) && (h->nb[1] == t->nb[1]) && (h->nb[2] == t->nb[2]) && (h->nb[3] == t->nb[3]);
1874
+ }
1875
+
1876
+ // add tensor and return its index
1877
+ int add_tensor(const ggml_tensor * t) {
1878
+ auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(t->buffer->context);
1879
+
1880
+ // First lookup by tensor data
1881
+ auto range = d_map.equal_range(t->data);
1882
+ for (auto it = range.first; it != range.second; ++it) {
1883
+ htp_tensor * h = &h_tens[it->second];
1884
+ if (same_shape(h, t)) { return it->second; }
1885
+ }
1886
+
1887
+ // Lookup by tensor ptr
1888
+ auto it = t_map.find(t);
1889
+ if (it != t_map.end()) { return it->second; }
1890
+
1891
+ // Add new tensor to the batch
1892
+ int ti = n_tens++;
1893
+ GGML_ASSERT(n_tens <= n_tens_max);
1894
+
1895
+ t_map.insert({t, ti});
1896
+ d_map.insert({t->data, ti});
1897
+
1898
+ uint64_t t_offset = (uint8_t *) t->data - sbuf->base;
1899
+ size_t t_size = ggml_nbytes(t);
1900
+
1901
+ htp_tensor &h = h_tens[ti];
1902
+ h.bi = add_buffer(sbuf);
1903
+ h.data = t_offset;
1904
+ h.size = t_size;
1905
+ h.type = t->type;
1906
+ h.ne[0] = t->ne[0]; h.ne[1] = t->ne[1]; h.ne[2] = t->ne[2]; h.ne[3] = t->ne[3];
1907
+ h.nb[0] = t->nb[0]; h.nb[1] = t->nb[1]; h.nb[2] = t->nb[2]; h.nb[3] = t->nb[3];
1908
+
1909
+ h.flags = 0;
1910
+ if (ggml_backend_buffer_get_usage(t->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
1911
+ h.flags |= HTP_TENSOR_COMPUTE;
1912
+ }
1913
+
1914
+ HEX_VERBOSE("ggml-hex: add-tensor #%u %s : bi %d data %p offset %zu size %zu flags 0x%x : %zu:%zu:%zu:%zu\n",
1915
+ ti, t->name, h.bi, (void*) t->data, (size_t) t_offset, t_size, h.flags,
1916
+ (size_t) t->ne[0], (size_t) t->ne[1], (size_t) t->ne[2], (size_t) t->ne[3]);
1917
+
1918
+ return ti;
1919
+ }
1920
+
1921
+ bool fit_op(const htp_opnode & node) const {
1922
+ if (n_ops >= n_ops_max ) return false;
1923
+
1924
+ // check how much extras we will need
1925
+ size_t extra_bufs = 0;
1926
+ size_t extra_vmem = 0;
1927
+ size_t extra_tens = 0;
1928
+
1929
+ auto fit_tensor = [&](const ggml_tensor *t) {
1930
+ if (!t) return;
1931
+ if (!t_map.count(t)) {
1932
+ extra_tens++;
1933
+
1934
+ auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(t->buffer->context);
1935
+ if (!b_map.count(sbuf->fd)) {
1936
+ extra_vmem += sbuf->size;
1937
+ extra_bufs += 1;
1938
+ }
1939
+ }
1940
+ };
1941
+
1942
+ for (const auto * src : node.get_inputs()) {
1943
+ fit_tensor(src);
1944
+ }
1945
+ fit_tensor(node.dst());
1946
+
1947
+ if ((extra_bufs + n_bufs) > n_bufs_max) return false;
1948
+ if ((extra_tens + n_tens) > n_tens_max) return false;
1949
+ if ((extra_vmem + b_vmem) > b_vmem_max) return false;
1950
+
1951
+ return true;
1952
+ }
1953
+
1954
+ // assumes that fit_op() was called first and returned true
1955
+ void add_op(const htp_opnode & node) {
1956
+ // Add new op
1957
+
1958
+ unsigned int n = n_ops++;
1959
+ GGML_ASSERT(n_ops <= n_ops_max);
1960
+
1961
+ ops[n] = node;
1962
+
1963
+ htp_op_desc &o = h_ops[n];
1964
+ memcpy(&o.params, &node.node->op_params, sizeof(node.node->op_params));
1965
+ o.opcode = node.opcode;
1966
+ o.flags = 0;
1967
+
1968
+ if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) {
1969
+ o.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
1970
+ }
1971
+
1972
+ ggml_hexagon_dump_op_exec(sess->c_name(), node, o.flags);
1973
+
1974
+ auto inputs = node.get_inputs();
1975
+ for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) {
1976
+ o.src[i] = (i < inputs.size() && inputs[i]) ? add_tensor(inputs[i]) : 0xffff;
1977
+ }
1978
+ o.dst = add_tensor(node.dst());
1979
+ }
1980
+ };
1981
+
1982
+ struct ggml_hexagon_opqueue {
1983
+ // Shared buffer for storing batches
1984
+ ggml_hexagon_shared_buffer *shm_buf;
1985
+ size_t shm_blk_size;
1986
+
1987
+ using opvec = std::vector<htp_opnode>;
1988
+
1989
+ std::queue<unsigned int> done; // completed batch ids
1990
+ std::vector<opvec> op_cache; // per batch op cache
1991
+ std::vector<uint64_t> start_usec; // per batch start time
1992
+
1993
+ ggml_hexagon_opqueue(ggml_hexagon_session *sess, size_t batch_size, size_t depth) {
1994
+ size_t n_bufs = HTP_OP_MAX_BUFS;
1995
+ size_t n_ops = batch_size;
1996
+ size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
1997
+
1998
+ shm_blk_size = sizeof(htp_buf_desc) * n_bufs +
1999
+ sizeof(htp_tensor) * n_tensors +
2000
+ sizeof(htp_op_desc) * n_ops +
2001
+ sizeof(htp_prof_desc) * n_ops;
2002
+
2003
+ shm_buf = new ggml_hexagon_shared_buffer(sess, shm_blk_size * depth, true /* pinned */);
2004
+
2005
+ op_cache.resize(depth);
2006
+ start_usec.resize(depth, 0);
2007
+
2008
+ // init done queue
2009
+ for (unsigned int i = 0; i < depth; i++) { done.push(i); }
2010
+
2011
+ if (opt_verbose) {
2012
+ GGML_LOG_INFO("ggml-hex: %s allocated op-queue : batch-size %zu depth %zu shm-size %zu shm-block-size %zu\n",
2013
+ sess->c_name(), batch_size, depth, shm_buf->size, shm_blk_size);
2014
+ }
2015
+ }
2016
+
2017
+ ~ggml_hexagon_opqueue() {
2018
+ delete shm_buf;
2019
+ }
2020
+
2021
+ // push new batch
2022
+ bool push(htp_opbatch_req& req, dspqueue_buffer& dbuf, ggml_hexagon_opbatch* op_batch) {
2023
+ static_assert(sizeof(htp_opbatch_req) % 8 == 0, "sizeof(htp_opbatch_req) must be multiple of 8");
2024
+ static_assert(sizeof(htp_opbatch_rsp) % 8 == 0, "sizeof(htp_opbatch_rsp) must be multiple of 8");
2025
+ static_assert(sizeof(htp_buf_desc) % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
2026
+ static_assert(sizeof(htp_tensor) % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
2027
+ static_assert(sizeof(htp_op_desc) % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
2028
+ static_assert(sizeof(htp_prof_desc) % 8 == 0, "sizeof(htp_prof_desc) must be multiple of 8");
2029
+
2030
+ if (done.empty()) { return false; }
2031
+
2032
+ req.id = done.front(); done.pop(); // batch id
2033
+ req.n_bufs = op_batch->n_bufs;
2034
+ req.n_tensors = op_batch->n_tens;
2035
+ req.n_ops = op_batch->n_ops;
2036
+
2037
+ op_cache[req.id] = op_batch->ops;
2038
+ start_usec[req.id] = ggml_time_us();
2039
+
2040
+ const size_t b_size = sizeof(htp_buf_desc) * req.n_bufs;
2041
+ const size_t t_size = sizeof(htp_tensor) * req.n_tensors;
2042
+ const size_t o_size = sizeof(htp_op_desc) * req.n_ops;
2043
+ const size_t p_size = sizeof(htp_prof_desc) * req.n_ops;
2044
+
2045
+ dbuf.ptr = shm_buf->base + (req.id * shm_blk_size);
2046
+ dbuf.fd = shm_buf->fd;
2047
+ dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
2048
+ dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) shm_buf->base;
2049
+ dbuf.size = b_size + t_size + o_size + p_size;
2050
+
2051
+ GGML_ASSERT(dbuf.size <= shm_blk_size);
2052
+
2053
+ uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
2054
+ uint8_t * b_ptr = m_ptr; m_ptr += b_size;
2055
+ uint8_t * t_ptr = m_ptr; m_ptr += t_size;
2056
+ uint8_t * o_ptr = m_ptr;
2057
+
2058
+ memcpy(b_ptr, (void *) op_batch->h_bufs.data(), b_size);
2059
+ memcpy(t_ptr, (void *) op_batch->h_tens.data(), t_size);
2060
+ memcpy(o_ptr, (void *) op_batch->h_ops.data(), o_size);
2061
+
2062
+ HEX_VERBOSE("ggml-hex: %s op-queue push batch #%u : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu m-size %zu\n",
2063
+ shm_buf->sess->c_name(), req.id, req.n_bufs, req.n_tensors, req.n_ops, op_batch->b_vmem,
2064
+ b_size, t_size, o_size, (size_t) dbuf.size);
2065
+
2066
+ op_batch->reset();
2067
+
2068
+ if (opt_verbose > 1) {
2069
+ htp_buf_desc *b = (htp_buf_desc*) b_ptr;
2070
+ for (unsigned int i=0; i < req.n_bufs; i++) {
2071
+ GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", shm_buf->sess->c_name(), i,
2072
+ b[i].fd, (void *) b[i].base, (size_t) b[i].size);
2073
+ }
2074
+ htp_tensor *t = (htp_tensor*) t_ptr;
2075
+ for (unsigned int i=0; i < req.n_tensors; i++) {
2076
+ GGML_LOG_DEBUG("ggml-hex: %s htp-tensor #%u : bi %u offset %u size %u : %zu:%zu:%zu:%zu\n",
2077
+ shm_buf->sess->c_name(), i, t[i].bi, t[i].data, t[i].size,
2078
+ (size_t) t[i].ne[0], (size_t) t[i].ne[1], (size_t) t[i].ne[2], (size_t) t[i].ne[3]);
2079
+ }
2080
+ }
2081
+
2082
+ return true;
2083
+ }
2084
+
2085
+ void pop(htp_opbatch_rsp rsp, dspqueue_buffer dbuf) {
2086
+ GGML_ASSERT(rsp.id < op_cache.size());
2087
+
2088
+ done.push(rsp.id);
2089
+
2090
+ const size_t b_size = sizeof(htp_buf_desc) * rsp.n_bufs;
2091
+ const size_t t_size = sizeof(htp_tensor) * rsp.n_tensors;
2092
+ const size_t o_size = sizeof(htp_op_desc) * rsp.n_ops;
2093
+ const size_t p_size = sizeof(htp_prof_desc) * rsp.n_ops;
2094
+
2095
+ const size_t m_size = b_size + t_size + o_size + p_size;
2096
+ GGML_ASSERT(m_size <= shm_blk_size);
2097
+
2098
+ HEX_VERBOSE("ggml-hex: %s op-queue pop batch #%u : n-bufs %u n-tensors %u n-ops %u : m-size %zu b-size %zu t-size %zu o-size %zu\n",
2099
+ shm_buf->sess->c_name(), rsp.id, rsp.n_bufs, rsp.n_tensors, rsp.n_ops,
2100
+ (size_t) dbuf.size, b_size, t_size, o_size);
2101
+
2102
+ uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
2103
+ uint8_t * p_ptr = m_ptr + (b_size + t_size + o_size);
2104
+
2105
+ if (opt_profile && rsp.n_ops > 0) {
2106
+ auto & ops = op_cache[rsp.id];
2107
+
2108
+ uint64_t batch_usec = ggml_time_us() - start_usec[rsp.id];
2109
+ uint32_t htp_usec = 0;
2110
+
2111
+ GGML_ASSERT(rsp.n_ops <= ops.size());
2112
+
2113
+ const htp_prof_desc * pd = (const htp_prof_desc *) p_ptr;
2114
+ for (uint32_t i = 0; i < rsp.n_ops; i++) {
2115
+ htp_usec += pd[i].usecs;
2116
+ ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i].usecs, pd[i].cycles, pd[i].pmu);
2117
+ }
2118
+
2119
+ GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u\n",
2120
+ shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec);
2121
+ }
2122
+ }
2123
+ };
2124
+
2125
+ // Flush HTP response queue i.e wait for all outstanding requests to complete
2126
+ void ggml_hexagon_session::flush_pending(bool all) {
2127
+ while (this->op_pending) {
2128
+ struct htp_opbatch_rsp rsp;
2129
+ uint32_t rsp_size;
2130
+ uint32_t flags;
2131
+
2132
+ struct dspqueue_buffer dbuf;
2133
+ uint32_t n_dbufs;
2134
+
2135
+ // Read response packet from queue
2136
+ const uint32_t timeo = opt_oppoll ? 0 : DSPQUEUE_TIMEOUT;
2137
+ int err = dspqueue_read(this->queue, &flags, 1, &n_dbufs, &dbuf, sizeof(rsp), &rsp_size, (uint8_t *) &rsp, timeo);
2138
+ if (err == AEE_EEXPIRED) {
2139
+ continue;
2140
+ }
2141
+
2142
+ if (err != 0) {
2143
+ GGML_ABORT("ggml-hex: dspqueue_read failed: 0x%08x\n", (unsigned) err);
2144
+ }
2145
+
2146
+ // Basic sanity checks
2147
+ if (rsp_size != sizeof(rsp) || n_dbufs != 1) {
2148
+ GGML_ABORT("ggml-hex: %s dspcall : bad response : size %u dspbufs %u\n", this->c_name(), rsp_size, n_dbufs);
2149
+ }
2150
+
2151
+ if (rsp.status != HTP_STATUS_OK) {
2152
+ GGML_LOG_ERROR("ggml-hex: %s dspcall : dsp-rsp: %s\n", this->c_name(), status_to_str(rsp.status));
2153
+ // TODO: handle errors
2154
+ }
2155
+
2156
+ op_queue->pop(rsp, dbuf);
2157
+
2158
+ this->op_pending--; // atomic dec
2159
+
2160
+ if (!all) break;
2161
+ }
2162
+ }
2163
+
2164
+ void ggml_hexagon_session::flush_batch() {
2165
+ if (op_batch->empty()) { return; }
2166
+
2167
+ htp_opbatch_req req {};
2168
+ dspqueue_buffer dbuf{};
2169
+
2170
+ if (!op_queue->push(req, dbuf, op_batch)) {
2171
+ flush_pending(false);
2172
+ op_queue->push(req, dbuf, op_batch);
2173
+ }
2174
+
2175
+ // Bump pending flag (cleared in the session::flush once we get the response)
2176
+ this->op_pending++; // atomic inc
2177
+
2178
+ HEX_VERBOSE("ggml-hex: %s queue-opbatch: %p size %u\n", this->c_name(), dbuf.ptr, dbuf.size);
2179
+
2180
+ int err = dspqueue_write(this->queue, 0, 1, &dbuf, sizeof(req), (const uint8_t*) &req, DSPQUEUE_TIMEOUT);
2181
+ if (err != 0) {
2182
+ GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->c_name(), (unsigned) err);
2183
+ }
2184
+ }
2185
+
2186
+ void ggml_hexagon_session::enqueue_op(const htp_opnode & node) {
2187
+ if (!op_batch->fit_op(node)) {
2188
+ flush_batch();
2189
+ }
2190
+ op_batch->add_op(node);
2191
+ }
2192
+
2193
+ // Flush HTP response queue i.e wait for all outstanding requests to complete
2194
+ void ggml_hexagon_session::flush(bool all) {
2195
+ flush_batch();
2196
+ flush_pending(all);
2197
+ }
2198
+
2199
+ static size_t ggml_hexagon_measure_max_vmem(ggml_hexagon_session *sess) {
2200
+ // Allocate a bunch pinned buffers till failure.
2201
+ // This is kind of expensive but handy for figuring out exactly how much we can mmap on a specific device.
2202
+ // Typically we're going to allocate all/most of these buffers anyway for the model weights.
2203
+
2204
+ std::vector<ggml_hexagon_shared_buffer *> sbufs;
2205
+
2206
+ const size_t MiB = 1024 * 1024;
2207
+ const size_t GiB = MiB * 1024;
2208
+
2209
+ size_t vmem = 0;
2210
+ size_t step = 256u * MiB;
2211
+
2212
+ try {
2213
+ sbufs.push_back(new ggml_hexagon_shared_buffer(sess, GiB, true)); vmem += GiB;
2214
+ sbufs.push_back(new ggml_hexagon_shared_buffer(sess, GiB, true)); vmem += GiB;
2215
+ sbufs.push_back(new ggml_hexagon_shared_buffer(sess, GiB, true)); vmem += GiB;
2216
+
2217
+ while (1) {
2218
+ sbufs.push_back(new ggml_hexagon_shared_buffer(sess, step, true));
2219
+ vmem += step;
2220
+ }
2221
+ } catch (...) { }
2222
+
2223
+ for (auto b : sbufs) { delete b; }
2224
+
2225
+ return vmem - step; // backoff to account for overhead from internal mappings
2226
+ }
2227
+
1557
2228
  void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
1558
2229
  this->valid_session = false;
1559
2230
  this->valid_handle = false;
@@ -1566,11 +2237,8 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
1566
2237
  this->name = std::string("HTP") + std::to_string(dev_id);
1567
2238
 
1568
2239
  this->op_pending = 0;
1569
- this->prof_usecs = 0;
1570
- this->prof_cycles = 0;
1571
- this->prof_pkts = 0;
1572
2240
 
1573
- GGML_LOG_INFO("ggml-hex: allocating new session: %s\n", this->name.c_str());
2241
+ GGML_LOG_DEBUG("ggml-hex: %s allocating new session\n", this->name.c_str());
1574
2242
 
1575
2243
  domain * my_domain = get_domain(this->domain_id);
1576
2244
  if (my_domain == NULL) {
@@ -1646,9 +2314,6 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
1646
2314
 
1647
2315
  this->valid_handle = true;
1648
2316
 
1649
- GGML_LOG_INFO("ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(),
1650
- this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
1651
-
1652
2317
  // Enable FastRPC QoS mode
1653
2318
  {
1654
2319
  struct remote_rpc_control_latency l;
@@ -1660,11 +2325,17 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
1660
2325
  }
1661
2326
  }
1662
2327
 
2328
+ GGML_LOG_INFO("ggml-hex: %s new session : session-id %d domain-id %d uri %s handle 0x%lx\n", this->c_name(),
2329
+ this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
2330
+
2331
+ const size_t req_q_size = (sizeof(htp_opbatch_req) * opt_opqueue * 2) + 1024;
2332
+ const size_t rsp_q_size = (sizeof(htp_opbatch_rsp) * opt_opqueue * 2) + 1024;
2333
+
1663
2334
  // Now let's setup the DSP queue
1664
2335
  err = dspqueue_create(this->domain_id,
1665
2336
  0, // Flags
1666
- 128 * 1024, // Request queue size (in bytes)
1667
- 64 * 1024, // Response queue size (in bytes)
2337
+ req_q_size, // Request queue size (in bytes)
2338
+ rsp_q_size, // Response queue size (in bytes)
1668
2339
  nullptr, // Read packet callback (we handle reads explicitly)
1669
2340
  nullptr, // Error callback (we handle errors during reads)
1670
2341
  (void *) this, // Callback context
@@ -1684,18 +2355,36 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
1684
2355
  }
1685
2356
 
1686
2357
  if (opt_etm) {
1687
- err = htp_iface_enable_etm(this->handle);
2358
+ err = htp_iface_etm(this->handle, 1);
1688
2359
  if (err != 0) {
1689
2360
  GGML_LOG_ERROR("ggml-hex: failed to enable ETM tracing: 0x%08x\n", (unsigned) err);
1690
2361
  }
1691
2362
  }
1692
2363
 
1693
- // Start the DSP-side service. We need to pass the queue ID to the
1694
- // DSP in a FastRPC call; the DSP side will import the queue and start
1695
- // listening for packets in a callback.
1696
- err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx);
2364
+ if (opt_profile) {
2365
+ htp_iface_pmu_conf pmu_conf{};
2366
+ std::copy(opt_pmu_evt.begin(), opt_pmu_evt.end(), pmu_conf.events);
2367
+
2368
+ err = htp_iface_profiler(this->handle, opt_profile, &pmu_conf);
2369
+ if (err != 0) {
2370
+ GGML_LOG_ERROR("ggml-hex: failed to enable profiling: 0x%08x\n", (unsigned) err);
2371
+ }
2372
+ }
2373
+
2374
+ // Allocate buffers and state for op batching
2375
+ this->op_queue = new ggml_hexagon_opqueue(this, opt_opbatch, opt_opqueue);
2376
+
2377
+ if (!opt_vmem) {
2378
+ opt_vmem = ggml_hexagon_measure_max_vmem(this);
2379
+ GGML_LOG_INFO("ggml-hex: %s measured max vmem %zu\n", this->c_name(), opt_vmem);
2380
+ }
2381
+
2382
+ this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch, opt_vmem);
2383
+
2384
+ // Start dspqueue/opbatch processing
2385
+ err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx, opt_use_hmx, opt_vmem);
1697
2386
  if (err != 0) {
1698
- GGML_LOG_ERROR("ggml-hex: failed to start session: 0x%08x\n", (unsigned) err);
2387
+ GGML_LOG_ERROR("ggml-hex: %s failed to start session: 0x%08x\n", this->c_name(), (unsigned) err);
1699
2388
  throw std::runtime_error("ggml-hex: iface start failed (see log for details)");
1700
2389
  }
1701
2390
  this->valid_iface = true;
@@ -1706,21 +2395,32 @@ void ggml_hexagon_session::release() noexcept(true) {
1706
2395
 
1707
2396
  int err;
1708
2397
 
1709
- // Stop the DSP-side service and close the queue
1710
2398
  if (this->valid_iface) {
2399
+ // Stop dspqueue/opbatch processing
1711
2400
  err = htp_iface_stop(this->handle);
1712
2401
  if (err != 0) {
1713
2402
  GGML_ABORT("ggml-hex: htp_iface_stop failed: 0x%08x\n", (unsigned) err);
1714
2403
  }
1715
2404
  }
1716
2405
 
2406
+ delete this->op_batch;
2407
+ delete this->op_queue;
2408
+
1717
2409
  if (opt_etm) {
1718
- err = htp_iface_disable_etm(this->handle);
2410
+ err = htp_iface_etm(this->handle, 0);
1719
2411
  if (err != 0) {
1720
2412
  GGML_LOG_ERROR("ggml-hex: warn : failed to disable ETM tracing: 0x%08x\n", (unsigned) err);
1721
2413
  }
1722
2414
  }
1723
2415
 
2416
+ if (opt_profile) {
2417
+ htp_iface_pmu_conf pmu_conf{};
2418
+ err = htp_iface_profiler(this->handle, 0, &pmu_conf);
2419
+ if (err != 0) {
2420
+ GGML_LOG_ERROR("ggml-hex: warn : failed to disable profiling: 0x%08x\n", (unsigned) err);
2421
+ }
2422
+ }
2423
+
1724
2424
  if (this->valid_queue) {
1725
2425
  err = dspqueue_close(queue);
1726
2426
  if (err != 0) {
@@ -1737,6 +2437,9 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
1737
2437
  buffer_type.device = dev;
1738
2438
  repack_buffer_type.device = dev;
1739
2439
 
2440
+ op_batch = nullptr;
2441
+ op_queue = nullptr;
2442
+
1740
2443
  try {
1741
2444
  allocate(dev_id);
1742
2445
 
@@ -1799,9 +2502,66 @@ static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_sess
1799
2502
  return false;
1800
2503
  }
1801
2504
 
1802
- return opt_experimental;
2505
+ if (dst->ne[3] != 1) {
2506
+ return false;
2507
+ }
2508
+
2509
+ return true;
1803
2510
  }
1804
2511
 
2512
+ static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2513
+ const struct ggml_tensor * q = op->src[0];
2514
+ const struct ggml_tensor * k = op->src[1];
2515
+ const struct ggml_tensor * v = op->src[2];
2516
+ const struct ggml_tensor * g = op->src[3];
2517
+ const struct ggml_tensor * beta = op->src[4];
2518
+ const struct ggml_tensor * state = op->src[5];
2519
+ const struct ggml_tensor * dst = op;
2520
+
2521
+ if (!q || !k || !v || !g || !beta || !state) {
2522
+ return false;
2523
+ }
2524
+
2525
+ if (q->type != GGML_TYPE_F32 || k->type != GGML_TYPE_F32 || v->type != GGML_TYPE_F32 ||
2526
+ g->type != GGML_TYPE_F32 || beta->type != GGML_TYPE_F32 || state->type != GGML_TYPE_F32 ||
2527
+ dst->type != GGML_TYPE_F32) {
2528
+ return false;
2529
+ }
2530
+
2531
+ if (!ggml_is_contiguous_rows(q) || !ggml_is_contiguous_rows(k) || !ggml_is_contiguous_rows(v) ||
2532
+ !ggml_is_contiguous(g) || !ggml_is_contiguous(beta) || !ggml_is_contiguous(state) ||
2533
+ !ggml_is_contiguous(dst)) {
2534
+ return false;
2535
+ }
2536
+
2537
+ const int64_t S_v = v->ne[0];
2538
+ const int64_t H = v->ne[1];
2539
+ const int64_t n_tokens = v->ne[2];
2540
+ const int64_t n_seqs = v->ne[3];
2541
+ const int64_t K = ggml_get_op_params_i32(op, 0);
2542
+
2543
+ if (S_v <= 0 || S_v > 128 || H <= 0 || n_tokens <= 0 || n_seqs <= 0) {
2544
+ return false;
2545
+ }
2546
+ if (q->ne[0] != S_v || k->ne[0] != S_v || q->ne[1] <= 0 || k->ne[1] <= 0 ||
2547
+ q->ne[2] != n_tokens || k->ne[2] != n_tokens || q->ne[3] <= 0 || k->ne[3] <= 0 ||
2548
+ (n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) {
2549
+ return false;
2550
+ }
2551
+ if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) {
2552
+ return false;
2553
+ }
2554
+ // state holds s0 only [S_v, S_v, H, n_seqs]; K is op param 0.
2555
+ if (ggml_nelements(state) != S_v * S_v * H * n_seqs) {
2556
+ return false;
2557
+ }
2558
+ if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs * K) {
2559
+ return false;
2560
+ }
2561
+
2562
+ GGML_UNUSED(sess);
2563
+ return true;
2564
+ }
1805
2565
 
1806
2566
  static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
1807
2567
  const struct ggml_tensor * src0 = dst->src[0];
@@ -1817,7 +2577,9 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
1817
2577
 
1818
2578
  switch (src0->type) {
1819
2579
  case GGML_TYPE_Q4_0:
2580
+ case GGML_TYPE_Q4_1:
1820
2581
  case GGML_TYPE_Q8_0:
2582
+ case GGML_TYPE_IQ4_NL:
1821
2583
  case GGML_TYPE_MXFP4:
1822
2584
  if (src0->ne[0] % 32) {
1823
2585
  return false;
@@ -1842,6 +2604,27 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
1842
2604
  GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F16 src0 not supported\n");
1843
2605
  return false;
1844
2606
  }
2607
+ if (src1->ne[2] < src0->ne[2] || src1->ne[3] < src0->ne[3]) {
2608
+ GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: src1 broadcasting not supported\n");
2609
+ return false;
2610
+ }
2611
+ if (ggml_nrows(src1) > 1024) {
2612
+ return false; // no huge batches (for now)
2613
+ }
2614
+ break;
2615
+
2616
+ case GGML_TYPE_F32:
2617
+ if (src1->type != GGML_TYPE_F32) {
2618
+ return false;
2619
+ }
2620
+ if (src0->nb[1] < src0->nb[0]) {
2621
+ GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F32 src0 not supported\n");
2622
+ return false;
2623
+ }
2624
+ if (src1->ne[2] < src0->ne[2] || src1->ne[3] < src0->ne[3]) {
2625
+ GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: src1 broadcasting not supported\n");
2626
+ return false;
2627
+ }
1845
2628
  if (ggml_nrows(src1) > 1024) {
1846
2629
  return false; // no huge batches (for now)
1847
2630
  }
@@ -1866,7 +2649,9 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
1866
2649
 
1867
2650
  switch (src0->type) {
1868
2651
  case GGML_TYPE_Q4_0:
2652
+ case GGML_TYPE_Q4_1:
1869
2653
  case GGML_TYPE_Q8_0:
2654
+ case GGML_TYPE_IQ4_NL:
1870
2655
  case GGML_TYPE_MXFP4:
1871
2656
  if ((src0->ne[0] % 32)) {
1872
2657
  return false;
@@ -1960,8 +2745,8 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
1960
2745
  return false;
1961
2746
  }
1962
2747
 
1963
- // TODO: add support for non-contigiuos tensors
1964
- if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
2748
+ // dst must be contiguous; src0 may be non-contiguous
2749
+ if (!ggml_is_contiguous(dst)) {
1965
2750
  return false;
1966
2751
  }
1967
2752
 
@@ -2064,8 +2849,25 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
2064
2849
  }
2065
2850
  }
2066
2851
 
2067
- return true;
2068
- }
2852
+ // Reject non-HVX-aligned sizes when ne[0] > HVX_F32_LANES
2853
+ // The HVX softmax implementation has issues with tail handling for larger non-aligned sizes
2854
+ // Small sizes (ne[0] <= 32) work correctly with tail-only processing
2855
+ const int64_t ne0 = src0->ne[0];
2856
+ if (ne0 > 32 && (ne0 & (32 - 1)) != 0) {
2857
+ return false;
2858
+ }
2859
+
2860
+ // HVX vector size constraints for softmax
2861
+ #define SOFTMAX_MAX_ROW_SIZE 131072 // 128K elements max for numerical precision
2862
+
2863
+ // Reject very large row sizes to avoid numerical precision issues
2864
+ // Softmax accumulation over many elements can lead to precision loss
2865
+ if (ne0 > SOFTMAX_MAX_ROW_SIZE) {
2866
+ return false;
2867
+ }
2868
+
2869
+ return true;
2870
+ }
2069
2871
 
2070
2872
  static bool ggml_hexagon_supported_set_rows(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2071
2873
  const struct ggml_tensor * src0 = op->src[0]; // values
@@ -2132,7 +2934,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
2132
2934
 
2133
2935
  int mode = op_params[2];
2134
2936
 
2135
- if ((mode & GGML_ROPE_TYPE_MROPE) || (mode & GGML_ROPE_TYPE_VISION)) {
2937
+ if (mode == GGML_ROPE_TYPE_VISION) {
2136
2938
  return false;
2137
2939
  }
2138
2940
  if (mode & 1) {
@@ -2206,486 +3008,238 @@ static bool ggml_hexagon_supported_ssm_conv(const struct ggml_hexagon_session *
2206
3008
  if (dst->ne[0] != d_inner || dst->ne[1] != n_t || dst->ne[2] != n_s) {
2207
3009
  return false;
2208
3010
  }
2209
-
2210
- // TODO: add support for non-contiguous tensors
2211
- if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
3011
+ if (src0->nb[0] != sizeof(float) || src1->nb[0] != sizeof(float) || dst->nb[0] != sizeof(float)) {
3012
+ return false;
3013
+ }
3014
+ if (src0->nb[1] != src0->ne[0] * sizeof(float) || src1->nb[1] != src1->ne[0] * sizeof(float)) {
2212
3015
  return false;
2213
3016
  }
2214
3017
 
2215
3018
  return true;
2216
3019
  }
2217
3020
 
2218
- enum dspqbuf_type {
2219
- DSPQBUF_TYPE_DSP_WRITE_CPU_READ = 0,
2220
- DSPQBUF_TYPE_CPU_WRITE_DSP_READ,
2221
- DSPQBUF_TYPE_CONSTANT,
2222
- };
2223
-
2224
- static void dspqbuf_dump(dspqueue_buffer * d, const struct ggml_tensor * t, dspqbuf_type type) {
2225
- if (opt_verbose < 2) return;
2226
-
2227
- auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
2228
- auto sess = buf->sess;
2229
-
2230
- GGML_LOG_DEBUG("ggml-hex: %s dspqbuf : %s base-addr %p base-size %zu data %p offset %u size %u\n", sess->name.c_str(),
2231
- t->name, (void *) buf->base, buf->size, (void *) d->ptr, (unsigned int) d->offset,
2232
- (unsigned int) d->size);
2233
- }
2234
-
2235
- // Init hexagon tensor from GGML tensor and Hexagon buffer
2236
- static void htp_req_tensor_init(htp_tensor * h, const ggml_tensor * t) {
2237
- h->data = 0; // updated by the receiver
2238
- h->type = t->type;
2239
- h->ne[0] = t->ne[0];
2240
- h->ne[1] = t->ne[1];
2241
- h->ne[2] = t->ne[2];
2242
- h->ne[3] = t->ne[3];
2243
- h->nb[0] = t->nb[0];
2244
- h->nb[1] = t->nb[1];
2245
- h->nb[2] = t->nb[2];
2246
- h->nb[3] = t->nb[3];
2247
- }
3021
+ static bool ggml_hexagon_supported_pad(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3022
+ const struct ggml_tensor * src0 = op->src[0];
3023
+ const struct ggml_tensor * dst = op;
2248
3024
 
2249
- static size_t htp_req_buff_init(htp_tensor *h, dspqueue_buffer * d, const ggml_tensor * t, dspqbuf_type type) {
2250
- if (!t) {
2251
- return 0;
3025
+ if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
3026
+ return false;
2252
3027
  }
2253
3028
 
2254
- auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
3029
+ GGML_UNUSED(sess);
3030
+ return true;
3031
+ }
2255
3032
 
2256
- memset(d, 0, sizeof(*d));
2257
- d->fd = buf->fd;
2258
- d->ptr = t->data;
2259
- d->offset = (uint8_t *) t->data - buf->base;
2260
- d->size = ggml_nbytes(t);
3033
+ static bool ggml_hexagon_supported_cumsum(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3034
+ const struct ggml_tensor * src0 = op->src[0];
3035
+ const struct ggml_tensor * dst = op;
2261
3036
 
2262
- if (!d->size) {
2263
- // Some requests contain srcs where ggml_nbytes() returns 0 but the rest of the op is non-empty
2264
- d->size = 64;
3037
+ if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
3038
+ return false;
2265
3039
  }
2266
3040
 
2267
- switch (type) {
2268
- case DSPQBUF_TYPE_DSP_WRITE_CPU_READ:
2269
- // Flush CPU
2270
- d->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER;
2271
- break;
2272
- case DSPQBUF_TYPE_CPU_WRITE_DSP_READ:
2273
- // Flush CPU, Invalidate DSP
2274
- d->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
2275
- break;
2276
- default:
2277
- // Constant buffer, no cache maintenance
2278
- d->flags = 0;
2279
- break;
3041
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
3042
+ return false;
2280
3043
  }
2281
3044
 
2282
- htp_req_tensor_init(h, t);
2283
-
2284
- dspqbuf_dump(d, t, type);
2285
-
2286
- return 1;
3045
+ GGML_UNUSED(sess);
3046
+ return true;
2287
3047
  }
2288
3048
 
2289
- typedef size_t (*htp_req_init_func_t)(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * op);
2290
-
2291
- template <htp_req_init_func_t _init_req_func>
2292
- static inline void ggml_hexagon_dispatch_op(ggml_hexagon_session *sess, const struct ggml_tensor * op, uint32_t flags) {
2293
- uint64_t t = ggml_time_us();
2294
-
2295
- // Construct HTP request
2296
- htp_general_req req;
2297
- memset(&req, 0, sizeof(req));
3049
+ static bool ggml_hexagon_supported_diag(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3050
+ const struct ggml_tensor * src0 = op->src[0];
3051
+ const struct ggml_tensor * dst = op;
2298
3052
 
2299
- req.flags = flags;
2300
- if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
2301
- req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
2302
- }
2303
- if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
2304
- req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
3053
+ // diag only supports F32 currently
3054
+ if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
3055
+ return false;
2305
3056
  }
2306
3057
 
2307
- ggml_hexagon_dump_op_exec(sess->name, op, req.flags);
2308
-
2309
- if ((opt_opmask & HTP_OPMASK_QUEUE)) {
2310
- dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
2311
- size_t n_bufs = _init_req_func(&req, bufs, op);
2312
- sess->enqueue(req, bufs, n_bufs, opt_opsync);
3058
+ // Input must have ne[1] == 1 (vector input)
3059
+ if (src0->ne[1] != 1) {
3060
+ return false;
2313
3061
  }
2314
3062
 
2315
- t = ggml_time_us() - t;
3063
+ // Output must be square in first two dimensions
3064
+ if (dst->ne[0] != dst->ne[1] || dst->ne[0] != src0->ne[0]) {
3065
+ return false;
3066
+ }
2316
3067
 
2317
- ggml_hexagon_dump_op_prof(sess->name, op, sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, t);
3068
+ GGML_UNUSED(sess);
3069
+ return true;
2318
3070
  }
2319
3071
 
2320
- template <bool _is_src0_constant>
2321
- static inline size_t init_binary_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2322
- switch (t->op) {
2323
- case GGML_OP_MUL_MAT:
2324
- req->op = HTP_OP_MUL_MAT;
2325
- break;
2326
- case GGML_OP_MUL:
2327
- req->op = HTP_OP_MUL;
2328
- break;
2329
- case GGML_OP_ADD:
2330
- req->op = HTP_OP_ADD;
2331
- break;
2332
- case GGML_OP_SUB:
2333
- req->op = HTP_OP_SUB;
2334
- break;
2335
- case GGML_OP_DIV:
2336
- req->op = HTP_OP_DIV;
2337
- break;
2338
- default:
2339
- GGML_ABORT("ggml-hex: binary : unsupported op: %d\n", t->op);
2340
- break;
3072
+ static bool ggml_hexagon_supported_solve_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3073
+ const struct ggml_tensor * src0 = op->src[0]; // A
3074
+ const struct ggml_tensor * src1 = op->src[1]; // B
3075
+ const struct ggml_tensor * dst = op; // X
3076
+
3077
+ if (!src0 || !src1) {
3078
+ return false;
2341
3079
  }
2342
3080
 
2343
- // src0: Weights (mulmat) or First Operand (binary op).
2344
- // If constant (e.g. weights), no cache management is needed.
2345
- // src1: Input Activations (mulmat) or Second Operand (binary op).
3081
+ if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
3082
+ return false;
3083
+ }
2346
3084
 
2347
- size_t n_bufs = 0;
2348
- n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], _is_src0_constant ? DSPQBUF_TYPE_CONSTANT : DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2349
- n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2350
- n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
3085
+ if (src0->ne[0] != src0->ne[1]) {
3086
+ return false;
3087
+ }
2351
3088
 
2352
- return n_bufs;
2353
- }
3089
+ if (src0->ne[1] != src1->ne[1]) {
3090
+ return false;
3091
+ }
2354
3092
 
2355
- static inline size_t init_cpy_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2356
- req->op = HTP_OP_CPY;
3093
+ if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
3094
+ return false;
3095
+ }
2357
3096
 
2358
- size_t n_bufs = 0;
2359
- n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2360
- n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
3097
+ if (dst->ne[0] != src1->ne[0] || dst->ne[1] != src1->ne[1] || dst->ne[2] != src1->ne[2] || dst->ne[3] != src1->ne[3]) {
3098
+ return false;
3099
+ }
2361
3100
 
2362
- return n_bufs;
3101
+ GGML_UNUSED(sess);
3102
+ return true;
2363
3103
  }
2364
3104
 
2365
- static inline size_t init_get_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2366
- req->op = HTP_OP_GET_ROWS;
2367
-
2368
- size_t n_bufs = 0;
2369
- n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2370
- n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2371
- n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
3105
+ static bool ggml_hexagon_supported_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2372
3106
 
2373
- return n_bufs;
2374
- }
3107
+ const struct ggml_tensor * src0 = op->src[0];
3108
+ const struct ggml_tensor * dst = op;
2375
3109
 
2376
- static inline size_t init_argsort_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2377
- req->op = HTP_OP_ARGSORT;
2378
- memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
3110
+ if (src0->type != GGML_TYPE_F32) { return false; }
3111
+ if (dst->type != GGML_TYPE_F32) { return false; }
3112
+ if (!ggml_are_same_shape(src0, dst)) { return false; }
3113
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) { return false; }
2379
3114
 
2380
- size_t n_bufs = 0;
2381
- n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2382
- n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
3115
+ return true;
2383
3116
 
2384
- return n_bufs;
3117
+ GGML_UNUSED(sess);
2385
3118
  }
2386
3119
 
2387
- template <bool _is_src0_constant>
2388
- static inline size_t init_binary_id_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2389
- switch (t->op) {
2390
- case GGML_OP_MUL_MAT_ID:
2391
- req->op = HTP_OP_MUL_MAT_ID;
2392
- break;
2393
- case GGML_OP_ADD_ID:
2394
- req->op = HTP_OP_ADD_ID;
2395
- break;
2396
- default:
2397
- GGML_ABORT("ggml-hex: unsupported op: %d\n", t->op);
2398
- }
2399
-
2400
- // src0: Weights (mulmat) or Input Activations (other op).
2401
- // If constant, no cache management is needed.
2402
- // src1: Input Activations (mulmat) or Second Operand (binary op).
2403
- // src2: Expert IDs (mulmat) or Activated Experts (other op).
2404
-
2405
- size_t n_bufs = 0;
2406
- n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], _is_src0_constant ? DSPQBUF_TYPE_CONSTANT : DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2407
- n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2408
- n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2409
- n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
2410
-
2411
- return n_bufs;
3120
+ static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
3121
+ auto sess = static_cast<ggml_hexagon_session *>(backend->context);
3122
+ return sess->c_name();
2412
3123
  }
2413
3124
 
2414
- static inline size_t init_set_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2415
- req->op = HTP_OP_SET_ROWS;
2416
-
2417
- size_t n_bufs = 0;
2418
- n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2419
- n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2420
- n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
2421
-
2422
- return n_bufs;
3125
+ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
3126
+ // we just need to delete the backend here
3127
+ // the sessions are allocated & freed as part of the registry
3128
+ delete backend;
2423
3129
  }
2424
3130
 
2425
- static inline size_t init_unary_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2426
- memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
2427
-
2428
- bool supported = false;
2429
-
3131
+ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
2430
3132
  switch (t->op) {
2431
- case GGML_OP_RMS_NORM:
2432
- req->op = HTP_OP_RMS_NORM;
2433
- supported = true;
2434
- break;
2435
-
2436
- case GGML_OP_SCALE:
2437
- req->op = HTP_OP_SCALE;
2438
- supported = true;
2439
- break;
2440
-
2441
- case GGML_OP_SQR:
2442
- req->op = HTP_OP_SQR;
2443
- supported = true;
2444
- break;
2445
-
2446
- case GGML_OP_SQRT:
2447
- req->op = HTP_OP_SQRT;
2448
- supported = true;
2449
- break;
3133
+ case GGML_OP_FLASH_ATTN_EXT: return HTP_OP_FLASH_ATTN_EXT;
3134
+ case GGML_OP_MUL_MAT: return HTP_OP_MUL_MAT;
3135
+ case GGML_OP_MUL_MAT_ID: return HTP_OP_MUL_MAT_ID;
3136
+ case GGML_OP_MUL: return HTP_OP_MUL;
3137
+ case GGML_OP_ADD: return HTP_OP_ADD;
3138
+ case GGML_OP_ADD_ID: return HTP_OP_ADD_ID;
3139
+ case GGML_OP_SUB: return HTP_OP_SUB;
3140
+ case GGML_OP_DIV: return HTP_OP_DIV;
3141
+ case GGML_OP_CPY: return HTP_OP_CPY;
3142
+ case GGML_OP_CONT: return HTP_OP_CPY;
3143
+ case GGML_OP_GET_ROWS: return HTP_OP_GET_ROWS;
3144
+ case GGML_OP_SET_ROWS: return HTP_OP_SET_ROWS;
3145
+ case GGML_OP_SUM_ROWS: return HTP_OP_SUM_ROWS;
3146
+ case GGML_OP_ARGSORT: return HTP_OP_ARGSORT;
3147
+ case GGML_OP_NORM: return HTP_OP_NORM;
3148
+ case GGML_OP_L2_NORM: return HTP_OP_L2_NORM;
3149
+ case GGML_OP_RMS_NORM: return HTP_OP_RMS_NORM;
3150
+ case GGML_OP_CONCAT: return HTP_OP_CONCAT;
3151
+ case GGML_OP_SCALE: return HTP_OP_SCALE;
3152
+ case GGML_OP_SQR: return HTP_OP_SQR;
3153
+ case GGML_OP_SQRT: return HTP_OP_SQRT;
3154
+ case GGML_OP_SOFT_MAX: return HTP_OP_SOFTMAX;
3155
+ case GGML_OP_SSM_CONV: return HTP_OP_SSM_CONV;
3156
+ case GGML_OP_GATED_DELTA_NET: return HTP_OP_GATED_DELTA_NET;
3157
+ case GGML_OP_ROPE: return HTP_OP_ROPE;
3158
+ case GGML_OP_REPEAT: return HTP_OP_REPEAT;
3159
+ case GGML_OP_CUMSUM: return HTP_OP_CUMSUM;
3160
+ case GGML_OP_FILL: return HTP_OP_FILL;
3161
+ case GGML_OP_DIAG: return HTP_OP_DIAG;
3162
+ case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI;
3163
+ case GGML_OP_TRI: return HTP_OP_TRI;
3164
+ case GGML_OP_PAD: return HTP_OP_PAD;
2450
3165
 
2451
3166
  case GGML_OP_UNARY:
2452
- if (ggml_get_unary_op(t) == GGML_UNARY_OP_SILU) {
2453
- req->op = HTP_OP_UNARY_SILU;
2454
- supported = true;
2455
- } else if (ggml_get_unary_op(t) == GGML_UNARY_OP_GELU) {
2456
- req->op = HTP_OP_UNARY_GELU;
2457
- supported = true;
3167
+ switch (ggml_get_unary_op(t)) {
3168
+ case GGML_UNARY_OP_SILU: return HTP_OP_UNARY_SILU;
3169
+ case GGML_UNARY_OP_GELU: return HTP_OP_UNARY_GELU;
3170
+ case GGML_UNARY_OP_GELU_QUICK: return HTP_OP_UNARY_GELU;
3171
+ case GGML_UNARY_OP_SIGMOID: return HTP_OP_UNARY_SIGMOID;
3172
+ case GGML_UNARY_OP_NEG: return HTP_OP_UNARY_NEG;
3173
+ case GGML_UNARY_OP_EXP: return HTP_OP_UNARY_EXP;
3174
+ case GGML_UNARY_OP_SOFTPLUS: return HTP_OP_UNARY_SOFTPLUS;
3175
+ case GGML_UNARY_OP_TANH: return HTP_OP_UNARY_TANH;
3176
+ default:
3177
+ break;
2458
3178
  }
2459
3179
  break;
2460
3180
 
2461
3181
  case GGML_OP_GLU:
2462
- if (ggml_get_glu_op(t) == GGML_GLU_OP_SWIGLU) {
2463
- req->op = HTP_OP_GLU_SWIGLU;
2464
- supported = true;
2465
- } else if (ggml_get_glu_op(t) == GGML_GLU_OP_SWIGLU_OAI) {
2466
- req->op = HTP_OP_GLU_SWIGLU_OAI;
2467
- supported = true;
2468
- } else if (ggml_get_glu_op(t) == GGML_GLU_OP_GEGLU) {
2469
- req->op = HTP_OP_GLU_GEGLU;
2470
- supported = true;
3182
+ switch (ggml_get_glu_op(t)) {
3183
+ case GGML_GLU_OP_SWIGLU: return HTP_OP_GLU_SWIGLU;
3184
+ case GGML_GLU_OP_SWIGLU_OAI: return HTP_OP_GLU_SWIGLU_OAI;
3185
+ case GGML_GLU_OP_GEGLU: return HTP_OP_GLU_GEGLU;
3186
+ default: break;
2471
3187
  }
2472
3188
  break;
2473
3189
 
2474
- case GGML_OP_SOFT_MAX:
2475
- req->op = HTP_OP_SOFTMAX;
2476
- supported = true;
2477
- break;
2478
-
2479
3190
  default:
2480
- break;
3191
+ GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(t));
2481
3192
  }
2482
-
2483
- if (!supported) {
2484
- GGML_ABORT("ggml-hex: unary : unsupported op: %d\n", t->op);
2485
- }
2486
-
2487
- size_t n_bufs = 0;
2488
- n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2489
- n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2490
- n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
2491
-
2492
- return n_bufs;
2493
- }
2494
-
2495
- static inline size_t init_sum_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2496
- memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
2497
- req->op = HTP_OP_SUM_ROWS;
2498
-
2499
- size_t n_bufs = 0;
2500
- n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2501
- n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
2502
-
2503
- return n_bufs;
3193
+ return HTP_OP_INVALID;
2504
3194
  }
2505
3195
 
2506
- static inline size_t init_rope_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2507
- memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
2508
- req->op = HTP_OP_ROPE;
2509
-
2510
- size_t n_bufs = 0;
2511
- n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2512
- n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2513
- n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2514
- n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
2515
-
2516
- return n_bufs;
2517
- }
2518
-
2519
- static inline size_t init_flash_attn_ext_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2520
- memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
2521
- req->op = HTP_OP_FLASH_ATTN_EXT;
2522
-
2523
- size_t n_bufs = 0;
2524
- n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2525
- n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2526
- n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2527
- n_bufs += htp_req_buff_init(&req->src3, &bufs[n_bufs], t->src[3], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2528
- n_bufs += htp_req_buff_init(&req->src4, &bufs[n_bufs], t->src[4], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2529
- n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
2530
-
2531
- return n_bufs;
2532
- }
2533
-
2534
- static inline size_t init_ssm_conv_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2535
- req->op = HTP_OP_SSM_CONV;
2536
-
2537
- size_t n_bufs = 0;
2538
- n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2539
- n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CONSTANT);
2540
- n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
2541
-
2542
- return n_bufs;
2543
- }
2544
-
2545
- static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
2546
- auto sess = static_cast<ggml_hexagon_session *>(backend->context);
2547
- return sess->name.c_str();
2548
- }
2549
-
2550
- static void ggml_backend_hexagon_free(ggml_backend_t backend) {
2551
- // we just need to delete the backend here
2552
- // the sessions are allocated & freed as part of the registry
2553
- delete backend;
2554
- }
2555
-
2556
- static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) {
2557
- return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type));
2558
- }
2559
-
2560
- static inline bool is_compute_op(ggml_tensor *node)
3196
+ static inline bool op_is_compute(ggml_tensor *node)
2561
3197
  {
2562
3198
  return !ggml_op_is_empty(node->op) && !ggml_is_empty(node) && (node->flags & GGML_TENSOR_FLAG_COMPUTE);
2563
3199
  }
2564
3200
 
2565
- // scan the graph and figure out last compute op index
2566
- static inline int last_compute_op(ggml_cgraph * graph) {
2567
- int last = 0;
2568
- for (int i = 0; i < graph->n_nodes; ++i) {
2569
- if (is_compute_op(graph->nodes[i])) {
2570
- last = i;
2571
- }
2572
- }
2573
-
2574
- return last;
2575
- }
2576
-
2577
3201
  static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
2578
3202
  auto sess = static_cast<ggml_hexagon_session *>(backend->context);
2579
3203
 
2580
- HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->name.c_str(), graph->n_nodes);
2581
-
2582
- const int last = last_compute_op(graph);
3204
+ HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->c_name(), graph->n_nodes);
2583
3205
 
2584
- const struct ggml_tensor * prev_op = nullptr; // prev executed op
3206
+ std::vector<htp_opnode> nodes;
3207
+ nodes.reserve(graph->n_nodes);
2585
3208
 
3209
+ // Fusion
2586
3210
  for (int i = 0; i < graph->n_nodes; ++i) {
2587
- ggml_tensor * node = graph->nodes[i];
2588
-
2589
- if (!is_compute_op(node)) {
3211
+ ggml_tensor * n = graph->nodes[i];
3212
+ if (!op_is_compute(n)) {
2590
3213
  continue;
2591
3214
  }
2592
3215
 
2593
- uint32_t flags = 0;
2594
-
2595
- // skip quantizer if src1 is reused
2596
- if (op_reuse_src1(node, prev_op)) {
2597
- flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
2598
- }
3216
+ ggml_tensor * next_node = (i + 1 < graph->n_nodes) ? graph->nodes[i + 1] : nullptr;
2599
3217
 
2600
- prev_op = node;
3218
+ htp_opnode node = {
3219
+ /*.node =*/ n,
3220
+ /*.fused =*/ {},
3221
+ /*.opcode =*/ HTP_OP_INVALID
3222
+ };
2601
3223
 
2602
- // ask for early notification for the last Op
2603
- if (i == last) {
2604
- flags |= HTP_OPFLAGS_EARLY_WAKEUP;
3224
+ if (n->op == GGML_OP_RMS_NORM && next_node) {
3225
+ if (next_node->op == GGML_OP_MUL && op_is_compute(next_node) && ggml_can_fuse(graph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
3226
+ node.add_fused(next_node);
3227
+ node.opcode = HTP_OP_RMS_NORM_MUL;
3228
+ i++; // skip the fused MUL node
3229
+ }
2605
3230
  }
2606
3231
 
2607
- switch (node->op) {
2608
- case GGML_OP_MUL_MAT:
2609
- if (ggml_is_quantized(node->src[0]->type)) {
2610
- ggml_hexagon_dispatch_op<init_binary_req<true>>(sess, node, flags);
2611
- } else {
2612
- ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
2613
- }
2614
- break;
2615
- case GGML_OP_MUL_MAT_ID:
2616
- if (ggml_is_quantized(node->src[0]->type)) {
2617
- ggml_hexagon_dispatch_op<init_binary_id_req<true>>(sess, node, flags);
2618
- } else {
2619
- ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
2620
- }
2621
- break;
2622
- case GGML_OP_MUL:
2623
- case GGML_OP_ADD:
2624
- case GGML_OP_SUB:
2625
- case GGML_OP_DIV:
2626
- ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
2627
- break;
2628
- case GGML_OP_ADD_ID:
2629
- ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
2630
- break;
2631
- case GGML_OP_RMS_NORM:
2632
- case GGML_OP_SCALE:
2633
- ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
2634
- break;
2635
- case GGML_OP_SQR:
2636
- case GGML_OP_SQRT:
2637
- ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
2638
- break;
2639
- case GGML_OP_SUM_ROWS:
2640
- ggml_hexagon_dispatch_op<init_sum_rows_req>(sess, node, flags);
2641
- break;
2642
- case GGML_OP_UNARY:
2643
- if ((ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) ||
2644
- (ggml_get_unary_op(node) == GGML_UNARY_OP_GELU)) {
2645
- ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
2646
- }
2647
- break;
2648
- case GGML_OP_GLU:
2649
- if ((ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU) ||
2650
- (ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI) ||
2651
- (ggml_get_glu_op(node) == GGML_GLU_OP_GEGLU)) {
2652
- ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
2653
- }
2654
- break;
2655
- case GGML_OP_SOFT_MAX:
2656
- ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
2657
- break;
2658
-
2659
- case GGML_OP_ROPE:
2660
- ggml_hexagon_dispatch_op<init_rope_req>(sess, node, flags);
2661
- break;
2662
-
2663
- case GGML_OP_FLASH_ATTN_EXT:
2664
- ggml_hexagon_dispatch_op<init_flash_attn_ext_req>(sess, node, flags);
2665
- break;
2666
-
2667
- case GGML_OP_SET_ROWS:
2668
- ggml_hexagon_dispatch_op<init_set_rows_req>(sess, node, flags);
2669
- break;
2670
-
2671
- case GGML_OP_GET_ROWS:
2672
- ggml_hexagon_dispatch_op<init_get_rows_req>(sess, node, flags);
2673
- break;
2674
-
2675
- case GGML_OP_CPY:
2676
- ggml_hexagon_dispatch_op<init_cpy_req>(sess, node, flags);
2677
- break;
2678
-
2679
- case GGML_OP_ARGSORT:
2680
- ggml_hexagon_dispatch_op<init_argsort_req>(sess, node, flags);
2681
- break;
3232
+ if (node.opcode == HTP_OP_INVALID) {
3233
+ node.opcode = op_remap_to_htp(n);
3234
+ }
2682
3235
 
2683
- case GGML_OP_SSM_CONV:
2684
- ggml_hexagon_dispatch_op<init_ssm_conv_req>(sess, node, flags);
2685
- break;
3236
+ nodes.push_back(std::move(node));
3237
+ }
2686
3238
 
2687
- default:
2688
- GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(node));
3239
+ // Queue and execute
3240
+ if (opt_opstage & HTP_OPSTAGE_QUEUE) {
3241
+ for (const auto & node : nodes) {
3242
+ sess->enqueue_op(node);
2689
3243
  }
2690
3244
  }
2691
3245
 
@@ -2698,57 +3252,13 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
2698
3252
  static void ggml_backend_hexagon_synchronize(ggml_backend_t backend) {
2699
3253
  auto sess = static_cast<ggml_hexagon_session *>(backend->context);
2700
3254
 
2701
- HEX_VERBOSE("ggml-hex: %s synchronize\n", sess->name.c_str());
3255
+ HEX_VERBOSE("ggml-hex: %s synchronize\n", sess->c_name());
2702
3256
 
2703
3257
  // Wait until all pending ops complete
2704
3258
  sess->flush();
2705
3259
  }
2706
3260
 
2707
- struct node_info {
2708
- ggml_tensor * node;
2709
-
2710
- std::vector<ggml_tensor *> fused;
2711
-
2712
- ggml_op op() const {
2713
- return node->op;
2714
- }
2715
-
2716
- const ggml_tensor * dst() const {
2717
- return fused.empty() ? node : fused.back();
2718
- }
2719
-
2720
- const ggml_tensor * src0() const {
2721
- return node->src[0];
2722
- }
2723
-
2724
- const ggml_tensor * src1() const {
2725
- return node->src[1];
2726
- }
2727
-
2728
- bool is_empty() const {
2729
- return ggml_op_is_empty(node->op);
2730
- }
2731
-
2732
- void add_fused(ggml_tensor * t) {
2733
- fused.push_back(t);
2734
- }
2735
-
2736
- bool stackable() const {
2737
- switch (this->op()) {
2738
- case GGML_OP_MUL_MAT:
2739
- case GGML_OP_MUL_MAT_ID:
2740
- return ggml_is_quantized(this->src0()->type);
2741
- default:
2742
- return false;
2743
- }
2744
- }
2745
-
2746
- bool same_input(const node_info& n) const {
2747
- return n.src1() == this->src1();
2748
- }
2749
- };
2750
-
2751
- static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<node_info> & nodes) {
3261
+ static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<htp_opnode> & nodes) {
2752
3262
  const int n = nodes.size();
2753
3263
 
2754
3264
  std::vector<int> res;
@@ -2802,14 +3312,14 @@ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgr
2802
3312
 
2803
3313
  enum ggml_op ops[MAX_FUSE];
2804
3314
 
2805
- std::vector<node_info> nodes;
3315
+ std::vector<htp_opnode> nodes;
2806
3316
  nodes.reserve(gf->n_nodes);
2807
3317
 
2808
3318
  // fuse nodes:
2809
3319
  // we don't want to make reorders that break fusing, so we first pack all fusable tensors
2810
3320
  // and perform the reorder over the fused nodes. after the reorder is done, we unfuse
2811
3321
  for (int i = 0; i < n; i++) {
2812
- node_info node = {
3322
+ htp_opnode node = {
2813
3323
  /*.node =*/gf->nodes[i],
2814
3324
  /*.fused =*/{},
2815
3325
  };
@@ -2876,6 +3386,8 @@ static struct ggml_backend_i hexagon_backend_i = {
2876
3386
  /* .free = */ ggml_backend_hexagon_free,
2877
3387
  /* .set_tensor_async = */ NULL,
2878
3388
  /* .get_tensor_async = */ NULL,
3389
+ /* .set_tensor_2d_async = */ NULL,
3390
+ /* .get_tensor_2d_async = */ NULL,
2879
3391
  /* .cpy_tensor_async = */ NULL,
2880
3392
  /* .synchronize = */ ggml_backend_hexagon_synchronize,
2881
3393
  /* .graph_plan_create = */ NULL,
@@ -2915,7 +3427,7 @@ static ggml_backend_t ggml_backend_hexagon_device_init(ggml_backend_dev_t dev, c
2915
3427
 
2916
3428
  static const char * ggml_backend_hexagon_device_get_name(ggml_backend_dev_t dev) {
2917
3429
  auto sess = static_cast<ggml_hexagon_session *>(dev->context);
2918
- return sess->name.c_str();
3430
+ return sess->c_name();
2919
3431
 
2920
3432
  GGML_UNUSED(dev);
2921
3433
  }
@@ -2926,8 +3438,7 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
2926
3438
  }
2927
3439
 
2928
3440
  static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
2929
- // ~2GB per session for now
2930
- *free = 2ULL * 1024 * 1024 * 1024;
3441
+ *free = 0;
2931
3442
  *total = *free;
2932
3443
 
2933
3444
  GGML_UNUSED(dev);
@@ -3006,9 +3517,77 @@ static bool ggml_hexagon_supported_cpy(const struct ggml_hexagon_session * sess,
3006
3517
  return true;
3007
3518
  }
3008
3519
 
3520
+ static bool ggml_hexagon_supported_cont(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3521
+ GGML_UNUSED(sess);
3522
+ const struct ggml_tensor * src0 = op->src[0];
3523
+
3524
+ // CONT is same-type only, supports f32 and f16
3525
+ if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
3526
+
3527
+ return true;
3528
+ }
3529
+
3530
+ static bool ggml_hexagon_supported_repeat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3531
+ GGML_UNUSED(sess);
3532
+ const struct ggml_tensor * src0 = op->src[0];
3533
+ const struct ggml_tensor * dst = op;
3534
+
3535
+ // Support f32 and f16
3536
+ if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
3537
+
3538
+ // src and dst must be the same type
3539
+ if (src0->type != dst->type) return false;
3540
+
3541
+ // dst dims must be multiples of src dims
3542
+ if (dst->ne[0] % src0->ne[0] != 0) return false;
3543
+ if (dst->ne[1] % src0->ne[1] != 0) return false;
3544
+ if (dst->ne[2] % src0->ne[2] != 0) return false;
3545
+ if (dst->ne[3] % src0->ne[3] != 0) return false;
3546
+
3547
+ // require contiguous tensors (no transposition)
3548
+ if (ggml_is_transposed(src0) || ggml_is_transposed(dst)) return false;
3549
+
3550
+ return true;
3551
+ }
3552
+
3553
+ static bool ggml_hexagon_supported_concat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3554
+ int dim = ((const int32_t *) op->op_params)[0];
3555
+ if (dim < 0 || dim >= GGML_MAX_DIMS) {
3556
+ return false;
3557
+ }
3558
+
3559
+ for (int i = 0; i < GGML_MAX_SRC; ++i) {
3560
+ const struct ggml_tensor * src = op->src[i];
3561
+ if (!src) {
3562
+ continue;
3563
+ }
3564
+ if (src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_I32 && src->type != GGML_TYPE_F16) {
3565
+ return false;
3566
+ }
3567
+ }
3568
+
3569
+ return true;
3570
+ }
3571
+
3572
+ static bool ggml_hexagon_supported_fill(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3573
+ const struct ggml_tensor * dst = op;
3574
+
3575
+ if (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) {
3576
+ return false;
3577
+ }
3578
+
3579
+ GGML_UNUSED(sess);
3580
+ return true;
3581
+ }
3582
+
3009
3583
  static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
3010
3584
  auto sess = static_cast<ggml_hexagon_session *>(dev->context);
3011
3585
 
3586
+ // reject ops that match the filter
3587
+ if (opt_opfilter && std::regex_match(ggml_op_desc(op), *opt_opfilter)) {
3588
+ return false;
3589
+ }
3590
+
3012
3591
  // all srcs & dsts must be mapped to the same session
3013
3592
  if (!ggml_hexagon_supported_buffers(sess, op)) {
3014
3593
  ggml_hexagon_dump_op_supp(sess->name, op, false);
@@ -3025,6 +3604,13 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
3025
3604
  supp = true;
3026
3605
  break;
3027
3606
 
3607
+ case GGML_OP_MUL:
3608
+ case GGML_OP_ADD:
3609
+ case GGML_OP_SUB:
3610
+ case GGML_OP_DIV:
3611
+ supp = ggml_hexagon_supported_binary(sess, op);
3612
+ break;
3613
+
3028
3614
  case GGML_OP_MUL_MAT:
3029
3615
  supp = ggml_hexagon_supported_mul_mat(sess, op);
3030
3616
  break;
@@ -3033,17 +3619,12 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
3033
3619
  supp = ggml_hexagon_supported_mul_mat_id(sess, op);
3034
3620
  break;
3035
3621
 
3036
- case GGML_OP_MUL:
3037
- case GGML_OP_ADD:
3038
- case GGML_OP_SUB:
3039
- case GGML_OP_DIV:
3040
- supp = ggml_hexagon_supported_binary(sess, op);
3041
- break;
3042
-
3043
3622
  case GGML_OP_ADD_ID:
3044
3623
  supp = ggml_hexagon_supported_add_id(sess, op);
3045
3624
  break;
3046
3625
 
3626
+ case GGML_OP_NORM:
3627
+ case GGML_OP_L2_NORM:
3047
3628
  case GGML_OP_RMS_NORM:
3048
3629
  case GGML_OP_SCALE:
3049
3630
  supp = ggml_hexagon_supported_unary(sess, op);
@@ -3063,21 +3644,36 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
3063
3644
  break;
3064
3645
 
3065
3646
  case GGML_OP_UNARY:
3066
- {
3067
- const auto unary_op = ggml_get_unary_op(op);
3068
- if (unary_op == GGML_UNARY_OP_SILU || unary_op == GGML_UNARY_OP_GELU) {
3647
+ switch (ggml_get_unary_op(op)) {
3648
+ case GGML_UNARY_OP_NEG:
3649
+ case GGML_UNARY_OP_EXP:
3650
+ case GGML_UNARY_OP_SIGMOID:
3651
+ case GGML_UNARY_OP_SOFTPLUS:
3652
+ case GGML_UNARY_OP_TANH:
3653
+ supp = ggml_hexagon_supported_unary(sess, op);
3654
+ break;
3655
+ case GGML_UNARY_OP_SILU:
3656
+ case GGML_UNARY_OP_GELU:
3657
+ case GGML_UNARY_OP_GELU_QUICK:
3069
3658
  supp = ggml_hexagon_supported_activations(sess, op);
3070
- }
3071
- break;
3659
+ break;
3660
+ default:
3661
+ break;
3072
3662
  }
3663
+ break;
3664
+
3073
3665
  case GGML_OP_GLU:
3074
- {
3075
- const auto glu_op = ggml_get_glu_op(op);
3076
- if ((glu_op == GGML_GLU_OP_SWIGLU) || (glu_op == GGML_GLU_OP_SWIGLU_OAI) || (glu_op == GGML_GLU_OP_GEGLU)) {
3666
+ switch (ggml_get_glu_op(op)) {
3667
+ case GGML_GLU_OP_SWIGLU:
3668
+ case GGML_GLU_OP_SWIGLU_OAI:
3669
+ case GGML_GLU_OP_GEGLU:
3077
3670
  supp = ggml_hexagon_supported_activations(sess, op);
3078
- }
3079
- break;
3671
+ break;
3672
+ default:
3673
+ break;
3080
3674
  }
3675
+ break;
3676
+
3081
3677
  case GGML_OP_ROPE:
3082
3678
  supp = ggml_hexagon_supported_rope(sess, op);
3083
3679
  break;
@@ -3098,6 +3694,14 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
3098
3694
  supp = ggml_hexagon_supported_cpy(sess, op);
3099
3695
  break;
3100
3696
 
3697
+ case GGML_OP_CONT:
3698
+ supp = ggml_hexagon_supported_cont(sess, op);
3699
+ break;
3700
+
3701
+ case GGML_OP_REPEAT:
3702
+ supp = ggml_hexagon_supported_repeat(sess, op);
3703
+ break;
3704
+
3101
3705
  case GGML_OP_ARGSORT:
3102
3706
  supp = ggml_hexagon_supported_argsort(sess, op);
3103
3707
  break;
@@ -3106,6 +3710,38 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
3106
3710
  supp = ggml_hexagon_supported_ssm_conv(sess, op);
3107
3711
  break;
3108
3712
 
3713
+ case GGML_OP_GATED_DELTA_NET:
3714
+ supp = ggml_hexagon_supported_gated_delta_net(sess, op);
3715
+ break;
3716
+
3717
+ case GGML_OP_CUMSUM:
3718
+ supp = ggml_hexagon_supported_cumsum(sess, op);
3719
+ break;
3720
+
3721
+ case GGML_OP_CONCAT:
3722
+ supp = ggml_hexagon_supported_concat(sess, op);
3723
+ break;
3724
+
3725
+ case GGML_OP_FILL:
3726
+ supp = ggml_hexagon_supported_fill(sess, op);
3727
+ break;
3728
+
3729
+ case GGML_OP_DIAG:
3730
+ supp = ggml_hexagon_supported_diag(sess, op);
3731
+ break;
3732
+
3733
+ case GGML_OP_SOLVE_TRI:
3734
+ supp = ggml_hexagon_supported_solve_tri(sess, op);
3735
+ break;
3736
+
3737
+ case GGML_OP_TRI:
3738
+ supp = ggml_hexagon_supported_tri(sess, op);
3739
+ break;
3740
+
3741
+ case GGML_OP_PAD:
3742
+ supp = ggml_hexagon_supported_pad(sess, op);
3743
+ break;
3744
+
3109
3745
  default:
3110
3746
  break;
3111
3747
  }
@@ -3172,21 +3808,6 @@ struct ggml_hexagon_registry {
3172
3808
  ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
3173
3809
  GGML_LOG_INFO("ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev %zu\n", opt_ndev);
3174
3810
 
3175
- if (!opt_arch) {
3176
- int err = get_hex_arch_ver(CDSP_DOMAIN_ID, &opt_arch);
3177
- if (err != 0) {
3178
- GGML_LOG_ERROR("ggml-hex: failed to query HTP version (err %d) defaulting to v73\n", err);
3179
- opt_arch = 73;
3180
- }
3181
- }
3182
-
3183
- #if defined(__ANDROID__)
3184
- if (opt_arch < 75) {
3185
- opt_ndev = 1;
3186
- GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
3187
- }
3188
- #endif
3189
-
3190
3811
  GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
3191
3812
 
3192
3813
  // Create devices / sessions
@@ -3241,53 +3862,117 @@ static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, cons
3241
3862
  return NULL;
3242
3863
  }
3243
3864
 
3865
+ template<typename T> std::vector<T> str_to_vec(const char* str) {
3866
+ std::stringstream ss(str);
3867
+ std::vector<T> v;
3868
+ std::string t;
3869
+
3870
+ while (std::getline(ss, t, ',')) {
3871
+ v.push_back(std::stoul(t, nullptr, 0));
3872
+ }
3873
+
3874
+ return v;
3875
+ }
3876
+
3877
+ template<typename T, int BASE=10> std::string vec_to_str(std::vector<T> v) {
3878
+ std::stringstream ss;
3879
+ ss << std::setbase(BASE) << std::showbase;
3880
+ for (auto i : v) { ss << i << ','; }
3881
+ auto str = ss.str(); str.pop_back(); // drop last comma
3882
+ return str;
3883
+ }
3884
+
3244
3885
  static void ggml_hexagon_init(ggml_backend_reg * reg) {
3245
3886
  // Basic sanity checks to make sure definitions match
3246
3887
  static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0,
3247
3888
  "please update hexagon_type to match ggml_type");
3889
+ static_assert((unsigned int) HTP_TYPE_Q4_1 == (unsigned int) GGML_TYPE_Q4_1,
3890
+ "please update hexagon_type to match ggml_type");
3248
3891
  static_assert((unsigned int) HTP_TYPE_Q8_0 == (unsigned int) GGML_TYPE_Q8_0,
3249
3892
  "please update hexagon_type to match ggml_type");
3250
3893
  static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4,
3251
3894
  "please update hexagon_type to match ggml_type");
3895
+ static_assert((unsigned int) HTP_TYPE_IQ4_NL == (unsigned int) GGML_TYPE_IQ4_NL,
3896
+ "please update hexagon_type to match ggml_type");
3252
3897
 
3253
- const char * str_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL");
3254
- const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
3255
- const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
3256
- const char * str_opmask = getenv("GGML_HEXAGON_OPMASK");
3257
- const char * str_opsync = getenv("GGML_HEXAGON_OPSYNC");
3258
- const char * str_profile = getenv("GGML_HEXAGON_PROFILE");
3259
- const char * str_etm = getenv("GGML_HEXAGON_ETM");
3260
- const char * str_nhvx = getenv("GGML_HEXAGON_NHVX");
3261
- const char * str_ndev = getenv("GGML_HEXAGON_NDEV");
3262
- const char * str_arch = getenv("GGML_HEXAGON_ARCH");
3263
-
3264
- opt_experimental = str_experimental ? atoi(str_experimental) : 0;
3265
- opt_verbose = str_verbose ? atoi(str_verbose) : 0;
3266
- opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
3267
- opt_opmask = str_opmask ? strtoul(str_opmask, NULL, 0) : opt_opmask;
3268
- opt_opsync = str_opsync ? atoi(str_opsync) : 0;
3269
- opt_profile = str_profile ? atoi(str_profile) : 0;
3270
- opt_etm = str_etm ? atoi(str_etm) : 0;
3271
- opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
3272
- opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev;
3898
+ const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
3899
+ const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
3900
+ const char * str_opstage = getenv("GGML_HEXAGON_OPSTAGE");
3901
+ const char * str_opbatch = getenv("GGML_HEXAGON_OPBATCH");
3902
+ const char * str_opqueue = getenv("GGML_HEXAGON_OPQUEUE");
3903
+ const char * str_oppoll = getenv("GGML_HEXAGON_OPPOLL");
3904
+ const char * str_opfilter = getenv("GGML_HEXAGON_OPFILTER");
3905
+ const char * str_profile = getenv("GGML_HEXAGON_PROFILE");
3906
+ const char * str_etm = getenv("GGML_HEXAGON_ETM");
3907
+ const char * str_nhvx = getenv("GGML_HEXAGON_NHVX");
3908
+ const char * str_use_hmx = getenv("GGML_HEXAGON_USE_HMX");
3909
+ const char * str_ndev = getenv("GGML_HEXAGON_NDEV");
3910
+ const char * str_arch = getenv("GGML_HEXAGON_ARCH");
3911
+ const char * str_vmem = getenv("GGML_HEXAGON_VMEM");
3912
+ const char * str_mbuf = getenv("GGML_HEXAGON_MBUF");
3913
+
3914
+ // Init Arch first since it affects other defaults
3915
+ if (!str_arch) {
3916
+ int err = get_hex_arch_ver(CDSP_DOMAIN_ID, &opt_arch);
3917
+ if (err != 0) {
3918
+ GGML_LOG_ERROR("ggml-hex: failed to query HTP version (err %d) defaulting to v73\n", err);
3919
+ opt_arch = 73;
3920
+ }
3921
+ } else {
3922
+ if (str_arch[0] == 'v' || str_arch[0] == 'V') {
3923
+ str_arch++;
3924
+ }
3925
+ opt_arch = strtoul(str_arch, NULL, 0);
3926
+ }
3927
+
3928
+ size_t MiB = 1024 * 1024;
3929
+
3930
+ // Update vmem default
3931
+ opt_vmem = opt_arch >= 75 ? HTP_OP_MAX_VMEM_DEFAULT : 3000 * MiB;
3932
+
3933
+ auto RE_ICASE = std::regex_constants::icase;
3934
+
3935
+ opt_opfilter = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL;
3936
+ opt_verbose = str_verbose ? atoi(str_verbose) : 0;
3937
+ opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
3938
+ opt_opstage = str_opstage ? strtoul(str_opstage, NULL, 0) : opt_opstage;
3939
+ opt_opbatch = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
3940
+ opt_opqueue = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
3941
+ opt_oppoll = str_oppoll ? strtoul(str_oppoll, NULL, 0) : opt_oppoll;
3942
+ opt_profile = str_profile ? atoi(str_profile) : 0;
3943
+ opt_etm = str_etm ? atoi(str_etm) : 0;
3944
+ opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
3945
+ opt_use_hmx = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx;
3946
+ opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev;
3947
+ opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
3948
+ opt_mbuf = str_mbuf ? strtoul(str_mbuf, NULL, 0) * MiB : opt_mbuf;
3949
+ opt_vmem = str_vmem ? strtoul(str_vmem, NULL, 0) * MiB : opt_vmem;
3273
3950
 
3274
3951
  if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
3275
3952
  opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
3276
3953
  }
3277
3954
 
3278
- if (str_arch) {
3279
- if (str_arch[0] == 'v') {
3280
- str_arch++;
3281
- }
3282
- opt_arch = strtoul(str_arch, NULL, 0);
3955
+ #if defined(__ANDROID__)
3956
+ if (opt_arch < 75) {
3957
+ opt_ndev = 1;
3958
+ GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
3283
3959
  }
3960
+ #endif
3284
3961
 
3285
- opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : 1;
3962
+ if (str_profile) {
3963
+ opt_pmu_evt = [&]() -> std::vector<uint32_t> {
3964
+ auto v = str_to_vec<uint32_t>(str_profile);
3965
+ switch (v.size()) {
3966
+ case 1: opt_profile = v[0]; return opt_pmu_evt; // mode with default pmu events
3967
+ case 8: opt_profile = 2; return v; // mode with custom pmu events
3968
+ default: opt_profile = 0; return {}; // garbage input
3969
+ }}();
3970
+ if (opt_profile == 1) opt_pmu_evt = {};
3971
+ GGML_LOG_INFO("ggml-hex: Profiling mode %u : pmu-evt [ %s ]\n", opt_profile,
3972
+ vec_to_str<uint32_t, 16>(opt_pmu_evt).c_str());
3973
+ }
3286
3974
 
3287
3975
  reg->context = new ggml_hexagon_registry(reg);
3288
-
3289
- HEX_VERBOSE("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req),
3290
- sizeof(struct htp_general_rsp));
3291
3976
  }
3292
3977
 
3293
3978
  static const struct ggml_backend_reg_i ggml_backend_hexagon_reg_i = {