whispercpp 1.3.5 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (610) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.md +99 -2
  4. data/ext/extconf.rb +1 -0
  5. data/ext/ruby_whisper.c +20 -4
  6. data/ext/ruby_whisper.h +30 -2
  7. data/ext/ruby_whisper_context.c +216 -124
  8. data/ext/ruby_whisper_context_params.c +163 -0
  9. data/ext/ruby_whisper_model.c +0 -1
  10. data/ext/ruby_whisper_params.c +0 -1
  11. data/ext/ruby_whisper_segment.c +0 -1
  12. data/ext/ruby_whisper_token.c +29 -9
  13. data/ext/ruby_whisper_transcribe.cpp +4 -1
  14. data/ext/ruby_whisper_vad_context.c +48 -1
  15. data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
  16. data/ext/ruby_whisper_vad_params.c +0 -1
  17. data/ext/ruby_whisper_vad_segment.c +0 -1
  18. data/ext/ruby_whisper_vad_segments.c +0 -1
  19. data/ext/sources/CMakeLists.txt +1 -1
  20. data/ext/sources/bindings/javascript/package.json +1 -1
  21. data/ext/sources/cmake/whisper-config.cmake.in +5 -40
  22. data/ext/sources/examples/bench/bench.cpp +23 -18
  23. data/ext/sources/examples/cli/cli.cpp +8 -0
  24. data/ext/sources/examples/common-ggml.cpp +2 -0
  25. data/ext/sources/examples/miniaudio.h +4507 -2131
  26. data/ext/sources/examples/server/server.cpp +18 -4
  27. data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
  28. data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
  29. data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
  30. data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
  31. data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
  32. data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
  33. data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
  34. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  35. data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
  36. data/ext/sources/examples/talk-llama/llama-context.h +27 -28
  37. data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
  38. data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
  39. data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
  40. data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
  41. data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
  42. data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
  43. data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
  44. data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
  45. data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
  47. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
  48. data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
  49. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
  50. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
  51. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
  52. data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
  53. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
  54. data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
  55. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
  56. data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
  57. data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
  58. data/ext/sources/examples/talk-llama/llama-model.h +72 -19
  59. data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
  60. data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
  61. data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
  62. data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
  63. data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
  64. data/ext/sources/examples/talk-llama/llama.cpp +76 -22
  65. data/ext/sources/examples/talk-llama/llama.h +63 -30
  66. data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
  67. data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
  68. data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
  69. data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
  70. data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
  71. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
  72. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
  73. data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
  74. data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
  75. data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
  76. data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
  77. data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
  78. data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
  79. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
  80. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
  81. data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
  82. data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
  83. data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
  84. data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
  85. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
  86. data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
  87. data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
  88. data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
  89. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
  90. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
  91. data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
  92. data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
  93. data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
  94. data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
  95. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
  96. data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
  97. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
  98. data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
  99. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
  100. data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
  101. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
  102. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
  103. data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
  104. data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
  105. data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
  106. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
  107. data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
  108. data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
  109. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
  110. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
  111. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
  112. data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
  113. data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
  114. data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
  115. data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
  116. data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
  117. data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
  118. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
  119. data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
  120. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
  121. data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
  122. data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
  123. data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
  124. data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
  125. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
  126. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
  127. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
  128. data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
  129. data/ext/sources/examples/talk-llama/models/models.h +181 -46
  130. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
  131. data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
  132. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
  133. data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
  134. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
  135. data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
  136. data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
  137. data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
  138. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
  139. data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
  140. data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
  141. data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
  142. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
  143. data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
  144. data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
  145. data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
  146. data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
  147. data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
  148. data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
  149. data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
  150. data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
  151. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
  152. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
  153. data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
  154. data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
  155. data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
  156. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
  157. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
  158. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
  159. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
  160. data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
  161. data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
  162. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
  163. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
  164. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
  165. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
  166. data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
  167. data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
  168. data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
  169. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
  170. data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
  171. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
  172. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
  173. data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
  174. data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
  175. data/ext/sources/ggml/CMakeLists.txt +9 -3
  176. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  177. data/ext/sources/ggml/include/ggml-cann.h +1 -1
  178. data/ext/sources/ggml/include/ggml-cpu.h +5 -0
  179. data/ext/sources/ggml/include/ggml-openvino.h +37 -0
  180. data/ext/sources/ggml/include/ggml-opt.h +1 -1
  181. data/ext/sources/ggml/include/ggml-rpc.h +6 -1
  182. data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
  183. data/ext/sources/ggml/include/ggml.h +56 -9
  184. data/ext/sources/ggml/src/CMakeLists.txt +3 -0
  185. data/ext/sources/ggml/src/ggml-alloc.c +4 -9
  186. data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
  187. data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
  188. data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
  189. data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
  190. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  191. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
  192. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
  193. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
  194. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
  195. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
  196. data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
  197. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
  198. data/ext/sources/ggml/src/ggml-common.h +11 -0
  199. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
  200. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
  201. data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
  202. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
  203. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
  204. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
  205. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
  206. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
  207. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
  208. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
  209. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
  210. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
  211. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
  212. data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
  213. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
  214. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
  215. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
  216. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
  217. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
  218. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
  219. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
  220. data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
  221. data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
  222. data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
  223. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
  224. data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
  225. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  226. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
  227. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
  228. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
  229. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
  230. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
  231. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
  232. data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
  233. data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
  234. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
  235. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
  236. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
  237. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
  238. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
  239. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
  240. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
  241. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
  242. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
  243. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  244. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
  245. data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
  246. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
  247. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
  248. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
  249. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
  250. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
  251. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
  252. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
  253. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
  254. data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
  255. data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
  256. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
  257. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
  258. data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
  259. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
  260. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
  261. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
  262. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  263. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
  264. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
  265. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
  267. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
  268. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
  269. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
  270. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
  271. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
  272. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
  273. data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
  274. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
  275. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
  276. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
  277. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
  278. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
  279. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
  280. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
  281. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  282. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
  283. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
  284. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
  285. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
  286. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
  287. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
  288. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  289. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  290. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  291. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
  292. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
  293. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
  294. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  295. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
  296. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  297. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
  298. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  299. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  300. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  301. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  302. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  303. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  304. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  305. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  306. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  307. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
  308. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
  309. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
  310. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
  311. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
  312. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
  313. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
  314. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  315. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
  316. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
  317. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  318. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  319. data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
  320. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  321. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
  322. data/ext/sources/ggml/src/ggml-impl.h +62 -0
  323. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
  324. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
  325. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
  326. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
  327. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
  328. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
  329. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
  330. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
  331. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
  332. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
  333. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
  334. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
  335. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
  336. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
  344. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  345. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  346. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  347. data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  348. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
  349. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  350. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  351. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  352. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  353. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  354. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  355. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  356. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
  357. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  358. data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  359. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
  360. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
  361. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
  362. data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  363. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
  364. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
  365. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
  366. data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  367. data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
  368. data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  369. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
  370. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  371. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
  372. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  373. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
  374. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
  375. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  376. data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  377. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  378. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  379. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  380. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  381. data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  382. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  383. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  384. data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  385. data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  386. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  387. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  388. data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  389. data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  390. data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  391. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  392. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
  393. data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  394. data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  395. data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  396. data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  397. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  398. data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  399. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
  400. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
  401. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
  402. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
  403. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  404. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  405. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  406. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  407. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  408. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
  409. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  410. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
  411. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
  412. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
  413. data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
  414. data/ext/sources/ggml/src/ggml-quants.c +96 -5
  415. data/ext/sources/ggml/src/ggml-quants.h +3 -0
  416. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
  417. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
  418. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
  419. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
  420. data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
  421. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
  422. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
  423. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
  424. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
  425. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
  426. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  427. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
  428. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
  429. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
  430. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
  431. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
  432. data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
  433. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
  434. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
  435. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
  436. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
  437. data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
  438. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
  439. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
  440. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
  441. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
  442. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
  443. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  444. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  445. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  446. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  447. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  448. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  449. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  450. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  451. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  452. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
  453. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
  454. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
  455. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
  456. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
  457. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
  458. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
  459. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
  460. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
  461. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
  462. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
  463. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
  464. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
  465. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
  466. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
  467. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
  468. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
  469. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
  470. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
  471. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
  472. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
  473. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
  474. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
  475. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
  476. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
  477. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
  478. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
  479. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
  480. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
  481. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
  482. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
  483. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
  484. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
  485. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
  486. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
  487. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
  488. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
  489. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
  490. data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  491. data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  492. data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  493. data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  494. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  495. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  496. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  497. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  498. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  499. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  500. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  501. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  502. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  503. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  504. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  505. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  506. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  507. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  508. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  509. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  510. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  511. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  512. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  513. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  514. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  515. data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  516. data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  517. data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  518. data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  519. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  520. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  521. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  522. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  523. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  524. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  525. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  526. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  527. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  528. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  529. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  530. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  531. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  532. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
  533. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
  534. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
  535. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  536. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
  537. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
  538. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
  539. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
  540. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  541. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
  542. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
  543. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
  544. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
  545. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
  546. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
  547. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  548. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
  549. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
  550. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
  551. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
  552. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
  553. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
  554. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
  555. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
  556. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
  557. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  558. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
  559. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
  560. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
  561. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
  562. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  563. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  564. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  565. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
  566. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
  567. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
  568. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
  569. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  570. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
  571. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
  572. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
  573. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
  574. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
  575. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
  576. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
  577. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
  578. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  579. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  580. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
  581. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
  582. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  583. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  584. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
  585. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
  586. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
  587. data/ext/sources/ggml/src/ggml.c +167 -33
  588. data/ext/sources/ggml/src/gguf.cpp +229 -44
  589. data/ext/sources/src/whisper.cpp +6 -28
  590. data/sig/whisper.rbs +43 -2
  591. data/test/test_context_params.rb +82 -0
  592. data/test/test_token.rb +11 -0
  593. data/test/test_vad_context.rb +58 -8
  594. data/test/test_whisper.rb +20 -0
  595. data/whispercpp.gemspec +1 -1
  596. metadata +240 -28
  597. data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
  598. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
  599. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
  600. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
  601. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
  602. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
  603. data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
  604. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
  605. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
  606. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
  607. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
  608. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
  609. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
  610. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
@@ -14,9 +14,6 @@
14
14
 
15
15
  #ifdef _WIN32
16
16
  # include <sal.h>
17
- # ifndef _WINDOWS
18
- # define _WINDOWS
19
- # endif
20
17
  #else
21
18
  # include <semaphore.h>
22
19
  # include <unistd.h>
@@ -25,8 +22,6 @@
25
22
  #pragma clang diagnostic ignored "-Wnested-anon-types"
26
23
  #pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
27
24
 
28
- #include "htp-utils.h"
29
-
30
25
  #include <AEEStdErr.h>
31
26
  #include <dspqueue.h>
32
27
  #include <rpcmem.h>
@@ -40,14 +35,15 @@
40
35
  #include "op-desc.h"
41
36
  #include "htp-msg.h"
42
37
  #include "htp_iface.h"
38
+ #include "htp-drv.h"
43
39
 
44
40
  static size_t opt_ndev = 1;
45
- static size_t opt_nhvx = 0; // use all
46
- static int opt_arch = 0; // autodetect
41
+ static size_t opt_nhvx = 0; // use all
42
+ static int opt_arch = 0; // autodetect
47
43
  static int opt_etm = 0;
48
44
  static int opt_verbose = 0;
49
45
  static int opt_profile = 0;
50
- static int opt_hostbuf = 1;
46
+ static int opt_hostbuf = 1; // hostbuf ON by default
51
47
  static int opt_experimental = 0;
52
48
 
53
49
  // Enable all stages by default
@@ -143,16 +139,16 @@ struct ggml_hexagon_session {
143
139
  };
144
140
 
145
141
  void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
146
- // Bump pending flag (cleared in the session::flush once we get the responce)
142
+ // Bump pending flag (cleared in the session::flush once we get the response)
147
143
  this->op_pending++; // atomic inc
148
144
 
149
145
  int err = dspqueue_write(this->queue,
150
146
  0, // flags - the framework will autoset this
151
147
  n_bufs, // number of buffers
152
148
  bufs, // buffer references
153
- sizeof(req),
149
+ sizeof(req), // Message length
154
150
  (const uint8_t *) &req, // Message
155
- 1000000 // Timeout
151
+ DSPQUEUE_TIMEOUT // Timeout
156
152
  );
157
153
 
158
154
  if (err != 0) {
@@ -182,13 +178,13 @@ void ggml_hexagon_session::flush() {
182
178
 
183
179
  // Read response packet from queue
184
180
  int err = dspqueue_read(q, &flags,
185
- HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
186
- &n_bufs, // Number of buffer references
187
- bufs, // Buffer references
188
- sizeof(rsp), // Max message length
189
- &rsp_size, // Message length
190
- (uint8_t *) &rsp,
191
- 1000000); // Timeout
181
+ HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
182
+ &n_bufs, // Number of buffer references
183
+ bufs, // Buffer references
184
+ sizeof(rsp), // Max message length
185
+ &rsp_size, // Message length
186
+ (uint8_t *) &rsp, // Message
187
+ DSPQUEUE_TIMEOUT); // Timeout
192
188
 
193
189
  if (err == AEE_EEXPIRED) {
194
190
  // TODO: might need to bail out if the HTP is stuck on something
@@ -269,13 +265,7 @@ struct ggml_backend_hexagon_buffer_context {
269
265
  ggml_backend_hexagon_buffer_context(ggml_hexagon_session * sess, size_t size, bool repack) {
270
266
  size += 4 * 1024; // extra page for padding
271
267
 
272
- if (rpcmem_alloc2) {
273
- this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
274
- } else {
275
- GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str());
276
- this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
277
- }
278
-
268
+ this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
279
269
  if (!this->base) {
280
270
  GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size);
281
271
  throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)");
@@ -412,6 +402,7 @@ static void pack_q4_0_quants(block_q4_0 * x, const uint8_t * qs, unsigned int bi
412
402
  static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
413
403
  static const int qk = QK_Q4_0x4x2;
414
404
  const int nb = (k + qk - 1) / qk; // number of blocks (padded)
405
+ const int nloe = k % qk; // leftovers
415
406
 
416
407
  const int dblk_size = 8 * 2; // 8x __fp16
417
408
  const int qblk_size = qk / 2; // int4
@@ -445,15 +436,17 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
445
436
  unpack_q4_0_quants(qs, &x[i * 8 + 6], 6);
446
437
  unpack_q4_0_quants(qs, &x[i * 8 + 7], 7);
447
438
 
439
+ bool partial = (nloe && i == nb-1);
440
+
448
441
  uint8_t * q = y_q + (i * qblk_size);
449
442
  for (int j = 0; j < qk / 2; j++) {
450
- q[j] = (qs[j + 128] << 4) | qs[j];
443
+ q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
451
444
  }
452
445
  }
453
446
 
454
447
  // Repack the scales
455
448
  // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
456
- // the last block is truncated and overriden by the scales.
449
+ // the last block is truncated and overridden by the scales.
457
450
  for (int i = 0; i < nb; i++) {
458
451
  // Repack the scales
459
452
  ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
@@ -477,6 +470,7 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
477
470
  static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
478
471
  static const int qk = QK_Q4_0x4x2;
479
472
  const int nb = (k + qk - 1) / qk; // number of blocks (padded)
473
+ const int nloe = k % qk; // leftovers
480
474
 
481
475
  const int dblk_size = 8 * 2; // 8x __fp16
482
476
  const int qblk_size = qk / 2; // int4
@@ -495,10 +489,17 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
495
489
  for (int i = 0; i < nb; i++) {
496
490
  uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
497
491
 
492
+ bool partial = (nloe && i == nb-1);
493
+
498
494
  const uint8_t * q = y_q + (i * qblk_size);
499
495
  for (int j = 0; j < qk / 2; j++) {
500
- qs[j] = q[j] & 0xf;
501
- qs[j + 128] = q[j] >> 4;
496
+ if (partial) {
497
+ qs[j*2+0] = q[j] & 0xf;
498
+ qs[j*2+1] = q[j] >> 4;
499
+ } else {
500
+ qs[j+000] = q[j] & 0xf;
501
+ qs[j+128] = q[j] >> 4;
502
+ }
502
503
  }
503
504
 
504
505
  pack_q4_0_quants(&x[i * 8 + 0], qs, 0);
@@ -513,7 +514,7 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
513
514
 
514
515
  // Repack the scales
515
516
  // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
516
- // the last block is truncated and overriden by the scales.
517
+ // the last block is truncated and overridden by the scales.
517
518
  for (int i = 0; i < nb; i++) {
518
519
  // Unpack the scales
519
520
  const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
@@ -562,7 +563,7 @@ static void init_row_q4x4x2(block_q4_0 * x, int64_t k) {
562
563
 
563
564
  // Init the scales
564
565
  // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
565
- // the last block is truncated and overriden by the scales.
566
+ // the last block is truncated and overridden by the scales.
566
567
  for (int i = 0; i < nb; i++) {
567
568
  // Unpack the scales
568
569
  x[i * 8 + 0].d = 0;
@@ -780,7 +781,7 @@ static void repack_row_q8x4x2(uint8_t * y, const block_q8_0 * x, int64_t k) {
780
781
 
781
782
  // Repack the scales
782
783
  // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
783
- // the last block is truncated and overriden by the scales.
784
+ // the last block is truncated and overridden by the scales.
784
785
  for (int i = 0; i < nb; i++) {
785
786
  // Repack the scales
786
787
  ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
@@ -839,7 +840,7 @@ static void unpack_row_q8x4x2(block_q8_0 * x, const uint8_t * y, int64_t k) {
839
840
 
840
841
  // Repack the scales
841
842
  // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
842
- // the last block is truncated and overriden by the scales.
843
+ // the last block is truncated and overridden by the scales.
843
844
  for (int i = 0; i < nb; i++) {
844
845
  // Unpack the scales
845
846
  const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
@@ -888,7 +889,7 @@ static void init_row_q8x4x2(block_q8_0 * x, int64_t k) {
888
889
 
889
890
  // Init the scales
890
891
  // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q8_0x4x2)
891
- // the last block is truncated and overriden by the scales.
892
+ // the last block is truncated and overridden by the scales.
892
893
  for (int i = 0; i < nb; i++) {
893
894
  // Unpack the scales
894
895
  x[i * 8 + 0].d = 0;
@@ -1088,6 +1089,7 @@ static void pack_mxfp4_quants(block_mxfp4 * x, const uint8_t * qs, unsigned int
1088
1089
  static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k) {
1089
1090
  static const int qk = QK_MXFP4x4x2;
1090
1091
  const int nb = (k + qk - 1) / qk; // number of blocks (padded)
1092
+ const int nloe = k % qk; // leftovers
1091
1093
 
1092
1094
  const int eblk_size = 8 * 1; // 8x E8M0
1093
1095
  const int qblk_size = qk / 2; // int4
@@ -1122,15 +1124,17 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
1122
1124
  unpack_mxfp4_quants(qs, &x[i * 8 + 6], 6);
1123
1125
  unpack_mxfp4_quants(qs, &x[i * 8 + 7], 7);
1124
1126
 
1127
+ bool partial = (nloe && i == nb-1);
1128
+
1125
1129
  uint8_t * q = y_q + (i * qblk_size);
1126
1130
  for (int j = 0; j < qk / 2; j++) {
1127
- q[j] = (qs[j + 128] << 4) | qs[j];
1131
+ q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
1128
1132
  }
1129
1133
  }
1130
1134
 
1131
1135
  // Repack the scales
1132
1136
  // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
1133
- // the last block is truncated and overriden by the scales.
1137
+ // the last block is truncated and overridden by the scales.
1134
1138
  for (int i = 0; i < nb; i++) {
1135
1139
  // Repack the scales
1136
1140
  uint8_t * e = (uint8_t *) (y_e + i * eblk_size);
@@ -1154,6 +1158,7 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
1154
1158
  static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k) {
1155
1159
  static const int qk = QK_MXFP4x4x2;
1156
1160
  const int nb = (k + qk - 1) / qk; // number of blocks (padded)
1161
+ const int nloe = k % qk; // leftovers
1157
1162
 
1158
1163
  const int eblk_size = 8 * 1; // 8x E8M0
1159
1164
  const int qblk_size = qk / 2; // int4
@@ -1172,10 +1177,17 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
1172
1177
  for (int i = 0; i < nb; i++) {
1173
1178
  uint8_t qs[QK_MXFP4x4x2]; // unpacked quants
1174
1179
 
1180
+ bool partial = (nloe && i == nb-1);
1181
+
1175
1182
  const uint8_t * q = y_q + (i * qblk_size);
1176
1183
  for (int j = 0; j < qk / 2; j++) {
1177
- qs[j] = q[j] & 0xf;
1178
- qs[j + 128] = q[j] >> 4;
1184
+ if (partial) {
1185
+ qs[j*2+0] = q[j] & 0xf;
1186
+ qs[j*2+1] = q[j] >> 4;
1187
+ } else {
1188
+ qs[j+000] = q[j] & 0xf;
1189
+ qs[j+128] = q[j] >> 4;
1190
+ }
1179
1191
  }
1180
1192
 
1181
1193
  pack_mxfp4_quants(&x[i * 8 + 0], qs, 0);
@@ -1190,7 +1202,7 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
1190
1202
 
1191
1203
  // Repack the scales
1192
1204
  // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4_0x4x2)
1193
- // the last block is truncated and overriden by the scales.
1205
+ // the last block is truncated and overridden by the scales.
1194
1206
  for (int i = 0; i < nb; i++) {
1195
1207
  // Unpack the scales
1196
1208
  const uint8_t * e = (const uint8_t *) (y_e + i * eblk_size);
@@ -1239,7 +1251,7 @@ static void init_row_mxfp4x4x2(block_mxfp4 * x, int64_t k) {
1239
1251
 
1240
1252
  // Init the scales
1241
1253
  // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
1242
- // the last block is truncated and overriden by the scales.
1254
+ // the last block is truncated and overridden by the scales.
1243
1255
  for (int i = 0; i < nb; i++) {
1244
1256
  // Unpack the scales
1245
1257
  x[i * 8 + 0].e = 0;
@@ -1753,24 +1765,10 @@ static bool ggml_backend_buffer_is_hexagon(const struct ggml_backend_buffer * b)
1753
1765
  }
1754
1766
 
1755
1767
  static inline bool ggml_backend_buffer_is_hexagon_repack(const struct ggml_backend_buffer * b) {
1756
- return b->buft->iface.alloc_buffer == ggml_backend_hexagon_repack_buffer_type_alloc_buffer;
1757
- }
1758
-
1759
- static bool hex_supported_dims2(const struct ggml_tensor * x, const struct ggml_tensor * y) {
1760
- if (x->ne[0] != y->ne[0]) {
1761
- return false;
1768
+ if (!opt_hostbuf) {
1769
+ return ggml_backend_buffer_is_hexagon(b);
1762
1770
  }
1763
- if (x->ne[1] != y->ne[1]) {
1764
- return false;
1765
- }
1766
- if (x->ne[2] != y->ne[2]) {
1767
- return false;
1768
- }
1769
- if (x->ne[3] != y->ne[3]) {
1770
- return false;
1771
- }
1772
-
1773
- return true;
1771
+ return b->buft->iface.alloc_buffer == ggml_backend_hexagon_repack_buffer_type_alloc_buffer;
1774
1772
  }
1775
1773
 
1776
1774
  static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
@@ -1804,43 +1802,6 @@ static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_sess
1804
1802
  return opt_experimental;
1805
1803
  }
1806
1804
 
1807
- static bool hex_supported_src0_type(ggml_type t) {
1808
- return t == GGML_TYPE_F32;
1809
- }
1810
-
1811
- static bool hex_supported_src1_type(ggml_type t) {
1812
- return t == GGML_TYPE_F32;
1813
- }
1814
-
1815
- static bool hex_supported_src2_type(ggml_type t) {
1816
- return t == GGML_TYPE_F32;
1817
- }
1818
-
1819
- static bool hex_supported_src1_type2(ggml_type t) {
1820
- return t == GGML_TYPE_F16;
1821
- }
1822
-
1823
- static bool hex_supported_src1_type3(ggml_type t) {
1824
- return t == GGML_TYPE_I32;
1825
- }
1826
-
1827
- static bool hex_supported_dst_type(ggml_type t) {
1828
- return t == GGML_TYPE_F32;
1829
- }
1830
-
1831
- static bool hex_supported_dims(const struct ggml_tensor * x, const struct ggml_tensor * y) {
1832
- // TODO: support broadcast for ne[2 and 3]
1833
- if (x->ne[0] != y->ne[0]) {
1834
- return false;
1835
- }
1836
- if (x->ne[2] != y->ne[2]) {
1837
- return false;
1838
- }
1839
- if (x->ne[3] != y->ne[3]) {
1840
- return false;
1841
- }
1842
- return true;
1843
- }
1844
1805
 
1845
1806
  static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
1846
1807
  const struct ggml_tensor * src0 = dst->src[0];
@@ -1862,12 +1823,12 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
1862
1823
  return false;
1863
1824
  }
1864
1825
 
1865
- if (src0->ne[1] > 16 * 1024) {
1826
+ if (ggml_nrows(src0) > 16 * 1024) {
1866
1827
  return false; // typically the lm-head which would be too large for VTCM
1867
1828
  }
1868
1829
 
1869
- if ((src1->ne[2] != 1 || src1->ne[3] != 1)) {
1870
- return false;
1830
+ if (ggml_nrows(src1) > 1024 || src1->ne[2] != 1 || src1->ne[3] != 1) {
1831
+ return false; // no huge batches or broadcasting (for now)
1871
1832
  }
1872
1833
 
1873
1834
  // src0 (weights) must be repacked
@@ -1881,6 +1842,9 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
1881
1842
  GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F16 src0 not supported\n");
1882
1843
  return false;
1883
1844
  }
1845
+ if (ggml_nrows(src1) > 1024) {
1846
+ return false; // no huge batches (for now)
1847
+ }
1884
1848
  break;
1885
1849
 
1886
1850
  default:
@@ -1926,24 +1890,30 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se
1926
1890
  const struct ggml_tensor * src1 = op->src[1];
1927
1891
  const struct ggml_tensor * dst = op;
1928
1892
 
1929
- if (!hex_supported_src0_type(src0->type)) {
1930
- return false;
1931
- }
1932
- if (!hex_supported_src1_type(src1->type)) {
1933
- return false;
1893
+ if (src0->type == GGML_TYPE_F32) {
1894
+ if (src1->type != GGML_TYPE_F32) {
1895
+ return false;
1896
+ }
1897
+ if (dst->type != GGML_TYPE_F32) {
1898
+ return false;
1899
+ }
1934
1900
  }
1935
- if (!hex_supported_dst_type(dst->type)) {
1936
- return false;
1901
+ else if (src0->type == GGML_TYPE_F16) {
1902
+ if (src1->type != GGML_TYPE_F16) {
1903
+ return false;
1904
+ }
1905
+ if (dst->type != GGML_TYPE_F16) {
1906
+ return false;
1907
+ }
1937
1908
  }
1938
- if (!hex_supported_dims2(src0, dst)) {
1909
+ else {
1939
1910
  return false;
1940
1911
  }
1941
- if (!ggml_can_repeat(src1, src0)) {
1912
+
1913
+ if (!ggml_are_same_shape(src0, dst)) {
1942
1914
  return false;
1943
1915
  }
1944
-
1945
- // TODO: add support for non-contigiuos tensors
1946
- if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
1916
+ if (!ggml_can_repeat(src1, src0) || ggml_is_permuted(src1)) {
1947
1917
  return false;
1948
1918
  }
1949
1919
 
@@ -1955,16 +1925,16 @@ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * se
1955
1925
  const struct ggml_tensor * src1 = op->src[1];
1956
1926
  const struct ggml_tensor * dst = op;
1957
1927
 
1958
- if (!hex_supported_src0_type(src0->type)) {
1928
+ if (src0->type != GGML_TYPE_F32) {
1959
1929
  return false;
1960
1930
  }
1961
- if (!hex_supported_src1_type(src1->type)) {
1931
+ if (src1->type != GGML_TYPE_F32) {
1962
1932
  return false;
1963
1933
  }
1964
- if (!hex_supported_dst_type(dst->type)) {
1934
+ if (dst->type != GGML_TYPE_F32) {
1965
1935
  return false;
1966
1936
  }
1967
- if (!hex_supported_dims2(src0, dst)) {
1937
+ if (!ggml_are_same_shape(src0, dst)) {
1968
1938
  return false;
1969
1939
  }
1970
1940
 
@@ -1980,13 +1950,32 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
1980
1950
  const struct ggml_tensor * src0 = op->src[0];
1981
1951
  const struct ggml_tensor * dst = op;
1982
1952
 
1983
- if (!hex_supported_src0_type(src0->type)) {
1953
+ if (src0->type != GGML_TYPE_F32) {
1954
+ return false;
1955
+ }
1956
+ if (dst->type != GGML_TYPE_F32) {
1957
+ return false;
1958
+ }
1959
+ if (!ggml_are_same_shape(src0, dst)) {
1960
+ return false;
1961
+ }
1962
+
1963
+ // TODO: add support for non-contigiuos tensors
1964
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
1984
1965
  return false;
1985
1966
  }
1986
- if (!hex_supported_dst_type(dst->type)) {
1967
+
1968
+ return true;
1969
+ }
1970
+
1971
+ static bool ggml_hexagon_supported_sum_rows(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
1972
+ const struct ggml_tensor * src0 = op->src[0];
1973
+ const struct ggml_tensor * dst = op;
1974
+
1975
+ if (src0->type != GGML_TYPE_F32) {
1987
1976
  return false;
1988
1977
  }
1989
- if (!hex_supported_dims2(src0, dst)) {
1978
+ if (dst->type != GGML_TYPE_F32) {
1990
1979
  return false;
1991
1980
  }
1992
1981
 
@@ -2004,10 +1993,10 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
2004
1993
  const struct ggml_tensor * src1 = op->src[1];
2005
1994
  const struct ggml_tensor * dst = op;
2006
1995
 
2007
- if (!hex_supported_src0_type(src0->type)) {
1996
+ if (src0->type != GGML_TYPE_F32) {
2008
1997
  return false;
2009
1998
  }
2010
- if (!hex_supported_dst_type(dst->type)) {
1999
+ if (dst->type != GGML_TYPE_F32) {
2011
2000
  return false;
2012
2001
  }
2013
2002
 
@@ -2016,10 +2005,10 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
2016
2005
  }
2017
2006
 
2018
2007
  if (src1) {
2019
- if (!hex_supported_src1_type(src1->type)) {
2008
+ if (src1->type != GGML_TYPE_F32) {
2020
2009
  return false;
2021
2010
  }
2022
- if (!hex_supported_dims2(src0, src1)) {
2011
+ if (!ggml_are_same_shape(src0, src1)) {
2023
2012
  return false;
2024
2013
  }
2025
2014
  if (!ggml_is_contiguous(src1)) {
@@ -2040,15 +2029,15 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
2040
2029
  return false; // FIXME: add support for sinks
2041
2030
  }
2042
2031
 
2043
- if (!hex_supported_src0_type(src0->type)) {
2032
+ if (src0->type != GGML_TYPE_F32) {
2044
2033
  return false;
2045
2034
  }
2046
- if (!hex_supported_dst_type(dst->type)) {
2035
+ if (dst->type != GGML_TYPE_F32) {
2047
2036
  return false;
2048
2037
  }
2049
2038
 
2050
2039
  if (src1) {
2051
- if (!hex_supported_src1_type(src1->type) && !hex_supported_src1_type2(src1->type)) {
2040
+ if (src1->type != GGML_TYPE_F32 && src1->type != GGML_TYPE_F16) {
2052
2041
  return false;
2053
2042
  }
2054
2043
  if (src0->ne[0] != src1->ne[0]) {
@@ -2118,6 +2107,26 @@ static bool ggml_hexagon_supported_get_rows(const struct ggml_hexagon_session *
2118
2107
  return true;
2119
2108
  }
2120
2109
 
2110
+ static bool ggml_hexagon_supported_argsort(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2111
+ const struct ggml_tensor * src0 = op->src[0]; // values
2112
+ const struct ggml_tensor * dst = op; // indices
2113
+
2114
+ if (src0->type != GGML_TYPE_F32) {
2115
+ return false;
2116
+ }
2117
+
2118
+ if (dst->type != GGML_TYPE_I32) {
2119
+ return false;
2120
+ }
2121
+
2122
+ if (src0->ne[0] > (16*1024)) {
2123
+ // reject tensors with huge rows for now
2124
+ return false;
2125
+ }
2126
+
2127
+ return true;
2128
+ }
2129
+
2121
2130
  static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2122
2131
  const int32_t * op_params = &op->op_params[0];
2123
2132
 
@@ -2135,17 +2144,17 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
2135
2144
  const struct ggml_tensor * src2 = op->src[2];
2136
2145
  const struct ggml_tensor * dst = op;
2137
2146
 
2138
- if (!hex_supported_src0_type(src0->type)) {
2147
+ if (src0->type != GGML_TYPE_F32) {
2139
2148
  return false; // FIXME: add support for GGML_TYPE_F16 for src0
2140
2149
  }
2141
- if (!hex_supported_dst_type(dst->type)) {
2150
+ if (dst->type != GGML_TYPE_F32) {
2142
2151
  return false;
2143
2152
  }
2144
- if (!hex_supported_src1_type3(src1->type)) {
2153
+ if (src1->type != GGML_TYPE_I32) {
2145
2154
  return false;
2146
2155
  }
2147
2156
  if (src2) {
2148
- if (!hex_supported_src2_type(src2->type)) {
2157
+ if (src2->type != GGML_TYPE_F32) {
2149
2158
  return false;
2150
2159
  }
2151
2160
  int n_dims = op_params[1];
@@ -2168,6 +2177,44 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
2168
2177
  return true;
2169
2178
  }
2170
2179
 
2180
+ static bool ggml_hexagon_supported_ssm_conv(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2181
+ const struct ggml_tensor * src0 = op->src[0];
2182
+ const struct ggml_tensor * src1 = op->src[1];
2183
+ const struct ggml_tensor * dst = op;
2184
+
2185
+ // Only support FP32 for now
2186
+ if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
2187
+ return false;
2188
+ }
2189
+
2190
+ // Check IO tensor shapes and dims
2191
+ if (src0->ne[3] != 1 || src1->ne[2] != 1 || src1->ne[3] != 1 || dst->ne[3] != 1) {
2192
+ return false; // src0 should be effectively 3D
2193
+ }
2194
+
2195
+ const int d_conv = src1->ne[0];
2196
+ const int d_inner = src0->ne[1];
2197
+ const int n_t = dst->ne[1];
2198
+ const int n_s = dst->ne[2];
2199
+
2200
+ if (src0->ne[0] != d_conv - 1 + n_t || src0->ne[1] != d_inner || src0->ne[2] != n_s) {
2201
+ return false;
2202
+ }
2203
+ if (src1->ne[0] != d_conv || src1->ne[1] != d_inner) {
2204
+ return false;
2205
+ }
2206
+ if (dst->ne[0] != d_inner || dst->ne[1] != n_t || dst->ne[2] != n_s) {
2207
+ return false;
2208
+ }
2209
+
2210
+ // TODO: add support for non-contiguous tensors
2211
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
2212
+ return false;
2213
+ }
2214
+
2215
+ return true;
2216
+ }
2217
+
2171
2218
  enum dspqbuf_type {
2172
2219
  DSPQBUF_TYPE_DSP_WRITE_CPU_READ = 0,
2173
2220
  DSPQBUF_TYPE_CPU_WRITE_DSP_READ,
@@ -2285,6 +2332,9 @@ static inline size_t init_binary_req(htp_general_req * req, dspqueue_buffer * bu
2285
2332
  case GGML_OP_SUB:
2286
2333
  req->op = HTP_OP_SUB;
2287
2334
  break;
2335
+ case GGML_OP_DIV:
2336
+ req->op = HTP_OP_DIV;
2337
+ break;
2288
2338
  default:
2289
2339
  GGML_ABORT("ggml-hex: binary : unsupported op: %d\n", t->op);
2290
2340
  break;
@@ -2302,6 +2352,16 @@ static inline size_t init_binary_req(htp_general_req * req, dspqueue_buffer * bu
2302
2352
  return n_bufs;
2303
2353
  }
2304
2354
 
2355
+ static inline size_t init_cpy_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2356
+ req->op = HTP_OP_CPY;
2357
+
2358
+ size_t n_bufs = 0;
2359
+ n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2360
+ n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
2361
+
2362
+ return n_bufs;
2363
+ }
2364
+
2305
2365
  static inline size_t init_get_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2306
2366
  req->op = HTP_OP_GET_ROWS;
2307
2367
 
@@ -2313,6 +2373,17 @@ static inline size_t init_get_rows_req(htp_general_req * req, dspqueue_buffer *
2313
2373
  return n_bufs;
2314
2374
  }
2315
2375
 
2376
+ static inline size_t init_argsort_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2377
+ req->op = HTP_OP_ARGSORT;
2378
+ memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
2379
+
2380
+ size_t n_bufs = 0;
2381
+ n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2382
+ n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
2383
+
2384
+ return n_bufs;
2385
+ }
2386
+
2316
2387
  template <bool _is_src0_constant>
2317
2388
  static inline size_t init_binary_id_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2318
2389
  switch (t->op) {
@@ -2367,6 +2438,16 @@ static inline size_t init_unary_req(htp_general_req * req, dspqueue_buffer * buf
2367
2438
  supported = true;
2368
2439
  break;
2369
2440
 
2441
+ case GGML_OP_SQR:
2442
+ req->op = HTP_OP_SQR;
2443
+ supported = true;
2444
+ break;
2445
+
2446
+ case GGML_OP_SQRT:
2447
+ req->op = HTP_OP_SQRT;
2448
+ supported = true;
2449
+ break;
2450
+
2370
2451
  case GGML_OP_UNARY:
2371
2452
  if (ggml_get_unary_op(t) == GGML_UNARY_OP_SILU) {
2372
2453
  req->op = HTP_OP_UNARY_SILU;
@@ -2384,6 +2465,9 @@ static inline size_t init_unary_req(htp_general_req * req, dspqueue_buffer * buf
2384
2465
  } else if (ggml_get_glu_op(t) == GGML_GLU_OP_SWIGLU_OAI) {
2385
2466
  req->op = HTP_OP_GLU_SWIGLU_OAI;
2386
2467
  supported = true;
2468
+ } else if (ggml_get_glu_op(t) == GGML_GLU_OP_GEGLU) {
2469
+ req->op = HTP_OP_GLU_GEGLU;
2470
+ supported = true;
2387
2471
  }
2388
2472
  break;
2389
2473
 
@@ -2408,6 +2492,17 @@ static inline size_t init_unary_req(htp_general_req * req, dspqueue_buffer * buf
2408
2492
  return n_bufs;
2409
2493
  }
2410
2494
 
2495
+ static inline size_t init_sum_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2496
+ memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
2497
+ req->op = HTP_OP_SUM_ROWS;
2498
+
2499
+ size_t n_bufs = 0;
2500
+ n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2501
+ n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
2502
+
2503
+ return n_bufs;
2504
+ }
2505
+
2411
2506
  static inline size_t init_rope_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2412
2507
  memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
2413
2508
  req->op = HTP_OP_ROPE;
@@ -2436,6 +2531,17 @@ static inline size_t init_flash_attn_ext_req(htp_general_req * req, dspqueue_buf
2436
2531
  return n_bufs;
2437
2532
  }
2438
2533
 
2534
+ static inline size_t init_ssm_conv_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2535
+ req->op = HTP_OP_SSM_CONV;
2536
+
2537
+ size_t n_bufs = 0;
2538
+ n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2539
+ n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CONSTANT);
2540
+ n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
2541
+
2542
+ return n_bufs;
2543
+ }
2544
+
2439
2545
  static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
2440
2546
  auto sess = static_cast<ggml_hexagon_session *>(backend->context);
2441
2547
  return sess->name.c_str();
@@ -2448,12 +2554,12 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
2448
2554
  }
2449
2555
 
2450
2556
  static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) {
2451
- return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type) && ggml_is_quantized(op1->src[1]->type));
2557
+ return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type));
2452
2558
  }
2453
2559
 
2454
2560
  static inline bool is_compute_op(ggml_tensor *node)
2455
2561
  {
2456
- return !(ggml_op_is_empty(node->op) || ggml_is_empty(node));
2562
+ return !ggml_op_is_empty(node->op) && !ggml_is_empty(node) && (node->flags & GGML_TENSOR_FLAG_COMPUTE);
2457
2563
  }
2458
2564
 
2459
2565
  // scan the graph and figure out last compute op index
@@ -2475,7 +2581,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
2475
2581
 
2476
2582
  const int last = last_compute_op(graph);
2477
2583
 
2478
- const struct ggml_tensor * prev_quant_op = nullptr; // prev executed op with quantizer
2584
+ const struct ggml_tensor * prev_op = nullptr; // prev executed op
2479
2585
 
2480
2586
  for (int i = 0; i < graph->n_nodes; ++i) {
2481
2587
  ggml_tensor * node = graph->nodes[i];
@@ -2487,10 +2593,12 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
2487
2593
  uint32_t flags = 0;
2488
2594
 
2489
2595
  // skip quantizer if src1 is reused
2490
- if (op_reuse_src1(node, prev_quant_op)) {
2596
+ if (op_reuse_src1(node, prev_op)) {
2491
2597
  flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
2492
2598
  }
2493
2599
 
2600
+ prev_op = node;
2601
+
2494
2602
  // ask for early notification for the last Op
2495
2603
  if (i == last) {
2496
2604
  flags |= HTP_OPFLAGS_EARLY_WAKEUP;
@@ -2503,7 +2611,6 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
2503
2611
  } else {
2504
2612
  ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
2505
2613
  }
2506
- prev_quant_op = node;
2507
2614
  break;
2508
2615
  case GGML_OP_MUL_MAT_ID:
2509
2616
  if (ggml_is_quantized(node->src[0]->type)) {
@@ -2511,11 +2618,11 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
2511
2618
  } else {
2512
2619
  ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
2513
2620
  }
2514
- prev_quant_op = node;
2515
2621
  break;
2516
2622
  case GGML_OP_MUL:
2517
2623
  case GGML_OP_ADD:
2518
2624
  case GGML_OP_SUB:
2625
+ case GGML_OP_DIV:
2519
2626
  ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
2520
2627
  break;
2521
2628
  case GGML_OP_ADD_ID:
@@ -2525,6 +2632,13 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
2525
2632
  case GGML_OP_SCALE:
2526
2633
  ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
2527
2634
  break;
2635
+ case GGML_OP_SQR:
2636
+ case GGML_OP_SQRT:
2637
+ ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
2638
+ break;
2639
+ case GGML_OP_SUM_ROWS:
2640
+ ggml_hexagon_dispatch_op<init_sum_rows_req>(sess, node, flags);
2641
+ break;
2528
2642
  case GGML_OP_UNARY:
2529
2643
  if ((ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) ||
2530
2644
  (ggml_get_unary_op(node) == GGML_UNARY_OP_GELU)) {
@@ -2533,7 +2647,8 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
2533
2647
  break;
2534
2648
  case GGML_OP_GLU:
2535
2649
  if ((ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU) ||
2536
- (ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI)) {
2650
+ (ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI) ||
2651
+ (ggml_get_glu_op(node) == GGML_GLU_OP_GEGLU)) {
2537
2652
  ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
2538
2653
  }
2539
2654
  break;
@@ -2557,6 +2672,18 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
2557
2672
  ggml_hexagon_dispatch_op<init_get_rows_req>(sess, node, flags);
2558
2673
  break;
2559
2674
 
2675
+ case GGML_OP_CPY:
2676
+ ggml_hexagon_dispatch_op<init_cpy_req>(sess, node, flags);
2677
+ break;
2678
+
2679
+ case GGML_OP_ARGSORT:
2680
+ ggml_hexagon_dispatch_op<init_argsort_req>(sess, node, flags);
2681
+ break;
2682
+
2683
+ case GGML_OP_SSM_CONV:
2684
+ ggml_hexagon_dispatch_op<init_ssm_conv_req>(sess, node, flags);
2685
+ break;
2686
+
2560
2687
  default:
2561
2688
  GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(node));
2562
2689
  }
@@ -2632,7 +2759,7 @@ static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<no
2632
2759
  // The main goal here is to stack the MUL_MAT ops with the same src1 input.
2633
2760
  // This allows use to reuse dynamically quantized src1 in VTCM.
2634
2761
 
2635
- // TODO: the current version might do incorrect reodering in cases where quantized src0
2762
+ // TODO: the current version might do incorrect reordering in cases where quantized src0
2636
2763
  // input is an output of another Op.
2637
2764
 
2638
2765
  for (int i0 = 0; i0 < n; i0++) {
@@ -2649,7 +2776,7 @@ static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<no
2649
2776
  }
2650
2777
 
2651
2778
  // that many nodes forward to search for stackable nodes that can reuse VTCM
2652
- constexpr int N_FORWARD = 8;
2779
+ constexpr int N_FORWARD = 16;
2653
2780
 
2654
2781
  for (int i1 = i0 + 1; i1 < i0 + N_FORWARD && i1 < n; i1++) {
2655
2782
  if (used[i1]) {
@@ -2858,6 +2985,27 @@ static bool ggml_hexagon_supported_buffers(ggml_hexagon_session *sess, const str
2858
2985
  return true;
2859
2986
  }
2860
2987
 
2988
+ static bool ggml_hexagon_supported_cpy(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2989
+ const struct ggml_tensor * src0 = op->src[0];
2990
+ const struct ggml_tensor * dst = op;
2991
+
2992
+ // for now we can do f32 -> f16 and f16 -> f32 (without reshaping)
2993
+ if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
2994
+ if ( dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) return false;
2995
+
2996
+ const bool sametype = (src0->type == dst->type);
2997
+ const bool transposed = ggml_is_transposed(src0) || ggml_is_transposed(dst);
2998
+ const bool sameshape = !transposed && ggml_are_same_shape(src0, dst);
2999
+
3000
+ // can handle any shape and any same-type (pretty slow if reshaping is required)
3001
+ if (sametype) return true;
3002
+
3003
+ // cannot handle re-shaping and type conversion at the same time
3004
+ if (!sameshape) return false;
3005
+
3006
+ return true;
3007
+ }
3008
+
2861
3009
  static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
2862
3010
  auto sess = static_cast<ggml_hexagon_session *>(dev->context);
2863
3011
 
@@ -2888,6 +3036,7 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
2888
3036
  case GGML_OP_MUL:
2889
3037
  case GGML_OP_ADD:
2890
3038
  case GGML_OP_SUB:
3039
+ case GGML_OP_DIV:
2891
3040
  supp = ggml_hexagon_supported_binary(sess, op);
2892
3041
  break;
2893
3042
 
@@ -2900,6 +3049,15 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
2900
3049
  supp = ggml_hexagon_supported_unary(sess, op);
2901
3050
  break;
2902
3051
 
3052
+ case GGML_OP_SQR:
3053
+ case GGML_OP_SQRT:
3054
+ supp = ggml_hexagon_supported_unary(sess, op);
3055
+ break;
3056
+
3057
+ case GGML_OP_SUM_ROWS:
3058
+ supp = ggml_hexagon_supported_sum_rows(sess, op);
3059
+ break;
3060
+
2903
3061
  case GGML_OP_SOFT_MAX:
2904
3062
  supp = ggml_hexagon_supported_softmax(sess, op);
2905
3063
  break;
@@ -2915,7 +3073,7 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
2915
3073
  case GGML_OP_GLU:
2916
3074
  {
2917
3075
  const auto glu_op = ggml_get_glu_op(op);
2918
- if ((glu_op == GGML_GLU_OP_SWIGLU) || (glu_op == GGML_GLU_OP_SWIGLU_OAI)) {
3076
+ if ((glu_op == GGML_GLU_OP_SWIGLU) || (glu_op == GGML_GLU_OP_SWIGLU_OAI) || (glu_op == GGML_GLU_OP_GEGLU)) {
2919
3077
  supp = ggml_hexagon_supported_activations(sess, op);
2920
3078
  }
2921
3079
  break;
@@ -2936,6 +3094,18 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
2936
3094
  supp = ggml_hexagon_supported_get_rows(sess, op);
2937
3095
  break;
2938
3096
 
3097
+ case GGML_OP_CPY:
3098
+ supp = ggml_hexagon_supported_cpy(sess, op);
3099
+ break;
3100
+
3101
+ case GGML_OP_ARGSORT:
3102
+ supp = ggml_hexagon_supported_argsort(sess, op);
3103
+ break;
3104
+
3105
+ case GGML_OP_SSM_CONV:
3106
+ supp = ggml_hexagon_supported_ssm_conv(sess, op);
3107
+ break;
3108
+
2939
3109
  default:
2940
3110
  break;
2941
3111
  }
@@ -3010,10 +3180,12 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
3010
3180
  }
3011
3181
  }
3012
3182
 
3183
+ #if defined(__ANDROID__)
3013
3184
  if (opt_arch < 75) {
3014
3185
  opt_ndev = 1;
3015
3186
  GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
3016
3187
  }
3188
+ #endif
3017
3189
 
3018
3190
  GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
3019
3191
 
@@ -3061,7 +3233,7 @@ static ggml_backend_dev_t ggml_backend_hexagon_reg_get_device(ggml_backend_reg_t
3061
3233
  }
3062
3234
 
3063
3235
  static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, const char * name) {
3064
- if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
3236
+ if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0 && opt_hostbuf) {
3065
3237
  ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_hexagon_device_get_extra_buffers_type;
3066
3238
  return (void *) fct;
3067
3239
  }
@@ -3078,34 +3250,31 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
3078
3250
  static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4,
3079
3251
  "please update hexagon_type to match ggml_type");
3080
3252
 
3253
+ const char * str_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL");
3081
3254
  const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
3082
3255
  const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
3083
-
3256
+ const char * str_opmask = getenv("GGML_HEXAGON_OPMASK");
3257
+ const char * str_opsync = getenv("GGML_HEXAGON_OPSYNC");
3258
+ const char * str_profile = getenv("GGML_HEXAGON_PROFILE");
3259
+ const char * str_etm = getenv("GGML_HEXAGON_ETM");
3260
+ const char * str_nhvx = getenv("GGML_HEXAGON_NHVX");
3261
+ const char * str_ndev = getenv("GGML_HEXAGON_NDEV");
3262
+ const char * str_arch = getenv("GGML_HEXAGON_ARCH");
3263
+
3264
+ opt_experimental = str_experimental ? atoi(str_experimental) : 0;
3084
3265
  opt_verbose = str_verbose ? atoi(str_verbose) : 0;
3085
- opt_profile = getenv("GGML_HEXAGON_PROFILE") != nullptr;
3086
- opt_etm = getenv("GGML_HEXAGON_ETM") != nullptr;
3087
- opt_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL") != nullptr;
3266
+ opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
3267
+ opt_opmask = str_opmask ? strtoul(str_opmask, NULL, 0) : opt_opmask;
3268
+ opt_opsync = str_opsync ? atoi(str_opsync) : 0;
3269
+ opt_profile = str_profile ? atoi(str_profile) : 0;
3270
+ opt_etm = str_etm ? atoi(str_etm) : 0;
3271
+ opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
3272
+ opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev;
3088
3273
 
3089
- const char * str_opmask = getenv("GGML_HEXAGON_OPMASK");
3090
- if (str_opmask != nullptr) {
3091
- opt_opmask = strtoul(str_opmask, NULL, 0);
3274
+ if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
3275
+ opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
3092
3276
  }
3093
- opt_opsync = getenv("GGML_HEXAGON_OPSYNC") != nullptr;
3094
3277
 
3095
- const char * str_ndev = getenv("GGML_HEXAGON_NDEV");
3096
- if (str_ndev) {
3097
- opt_ndev = strtoul(str_ndev, NULL, 0);
3098
- if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
3099
- opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
3100
- }
3101
- }
3102
-
3103
- const char * str_nhvx = getenv("GGML_HEXAGON_NHVX");
3104
- if (str_nhvx) {
3105
- opt_nhvx = strtoul(str_nhvx, NULL, 0);
3106
- }
3107
-
3108
- const char * str_arch = getenv("GGML_HEXAGON_ARCH");
3109
3278
  if (str_arch) {
3110
3279
  if (str_arch[0] == 'v') {
3111
3280
  str_arch++;
@@ -3139,6 +3308,11 @@ ggml_backend_reg_t ggml_backend_hexagon_reg(void) {
3139
3308
  static std::mutex mutex;
3140
3309
  std::lock_guard<std::mutex> lock(mutex);
3141
3310
  if (!initialized) {
3311
+ auto nErr = htpdrv_init();
3312
+ if (nErr != AEE_SUCCESS) {
3313
+ return NULL;
3314
+ }
3315
+
3142
3316
  ggml_hexagon_init(&reg);
3143
3317
  }
3144
3318