whispercpp 1.3.3 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (963) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -43
  3. data/ext/extconf.rb +2 -2
  4. data/ext/ruby_whisper.c +14 -2
  5. data/ext/ruby_whisper.h +39 -0
  6. data/ext/ruby_whisper_context.c +22 -22
  7. data/ext/ruby_whisper_model.c +12 -12
  8. data/ext/ruby_whisper_params.c +79 -25
  9. data/ext/ruby_whisper_segment.c +84 -19
  10. data/ext/ruby_whisper_token.c +351 -0
  11. data/ext/ruby_whisper_transcribe.cpp +1 -1
  12. data/ext/ruby_whisper_vad_context.c +75 -0
  13. data/ext/ruby_whisper_vad_context_detect.cpp +50 -0
  14. data/ext/ruby_whisper_vad_segment.c +139 -0
  15. data/ext/ruby_whisper_vad_segments.c +106 -0
  16. data/ext/sources/CMakeLists.txt +4 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/cmake/arm64-apple-clang.cmake +16 -0
  19. data/ext/sources/cmake/arm64-windows-llvm.cmake +16 -0
  20. data/ext/sources/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
  21. data/ext/sources/cmake/x64-windows-llvm.cmake +5 -0
  22. data/ext/sources/examples/CMakeLists.txt +1 -0
  23. data/ext/sources/examples/addon.node/addon.cpp +19 -19
  24. data/ext/sources/examples/addon.node/index.js +7 -5
  25. data/ext/sources/examples/addon.node/vad-example.js +2 -2
  26. data/ext/sources/examples/bench/bench.cpp +26 -16
  27. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  28. data/ext/sources/examples/cli/cli.cpp +122 -111
  29. data/ext/sources/examples/command/command.cpp +26 -24
  30. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  31. data/ext/sources/examples/common-ggml.cpp +2 -0
  32. data/ext/sources/examples/lsp/CMakeLists.txt +2 -1
  33. data/ext/sources/examples/lsp/lsp.cpp +19 -17
  34. data/ext/sources/examples/quantize/CMakeLists.txt +2 -1
  35. data/ext/sources/examples/server/server.cpp +34 -24
  36. data/ext/sources/examples/server.py +6 -1
  37. data/ext/sources/examples/stream/stream.cpp +4 -2
  38. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  39. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  40. data/ext/sources/examples/talk-llama/CMakeLists.txt +7 -3
  41. data/ext/sources/examples/talk-llama/llama-adapter.cpp +113 -7
  42. data/ext/sources/examples/talk-llama/llama-adapter.h +13 -1
  43. data/ext/sources/examples/talk-llama/llama-arch.cpp +2136 -1491
  44. data/ext/sources/examples/talk-llama/llama-arch.h +125 -3
  45. data/ext/sources/examples/talk-llama/llama-batch.cpp +174 -100
  46. data/ext/sources/examples/talk-llama/llama-batch.h +46 -20
  47. data/ext/sources/examples/talk-llama/llama-chat.cpp +199 -8
  48. data/ext/sources/examples/talk-llama/llama-chat.h +11 -0
  49. data/ext/sources/examples/talk-llama/llama-context.cpp +1213 -413
  50. data/ext/sources/examples/talk-llama/llama-context.h +99 -36
  51. data/ext/sources/examples/talk-llama/llama-cparams.h +5 -4
  52. data/ext/sources/examples/talk-llama/llama-grammar.cpp +288 -53
  53. data/ext/sources/examples/talk-llama/llama-grammar.h +22 -1
  54. data/ext/sources/examples/talk-llama/llama-graph.cpp +883 -294
  55. data/ext/sources/examples/talk-llama/llama-graph.h +361 -161
  56. data/ext/sources/examples/talk-llama/llama-hparams.cpp +144 -6
  57. data/ext/sources/examples/talk-llama/llama-hparams.h +100 -23
  58. data/ext/sources/examples/talk-llama/llama-impl.cpp +7 -3
  59. data/ext/sources/examples/talk-llama/llama-impl.h +3 -1
  60. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +328 -0
  61. data/ext/sources/examples/talk-llama/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +38 -29
  62. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2100 -0
  63. data/ext/sources/examples/talk-llama/llama-kv-cache.h +373 -27
  64. data/ext/sources/examples/talk-llama/llama-kv-cells.h +124 -30
  65. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +63 -41
  66. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +30 -29
  67. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +77 -35
  68. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +15 -16
  69. data/ext/sources/examples/talk-llama/llama-memory.h +16 -10
  70. data/ext/sources/examples/talk-llama/llama-mmap.cpp +172 -37
  71. data/ext/sources/examples/talk-llama/llama-mmap.h +8 -3
  72. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +93 -9
  73. data/ext/sources/examples/talk-llama/llama-model-loader.h +9 -2
  74. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +3 -0
  75. data/ext/sources/examples/talk-llama/llama-model.cpp +3369 -10145
  76. data/ext/sources/examples/talk-llama/llama-model.h +104 -12
  77. data/ext/sources/examples/talk-llama/llama-quant.cpp +53 -30
  78. data/ext/sources/examples/talk-llama/llama-sampling.cpp +1520 -324
  79. data/ext/sources/examples/talk-llama/llama-sampling.h +19 -7
  80. data/ext/sources/examples/talk-llama/llama-vocab.cpp +562 -39
  81. data/ext/sources/examples/talk-llama/llama-vocab.h +50 -0
  82. data/ext/sources/examples/talk-llama/llama.cpp +794 -12
  83. data/ext/sources/examples/talk-llama/llama.h +246 -190
  84. data/ext/sources/examples/talk-llama/models/afmoe.cpp +191 -0
  85. data/ext/sources/examples/talk-llama/models/apertus.cpp +125 -0
  86. data/ext/sources/examples/talk-llama/models/arcee.cpp +135 -0
  87. data/ext/sources/examples/talk-llama/models/arctic.cpp +138 -0
  88. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +86 -0
  89. data/ext/sources/examples/talk-llama/models/baichuan.cpp +122 -0
  90. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +144 -0
  91. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +135 -0
  92. data/ext/sources/examples/talk-llama/models/bert.cpp +178 -0
  93. data/ext/sources/examples/talk-llama/models/bitnet.cpp +160 -0
  94. data/ext/sources/examples/talk-llama/models/bloom.cpp +101 -0
  95. data/ext/sources/examples/talk-llama/models/chameleon.cpp +178 -0
  96. data/ext/sources/examples/talk-llama/models/chatglm.cpp +132 -0
  97. data/ext/sources/examples/talk-llama/models/codeshell.cpp +111 -0
  98. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +102 -0
  99. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +134 -0
  100. data/ext/sources/examples/talk-llama/models/command-r.cpp +122 -0
  101. data/ext/sources/examples/talk-llama/models/dbrx.cpp +123 -0
  102. data/ext/sources/examples/talk-llama/models/deci.cpp +135 -0
  103. data/ext/sources/examples/talk-llama/models/deepseek.cpp +144 -0
  104. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +259 -0
  105. data/ext/sources/examples/talk-llama/models/dots1.cpp +134 -0
  106. data/ext/sources/examples/talk-llama/models/dream.cpp +105 -0
  107. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +150 -0
  108. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +110 -0
  109. data/ext/sources/examples/talk-llama/models/exaone.cpp +114 -0
  110. data/ext/sources/examples/talk-llama/models/exaone4.cpp +123 -0
  111. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +113 -0
  112. data/ext/sources/examples/talk-llama/models/falcon.cpp +120 -0
  113. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +116 -0
  114. data/ext/sources/examples/talk-llama/models/gemma.cpp +112 -0
  115. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +128 -0
  116. data/ext/sources/examples/talk-llama/models/gemma3.cpp +155 -0
  117. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +384 -0
  118. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +170 -0
  119. data/ext/sources/examples/talk-llama/models/glm4.cpp +150 -0
  120. data/ext/sources/examples/talk-llama/models/gpt2.cpp +105 -0
  121. data/ext/sources/examples/talk-llama/models/gptneox.cpp +144 -0
  122. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +196 -0
  123. data/ext/sources/examples/talk-llama/models/granite.cpp +211 -0
  124. data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +283 -0
  125. data/ext/sources/examples/talk-llama/models/grok.cpp +159 -0
  126. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +141 -0
  127. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +132 -0
  128. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +154 -0
  129. data/ext/sources/examples/talk-llama/models/internlm2.cpp +120 -0
  130. data/ext/sources/examples/talk-llama/models/jais.cpp +86 -0
  131. data/ext/sources/examples/talk-llama/models/jamba.cpp +106 -0
  132. data/ext/sources/examples/talk-llama/models/lfm2.cpp +175 -0
  133. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +122 -0
  134. data/ext/sources/examples/talk-llama/models/llada.cpp +99 -0
  135. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +178 -0
  136. data/ext/sources/examples/talk-llama/models/llama.cpp +168 -0
  137. data/ext/sources/examples/talk-llama/models/maincoder.cpp +117 -0
  138. data/ext/sources/examples/talk-llama/models/mamba.cpp +55 -0
  139. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +123 -0
  140. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +199 -0
  141. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +124 -0
  142. data/ext/sources/examples/talk-llama/models/mistral3.cpp +160 -0
  143. data/ext/sources/examples/talk-llama/models/models.h +569 -0
  144. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +116 -0
  145. data/ext/sources/examples/talk-llama/models/mpt.cpp +126 -0
  146. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +150 -0
  147. data/ext/sources/examples/talk-llama/models/nemotron.cpp +122 -0
  148. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +104 -0
  149. data/ext/sources/examples/talk-llama/models/olmo.cpp +121 -0
  150. data/ext/sources/examples/talk-llama/models/olmo2.cpp +150 -0
  151. data/ext/sources/examples/talk-llama/models/olmoe.cpp +124 -0
  152. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +127 -0
  153. data/ext/sources/examples/talk-llama/models/openelm.cpp +124 -0
  154. data/ext/sources/examples/talk-llama/models/orion.cpp +123 -0
  155. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +121 -0
  156. data/ext/sources/examples/talk-llama/models/phi2.cpp +121 -0
  157. data/ext/sources/examples/talk-llama/models/phi3.cpp +152 -0
  158. data/ext/sources/examples/talk-llama/models/plamo.cpp +110 -0
  159. data/ext/sources/examples/talk-llama/models/plamo2.cpp +316 -0
  160. data/ext/sources/examples/talk-llama/models/plamo3.cpp +128 -0
  161. data/ext/sources/examples/talk-llama/models/plm.cpp +168 -0
  162. data/ext/sources/examples/talk-llama/models/qwen.cpp +108 -0
  163. data/ext/sources/examples/talk-llama/models/qwen2.cpp +126 -0
  164. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +151 -0
  165. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +117 -0
  166. data/ext/sources/examples/talk-llama/models/qwen3.cpp +117 -0
  167. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +124 -0
  168. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +873 -0
  169. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +149 -0
  170. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +141 -0
  171. data/ext/sources/examples/talk-llama/models/refact.cpp +94 -0
  172. data/ext/sources/examples/talk-llama/models/rnd1.cpp +126 -0
  173. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +162 -0
  174. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +94 -0
  175. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +86 -0
  176. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +135 -0
  177. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +90 -0
  178. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +124 -0
  179. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +126 -0
  180. data/ext/sources/examples/talk-llama/models/smollm3.cpp +128 -0
  181. data/ext/sources/examples/talk-llama/models/stablelm.cpp +146 -0
  182. data/ext/sources/examples/talk-llama/models/starcoder.cpp +100 -0
  183. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +121 -0
  184. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +166 -0
  185. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +96 -0
  186. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +149 -0
  187. data/ext/sources/examples/talk-llama/models/xverse.cpp +108 -0
  188. data/ext/sources/examples/talk-llama/talk-llama.cpp +9 -6
  189. data/ext/sources/examples/talk-llama/unicode.cpp +309 -16
  190. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  191. data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +1 -1
  192. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +4 -2
  193. data/ext/sources/examples/whisper.wasm/index-tmpl.html +18 -17
  194. data/ext/sources/ggml/CMakeLists.txt +135 -79
  195. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  196. data/ext/sources/ggml/include/ggml-alloc.h +9 -0
  197. data/ext/sources/ggml/include/ggml-backend.h +21 -2
  198. data/ext/sources/ggml/include/ggml-cpu.h +2 -1
  199. data/ext/sources/ggml/include/ggml-hexagon.h +19 -0
  200. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  201. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  202. data/ext/sources/ggml/include/ggml-rpc.h +8 -11
  203. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  204. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  205. data/ext/sources/ggml/include/ggml-zendnn.h +22 -0
  206. data/ext/sources/ggml/include/ggml.h +406 -23
  207. data/ext/sources/ggml/src/CMakeLists.txt +99 -13
  208. data/ext/sources/ggml/src/ggml-alloc.c +368 -161
  209. data/ext/sources/ggml/src/ggml-backend-impl.h +5 -5
  210. data/ext/sources/ggml/src/ggml-backend-reg.cpp +55 -14
  211. data/ext/sources/ggml/src/ggml-backend.cpp +290 -57
  212. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +17 -3
  213. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +10 -13
  214. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +14 -0
  215. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +59 -45
  216. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +138 -47
  217. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +2586 -1917
  218. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +348 -309
  219. data/ext/sources/ggml/src/ggml-cann/common.h +350 -133
  220. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +894 -625
  221. data/ext/sources/ggml/src/ggml-common.h +17 -0
  222. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +167 -75
  223. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
  224. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
  225. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +560 -622
  226. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1002 -270
  227. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +107 -587
  228. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
  229. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  230. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +373 -486
  231. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
  232. data/ext/sources/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  233. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +521 -353
  234. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
  235. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
  236. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +4682 -1660
  237. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +82 -4
  238. data/ext/sources/ggml/src/ggml-cpu/common.h +14 -0
  239. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +18 -9
  240. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +263 -111
  241. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +39 -28
  242. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +683 -82
  243. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +38 -43
  244. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +435 -119
  245. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
  246. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1234 -1182
  247. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
  248. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +2167 -1480
  249. data/ext/sources/ggml/src/ggml-cpu/ops.h +10 -12
  250. data/ext/sources/ggml/src/ggml-cpu/quants.c +35 -0
  251. data/ext/sources/ggml/src/ggml-cpu/quants.h +8 -0
  252. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1132 -81
  253. data/ext/sources/ggml/src/ggml-cpu/repack.h +36 -0
  254. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +120 -93
  255. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  256. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  257. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  258. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  259. data/ext/sources/ggml/src/ggml-cpu/traits.cpp +2 -2
  260. data/ext/sources/ggml/src/ggml-cpu/traits.h +1 -1
  261. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
  262. data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +7 -0
  263. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +294 -27
  264. data/ext/sources/ggml/src/ggml-cpu/vec.h +606 -48
  265. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +92 -17
  266. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  267. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  268. data/ext/sources/ggml/src/ggml-cuda/argmax.cu +2 -2
  269. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +123 -6
  270. data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +16 -0
  271. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  272. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  273. data/ext/sources/ggml/src/ggml-cuda/common.cuh +588 -128
  274. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  275. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  276. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  277. data/ext/sources/ggml/src/ggml-cuda/convert.cu +95 -22
  278. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +25 -0
  279. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  280. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +335 -485
  281. data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +1 -5
  282. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  283. data/ext/sources/ggml/src/ggml-cuda/cumsum.cu +307 -0
  284. data/ext/sources/ggml/src/ggml-cuda/cumsum.cuh +5 -0
  285. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  286. data/ext/sources/ggml/src/ggml-cuda/diag.cu +77 -0
  287. data/ext/sources/ggml/src/ggml-cuda/diag.cuh +5 -0
  288. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +519 -378
  289. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +750 -637
  290. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
  291. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +1244 -0
  292. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
  293. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +98 -61
  294. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
  295. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +230 -197
  296. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  297. data/ext/sources/ggml/src/ggml-cuda/fill.cu +37 -0
  298. data/ext/sources/ggml/src/ggml-cuda/fill.cuh +3 -0
  299. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  300. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1557 -294
  301. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  302. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  303. data/ext/sources/ggml/src/ggml-cuda/mean.cu +57 -2
  304. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +915 -69
  305. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +171 -0
  306. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +835 -0
  307. data/ext/sources/ggml/src/ggml-cuda/mmid.cu +164 -0
  308. data/ext/sources/ggml/src/ggml-cuda/mmid.cuh +5 -0
  309. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +109 -67
  310. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1601 -733
  311. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +802 -0
  312. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +12 -0
  313. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +286 -149
  314. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -1
  315. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  316. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  317. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  318. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  319. data/ext/sources/ggml/src/ggml-cuda/pad.cu +86 -32
  320. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  321. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  322. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +163 -10
  323. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +14 -0
  324. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  325. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  326. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  327. data/ext/sources/ggml/src/ggml-cuda/rope.cu +207 -98
  328. data/ext/sources/ggml/src/ggml-cuda/rope.cuh +2 -0
  329. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  330. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +330 -0
  331. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  332. data/ext/sources/ggml/src/ggml-cuda/set.cu +39 -0
  333. data/ext/sources/ggml/src/ggml-cuda/set.cuh +7 -0
  334. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  335. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  336. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +325 -61
  337. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +275 -0
  338. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
  339. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +14 -12
  340. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +291 -104
  341. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  342. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +21 -4
  343. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  344. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  345. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  346. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  347. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  348. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  349. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  350. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  351. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  352. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  353. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  354. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  355. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  356. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  357. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  358. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  359. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  360. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  361. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  362. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  363. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  364. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  365. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  366. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  367. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  368. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  369. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  370. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  371. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  372. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  373. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  374. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  375. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  376. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  377. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  378. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  379. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  380. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  381. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  382. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  383. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  384. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  385. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  386. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  387. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  388. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
  389. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  390. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  391. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  392. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  393. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  394. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  395. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  396. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  397. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  398. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  399. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  400. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  401. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  402. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  403. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  404. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  405. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  406. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +96 -0
  407. data/ext/sources/ggml/src/ggml-cuda/top-k.cuh +3 -0
  408. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +351 -0
  409. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +21 -0
  410. data/ext/sources/ggml/src/ggml-cuda/tri.cu +136 -0
  411. data/ext/sources/ggml/src/ggml-cuda/tri.cuh +5 -0
  412. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  413. data/ext/sources/ggml/src/ggml-cuda/unary.cu +189 -5
  414. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +44 -0
  415. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +248 -6
  416. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  417. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +8 -0
  418. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +70 -37
  419. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +10 -3
  420. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +80 -0
  421. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3151 -0
  422. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +44 -0
  423. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +682 -0
  424. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
  425. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  426. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +566 -0
  427. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +112 -0
  428. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  429. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.c +63 -0
  430. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +157 -0
  431. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +165 -0
  432. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +92 -0
  433. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  434. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +94 -0
  435. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +72 -0
  436. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
  437. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +1020 -0
  438. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +1353 -0
  439. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +1001 -0
  440. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2503 -0
  441. data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
  442. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +487 -0
  443. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +168 -0
  444. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
  445. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +287 -0
  446. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
  447. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  448. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +454 -0
  449. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +221 -0
  450. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +153 -0
  451. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +16 -13
  452. data/ext/sources/ggml/src/ggml-impl.h +186 -15
  453. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -7
  454. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  455. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  456. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  457. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +609 -0
  458. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1743 -0
  459. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +273 -0
  460. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1686 -0
  461. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +356 -61
  462. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +4161 -0
  463. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
  464. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +724 -0
  465. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +4495 -1876
  466. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +21 -9
  467. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +29 -0
  468. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4005 -427
  469. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  470. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  471. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  472. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  473. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +147 -0
  474. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +66 -0
  475. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +82 -0
  476. data/ext/sources/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  477. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  478. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  479. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  480. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  481. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  482. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  483. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
  484. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +177 -0
  485. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
  486. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  487. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  488. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +39 -0
  489. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  490. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  491. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  492. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  493. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  494. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  495. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  496. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  497. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  498. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  499. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  500. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  501. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  502. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  503. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  504. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
  505. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +94 -0
  506. data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
  507. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  508. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  509. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  510. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  511. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  512. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  513. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -0
  514. data/ext/sources/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  515. data/ext/sources/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  516. data/ext/sources/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  517. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
  518. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +33 -0
  519. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
  520. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  521. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  522. data/ext/sources/ggml/src/ggml-quants.c +111 -16
  523. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  524. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +497 -195
  525. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +48 -3
  526. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +77 -0
  527. data/ext/sources/ggml/src/ggml-sycl/add-id.hpp +8 -0
  528. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +8 -0
  529. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +6 -5
  530. data/ext/sources/ggml/src/ggml-sycl/common.hpp +117 -15
  531. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +50 -30
  532. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +10 -4
  533. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +200 -99
  534. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  535. data/ext/sources/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  536. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +72 -309
  537. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  538. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +18 -0
  539. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +67 -49
  540. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +77 -34
  541. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +397 -314
  542. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +12 -2
  543. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +14 -26
  544. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +9 -6
  545. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +643 -413
  546. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  547. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +2 -2
  548. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +80 -60
  549. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +223 -132
  550. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +230 -55
  551. data/ext/sources/ggml/src/ggml-sycl/norm.hpp +2 -0
  552. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +97 -0
  553. data/ext/sources/ggml/src/ggml-sycl/pad.hpp +24 -0
  554. data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
  555. data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
  556. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +2 -0
  557. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  558. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +8 -9
  559. data/ext/sources/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  560. data/ext/sources/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  561. data/ext/sources/ggml/src/ggml-sycl/roll.cpp +122 -0
  562. data/ext/sources/ggml/src/ggml-sycl/roll.hpp +20 -0
  563. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +65 -59
  564. data/ext/sources/ggml/src/ggml-sycl/set.cpp +73 -0
  565. data/ext/sources/ggml/src/ggml-sycl/set.hpp +5 -0
  566. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  567. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  568. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +330 -165
  569. data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +4 -0
  570. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
  571. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  572. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +12 -6
  573. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +60 -6
  574. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +16 -12
  575. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
  576. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +7398 -2635
  577. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  578. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
  579. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +43 -3
  580. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  581. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  582. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  583. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +15 -6
  584. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +56 -39
  585. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  586. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  587. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
  588. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
  589. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
  590. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
  591. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +347 -0
  592. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
  593. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
  594. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +5 -5
  595. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +67 -13
  596. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  597. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
  598. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
  599. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  600. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
  601. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  602. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  603. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
  604. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +158 -16
  605. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +38 -3
  606. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
  607. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  608. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
  609. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
  610. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
  611. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
  612. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
  613. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
  614. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
  615. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
  616. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  617. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +4 -4
  618. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +2 -2
  619. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
  620. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
  621. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +4 -4
  622. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
  623. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
  624. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +4 -4
  625. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +2 -2
  626. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
  627. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
  628. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
  629. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
  630. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  631. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  632. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +103 -36
  633. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +220 -0
  634. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +139 -45
  635. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +113 -38
  636. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +75 -14
  637. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  638. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
  639. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  640. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  641. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
  642. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  643. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
  644. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +19 -17
  645. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +2 -0
  646. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +7 -0
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +4 -0
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +33 -17
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +2 -2
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +227 -0
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +71 -21
  663. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +41 -25
  664. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +2 -2
  665. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +44 -26
  666. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +2 -2
  667. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +2 -2
  668. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +2 -2
  669. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +20 -14
  670. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
  671. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +4 -6
  672. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +2 -2
  673. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +4 -6
  674. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +4 -6
  675. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +2 -2
  676. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +143 -0
  677. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +494 -0
  678. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -556
  679. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +230 -51
  680. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +566 -0
  681. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +72 -0
  682. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +90 -223
  683. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
  684. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
  685. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +195 -0
  686. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  687. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
  688. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
  689. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  690. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +41 -5
  691. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
  692. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +59 -9
  693. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
  694. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
  695. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
  696. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
  697. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +104 -14
  698. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
  699. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  700. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  701. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +234 -0
  702. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
  703. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +6 -52
  704. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +6 -35
  705. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +6 -35
  706. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +28 -0
  707. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +6 -39
  708. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  709. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +5 -0
  710. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +3 -3
  711. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
  712. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
  713. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
  714. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
  715. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +30 -8
  716. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
  717. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
  718. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
  719. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
  720. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
  721. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  722. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
  723. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  724. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
  725. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
  726. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
  727. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  728. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
  729. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +16 -6
  730. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
  731. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
  732. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  733. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
  734. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
  735. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
  736. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
  737. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
  738. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +43 -0
  739. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  740. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +435 -24
  741. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +148 -6
  742. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.glsl +25 -0
  743. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +619 -177
  744. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  745. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  746. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +169 -0
  747. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3087 -0
  748. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  749. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
  750. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  751. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  752. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
  753. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +147 -0
  754. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +591 -0
  755. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  756. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  757. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  758. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  759. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
  760. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
  761. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
  762. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
  763. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  764. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  765. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
  766. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +112 -0
  767. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  768. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  769. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +483 -0
  770. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  771. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  772. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  773. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  774. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  775. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  776. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  777. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
  778. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +466 -0
  779. data/ext/sources/ggml/src/ggml.c +901 -129
  780. data/ext/sources/ggml/src/gguf.cpp +8 -1
  781. data/ext/sources/include/whisper.h +1 -0
  782. data/ext/sources/src/CMakeLists.txt +3 -1
  783. data/ext/sources/src/whisper.cpp +124 -81
  784. data/ext/sources/tests/CMakeLists.txt +8 -1
  785. data/ext/sources/tests/test-vad-full.cpp +7 -5
  786. data/ext/sources/tests/test-vad.cpp +3 -3
  787. data/extsources.rb +1 -0
  788. data/lib/whisper/model/uri.rb +17 -18
  789. data/sig/whisper.rbs +126 -2
  790. data/test/test_params.rb +24 -8
  791. data/test/test_segment.rb +0 -1
  792. data/test/test_token.rb +70 -0
  793. data/test/test_vad.rb +1 -1
  794. data/test/test_vad_context.rb +50 -0
  795. data/test/test_vad_segment.rb +19 -0
  796. data/test/test_vad_segments.rb +16 -0
  797. data/test/test_whisper.rb +8 -1
  798. data/whispercpp.gemspec +1 -1
  799. metadata +439 -179
  800. data/ext/sources/build-xcframework.sh +0 -547
  801. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +0 -279
  802. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +0 -1841
  803. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +0 -303
  804. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  805. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  806. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  807. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  808. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  809. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  810. data/ext/sources/ggml/src/ggml-cann/Doxyfile +0 -2579
  811. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  812. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  813. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  814. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  815. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  816. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  817. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  818. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  819. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  820. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  821. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  822. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  823. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  824. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  825. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  826. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  827. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -506
  828. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +0 -11
  829. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  830. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  831. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  832. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  833. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  834. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  835. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  836. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  837. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  838. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  839. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  840. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  841. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  842. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  843. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  844. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  845. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  846. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  847. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  848. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  849. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  850. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  851. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  852. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  853. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  854. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  855. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  856. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  857. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  858. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  859. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  860. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  861. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  862. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  863. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  864. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  865. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  866. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  867. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  868. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  869. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  870. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  871. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  872. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  873. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  874. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  875. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  876. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  877. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  878. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  879. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  880. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  881. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  882. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  883. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  884. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  885. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  886. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  887. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  888. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  889. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  890. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  891. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  892. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  893. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  894. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  895. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  896. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  897. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  898. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  899. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  900. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  901. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  902. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  903. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  904. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  905. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  906. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  907. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  908. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  909. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  910. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  911. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  912. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  913. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  914. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  915. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  916. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  917. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  918. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  919. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  920. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  921. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  922. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  923. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  924. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  925. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  926. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  927. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  928. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  929. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  930. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  931. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  932. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  933. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  934. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  935. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  936. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  937. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  938. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  939. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  940. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  941. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  942. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  943. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  944. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  945. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  946. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  947. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  948. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  949. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  950. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  951. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  952. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  953. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  954. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -6280
  955. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +0 -162
  956. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -118
  957. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -99
  958. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -58
  959. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
  960. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
  961. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
  962. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
  963. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
@@ -23,30 +23,36 @@
23
23
  #ifndef CANN_ACLNN_OPS
24
24
  #define CANN_ACLNN_OPS
25
25
 
26
- #include <functional>
26
+ #include "acl_tensor.h"
27
+ #include "common.h"
28
+
27
29
  #include <aclnnop/aclnn_abs.h>
28
- #include <aclnnop/aclnn_neg.h>
29
- #include <aclnnop/aclnn_exp.h>
30
30
  #include <aclnnop/aclnn_arange.h>
31
31
  #include <aclnnop/aclnn_argsort.h>
32
32
  #include <aclnnop/aclnn_cat.h>
33
33
  #include <aclnnop/aclnn_clamp.h>
34
+ #include <aclnnop/aclnn_cos.h>
35
+ #include <aclnnop/aclnn_exp.h>
34
36
  #include <aclnnop/aclnn_gelu.h>
35
37
  #include <aclnnop/aclnn_gelu_v2.h>
36
- #include <aclnnop/aclnn_sigmoid.h>
37
38
  #include <aclnnop/aclnn_hardsigmoid.h>
38
39
  #include <aclnnop/aclnn_hardswish.h>
39
40
  #include <aclnnop/aclnn_leaky_relu.h>
41
+ #include <aclnnop/aclnn_log.h>
42
+ #include <aclnnop/aclnn_logsoftmax.h>
43
+ #include <aclnnop/aclnn_neg.h>
44
+ #include <aclnnop/aclnn_norm.h>
40
45
  #include <aclnnop/aclnn_relu.h>
46
+ #include <aclnnop/aclnn_sigmoid.h>
47
+ #include <aclnnop/aclnn_sign.h>
41
48
  #include <aclnnop/aclnn_silu.h>
42
- #include <aclnnop/aclnn_tanh.h>
43
- #include <aclnnop/aclnn_sqrt.h>
44
49
  #include <aclnnop/aclnn_sin.h>
45
- #include <aclnnop/aclnn_cos.h>
46
- #include <aclnnop/aclnn_log.h>
47
- #include <aclnnop/aclnn_sign.h>
48
- #include "acl_tensor.h"
49
- #include "common.h"
50
+ #include <aclnnop/aclnn_slice.h>
51
+ #include <aclnnop/aclnn_sqrt.h>
52
+ #include <aclnnop/aclnn_tanh.h>
53
+
54
+ #include <functional>
55
+ #include <unordered_set>
50
56
 
51
57
  /**
52
58
  * @brief Repeats a ggml tensor along each dimension to match the dimensions
@@ -61,7 +67,7 @@
61
67
  * @param dst The ggml tensor representing the destination, which op is
62
68
  * GGML_OP_REPEAT and specifies the desired dimensions.
63
69
  */
64
- void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
70
+ void ggml_cann_repeat(ggml_backend_cann_context & ctx, ggml_tensor * dst);
65
71
 
66
72
  /**
67
73
  * @brief Applies the Leaky ReLU activation function to a tensor using the CANN
@@ -81,7 +87,7 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
81
87
  * @param dst The destination tensor where the result of the Leaky ReLU
82
88
  * activation is stored, which op is `GGML_OP_LEAKY_RELU`
83
89
  */
84
- void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
90
+ void ggml_cann_leaky_relu(ggml_backend_cann_context & ctx, ggml_tensor * dst);
85
91
 
86
92
  /**
87
93
  * @brief Concatenates multiple tensors along a specified dimension using the
@@ -96,7 +102,7 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
96
102
  * @attention tensorList length should be 2 and the dimension using for concat
97
103
  * default to 1.
98
104
  */
99
- void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
105
+ void ggml_cann_concat(ggml_backend_cann_context & ctx, ggml_tensor * dst);
100
106
 
101
107
  /**
102
108
  * @brief Generates a sequence of evenly spaced values within a specified
@@ -112,7 +118,7 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
112
118
  * `start`, 'stop' and 'step' are in dst->op_params and dst->op is
113
119
  * `GGML_OP_ARANGE`.
114
120
  */
115
- void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst);
121
+ void ggml_cann_arange(ggml_backend_cann_context & ctx, ggml_tensor * dst);
116
122
 
117
123
  /**
118
124
  * @brief Applies a clamp operation to the elements of a ggml tensor using the
@@ -130,7 +136,7 @@ void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst);
130
136
  * @param dst The destination tensor where the clamped values will be stored.
131
137
  * dst->op is `GGML_OP_CLAMP`, `min` and `max` value is in dst->params.
132
138
  */
133
- void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst);
139
+ void ggml_cann_clamp(ggml_backend_cann_context & ctx, ggml_tensor * dst);
134
140
 
135
141
  /**
136
142
  * @brief Scales the elements of a ggml tensor by a constant factor using the
@@ -147,7 +153,7 @@ void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst);
147
153
  * @param dst The destination tensor where the scaled values will be stored.
148
154
  * dst->op is `GGML_OP_SCALE` and `scale` value is in dst->params.
149
155
  */
150
- void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst);
156
+ void ggml_cann_scale(ggml_backend_cann_context & ctx, ggml_tensor * dst);
151
157
 
152
158
  /**
153
159
  * @brief Sorts the elements of a ggml tensor and returns the indices that
@@ -162,7 +168,7 @@ void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst);
162
168
  * @param dst The destination tensor where the sorted indices will be stored.
163
169
  * dst->op is `GGML_OP_ARGSORT`.
164
170
  */
165
- void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst);
171
+ void ggml_cann_argsort(ggml_backend_cann_context & ctx, ggml_tensor * dst);
166
172
 
167
173
  /**
168
174
  * @brief Computes the Layer Normalization for a ggml tensor using the CANN
@@ -184,7 +190,67 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst);
184
190
  * @param dst The destination tensor where the normalized values will be stored.
185
191
  * @attention `Var` defaults to dst->ne[0].
186
192
  */
187
- void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
193
+ void ggml_cann_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);
194
+
195
+ /**
196
+ * @brief Computes the L2 Normalization for a ggml tensor using the CANN
197
+ * backend.
198
+ *
199
+ * @details This function applies the L2 Normalization operation on the
200
+ * input tensor `src` and stores the result in the destination tensor
201
+ * `dst`. L2 Normalization scales the input tensor such that the
202
+ * L2 norm along the specified dimension equals 1. This operation
203
+ * is commonly used in neural networks for feature normalization
204
+ * and vector scaling.
205
+ * The operation is defined as:
206
+ * \f[
207
+ * \text{out} = \frac{x}{\sqrt{\sum{x^2}}}
208
+ * \f]
209
+ * The normalization is performed along the last dimension by default.
210
+ *
211
+ * @param ctx The CANN context used for operations.
212
+ * @param dst The destination tensor where the normalized values will be stored.
213
+ * @attention The normalization is performed along the last dimension of the
214
+ * input tensor by default.
215
+ */
216
+ void ggml_cann_l2_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);
217
+
218
+ /**
219
+ * @brief Computes the Cross Entropy Loss for a ggml tensor using the CANN
220
+ * backend.
221
+ *
222
+ * @details This function computes the cross entropy loss between the predicted
223
+ * logits and target probability distributions. The operation follows
224
+ * the same computation pattern as the CPU implementation:
225
+ * 1. Applies log_softmax to the logits along the class dimension
226
+ * 2. Element-wise multiplication with target distributions
227
+ * 3. Summation along the class dimension to get per-sample losses
228
+ * 4. Global summation and scaling by -1/nr to get final loss
229
+ *
230
+ * The computation can be expressed as:
231
+ * \f[
232
+ * \text{loss} = -\frac{1}{N} \sum_{i=1}^{N} \sum_{j=1}^{C} y_{ij} \cdot \log(\text{softmax}(x_{ij}))
233
+ * \f]
234
+ * where \f$N\f$ is the total number of samples, \f$C\f$ is the number
235
+ * of classes, \f$x\f$ are the logits, and \f$y\f$ are the target
236
+ * probability distributions.
237
+ *
238
+ * @param ctx The CANN context used for operations.
239
+ * @param dst The destination tensor where the computed loss will be stored.
240
+ * This should be a scalar tensor containing the final loss value.
241
+ *
242
+ * @note This implementation computes cross entropy between probability
243
+ * distributions, not the typical classification cross entropy that
244
+ * expects class indices as targets. Both input tensors (src0 and src1)
245
+ * should have the same shape and represent probability distributions
246
+ * over the class dimension.
247
+ * @note The function expects two source tensors:
248
+ * - dst->src[0]: Logits tensor (before softmax)
249
+ * - dst->src[1]: Target probability distributions tensor
250
+ * @note The computation is performed using CANN backend operators including
251
+ * LogSoftmax, Mul, ReduceSum, and Muls for the final scaling.
252
+ */
253
+ void ggml_cann_cross_entropy_loss(ggml_backend_cann_context & ctx, ggml_tensor * dst);
188
254
 
189
255
  /**
190
256
  * @brief Computes the Group Normalization for a ggml tensor using the CANN
@@ -208,7 +274,7 @@ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
208
274
  *
209
275
  * @attention eps defaults to 1e-6f.
210
276
  */
211
- void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
277
+ void ggml_cann_group_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);
212
278
 
213
279
  /**
214
280
  * @brief Computes the accumulation of tensors using the CANN backend.
@@ -227,7 +293,7 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
227
293
  * @param dst The destination tensor where the accumulated values will be stored.
228
294
  * `inplace` is in dst->params, and dst->op is `GGML_OP_ACC`.
229
295
  */
230
- void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst);
296
+ void ggml_cann_acc(ggml_backend_cann_context & ctx, ggml_tensor * dst);
231
297
 
232
298
  /**
233
299
  * @brief Computes the sum of elements along the last dimension of a ggml tensor
@@ -243,7 +309,7 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst);
243
309
  *
244
310
  * @attention `reduce_dims` defaults to 3, which means the last dimension.
245
311
  */
246
- void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
312
+ void ggml_cann_sum_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);
247
313
 
248
314
  /**
249
315
  * @brief Computes the sum of elements in a ggml tensor.
@@ -257,7 +323,7 @@ void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
257
323
  *
258
324
  */
259
325
 
260
- void ggml_cann_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst);
326
+ void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst);
261
327
 
262
328
  /**
263
329
  * @brief Upsamples a ggml tensor using nearest neighbor interpolation using
@@ -273,8 +339,7 @@ void ggml_cann_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst);
273
339
  * @param dst The destination tensor where the upsampled values will be stored.
274
340
  * dst->op is `GGML_OP_UPSCALE`.
275
341
  */
276
- void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
277
- ggml_tensor* dst);
342
+ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context & ctx, ggml_tensor * dst);
278
343
 
279
344
  /**
280
345
  * @brief Pads a ggml tensor to match the dimensions of the destination tensor
@@ -289,7 +354,7 @@ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
289
354
  * @param dst The destination tensor, which specifies the target dimensions for
290
355
  * padding. dst->op is `GGML_OP_PAD`.
291
356
  */
292
- void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst);
357
+ void ggml_cann_pad(ggml_backend_cann_context & ctx, ggml_tensor * dst);
293
358
 
294
359
  /**
295
360
  * @brief Executes a 2D pooling operation on a ggml tensor using the CANN
@@ -306,7 +371,7 @@ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst);
306
371
  * @param dst The destination tensor on which the pooling operation is to be
307
372
  * performed. dst->op is `GGML_OP_POOL_2D`.
308
373
  */
309
- void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
374
+ void ggml_cann_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst);
310
375
 
311
376
  /**
312
377
  * @brief Duplicates a ggml tensor using the CANN backend.
@@ -325,7 +390,7 @@ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
325
390
  * different shape and dst is no-contiguous.
326
391
  * @note: This func need to simplify.
327
392
  */
328
- void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst);
393
+ void ggml_cann_dup(ggml_backend_cann_context & ctx, ggml_tensor * dst);
329
394
 
330
395
  /**
331
396
  * @brief Computes the Root Mean Square (RMS) normalization of a ggml tensor
@@ -347,7 +412,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst);
347
412
  * @param dst The destination tensor where the normalized values will be stored.
348
413
  * dst->op is `GGML_OP_RMS_NORM`.
349
414
  */
350
- void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
415
+ void ggml_cann_rms_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);
351
416
 
352
417
  /**
353
418
  * @brief Applies a diagonal mask to the tensor with a specified value.
@@ -362,7 +427,7 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
362
427
  * `GGML_OP_DIAG_MASK`
363
428
  * @param value The value to use for masking.
364
429
  */
365
- void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float value);
430
+ void ggml_cann_diag_mask(ggml_backend_cann_context & ctx, ggml_tensor * dst, float value);
366
431
 
367
432
  /**
368
433
  * @brief Performs an image-to-column transformation on the input tensor.
@@ -377,7 +442,7 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float
377
442
  * @param dst The destination tensor that stores the result of the operation.
378
443
  * dst->op is `GGML_OP_IM2COL`.
379
444
  */
380
- void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst);
445
+ void ggml_cann_im2col(ggml_backend_cann_context & ctx, ggml_tensor * dst);
381
446
 
382
447
  /**
383
448
  * @brief Computes time step embeddings using sine and cosine functions.
@@ -391,10 +456,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst);
391
456
  * @param dst The destination tensor where the result of the embedding operation
392
457
  * will be stored. dst->op is `GGML_OP_TIMESTEP_EMBEDDING`.
393
458
  */
394
- void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* dst);
459
+ void ggml_cann_timestep_embedding(ggml_backend_cann_context & ctx, ggml_tensor * dst);
395
460
 
396
461
  // @see ggml_cann_dup.
397
- void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst);
462
+ void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst);
398
463
 
399
464
  /**
400
465
  * @brief Computes the softmax activation with optional masking.
@@ -416,21 +481,31 @@ void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst);
416
481
  * @param dst The destination tensor where the result will be stored. dst->op is
417
482
  * `GGML_OP_SOFTMAX`.
418
483
  */
419
- void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
484
+ void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst);
420
485
 
421
486
  /**
422
487
  * @brief Extracts specific rows from a tensor based on indices.
423
488
  *
424
489
  * @details This function retrieves rows from a source tensor src0 according to
425
490
  * the indices provided in another tensor src1 and stores the result in
426
- * a destination tensor (\p dst). It supports different data types
427
- * including F32, F16, Q4_0, and Q8_0.
491
+ * a destination tensor (\p dst).
428
492
  *
429
493
  * @param ctx The backend CANN context for executing operations.
430
494
  * @param dst The destination tensor where the extracted rows will be stored.
431
- * dst->op is `GGML_OP_GET_ROWS`.
432
495
  */
433
- void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
496
+ void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);
497
+
498
+ /**
499
+ * @brief Writes specific rows into a tensor at positions specified by indices.
500
+ *
501
+ * @details This function copies rows from a source tensor into a destination
502
+ * tensor (\p dst) at the positions indicated by the indices in another
503
+ * tensor.
504
+ *
505
+ * @param ctx The backend CANN context for executing operations.
506
+ * @param dst The destination tensor where the specified rows will be updated.
507
+ */
508
+ void ggml_cann_set_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);
434
509
 
435
510
  /**
436
511
  * @brief Executes matrix multiplication for the given tensor.
@@ -443,7 +518,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
443
518
  * @param dst The destination tensor for storing the result of the matrix
444
519
  * multiplication. dst->op is `GGML_OP_MUL_MAT`.
445
520
  */
446
- void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
521
+ void ggml_cann_mul_mat(ggml_backend_cann_context & ctx, ggml_tensor * dst);
447
522
 
448
523
  /**
449
524
  * @brief Applies Rotary Positional Embedding (RoPE) to the input tensor.
@@ -466,7 +541,7 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
466
541
  * @note The function currently does not support cases where the freq_scale is
467
542
  * not equal 1.
468
543
  */
469
- void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
544
+ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst);
470
545
 
471
546
  /**
472
547
  * @brief Computes the index of the maximum value along the specified dimension
@@ -481,7 +556,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
481
556
  * @param dst The destination tensor where the indices of the maximum values will
482
557
  * be stored. dst->op is `GGML_OP_ARGMAX`.
483
558
  */
484
- void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
559
+ void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst);
485
560
 
486
561
  /**
487
562
  * @brief Adds two tensors element-wise and stores the result in a destination
@@ -498,8 +573,10 @@ void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
498
573
  * @param acl_src1 The second source tensor.
499
574
  * @param acl_dst The destination tensor where the result will be stored.
500
575
  */
501
- void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
502
- aclTensor* acl_src1, aclTensor* acl_dst = nullptr);
576
+ void aclnn_add(ggml_backend_cann_context & ctx,
577
+ aclTensor * acl_src0,
578
+ aclTensor * acl_src1,
579
+ aclTensor * acl_dst = nullptr);
503
580
 
504
581
  /**
505
582
  * @brief Sub two tensors element-wise and stores the result in a destination
@@ -516,8 +593,10 @@ void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
516
593
  * @param acl_src1 The second source tensor.
517
594
  * @param acl_dst The destination tensor where the result will be stored.
518
595
  */
519
- void aclnn_sub(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
520
- aclTensor* acl_src1, aclTensor* acl_dst = nullptr);
596
+ void aclnn_sub(ggml_backend_cann_context & ctx,
597
+ aclTensor * acl_src0,
598
+ aclTensor * acl_src1,
599
+ aclTensor * acl_dst = nullptr);
521
600
 
522
601
  /**
523
602
  * @brief Performs element-wise multiplication of two tensors and stores the
@@ -535,8 +614,10 @@ void aclnn_sub(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
535
614
  * @param acl_other The second tensor for element-wise multiplication.
536
615
  * @param acl_dst The destination tensor where the result will be stored.
537
616
  */
538
- void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
539
- aclTensor* acl_other, aclTensor* acl_dst = nullptr);
617
+ void aclnn_mul(ggml_backend_cann_context & ctx,
618
+ aclTensor * acl_src,
619
+ aclTensor * acl_other,
620
+ aclTensor * acl_dst = nullptr);
540
621
 
541
622
  /**
542
623
  * @brief Matrix division, optionally in-place.
@@ -556,8 +637,10 @@ void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
556
637
  * @param inplace Flag indicating whether to perform the operation in-place on
557
638
  * `acl_src`.
558
639
  */
559
- void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src,
560
- aclTensor* acl_other, aclTensor* acl_dst = nullptr);
640
+ void aclnn_div(ggml_backend_cann_context & ctx,
641
+ aclTensor * acl_src,
642
+ aclTensor * acl_other,
643
+ aclTensor * acl_dst = nullptr);
561
644
 
562
645
  /**
563
646
  * @brief Applies element-wise cosine function to the elements of a tensor.
@@ -573,8 +656,7 @@ void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src,
573
656
  * @param acl_dst The destination tensor where the cosine results will be
574
657
  * stored.
575
658
  */
576
- void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
577
- aclTensor* acl_dst);
659
+ void aclnn_cos(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst);
578
660
 
579
661
  /**
580
662
  * @brief Applies element-wise sine function to the elements of a tensor.
@@ -591,8 +673,7 @@ void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
591
673
  * @param acl_src The source tensor on which the sine function will be applied.
592
674
  * @param acl_dst The destination tensor where the sine results will be stored.
593
675
  */
594
- void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
595
- aclTensor* acl_dst);
676
+ void aclnn_sin(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst);
596
677
 
597
678
  /**
598
679
  * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one
@@ -610,8 +691,12 @@ void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
610
691
  * @param acl_src1 Output pointer to the created ACL tensor corresponding to src1.
611
692
  * @param acl_dst Output pointer to the created ACL tensor corresponding to dst.
612
693
  */
613
- void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst,
614
- aclTensor ** acl_src0, aclTensor ** acl_src1, aclTensor ** acl_dst);
694
+ void bcast_shape(ggml_tensor * src0,
695
+ ggml_tensor * src1,
696
+ ggml_tensor * dst,
697
+ acl_tensor_ptr & acl_src0,
698
+ acl_tensor_ptr & acl_src1,
699
+ acl_tensor_ptr & acl_dst);
615
700
 
616
701
  /**
617
702
  * @brief Computes the 1D transposed convolution (deconvolution) of a ggml
@@ -626,7 +711,7 @@ void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst,
626
711
  * @param dst The destination tensor where the transposed convolution result
627
712
  * will be stored. dst->op is `GGML_OP_CONV_TRANSPOSE_1D`.
628
713
  */
629
- void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
714
+ void ggml_cann_conv_transpose_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst);
630
715
 
631
716
  /**
632
717
  * @brief Applies the ELU (Exponential Linear Unit) activation to a ggml tensor
@@ -651,7 +736,7 @@ void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* ds
651
736
  * @param dst The destination tensor where the ELU-activated result will be stored.
652
737
  * dst->op is expected to be `GGML_OP_ELU`.
653
738
  */
654
- void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
739
+ void ggml_cann_elu(ggml_backend_cann_context & ctx, ggml_tensor * dst);
655
740
 
656
741
  /**
657
742
  * @brief Computes the mean of a ggml tensor element-wise using the CANN backend.
@@ -666,7 +751,7 @@ void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
666
751
  * @param dst The destination tensor where the mean result will be stored.
667
752
  * dst->op is expected to be `GGML_OP_MEAN`.
668
753
  */
669
- void ggml_cann_mean(ggml_backend_cann_context& ctx, ggml_tensor* dst);
754
+ void ggml_cann_mean(ggml_backend_cann_context & ctx, ggml_tensor * dst);
670
755
 
671
756
  /**
672
757
  * @brief Applies 1D reflect padding to a ggml tensor using the CANN backend.
@@ -681,7 +766,7 @@ void ggml_cann_mean(ggml_backend_cann_context& ctx, ggml_tensor* dst);
681
766
  * @param dst The destination tensor where the padded result will be stored.
682
767
  * dst->op is expected to be `GGML_OP_PAD_REFLECT_1D`.
683
768
  */
684
- void ggml_cann_pad_reflect_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
769
+ void ggml_cann_pad_reflect_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst);
685
770
 
686
771
  /**
687
772
  * @brief Counts the number of equal elements in two ggml tensors using the CANN backend.
@@ -697,7 +782,7 @@ void ggml_cann_pad_reflect_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
697
782
  * @param dst The destination tensor where the result will be stored.
698
783
  * dst->op is expected to be `GGML_OP_COUNT_EQUAL`.
699
784
  */
700
- void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst);
785
+ void ggml_cann_count_equal(ggml_backend_cann_context & ctx, ggml_tensor * dst);
701
786
 
702
787
  /**
703
788
  * @brief Applies the Step activation function to a ggml tensor using the CANN backend.
@@ -712,7 +797,7 @@ void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst);
712
797
  * @param dst The destination tensor where the result will be stored.
713
798
  * dst->op is expected to be `GGML_OP_STEP`.
714
799
  */
715
- void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst);
800
+ void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst);
716
801
 
717
802
  /**
718
803
  * @brief Performs the Flash Attention extended operator using the CANN backend.
@@ -727,59 +812,46 @@ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst);
727
812
  * @param dst The destination tensor where the result will be stored.
728
813
  * dst->op is expected to be `GGML_OP_FLASH_ATTN_EXT`.
729
814
  */
730
- void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst);
815
+ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst);
731
816
 
732
817
  /*
733
818
  * @brief A generic wrapper for ACL resources with custom deleter support.
734
819
  */
735
- using any_acl_resource = std::unique_ptr<void, std::function<void(void*)>>;
820
+ using any_acl_resource = std::unique_ptr<void, std::function<void(void *)>>;
736
821
 
737
822
  /**
738
823
  * @brief Trait structure used to define how to destroy a given ACL resource type.
739
824
  *
740
825
  * @tparam T ACL resource type.
741
826
  */
742
- template<typename T>
743
- struct acl_resource_traits;
827
+ template <typename T> struct acl_resource_traits;
744
828
 
745
829
  /**
746
830
  * @brief Specialization for aclTensor, defines how to destroy an aclTensor resource.
747
831
  */
748
- template<>
749
- struct acl_resource_traits<aclTensor> {
750
- static void destroy(void* p) {
751
- ACL_CHECK(aclDestroyTensor(static_cast<aclTensor*>(p)));
752
- }
832
+ template <> struct acl_resource_traits<aclTensor> {
833
+ static void destroy(void * p) { ACL_CHECK(aclDestroyTensor(static_cast<aclTensor *>(p))); }
753
834
  };
754
835
 
755
836
  /**
756
837
  * @brief Specialization for aclIntArray, defines how to destroy an aclIntArray resource.
757
838
  */
758
- template<>
759
- struct acl_resource_traits<aclIntArray> {
760
- static void destroy(void* p) {
761
- ACL_CHECK(aclDestroyIntArray(static_cast<aclIntArray*>(p)));
762
- }
839
+ template <> struct acl_resource_traits<aclIntArray> {
840
+ static void destroy(void * p) { ACL_CHECK(aclDestroyIntArray(static_cast<aclIntArray *>(p))); }
763
841
  };
764
842
 
765
843
  /**
766
844
  * @brief Specialization for aclScalar, defines how to destroy an aclScalar resource.
767
845
  */
768
- template<>
769
- struct acl_resource_traits<aclScalar> {
770
- static void destroy(void* p) {
771
- ACL_CHECK(aclDestroyScalar(static_cast<aclScalar*>(p)));
772
- }
846
+ template <> struct acl_resource_traits<aclScalar> {
847
+ static void destroy(void * p) { ACL_CHECK(aclDestroyScalar(static_cast<aclScalar *>(p))); }
773
848
  };
774
849
 
775
850
  /**
776
851
  * @brief Specialization for aclTensorList, defines how to destroy an aclTensorList resource.
777
852
  */
778
- template<>
779
- struct acl_resource_traits<aclTensorList> {
780
- static void destroy(void* p) {
781
- ACL_CHECK(aclDestroyTensorList(static_cast<aclTensorList*>(p)));
782
- }
853
+ template <> struct acl_resource_traits<aclTensorList> {
854
+ static void destroy(void * p) { ACL_CHECK(aclDestroyTensorList(static_cast<aclTensorList *>(p))); }
783
855
  };
784
856
 
785
857
  /**
@@ -789,14 +861,8 @@ struct acl_resource_traits<aclTensorList> {
789
861
  * @param ptr Raw pointer to ACL resource.
790
862
  * @return any_acl_resource Smart pointer that handles destruction.
791
863
  */
792
- template<typename T>
793
- any_acl_resource make_acl_resource(T* ptr) {
794
- return any_acl_resource(
795
- static_cast<void*>(ptr),
796
- [](void* p) {
797
- acl_resource_traits<T>::destroy(p);
798
- }
799
- );
864
+ template <typename T> any_acl_resource make_acl_resource(T * ptr) {
865
+ return any_acl_resource(static_cast<void *>(ptr), [](void * p) { acl_resource_traits<T>::destroy(p); });
800
866
  }
801
867
 
802
868
  /**
@@ -806,89 +872,10 @@ any_acl_resource make_acl_resource(T* ptr) {
806
872
  * @param vec Target vector to hold ACL resources.
807
873
  * @param args Raw pointers to ACL resources.
808
874
  */
809
- template<typename... Args>
810
- void register_acl_resources(std::vector<any_acl_resource>& vec, Args*... args) {
875
+ template <typename... Args> void register_acl_resources(std::vector<any_acl_resource> & vec, Args *... args) {
811
876
  (vec.emplace_back(make_acl_resource(args)), ...);
812
877
  }
813
878
 
814
- /**
815
- * @brief Task class that wraps the execution of an aclnn function call.
816
- */
817
- class aclnn_task : public cann_task {
818
- public:
819
- aclnn_task(aclnn_func_t aclnn_func, void * workspace_addr,
820
- uint64_t workspace_size, aclOpExecutor * executor,
821
- aclrtStream stream) :
822
- aclnn_func_(aclnn_func),
823
- workspace_addr_(workspace_addr),
824
- workspace_size_(workspace_size),
825
- executor_(executor),
826
- stream_(stream) {}
827
- virtual void run_task() override {
828
- ACL_CHECK(aclnn_func_(workspace_addr_, workspace_size_, executor_, stream_));
829
- }
830
- private:
831
- aclnn_func_t aclnn_func_;
832
- void * workspace_addr_;
833
- uint64_t workspace_size_;
834
- aclOpExecutor * executor_;
835
- aclrtStream stream_;
836
- };
837
-
838
- /**
839
- * @brief Task class that releases ACL resources after usage.
840
- */
841
- class release_resource_task : public cann_task {
842
- public:
843
- release_resource_task(std::vector<any_acl_resource>&& resources){
844
- resource_ = std::move(resources);
845
- }
846
-
847
- virtual void run_task() override {
848
- resource_.clear();
849
- }
850
- private:
851
- std::vector<any_acl_resource> resource_;
852
- };
853
-
854
- /**
855
- * @brief Task class for performing asynchronous memory copy operations.
856
- */
857
- class async_memcpy_task : public cann_task {
858
- public:
859
- async_memcpy_task(void* dst, const void* src, size_t size,
860
- aclrtMemcpyKind kind, aclrtStream stream)
861
- : dst_(dst), src_(src), size_(size), kind_(kind), stream_(stream) {}
862
-
863
- virtual void run_task() override {
864
- ACL_CHECK(aclrtMemcpyAsync(dst_, size_, src_, size_, kind_, stream_));
865
- }
866
- private:
867
- void* dst_;
868
- const void* src_;
869
- size_t size_;
870
- aclrtMemcpyKind kind_;
871
- aclrtStream stream_;
872
- };
873
-
874
- /**
875
- * @brief Task class for performing asynchronous memory set operations.
876
- */
877
- class async_memset_task : public cann_task {
878
- public:
879
- async_memset_task(void* buffer, size_t size, int32_t value, aclrtStream stream)
880
- : buffer_(buffer), size_(size), value_(value), stream_(stream) {}
881
-
882
- virtual void run_task() override {
883
- ACL_CHECK(aclrtMemsetAsync(buffer_, size_, value_, size_, stream_));
884
- }
885
- private:
886
- void* buffer_;
887
- size_t size_;
888
- int32_t value_;
889
- aclrtStream stream_;
890
- };
891
-
892
879
  /**
893
880
  * @brief Launches an asynchronous task using the memory allocator.
894
881
  *
@@ -907,92 +894,20 @@ class async_memset_task : public cann_task {
907
894
  * same stream are executed in queue order.
908
895
  */
909
896
 
910
- #define GGML_CANN_CALL_ACLNN_OP(CTX, OP_NAME, ...) \
911
- do { \
912
- uint64_t workspaceSize = 0; \
913
- aclOpExecutor * executor; \
914
- void * workspaceAddr = nullptr; \
915
- ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor));\
916
- /* workspace should alloced in main thread to keep malloc order when using vmm. */ \
917
- if (workspaceSize > 0) { \
918
- ggml_cann_pool_alloc workspace_allocator(CTX.pool(), workspaceSize); \
919
- workspaceAddr = workspace_allocator.get(); \
920
- } \
921
- if (CTX.async_mode) { \
922
- auto task = \
923
- std::make_unique<aclnn_task>(aclnn##OP_NAME, workspaceAddr, workspaceSize, \
924
- executor, CTX.stream()); \
925
- CTX.task_queue.submit_task(std::move(task)); \
926
- } else { \
927
- ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, CTX.stream()));\
928
- } \
897
+ #define GGML_CANN_CALL_ACLNN_OP(CTX, OP_NAME, ...) \
898
+ do { \
899
+ uint64_t workspaceSize = 0; \
900
+ aclOpExecutor * executor; \
901
+ void * workspaceAddr = nullptr; \
902
+ ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor)); \
903
+ /* workspace should alloced in main thread to keep malloc order when using vmm. */ \
904
+ if (workspaceSize > 0) { \
905
+ ggml_cann_pool_alloc workspace_allocator(CTX.pool(), workspaceSize); \
906
+ workspaceAddr = workspace_allocator.get(); \
907
+ } \
908
+ ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, CTX.stream())); \
929
909
  } while (0)
930
910
 
931
- /**
932
- * @brief Registers and releases multiple ACL resources, optionally deferring the release
933
- * using a task.
934
- *
935
- * @tparam Args Types of the ACL resources.
936
- * @param ctx Backend context which manages task submission and async mode.
937
- * @param args Pointers to ACL resources to be released.
938
- */
939
- template <typename... Args>
940
- void ggml_cann_release_resources(ggml_backend_cann_context & ctx, Args &&... args) {
941
- std::vector<any_acl_resource> resources;
942
- register_acl_resources(resources, std::forward<Args>(args)...);
943
- if(ctx.async_mode) {
944
- auto task = std::make_unique<release_resource_task>(std::move(resources));
945
- ctx.task_queue.submit_task(std::move(task));
946
- }
947
- }
948
-
949
- /**
950
- * @brief Performs an asynchronous memory copy operation, optionally deferred via task submission.
951
- *
952
- * @param ctx Backend context containing stream and async configuration.
953
- * @param dst Destination memory address.
954
- * @param src Source memory address.
955
- * @param len Size of memory to copy (in bytes).
956
- * @param kind Type of memory copy (host-to-device, device-to-host, etc).
957
- */
958
- inline void ggml_cann_async_memcpy(ggml_backend_cann_context & ctx, void * dst,
959
- const void * src, size_t len, aclrtMemcpyKind kind) {
960
- if (ctx.async_mode) {
961
- auto task = std::make_unique<async_memcpy_task>(dst, const_cast<void *>(src), len, kind, ctx.stream());
962
- ctx.task_queue.submit_task(std::move(task));
963
- } else {
964
- ACL_CHECK(aclrtMemcpyAsync(dst, len, src, len, kind, ctx.stream()));
965
- }
966
- }
967
-
968
- inline void ggml_cann_async_memcpy(ggml_backend_cann_context * ctx, void * dst,
969
- const void * src, size_t len, aclrtMemcpyKind kind) {
970
- if (ctx->async_mode) {
971
- auto task = std::make_unique<async_memcpy_task>(dst, const_cast<void *>(src), len, kind, ctx->stream());
972
- ctx->task_queue.submit_task(std::move(task));
973
- } else {
974
- ACL_CHECK(aclrtMemcpyAsync(dst, len, src, len, kind, ctx->stream()));
975
- }
976
- }
977
-
978
- /**
979
- * @brief Performs an asynchronous memory set operation, optionally deferred via task submission.
980
- *
981
- * @param ctx Backend context containing stream and async configuration.
982
- * @param buffer Memory buffer to be set.
983
- * @param size Size of the memory buffer (in bytes).
984
- * @param value Value to set in the buffer.
985
- */
986
- inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffer,
987
- size_t size, int value) {
988
- if (ctx.async_mode) {
989
- auto task = std::make_unique<async_memset_task>(buffer, size, value, ctx.stream());
990
- ctx.task_queue.submit_task(std::move(task));
991
- } else {
992
- ACL_CHECK(aclrtMemsetAsync(buffer, size, value, size, ctx.stream()));
993
- }
994
- }
995
-
996
911
  /**
997
912
  * @brief Performs sparse expert-based matrix multiplication using the CANN backend.
998
913
  *
@@ -1018,7 +933,46 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe
1018
933
  * @param dst The destination tensor where the expert-weighted token outputs are stored.
1019
934
  * Expected to be of shape [M, K, N, 1].
1020
935
  */
1021
- void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
936
+ void ggml_cann_mul_mat_id(ggml_backend_cann_context & ctx, ggml_tensor * dst);
937
+
938
+ /**
939
+ * @brief Performs fused ADD + RMS_NORM operation using the CANN backend.
940
+ *
941
+ * This function fuses the ADD and RMS_NORM operations into a single kernel call
942
+ * for better performance. It first adds two input tensors (x1 + x2), then applies
943
+ * RMS normalization to the result.
944
+ *
945
+ * @param ctx The context for the CANN backend operations.
946
+ * @param dst The ADD operation node, contains the two input tensors to be added.
947
+ * @param rms_norm_tensor The RMS_NORM operation node, contains the gamma weights
948
+ * and epsilon parameter.
949
+ */
950
+ void ggml_cann_op_add_rms_norm_fused(ggml_backend_cann_context & ctx, ggml_tensor * add_node, ggml_tensor * rms_norm_node);
951
+
952
+ /**
953
+ * @brief Check whether a tensor is a weight tensor for matrix multiplication.
954
+ *
955
+ * @details Checks whether the given tensor serves as weight parameters in matrix multiplication operations,
956
+ * typically within neural network layers. The function maintains a static set of canonical weight
957
+ * naming suffixes from Transformer-based architectures. Uses substring matching to identify weight
958
+ * tensors even with hierarchical naming patterns.
959
+ *
960
+ * @param tensor Pointer to the target ggml_tensor object (const-qualified).
961
+ */
962
+ static bool is_matmul_weight(const ggml_tensor * tensor) {
963
+ std::string name = ggml_get_name(tensor);
964
+ static const std::unordered_set<std::string> weight_suffixes{ "output.weight", "attn_q.weight",
965
+ "attn_k.weight", "attn_v.weight",
966
+ "attn_output.weight", "ffn_gate.weight",
967
+ "ffn_up.weight", "ffn_down.weight" };
968
+
969
+ for (const auto & suffix : weight_suffixes) {
970
+ if (name.find(suffix) != std::string::npos) {
971
+ return true;
972
+ }
973
+ }
974
+ return false;
975
+ }
1022
976
 
1023
977
  /**
1024
978
  * @brief Applies a element-wise operation to two input tensors using the CANN
@@ -1036,23 +990,17 @@ void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
1036
990
  * @param ctx The CANN backend context used to manage execution and resources.
1037
991
  * @param dst The destination tensor.
1038
992
  */
1039
- template <auto binary_op>
1040
- void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1041
- ggml_tensor* src0 = dst->src[0];
1042
- ggml_tensor* src1 = dst->src[1];
993
+ template <auto binary_op> void ggml_cann_binary_op(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
994
+ ggml_tensor * src0 = dst->src[0];
995
+ ggml_tensor * src1 = dst->src[1];
1043
996
 
1044
- aclTensor* acl_src0;
1045
- aclTensor* acl_src1;
1046
- aclTensor* acl_dst;
997
+ acl_tensor_ptr acl_src0, acl_src1, acl_dst;
1047
998
 
1048
999
  // Need bcast
1049
- bcast_shape(src0, src1, dst, &acl_src0, &acl_src1, &acl_dst);
1050
- binary_op(ctx, acl_src0, acl_src1, acl_dst);
1051
-
1052
- ggml_cann_release_resources(ctx, acl_src0, acl_src1, acl_dst);
1000
+ bcast_shape(src0, src1, dst, acl_src0, acl_src1, acl_dst);
1001
+ binary_op(ctx, acl_src0.get(), acl_src1.get(), acl_dst.get());
1053
1002
  }
1054
1003
 
1055
-
1056
1004
  /**
1057
1005
  * @brief Applies a unary operation to an input tensor using the CANN backend.
1058
1006
  *
@@ -1060,66 +1008,157 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1060
1008
  * and stores the result in the destination tensor.
1061
1009
  *
1062
1010
  * @tparam unary_op A callable with the signature:
1063
- * void(ggml_backend_cann_context&, aclTensor*, aclTensor*)
1011
+ * void(ggml_backend_cann_context&, aclTensor *, aclTensor *)
1064
1012
  * where the first aclTensor is the source and the second is the destination.
1065
1013
  * @param ctx The CANN backend context for managing resources and execution.
1066
1014
  * @param dst The destination tensor. Its src[0] is treated as the input tensor.
1067
1015
  */
1068
- template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
1069
- void ggml_cann_unary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1070
- ggml_tensor* src = dst->src[0];
1016
+ template <void unary_op(ggml_backend_cann_context &, aclTensor *, aclTensor *)>
1017
+ void ggml_cann_op_unary(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1018
+ ggml_tensor * src = dst->src[0];
1071
1019
 
1072
- aclTensor* acl_src = ggml_cann_create_tensor(src);
1073
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
1020
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
1021
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
1074
1022
 
1075
- unary_op(ctx, acl_src, acl_dst);
1076
- ggml_cann_release_resources(ctx, acl_src, acl_dst);
1023
+ unary_op(ctx, acl_src.get(), acl_dst.get());
1077
1024
  }
1078
1025
 
1079
1026
  /**
1080
- * @brief Applies a unary operation to a ggml tensor using the CANN backend.
1027
+ * @brief Applies a unary operation to a ggml tensor using the CANN backend.
1081
1028
  *
1082
- * @details This function performs a unary operation on the input tensor using
1083
- * a user-provided lambda or callable object `unary_op`, which accepts the CANN
1084
- * context and two ACL tensors (source and destination). Internally, this function
1085
- * creates ACL representations of the ggml tensors and invokes the unary operation.
1086
- * The result is stored in the destination tensor `dst`. This utility abstracts the
1087
- * common boilerplate of tensor conversion and cleanup when implementing unary ops.
1029
+ * @details This function applies a unary operation to the input tensor using
1030
+ * a user-provided lambda or callable `unary_op`. The lambda receives the
1031
+ * CANN backend context and two ACL tensors: the source and the destination.
1088
1032
  *
1089
- * @param unary_op A callable that performs the unary operation using CANN APIs.
1090
- * @param ctx The CANN context used for operations.
1091
- * @param dst The destination tensor where the result will be stored.
1092
- * The source tensor is retrieved from `dst->src[0]`.
1033
+ * Internally, this function handles the conversion from GGML tensors to ACL tensors,
1034
+ * calls the provided unary op, and manages resource cleanup. The input is assumed
1035
+ * to be `dst->src[0]`, and the result is written to `dst`.
1036
+ *
1037
+ * This utility simplifies writing unary op wrappers by abstracting tensor preparation.
1038
+ *
1039
+ * @param unary_op A callable that performs the unary operation using CANN ACL APIs.
1040
+ * @param ctx The CANN context for operation execution.
1041
+ * @param dst The destination ggml_tensor where the result will be stored.
1042
+ * The input tensor is assumed to be `dst->src[0]`.
1043
+ *
1044
+ * @see GGML_CANN_CALL_OP_UNARY
1045
+ */
1046
+ void ggml_cann_op_unary(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
1047
+ ggml_backend_cann_context & ctx,
1048
+ ggml_tensor * dst);
1049
+
1050
+ void ggml_cann_ssm_conv(ggml_backend_cann_context & ctx, ggml_tensor * dst);
1051
+
1052
+ /**
1053
+ * @brief Applies a gated (GLU-style) unary operation using the CANN backend.
1054
+ *
1055
+ * @details This function performs a gated activation such as GEGLU or ReGLU.
1056
+ * It supports two input modes:
1057
+ *
1058
+ * 1. **Dual input mode**: `dst->src[0]` and `dst->src[1]` are both valid tensors.
1059
+ * These are used directly as the value and gate tensors.
1060
+ *
1061
+ * 2. **Packed input mode**: Only `dst->src[0]` is valid, and it is assumed to
1062
+ * contain a concatenation of value and gate along the first dimension. This tensor
1063
+ * will be split into two equal halves to form the value and gate inputs.
1064
+ *
1065
+ * The function applies a user-provided unary operation (e.g., GELU) to the value tensor,
1066
+ * then multiplies the result in-place with the gate tensor:
1067
+ *
1068
+ * @code
1069
+ * dst = unary_op(value) * gate;
1070
+ * @endcode
1071
+ *
1072
+ * The `swapped` parameter (from `dst->op_params[1]`) allows flipping the
1073
+ * order of value/gate in the packed input case.
1074
+ *
1075
+ * @param unary_op A callable that performs the unary operation using CANN ACL APIs.
1076
+ * It receives (ctx, acl_value_tensor, acl_output_tensor).
1077
+ * @param ctx The CANN context used for execution.
1078
+ * @param dst The destination ggml_tensor. Source tensors are in `dst->src[0]` and optionally `src[1]`.
1079
+ *
1080
+ * @see GGML_CANN_CALL_OP_UNARY_GATED
1081
+ */
1082
+ void ggml_cann_op_unary_gated(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
1083
+ ggml_backend_cann_context & ctx,
1084
+ ggml_tensor * dst);
1085
+
1086
+ /**
1087
+ * @brief Helper macro to call a unary ACL operator via ggml_cann_op_unary.
1088
+ *
1089
+ * This macro wraps the specified ACLNN unary operator name into a lambda expression,
1090
+ * and passes it to `ggml_cann_op_unary`, which handles the common logic for executing
1091
+ * unary ops in the CANN backend.
1092
+ *
1093
+ * Internally, this macro expands to a lambda like:
1094
+ * @code
1095
+ * [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) {
1096
+ * GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
1097
+ * };
1098
+ * @endcode
1099
+ *
1100
+ * This lambda is then passed to `ggml_cann_op_unary`, which applies the operation.
1101
+ *
1102
+ * @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
1103
+ *
1104
+ * @see ggml_cann_op_unary
1105
+ * @see GGML_CANN_CALL_ACLNN_OP
1093
1106
  */
1094
- void ggml_cann_unary_op(
1095
- std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
1096
- ggml_backend_cann_context& ctx, ggml_tensor* dst);
1107
+ #define GGML_CANN_CALL_OP_UNARY(OP_NAME) \
1108
+ do { \
1109
+ auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { \
1110
+ GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \
1111
+ }; \
1112
+ ggml_cann_op_unary(lambda, ctx, dst); \
1113
+ } while (0)
1097
1114
 
1098
1115
  /**
1099
- * @brief Helper macro to invoke a unary ACL operation using ggml_cann_unary_op.
1116
+ * @brief Helper macro to call a gated unary ACL operator via ggml_cann_op_unary_gated.
1100
1117
  *
1101
- * This macro defines an inline lambda wrapping a specific ACL operation name,
1102
- * and passes it to the templated ggml_cann_unary_op function. It simplifies
1103
- * calling unary ops by hiding the lambda boilerplate.
1118
+ * This macro wraps the specified ACLNN unary operator name into a lambda expression,
1119
+ * and passes it to `ggml_cann_op_unary_gated`, which handles the common logic for
1120
+ * executing gated unary ops in the CANN backend.
1104
1121
  *
1105
- * Internally, the lambda will call:
1122
+ * Internally, this macro expands to a lambda like:
1106
1123
  * @code
1107
- * GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
1124
+ * [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) {
1125
+ * GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
1126
+ * };
1108
1127
  * @endcode
1109
1128
  *
1129
+ * This lambda is then passed to `ggml_cann_op_unary_gated`, which applies the operation.
1130
+ *
1110
1131
  * @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
1111
1132
  *
1112
- * @see ggml_cann_unary_op
1133
+ * @see ggml_cann_op_unary_gated
1113
1134
  * @see GGML_CANN_CALL_ACLNN_OP
1114
1135
  */
1115
- #define GGML_CANN_CALL_UNARY_OP(OP_NAME) \
1116
- do { \
1117
- auto lambda = [](ggml_backend_cann_context& ctx, \
1118
- aclTensor* acl_src, \
1119
- aclTensor* acl_dst) { \
1120
- GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \
1121
- }; \
1122
- ggml_cann_unary_op(lambda, ctx, dst); \
1123
- } \
1124
- while (0)
1136
+ #define GGML_CANN_CALL_OP_UNARY_GATED(OP_NAME) \
1137
+ do { \
1138
+ auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { \
1139
+ GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \
1140
+ }; \
1141
+ ggml_cann_op_unary_gated(lambda, ctx, dst); \
1142
+ } while (0)
1143
+
1125
1144
  #endif // CANN_ACLNN_OPS
1145
+
1146
+ /**
1147
+ * @brief Performs outer product operation on two ggml tensors using the CANN backend.
1148
+ *
1149
+ * @details This function computes the outer product of two input tensors (src0 and src1)
1150
+ * and stores the result in the destination tensor. The outer product operation is defined as:
1151
+ * dst[i,j,k,l] = sum_m (src0[i,m,k,l] * src1[j,m,k,l])
1152
+ *
1153
+ * The function supports multiple data types including F32, F16. For floating-point
1154
+ * types, it uses batch matrix multiplication for efficient computation.
1155
+ *
1156
+ * The implementation handles 4D tensor broadcasting and batch processing automatically.
1157
+ *
1158
+ * @param ctx The CANN backend context for operation execution and memory management.
1159
+ * @param dst The destination ggml_tensor where the outer product result will be stored.
1160
+ * The input tensors are assumed to be `dst->src[0]` and `dst->src[1]`.
1161
+ *
1162
+ * @see GGML_CANN_CALL_ACLNN_OP for CANN operator invocation
1163
+ */
1164
+ void ggml_cann_out_prod(ggml_backend_cann_context & ctx, ggml_tensor * dst);