whispercpp 1.3.2 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (664) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +59 -27
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/build-xcframework.sh +24 -0
  19. data/ext/sources/examples/CMakeLists.txt +1 -0
  20. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  21. data/ext/sources/examples/addon.node/addon.cpp +154 -35
  22. data/ext/sources/examples/addon.node/index.js +10 -5
  23. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  24. data/ext/sources/examples/bench/bench.cpp +29 -18
  25. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  26. data/ext/sources/examples/cli/cli.cpp +7 -4
  27. data/ext/sources/examples/command/command.cpp +58 -32
  28. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  29. data/ext/sources/examples/common-ggml.cpp +2 -0
  30. data/ext/sources/examples/common-whisper.cpp +14 -7
  31. data/ext/sources/examples/lsp/lsp.cpp +21 -17
  32. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  33. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  34. data/ext/sources/examples/server/server.cpp +193 -35
  35. data/ext/sources/examples/server.py +6 -1
  36. data/ext/sources/examples/stream/stream.cpp +10 -2
  37. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  38. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  39. data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -0
  40. data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
  41. data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
  42. data/ext/sources/examples/talk-llama/llama-arch.cpp +756 -15
  43. data/ext/sources/examples/talk-llama/llama-arch.h +85 -1
  44. data/ext/sources/examples/talk-llama/llama-batch.cpp +773 -272
  45. data/ext/sources/examples/talk-llama/llama-batch.h +126 -55
  46. data/ext/sources/examples/talk-llama/llama-chat.cpp +150 -13
  47. data/ext/sources/examples/talk-llama/llama-chat.h +8 -0
  48. data/ext/sources/examples/talk-llama/llama-context.cpp +814 -542
  49. data/ext/sources/examples/talk-llama/llama-context.h +68 -32
  50. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  51. data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
  52. data/ext/sources/examples/talk-llama/llama-graph.cpp +787 -440
  53. data/ext/sources/examples/talk-llama/llama-graph.h +333 -153
  54. data/ext/sources/examples/talk-llama/llama-hparams.cpp +128 -6
  55. data/ext/sources/examples/talk-llama/llama-hparams.h +80 -17
  56. data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
  57. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
  58. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +137 -0
  59. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +1248 -1967
  60. data/ext/sources/examples/talk-llama/llama-kv-cache.h +218 -345
  61. data/ext/sources/examples/talk-llama/llama-kv-cells.h +164 -52
  62. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +266 -0
  63. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +139 -0
  64. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1154 -0
  65. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +182 -0
  66. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  67. data/ext/sources/examples/talk-llama/llama-memory.h +94 -4
  68. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  69. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +44 -17
  70. data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
  71. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  72. data/ext/sources/examples/talk-llama/llama-model.cpp +11377 -5248
  73. data/ext/sources/examples/talk-llama/llama-model.h +87 -9
  74. data/ext/sources/examples/talk-llama/llama-quant.cpp +137 -16
  75. data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
  76. data/ext/sources/examples/talk-llama/llama-vocab.cpp +502 -38
  77. data/ext/sources/examples/talk-llama/llama-vocab.h +46 -0
  78. data/ext/sources/examples/talk-llama/llama.cpp +76 -17
  79. data/ext/sources/examples/talk-llama/llama.h +176 -151
  80. data/ext/sources/examples/talk-llama/talk-llama.cpp +11 -6
  81. data/ext/sources/examples/talk-llama/unicode.cpp +212 -0
  82. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  83. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  84. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +6 -2
  85. data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
  86. data/ext/sources/ggml/CMakeLists.txt +106 -33
  87. data/ext/sources/ggml/cmake/common.cmake +24 -0
  88. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  89. data/ext/sources/ggml/include/ggml-backend.h +18 -2
  90. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  91. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  92. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  93. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  94. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  95. data/ext/sources/ggml/include/ggml.h +365 -21
  96. data/ext/sources/ggml/src/CMakeLists.txt +98 -25
  97. data/ext/sources/ggml/src/ggml-alloc.c +265 -141
  98. data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
  99. data/ext/sources/ggml/src/ggml-backend-reg.cpp +35 -13
  100. data/ext/sources/ggml/src/ggml-backend.cpp +266 -60
  101. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +4 -4
  102. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
  103. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +15 -0
  104. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  105. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
  106. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
  107. data/ext/sources/ggml/src/ggml-cann/common.h +149 -2
  108. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +521 -78
  109. data/ext/sources/ggml/src/ggml-common.h +21 -0
  110. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +165 -50
  111. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -3
  112. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  113. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  114. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +3650 -0
  115. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1891 -0
  116. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2160 -0
  117. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  118. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  119. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1897 -0
  120. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  121. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  122. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  123. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  124. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  125. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +214 -0
  126. data/ext/sources/ggml/src/ggml-cpu/common.h +18 -3
  127. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +23 -7
  128. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +179 -110
  129. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +44 -33
  130. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  131. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
  132. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
  133. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +228 -98
  134. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +532 -1124
  135. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  136. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +3374 -2081
  137. data/ext/sources/ggml/src/ggml-cpu/ops.h +13 -8
  138. data/ext/sources/ggml/src/ggml-cpu/quants.c +1193 -0
  139. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +34 -0
  140. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1982 -0
  141. data/ext/sources/ggml/src/ggml-cpu/repack.h +120 -0
  142. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +367 -46
  143. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  144. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  145. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  146. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  147. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +3 -3
  148. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +1 -1
  149. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +272 -35
  150. data/ext/sources/ggml/src/ggml-cpu/vec.h +794 -142
  151. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
  152. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  153. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  154. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  155. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  156. data/ext/sources/ggml/src/ggml-cuda/common.cuh +291 -81
  157. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  158. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  159. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  160. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  161. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  162. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  163. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  164. data/ext/sources/ggml/src/ggml-cuda/convert.cu +117 -22
  165. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +20 -0
  166. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  167. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
  168. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  169. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  170. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +499 -368
  171. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +142 -93
  172. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
  173. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
  174. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
  175. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +90 -50
  176. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
  177. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  178. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  179. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +636 -222
  180. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  181. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  182. data/ext/sources/ggml/src/ggml-cuda/mean.cu +73 -0
  183. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  184. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
  185. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
  186. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
  187. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
  188. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
  189. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +506 -0
  190. data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +4 -5
  191. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
  192. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  193. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  194. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  195. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  196. data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
  197. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  198. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  199. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
  200. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  201. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  202. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  203. data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
  204. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  205. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
  206. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  207. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  208. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  209. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
  210. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  211. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -98
  212. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  213. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +23 -19
  214. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  225. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  226. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  227. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  228. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  229. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  230. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  231. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  232. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  233. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  234. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  235. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  236. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  237. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  238. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  239. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  240. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  241. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  242. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  243. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  244. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  245. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  246. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  247. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  248. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  249. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  250. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  251. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
  252. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  253. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  254. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  255. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  256. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  257. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  258. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  259. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  260. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  261. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  262. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  263. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  264. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  265. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  267. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  268. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  269. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
  270. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
  271. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  272. data/ext/sources/ggml/src/ggml-cuda/unary.cu +179 -0
  273. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +15 -0
  274. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
  275. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  276. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  277. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
  278. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
  279. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +14 -2
  280. data/ext/sources/ggml/src/ggml-impl.h +229 -175
  281. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +21 -17
  282. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  283. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  284. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  285. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
  286. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
  287. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
  288. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
  289. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +163 -63
  290. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
  291. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
  292. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
  293. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +3208 -1575
  294. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
  295. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  296. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +32 -0
  297. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4430 -792
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  300. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  301. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  302. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  303. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  304. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
  305. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  306. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  307. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
  308. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  309. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  310. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  311. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  312. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  313. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  314. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  315. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  316. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  317. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  318. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  322. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  323. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  324. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  325. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  326. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  327. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  328. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  329. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  330. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  331. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  332. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
  333. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  334. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  335. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  336. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  344. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  345. data/ext/sources/ggml/src/ggml-quants.c +117 -24
  346. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  347. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +85 -62
  348. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  349. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
  350. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +9 -0
  351. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
  352. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  353. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +13 -17
  354. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +21 -2
  355. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +116 -211
  356. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  357. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  358. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +700 -1041
  359. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +20 -9
  360. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +17 -26
  361. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +2 -96
  362. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +393 -250
  363. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  364. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  365. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  366. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -11
  367. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +125 -21
  368. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  369. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  370. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  371. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  372. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +4 -3
  373. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +105 -17
  374. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  375. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4198 -1145
  376. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  377. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  378. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  379. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  380. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  381. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
  382. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  383. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  384. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
  385. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
  386. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  387. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  388. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
  389. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
  390. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
  391. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  392. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  393. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  394. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  395. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  396. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  397. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  398. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
  399. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
  400. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
  401. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
  402. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
  403. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  404. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  405. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  406. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  407. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
  408. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
  409. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
  410. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +19 -0
  411. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  412. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  413. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  414. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
  415. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
  416. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
  417. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  418. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  419. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
  420. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
  421. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
  422. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
  423. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
  424. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  425. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  426. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
  427. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
  428. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  429. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +64 -11
  430. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  431. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  432. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  433. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  434. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +338 -71
  449. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  450. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
  451. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
  452. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
  453. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  454. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  455. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  456. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
  457. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  458. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  459. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
  460. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
  461. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  462. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
  463. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
  464. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  465. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  466. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  467. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  468. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  469. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  470. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  471. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  472. data/ext/sources/ggml/src/ggml.c +802 -142
  473. data/ext/sources/ggml/src/ggml.cpp +26 -0
  474. data/ext/sources/ggml/src/gguf.cpp +32 -4
  475. data/ext/sources/include/whisper.h +2 -0
  476. data/ext/sources/src/CMakeLists.txt +2 -0
  477. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  478. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  479. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  480. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  481. data/ext/sources/src/whisper.cpp +241 -215
  482. data/ext/sources/tests/CMakeLists.txt +8 -1
  483. data/ext/sources/tests/test-vad-full.cpp +3 -3
  484. data/ext/sources/tests/test-vad.cpp +2 -2
  485. data/extsources.rb +15 -9
  486. data/lib/whisper/context.rb +15 -0
  487. data/lib/whisper/model/uri.rb +57 -2
  488. data/lib/whisper/segment.rb +58 -0
  489. data/sig/whisper.rbs +75 -38
  490. data/{tests → test}/helper.rb +1 -12
  491. data/{tests → test}/test_model.rb +9 -0
  492. data/test/test_package.rb +51 -0
  493. data/{tests → test}/test_params.rb +8 -0
  494. data/test/test_segment.rb +146 -0
  495. data/{tests → test}/test_whisper.rb +70 -0
  496. data/whispercpp.gemspec +2 -3
  497. metadata +246 -191
  498. data/ext/sources/.dockerignore +0 -3
  499. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  500. data/ext/sources/ci/run.sh +0 -336
  501. data/ext/sources/close-issue.yml +0 -28
  502. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  503. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  504. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  505. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  506. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  507. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  508. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  509. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  510. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  511. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  512. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  513. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  514. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  515. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  516. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  517. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  518. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -6431
  519. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  520. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  521. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  522. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  523. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  524. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  525. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  526. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  527. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -336
  528. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  529. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  530. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  531. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  532. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  533. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  534. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  535. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  536. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  537. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  538. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  539. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  540. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  541. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  542. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  543. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  544. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  545. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  546. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  547. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  548. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  549. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  550. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  551. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  552. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  553. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  554. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  555. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  556. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  557. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  558. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  559. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  560. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  561. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  562. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  563. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  564. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  565. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  566. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  567. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  568. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  569. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  570. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  571. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  572. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  573. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  574. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  575. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  576. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  577. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  578. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  579. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  580. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  581. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  582. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  583. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  584. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  585. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  586. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  587. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  588. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  589. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  590. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  591. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  592. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  593. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  594. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  595. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  596. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  597. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  598. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  599. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  600. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  601. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  602. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  603. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  604. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  605. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  606. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  607. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  608. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  609. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  610. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  611. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  612. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  613. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  614. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  615. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  616. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  617. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  618. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  619. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  620. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  621. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  622. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  623. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  624. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  625. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  626. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  627. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  628. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  629. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  630. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  631. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  632. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  633. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  634. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  635. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  636. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  637. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  638. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  639. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  640. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  641. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  642. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  643. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  644. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  645. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  646. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  647. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  648. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  649. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  650. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  651. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  652. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  653. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -5998
  654. data/tests/test_package.rb +0 -46
  655. data/tests/test_segment.rb +0 -74
  656. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  657. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  658. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  659. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  660. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  661. /data/{tests → test}/test_callback.rb +0 -0
  662. /data/{tests → test}/test_error.rb +0 -0
  663. /data/{tests → test}/test_vad.rb +0 -0
  664. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -24,6 +24,7 @@
24
24
 
25
25
  #include <acl/acl.h>
26
26
  #include <stdarg.h>
27
+ #include <aclnnop/aclnn_trans_matmul_weight.h>
27
28
 
28
29
  #include <cmath>
29
30
  #include <cstdio>
@@ -31,6 +32,8 @@
31
32
  #include <mutex>
32
33
  #include <queue>
33
34
  #include <chrono>
35
+ #include <unordered_set>
36
+ #include <optional>
34
37
 
35
38
  #include "ggml-impl.h"
36
39
  #include "ggml-backend-impl.h"
@@ -72,13 +75,12 @@
72
75
  * @param device The device ID to set.
73
76
  */
74
77
  void ggml_cann_set_device(const int32_t device) {
75
- // TODO: uncomment these lines after empty context has fixed.
76
- // int current_device;
77
- // ACL_CHECK(aclrtGetDevice(&current_device));
78
+ int current_device = -1;
79
+ aclrtGetDevice(&current_device);
78
80
 
79
- // if (device == current_device) {
80
- // return;
81
- // }
81
+ if (device == current_device) {
82
+ return;
83
+ }
82
84
  ACL_CHECK(aclrtSetDevice(device));
83
85
  }
84
86
 
@@ -93,6 +95,44 @@ int32_t ggml_cann_get_device() {
93
95
  return id;
94
96
  }
95
97
 
98
+ /**
99
+ * @brief Get the value of the specified environment variable (name).
100
+ * if not empty, return a std::string object
101
+ */
102
+ std::optional<std::string> get_env(const std::string& name) {
103
+ const char* val = std::getenv(name.c_str());
104
+ if (!val) return std::nullopt;
105
+ std::string res = std::string(val);
106
+ std::transform(res.begin(), res.end(), res.begin(), ::tolower);
107
+ return res;
108
+ }
109
+
110
+ /**
111
+ * @brief Verify whether the environment variable is a valid value.
112
+ */
113
+ bool parse_bool(const std::string& value) {
114
+ std::unordered_set<std::string> valid_values = {"on", "1", "yes", "y", "enable", "true"};
115
+ return valid_values.find(value) != valid_values.end();
116
+ }
117
+
118
+ /**
119
+ * @brief Parse a string as an integer, returning 0 if invalid.
120
+ *
121
+ * This function attempts to convert the input string `value` to an `int`.
122
+ * If the string is not a valid integer or is out of the `int` range,
123
+ * it returns 0.
124
+ *
125
+ * @param value The string to parse.
126
+ * @return The parsed integer, or 0 if conversion fails.
127
+ */
128
+ int parse_integer(const std::string& value) {
129
+ try {
130
+ return std::stoi(value);
131
+ } catch (...) {
132
+ return 0;
133
+ }
134
+ }
135
+
96
136
  /**
97
137
  * @brief Initialize the CANN device information.
98
138
  *
@@ -214,7 +254,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
214
254
  * @param device The device ID to associate with this buffer pool.
215
255
  */
216
256
  explicit ggml_cann_pool_buf_prio(int device) : device(device) {
217
- disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
257
+ disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
218
258
  }
219
259
 
220
260
  /**
@@ -410,7 +450,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
410
450
  * @param device The device ID to associate with this buffer pool.
411
451
  */
412
452
  explicit ggml_cann_pool_buf(int device) : device(device) {
413
- disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
453
+ disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
414
454
  }
415
455
 
416
456
  /**
@@ -731,16 +771,18 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
731
771
  */
732
772
  std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
733
773
  int device) {
734
- bool disable_vmm = (getenv("GGML_CANN_DISABLE_VMM_POOL") != nullptr);
735
- if (!disable_vmm && ggml_cann_info().devices[device].vmm) {
736
- GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
737
- return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
738
- }
739
- bool enable_buf_prio = (getenv("GGML_CANN_ENABLE_BUF_PRIO_POOL") != nullptr);
740
- if (enable_buf_prio) {
774
+ std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
775
+
776
+ if (mem_pool_type == "prio") {
741
777
  GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
742
778
  return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf_prio(device));
743
779
  }
780
+
781
+ if (ggml_cann_info().devices[device].vmm && mem_pool_type != "leg") {
782
+ GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
783
+ return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
784
+ }
785
+
744
786
  GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device);
745
787
  return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf(device));
746
788
  }
@@ -1091,6 +1133,98 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
1091
1133
  return GGML_STATUS_SUCCESS;
1092
1134
  }
1093
1135
 
1136
+ /**
1137
+ * @brief Workspace for caching NZ buffers per device.
1138
+ *
1139
+ * This struct manages a device buffer used in NZ computations. It supports
1140
+ * allocation, reallocation, and clearing of cached memory. The struct is
1141
+ * designed to be used with a global array, one per device.
1142
+ */
1143
+ struct ggml_cann_nz_workspace {
1144
+ void* ptr; // Pointer to allocated device buffer
1145
+ size_t allocated; // Size of currently allocated buffer in bytes
1146
+
1147
+ /**
1148
+ * @brief Constructor. Initializes the workspace with no allocated memory.
1149
+ */
1150
+ ggml_cann_nz_workspace() : ptr(nullptr), allocated(0) {}
1151
+
1152
+ /**
1153
+ * @brief Free cached memory and reset the workspace.
1154
+ *
1155
+ * If a buffer has been allocated, this function releases it using
1156
+ * aclrtFree and resets internal state.
1157
+ */
1158
+ void clear() {
1159
+ if (ptr) {
1160
+ ACL_CHECK(aclrtFree(ptr));
1161
+ ptr = nullptr;
1162
+ allocated = 0;
1163
+ }
1164
+ }
1165
+
1166
+ /**
1167
+ * @brief Allocate or reallocate the workspace buffer.
1168
+ *
1169
+ * If the requested size is larger than the currently allocated size,
1170
+ * the old buffer will be freed and a new buffer of the requested size
1171
+ * will be allocated on the device.
1172
+ *
1173
+ * @param new_size Size in bytes to allocate for the workspace.
1174
+ */
1175
+ void realloc(size_t new_size) {
1176
+ if (new_size > allocated) {
1177
+ clear();
1178
+ ACL_CHECK(aclrtMalloc(&ptr, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
1179
+ allocated = new_size;
1180
+ }
1181
+ }
1182
+
1183
+ /**
1184
+ * @brief Get the device buffer pointer.
1185
+ *
1186
+ * @return Pointer to the allocated buffer, or nullptr if not allocated.
1187
+ */
1188
+ void* get() const { return ptr; }
1189
+ };
1190
+
1191
+ /**
1192
+ * @brief Global array of NZ workspaces, one per device.
1193
+ */
1194
+ static ggml_cann_nz_workspace g_nz_workspaces[GGML_CANN_MAX_DEVICES];
1195
+
1196
+ /**
1197
+ * @brief Convert tensor weights to NZ format using Ascend CANN API.
1198
+ *
1199
+ * This function creates a transposed tensor descriptor and performs the
1200
+ * TransMatmulWeight operation. Converting tensor formats can significantly
1201
+ * improve performance on certain hardware.
1202
+ *
1203
+ * @param tensor Pointer to the input ggml_tensor containing the weights.
1204
+ * @param offset Byte offset within the tensor data buffer where weights start.
1205
+ * @param device device id.
1206
+ *
1207
+ * @note The workspace buffer used in this function is managed globally and reused
1208
+ * across calls. This reduces overhead from repeated memory allocation and deallocation.
1209
+ */
1210
+ static void weight_format_to_nz(ggml_tensor *tensor, size_t offset, int device) {
1211
+ aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne,
1212
+ tensor->nb, 2, ACL_FORMAT_ND, offset);
1213
+ uint64_t workspaceSize = 0;
1214
+ aclOpExecutor *executor;
1215
+
1216
+ // TransMatmulWeight
1217
+ ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed,
1218
+ &workspaceSize, &executor));
1219
+ // Avoid frequent malloc/free of the workspace.
1220
+ g_nz_workspaces[device].realloc(workspaceSize);
1221
+
1222
+ void* g_nz_workspace = g_nz_workspaces[device].get();
1223
+
1224
+ ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
1225
+ ACL_CHECK(aclDestroyTensor(weightTransposed));
1226
+ }
1227
+
1094
1228
  // TODO: need handle tensor which has paddings.
1095
1229
  /**
1096
1230
  * @brief Set tensor data in a CANN buffer.
@@ -1115,9 +1249,16 @@ static void ggml_backend_cann_buffer_set_tensor(
1115
1249
  // For acl, synchronous functions use this default stream.
1116
1250
  // Why aclrtSynchronizeDevice?
1117
1251
 
1252
+ // Only check env once.
1253
+ static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
1118
1254
  if (!need_transform(tensor->type)) {
1119
1255
  ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
1120
1256
  ACL_MEMCPY_HOST_TO_DEVICE));
1257
+ if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
1258
+ GGML_ASSERT(tensor->ne[2] == 1);
1259
+ GGML_ASSERT(tensor->ne[3] == 1);
1260
+ weight_format_to_nz(tensor, offset, ctx->device);
1261
+ }
1121
1262
  } else {
1122
1263
  void *transform_buffer = malloc(size);
1123
1264
  ggml_backend_cann_transform(tensor, data, transform_buffer);
@@ -1192,6 +1333,10 @@ static bool ggml_backend_cann_buffer_cpy_tensor(
1192
1333
  ACL_MEMCPY_DEVICE_TO_DEVICE));
1193
1334
  return true;
1194
1335
  } else {
1336
+ #ifdef ASCEND_310P
1337
+ // TODO: Support 310p P2P copy
1338
+ return false;
1339
+ #endif
1195
1340
  // Different device but can access by peer.
1196
1341
  int32_t canAccessPeer = 0;
1197
1342
  ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device,
@@ -1351,20 +1496,32 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
1351
1496
  size_t size = ggml_nbytes(tensor);
1352
1497
  int64_t ne0 = tensor->ne[0];
1353
1498
 
1499
+ // Only check env once.
1500
+ static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
1501
+
1354
1502
  // last line must bigger than 32, because every single op deal at
1355
1503
  // least 32 bytes.
1356
1504
  // TODO: quantized type?
1357
1505
  // int64_t line_size = ne0 * ggml_element_size(tensor);
1358
1506
  // int64_t line_size_align_32 = (line_size + 31) & ~31;
1359
1507
  // size += (line_size_align_32 - line_size);
1360
-
1361
- // TODO: not support quantized yet.
1362
- // TODO: consider un-continue tensor.
1363
1508
  if (ggml_is_quantized(tensor->type)) {
1364
1509
  if (ne0 % MATRIX_ROW_PADDING != 0) {
1365
1510
  size += ggml_row_size(
1366
1511
  tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
1367
1512
  }
1513
+ } else if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
1514
+ // NZ format weight are not support quantized yet.
1515
+ // If ND tensor transform to NZ, size may changed.
1516
+ int64_t shape[] = {tensor->ne[1], tensor->ne[0]};
1517
+ GGML_ASSERT(tensor->ne[2] == 1);
1518
+ GGML_ASSERT(tensor->ne[3] == 1);
1519
+ const aclIntArray *acl_shape = aclCreateIntArray(shape, 2);
1520
+ size_t new_size;
1521
+ ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape,
1522
+ ggml_cann_type_mapping(tensor->type), &new_size));
1523
+ ACL_CHECK(aclDestroyIntArray(acl_shape));
1524
+ size = std::max(size, new_size);
1368
1525
  }
1369
1526
 
1370
1527
  return size;
@@ -1570,6 +1727,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1570
1727
  case GGML_OP_GET_ROWS:
1571
1728
  ggml_cann_get_rows(ctx, dst);
1572
1729
  break;
1730
+ case GGML_OP_SET_ROWS:
1731
+ ggml_cann_set_rows(ctx, dst);
1732
+ break;
1573
1733
  case GGML_OP_DUP:
1574
1734
  ggml_cann_dup(ctx, dst);
1575
1735
  break;
@@ -1592,16 +1752,18 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1592
1752
  case GGML_OP_UNARY:
1593
1753
  switch (ggml_get_unary_op(dst)) {
1594
1754
  case GGML_UNARY_OP_ABS:
1595
- GGML_CANN_CALL_UNARY_OP(Abs);
1755
+ GGML_CANN_CALL_OP_UNARY(Abs);
1596
1756
  break;
1597
1757
  case GGML_UNARY_OP_NEG:
1598
- GGML_CANN_CALL_UNARY_OP(Neg);
1758
+ GGML_CANN_CALL_OP_UNARY(Neg);
1599
1759
  break;
1600
1760
  case GGML_UNARY_OP_GELU:
1601
- GGML_CANN_CALL_UNARY_OP(Gelu);
1761
+ case GGML_UNARY_OP_GELU_ERF:
1762
+ // aclnnGelu internally uses the erf-based approximation.
1763
+ GGML_CANN_CALL_OP_UNARY(Gelu);
1602
1764
  break;
1603
1765
  case GGML_UNARY_OP_SILU:
1604
- GGML_CANN_CALL_UNARY_OP(Silu);
1766
+ GGML_CANN_CALL_OP_UNARY(Silu);
1605
1767
  break;
1606
1768
  case GGML_UNARY_OP_GELU_QUICK: {
1607
1769
  auto lambda = [](ggml_backend_cann_context& ctx,
@@ -1609,31 +1771,31 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1609
1771
  aclTensor* acl_dst) {
1610
1772
  GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
1611
1773
  };
1612
- ggml_cann_unary_op(lambda, ctx, dst);
1774
+ ggml_cann_op_unary(lambda, ctx, dst);
1613
1775
  } break;
1614
1776
  case GGML_UNARY_OP_TANH:
1615
- GGML_CANN_CALL_UNARY_OP(Tanh);
1777
+ GGML_CANN_CALL_OP_UNARY(Tanh);
1616
1778
  break;
1617
1779
  case GGML_UNARY_OP_RELU:
1618
- GGML_CANN_CALL_UNARY_OP(Relu);
1780
+ GGML_CANN_CALL_OP_UNARY(Relu);
1619
1781
  break;
1620
1782
  case GGML_UNARY_OP_SIGMOID:
1621
- GGML_CANN_CALL_UNARY_OP(Sigmoid);
1783
+ GGML_CANN_CALL_OP_UNARY(Sigmoid);
1622
1784
  break;
1623
1785
  case GGML_UNARY_OP_HARDSIGMOID:
1624
- GGML_CANN_CALL_UNARY_OP(Hardsigmoid);
1786
+ GGML_CANN_CALL_OP_UNARY(Hardsigmoid);
1625
1787
  break;
1626
1788
  case GGML_UNARY_OP_HARDSWISH:
1627
- GGML_CANN_CALL_UNARY_OP(Hardswish);
1789
+ GGML_CANN_CALL_OP_UNARY(Hardswish);
1628
1790
  break;
1629
1791
  case GGML_UNARY_OP_EXP:
1630
- GGML_CANN_CALL_UNARY_OP(Exp);
1792
+ GGML_CANN_CALL_OP_UNARY(Exp);
1631
1793
  break;
1632
1794
  case GGML_UNARY_OP_ELU:
1633
1795
  ggml_cann_elu(ctx, dst);
1634
1796
  break;
1635
1797
  case GGML_UNARY_OP_SGN:
1636
- GGML_CANN_CALL_UNARY_OP(Sign);
1798
+ GGML_CANN_CALL_OP_UNARY(Sign);
1637
1799
  break;
1638
1800
  case GGML_UNARY_OP_STEP:
1639
1801
  ggml_cann_step(ctx, dst);
@@ -1642,6 +1804,31 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1642
1804
  return false;
1643
1805
  }
1644
1806
  break;
1807
+ case GGML_OP_GLU:
1808
+ switch (ggml_get_glu_op(dst)) {
1809
+ case GGML_GLU_OP_REGLU:
1810
+ GGML_CANN_CALL_OP_UNARY_GATED(Relu);
1811
+ break;
1812
+ case GGML_GLU_OP_GEGLU:
1813
+ case GGML_GLU_OP_GEGLU_ERF:
1814
+ // aclnnGelu internally uses the erf-based approximation.
1815
+ GGML_CANN_CALL_OP_UNARY_GATED(Gelu);
1816
+ break;
1817
+ case GGML_GLU_OP_SWIGLU:
1818
+ GGML_CANN_CALL_OP_UNARY_GATED(Silu);
1819
+ break;
1820
+ case GGML_GLU_OP_GEGLU_QUICK: {
1821
+ auto lambda = [](ggml_backend_cann_context& ctx,
1822
+ aclTensor* acl_src,
1823
+ aclTensor* acl_dst) {
1824
+ GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
1825
+ };
1826
+ ggml_cann_op_unary_gated(lambda, ctx, dst);
1827
+ } break;
1828
+ default:
1829
+ return false;
1830
+ }
1831
+ break;
1645
1832
  case GGML_OP_NORM:
1646
1833
  ggml_cann_norm(ctx, dst);
1647
1834
  break;
@@ -1684,7 +1871,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1684
1871
  ggml_cann_binary_op<aclnn_mul>(ctx, dst);
1685
1872
  break;
1686
1873
  case GGML_OP_SQRT:
1687
- GGML_CANN_CALL_UNARY_OP(Sqrt);
1874
+ GGML_CANN_CALL_OP_UNARY(Sqrt);
1688
1875
  break;
1689
1876
  case GGML_OP_CLAMP:
1690
1877
  ggml_cann_clamp(ctx, dst);
@@ -1729,16 +1916,16 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1729
1916
  ggml_cann_argmax(ctx, dst);
1730
1917
  break;
1731
1918
  case GGML_OP_COS:
1732
- ggml_cann_unary_op<aclnn_cos>(ctx, dst);
1919
+ ggml_cann_op_unary<aclnn_cos>(ctx, dst);
1733
1920
  break;
1734
1921
  case GGML_OP_SIN:
1735
- ggml_cann_unary_op<aclnn_sin>(ctx, dst);
1922
+ ggml_cann_op_unary<aclnn_sin>(ctx, dst);
1736
1923
  break;
1737
1924
  case GGML_OP_CONV_TRANSPOSE_1D:
1738
1925
  ggml_cann_conv_transpose_1d(ctx, dst);
1739
1926
  break;
1740
1927
  case GGML_OP_LOG:
1741
- GGML_CANN_CALL_UNARY_OP(Log);
1928
+ GGML_CANN_CALL_OP_UNARY(Log);
1742
1929
  break;
1743
1930
  case GGML_OP_MEAN:
1744
1931
  ggml_cann_mean(ctx, dst);
@@ -1871,6 +2058,8 @@ static bool ggml_backend_cann_cpy_tensor_async(
1871
2058
  GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
1872
2059
  ggml_backend_is_cann(backend_dst));
1873
2060
 
2061
+ GGML_ASSERT(!is_matmul_weight((const ggml_tensor*)src));
2062
+
1874
2063
  if (!ggml_backend_buffer_is_cann(src->buffer) ||
1875
2064
  !ggml_backend_buffer_is_cann(dst->buffer)) {
1876
2065
  return false;
@@ -1887,7 +2076,14 @@ static bool ggml_backend_cann_cpy_tensor_async(
1887
2076
  (ggml_backend_cann_context*)backend_dst->context;
1888
2077
 
1889
2078
  size_t copy_size = ggml_nbytes(dst);
2079
+ if (copy_size == 0) {
2080
+ return true;
2081
+ }
1890
2082
  if (backend_src != backend_dst) {
2083
+ #ifdef ASCEND_310P
2084
+ // TODO: Support 310p P2P copy
2085
+ return false;
2086
+ #endif
1891
2087
  ggml_backend_cann_buffer_context* buf_ctx_src =
1892
2088
  (ggml_backend_cann_buffer_context*)buf_src->context;
1893
2089
  ggml_backend_cann_buffer_context* buf_ctx_dst =
@@ -1904,7 +2100,6 @@ static bool ggml_backend_cann_cpy_tensor_async(
1904
2100
  }
1905
2101
 
1906
2102
  // need open both directions for memcpyasync between devices.
1907
- ggml_cann_set_device(cann_ctx_dst->device);
1908
2103
  ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
1909
2104
  ggml_cann_set_device(cann_ctx_src->device);
1910
2105
  ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
@@ -1914,9 +2109,17 @@ static bool ggml_backend_cann_cpy_tensor_async(
1914
2109
  ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
1915
2110
  ACL_MEMCPY_DEVICE_TO_DEVICE,
1916
2111
  cann_ctx_src->stream()));
1917
-
1918
- //TODO: workaround for Event didn`t work here.
1919
- aclrtSynchronizeStream(cann_ctx_src->stream());
2112
+ // record event on src stream after the copy
2113
+ // TODO: this event is not effective with acl graph mode, change to use aclrtSynchronizeStream
2114
+ // if (!cann_ctx_src->copy_event) {
2115
+ // ACL_CHECK(aclrtCreateEventWithFlag(&cann_ctx_src->copy_event, ACL_EVENT_SYNC));
2116
+ // }
2117
+ // ACL_CHECK(aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
2118
+
2119
+ // // wait on dst stream for the copy to complete
2120
+ // ggml_cann_set_device(cann_ctx_dst->device);
2121
+ // ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), cann_ctx_src->copy_event));
2122
+ ACL_CHECK(aclrtSynchronizeStream(cann_ctx_src->stream()));
1920
2123
  } else {
1921
2124
  // src and dst are on the same backend
1922
2125
  ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
@@ -1943,6 +2146,193 @@ static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
1943
2146
  ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
1944
2147
  }
1945
2148
 
2149
+ #ifdef USE_ACL_GRAPH
2150
+ /**
2151
+ * @brief Add a new CANN graph to the LRU cache by populating node properties from the ggml graph.
2152
+ *
2153
+ * This function creates a new ggml_cann_graph object and fills its node properties
2154
+ * (operation type, dimensions, strides, input sources, and operation parameters)
2155
+ * based on the current ggml computation graph.
2156
+ *
2157
+ * Each node in the ggml graph is mapped to a property entry in the new CANN graph:
2158
+ * - node address
2159
+ * - operation type
2160
+ * - shape (ne) and strides (nb)
2161
+ * - source tensor addresses
2162
+ * - operation parameters
2163
+ *
2164
+ * After initialization, the new graph is pushed into the LRU cache owned by the
2165
+ * CANN backend context. The cache takes ownership of the graph and manages its
2166
+ * lifetime (including deletion upon eviction).
2167
+ *
2168
+ * @param cann_ctx The CANN backend context containing the graph cache.
2169
+ * @param cgraph The current ggml computation graph.
2170
+ */
2171
+ static void add_lru_matched_graph_node_properties(
2172
+ ggml_backend_cann_context * cann_ctx,
2173
+ ggml_cgraph * cgraph) {
2174
+ // Create a new ggml_cann_graph object on the heap (its lifetime is managed by the cache).
2175
+ ggml_cann_graph * new_graph = new ggml_cann_graph();
2176
+ new_graph->ggml_graph_properties.resize(cgraph->n_nodes);
2177
+
2178
+ for (int node_idx = 0; node_idx < cgraph->n_nodes; ++node_idx) {
2179
+ ggml_tensor * node = cgraph->nodes[node_idx];
2180
+ auto & prop = new_graph->ggml_graph_properties[node_idx];
2181
+
2182
+ prop.node_address = node->data;
2183
+ prop.node_op = node->op;
2184
+
2185
+ std::copy_n(node->ne, GGML_MAX_DIMS, prop.ne);
2186
+ std::copy_n(node->nb, GGML_MAX_DIMS, prop.nb);
2187
+
2188
+ for (int src = 0; src < GGML_MAX_SRC; ++src) {
2189
+ prop.src_address[src] = node->src[src] ? node->src[src]->data : nullptr;
2190
+ }
2191
+
2192
+ memcpy(prop.op_params, node->op_params, GGML_MAX_OP_PARAMS);
2193
+ }
2194
+
2195
+ // Insert into the LRU cache (cache takes ownership and will delete it when evicted).
2196
+ cann_ctx->graph_lru_cache.push(new_graph);
2197
+ }
2198
+
2199
+ /**
2200
+ * @brief Check if a ggml tensor node matches a previously captured CANN graph node.
2201
+ *
2202
+ * This function compares all relevant fields (address, op type, shape, source inputs, op params)
2203
+ * to determine whether the current node matches a previously recorded version.
2204
+ *
2205
+ * @param node The current ggml tensor node.
2206
+ * @param graph_node_properties The stored properties of a CANN graph node.
2207
+ * @return true if all fields match (excluding GGML_OP_VIEW); false otherwise.
2208
+ */
2209
+ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2210
+ if (node->data != graph_node_properties->node_address &&
2211
+ node->op != GGML_OP_VIEW) {
2212
+ return false;
2213
+ }
2214
+ if (node->op != graph_node_properties->node_op) {
2215
+ return false;
2216
+ }
2217
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
2218
+ if (node->ne[i] != graph_node_properties->ne[i]) {
2219
+ return false;
2220
+ }
2221
+ if (node->nb[i] != graph_node_properties->nb[i]) {
2222
+ return false;
2223
+ }
2224
+ }
2225
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2226
+ if (node->src[i] &&
2227
+ node->src[i]->data != graph_node_properties->src_address[i] &&
2228
+ node->op != GGML_OP_VIEW
2229
+ ) {
2230
+ return false;
2231
+ }
2232
+ }
2233
+ if (node->op == GGML_OP_SCALE &&
2234
+ memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
2235
+ return false;
2236
+ }
2237
+ return true;
2238
+ }
2239
+
2240
+ /**
2241
+ * @brief Check whether there is a cached CANN graph that matches the current ggml graph.
2242
+ *
2243
+ * This function iterates through the cached CANN graphs stored in the LRU cache and
2244
+ * compares them against the given ggml computation graph. A match requires that the
2245
+ * number of nodes is the same and that each node’s properties (operation type,
2246
+ * dimensions, strides, inputs, and operation parameters) are identical.
2247
+ *
2248
+ * If a matching graph is found, it is promoted to the front of the LRU cache and the
2249
+ * function returns true. Otherwise, the function returns false, indicating that a new
2250
+ * CANN graph needs to be captured.
2251
+ *
2252
+ * @param cann_ctx The CANN backend context containing the graph cache.
2253
+ * @param cgraph The current ggml computation graph.
2254
+ * @return true if a matching cached graph exists; false otherwise.
2255
+ */
2256
+ static bool is_matched_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
2257
+ ggml_cann_graph_lru_cache &lru_cache = cann_ctx->graph_lru_cache;
2258
+ for (auto &graph_ptr : lru_cache.cache_list) {
2259
+ // Skip graphs with a different number of nodes.
2260
+ if (graph_ptr->ggml_graph_properties.size() != static_cast<size_t>(cgraph->n_nodes)) {
2261
+ continue;
2262
+ }
2263
+
2264
+ // Check if all nodes match.
2265
+ bool all_match = true;
2266
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
2267
+ if (!ggml_graph_node_has_matching_properties(cgraph->nodes[i], &graph_ptr->ggml_graph_properties[i])) {
2268
+ all_match = false;
2269
+ break;
2270
+ }
2271
+ }
2272
+
2273
+ if (all_match) {
2274
+ // update cache_list && renturn graph_ptr
2275
+ lru_cache.move_to_front(graph_ptr);
2276
+ return true;
2277
+ }
2278
+ }
2279
+
2280
+ return false;
2281
+ }
2282
+ #endif // USE_ACL_GRAPH
2283
+
2284
+ /**
2285
+ * @brief Evaluate the computation graph and optionally capture or execute it using CANN graph API.
2286
+ *
2287
+ * If CANN graph execution is enabled and graph capture is required, this function begins
2288
+ * graph capture, runs the graph, ends capture, and stores the captured graph.
2289
+ *
2290
+ * Otherwise, it falls back to op-by-op execution using the CANN compute kernel dispatcher.
2291
+ *
2292
+ * @param cann_ctx The CANN backend context.
2293
+ * @param cgraph The ggml computation graph.
2294
+ * @param use_cann_graph Whether to use CANN graph execution.
2295
+ * @param cann_graph_update_required Whether graph capture is needed due to graph changes.
2296
+ */
2297
+ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph,
2298
+ bool & use_cann_graph, bool & cann_graph_update_required) {
2299
+ #ifdef USE_ACL_GRAPH
2300
+ ggml_cann_graph* matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
2301
+ if (use_cann_graph && cann_graph_update_required) {
2302
+ ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
2303
+ }
2304
+ #endif // USE_ACL_GRAPH
2305
+ // Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
2306
+ // With the use of CANN graphs, the execution will be performed by the graph launch.
2307
+ if (!use_cann_graph || cann_graph_update_required) {
2308
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2309
+ ggml_tensor * node = cgraph->nodes[i];
2310
+
2311
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2312
+ continue;
2313
+ }
2314
+
2315
+ bool ok = ggml_cann_compute_forward(*cann_ctx, node);
2316
+ if (!ok) {
2317
+ GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2318
+ }
2319
+ GGML_ASSERT(ok);
2320
+ }
2321
+ }
2322
+
2323
+ #ifdef USE_ACL_GRAPH
2324
+ if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture
2325
+ ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
2326
+ }
2327
+
2328
+ if (use_cann_graph) {
2329
+ // Execute graph
2330
+ ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream()));
2331
+ }
2332
+ #endif // USE_ACL_GRAPH
2333
+ }
2334
+
2335
+
1946
2336
  /**
1947
2337
  * @brief Computes a computational graph using a CANN backend.
1948
2338
  *
@@ -1959,24 +2349,53 @@ static enum ggml_status ggml_backend_cann_graph_compute(
1959
2349
  ggml_backend_t backend, ggml_cgraph* cgraph) {
1960
2350
  ggml_backend_cann_context* cann_ctx =
1961
2351
  (ggml_backend_cann_context*)backend->context;
1962
-
1963
2352
  ggml_cann_set_device(cann_ctx->device);
1964
-
1965
- for (int i = 0; i < cgraph->n_nodes; i++) {
1966
- ggml_tensor* node = cgraph->nodes[i];
1967
-
1968
- if (ggml_is_empty(node) || node->op == GGML_OP_NONE) {
1969
- continue;
2353
+ g_nz_workspaces[cann_ctx->device].clear();
2354
+
2355
+ // calculate rope cache for fist layer in current device.
2356
+ cann_ctx->rope_cache.cached = false;
2357
+
2358
+ #ifdef USE_ACL_GRAPH
2359
+ bool use_cann_graph = true;
2360
+ bool cann_graph_update_required = false;
2361
+
2362
+ static bool prefill_use_graph = parse_bool(get_env("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
2363
+ if (!prefill_use_graph) {
2364
+ // Do not use acl_graph for prefill.
2365
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2366
+ ggml_tensor * node = cgraph->nodes[i];
2367
+ // TODO: Optimize here. Currently, we can only
2368
+ // get seq_len by FA's input.
2369
+ if (node->op == GGML_OP_FLASH_ATTN_EXT) {
2370
+ // Q -> src[0], shape: [B, S, N, D]
2371
+ use_cann_graph = (node->src[0]->ne[1] == 1);
2372
+ break;
2373
+ }
1970
2374
  }
2375
+ }
1971
2376
 
1972
- bool ok = ggml_cann_compute_forward(*cann_ctx, node);
2377
+ if (!cann_ctx->acl_graph_mode) {
2378
+ use_cann_graph = false;
2379
+ }
1973
2380
 
1974
- if (!ok) {
1975
- GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
1976
- node->name, ggml_op_name(node->op));
2381
+ if (use_cann_graph) {
2382
+ // If no matching graph is found, the graph needs to be recaptured.
2383
+ cann_graph_update_required = !is_matched_graph(cann_ctx, cgraph);
2384
+ if (cann_graph_update_required) {
2385
+ // If no matching graph is found, add a new ACL graph.
2386
+ add_lru_matched_graph_node_properties(cann_ctx, cgraph);
1977
2387
  }
1978
- GGML_ASSERT(ok);
1979
2388
  }
2389
+ #else
2390
+ bool use_cann_graph = false;
2391
+ bool cann_graph_update_required = false;
2392
+ #endif // USE_ACL_GRAPH
2393
+ evaluate_and_capture_cann_graph(
2394
+ cann_ctx,
2395
+ cgraph,
2396
+ use_cann_graph,
2397
+ cann_graph_update_required
2398
+ );
1980
2399
 
1981
2400
  return GGML_STATUS_SUCCESS;
1982
2401
  }
@@ -2012,10 +2431,23 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2012
2431
  case GGML_UNARY_OP_ELU:
2013
2432
  case GGML_UNARY_OP_SGN:
2014
2433
  case GGML_UNARY_OP_STEP:
2434
+ case GGML_UNARY_OP_GELU_ERF:
2015
2435
  return true;
2016
2436
  default:
2017
2437
  return false;
2018
2438
  }
2439
+ case GGML_OP_GLU:
2440
+ switch (ggml_get_glu_op(op)) {
2441
+ case GGML_GLU_OP_REGLU:
2442
+ case GGML_GLU_OP_GEGLU:
2443
+ case GGML_GLU_OP_SWIGLU:
2444
+ case GGML_GLU_OP_GEGLU_ERF:
2445
+ case GGML_GLU_OP_GEGLU_QUICK:
2446
+ return true;
2447
+ default:
2448
+ return false;
2449
+ }
2450
+ break;
2019
2451
  case GGML_OP_MUL_MAT: {
2020
2452
  switch (op->src[0]->type) {
2021
2453
  case GGML_TYPE_F16:
@@ -2024,7 +2456,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2024
2456
  case GGML_TYPE_Q8_0:
2025
2457
  case GGML_TYPE_Q4_0:
2026
2458
  #ifdef ASCEND_310P
2027
- // Q4 && Q8 per group is not suppor on 310p device
2459
+ // Q4 && Q8 per group is not support on 310p device
2028
2460
  return false;
2029
2461
  #endif
2030
2462
  // only support contiguous for quantized types.
@@ -2042,7 +2474,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2042
2474
  case GGML_TYPE_Q8_0:
2043
2475
  case GGML_TYPE_Q4_0:
2044
2476
  #ifdef ASCEND_310P
2045
- // Q4 && Q8 per group is not suppor on 310p device
2477
+ // Q4 && Q8 per group is not support on 310p device
2046
2478
  return false;
2047
2479
  #endif
2048
2480
  // only support contiguous for quantized types.
@@ -2062,6 +2494,15 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2062
2494
  return false;
2063
2495
  }
2064
2496
  } break;
2497
+ case GGML_OP_SET_ROWS: {
2498
+ switch (op->type) {
2499
+ case GGML_TYPE_F32:
2500
+ case GGML_TYPE_F16:
2501
+ return true;
2502
+ default:
2503
+ return false;
2504
+ }
2505
+ } break;
2065
2506
  case GGML_OP_CPY: {
2066
2507
  ggml_tensor *src = op->src[0];
2067
2508
  if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
@@ -2070,12 +2511,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2070
2511
  // only support F32 and F16.
2071
2512
  return false;
2072
2513
  }
2073
-
2074
- if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
2075
- // unsupport dst is not contiguous.
2076
- return false;
2077
- }
2078
-
2079
2514
  return true;
2080
2515
  } break;
2081
2516
  case GGML_OP_CONT: {
@@ -2090,16 +2525,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2090
2525
  }
2091
2526
  case GGML_OP_ROPE: {
2092
2527
  // TODO: with ops-test v == 1
2093
- float ext_factor = 0.0f;
2094
- memcpy(&ext_factor, (const float *) op->op_params + 7, sizeof(float));
2095
2528
  // TODO: n_dims <= ne0
2096
2529
  if (op->src[0]->ne[0] != op->op_params[1]) {
2097
2530
  return false;
2098
2531
  }
2099
- // TODO: ext_factor != 0
2100
- if (ext_factor != 0) {
2101
- return false;
2102
- }
2103
2532
 
2104
2533
  const int mode = ((const int32_t *) op->op_params)[2];
2105
2534
  if (mode & GGML_ROPE_TYPE_MROPE) {
@@ -2108,10 +2537,11 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2108
2537
  if (mode & GGML_ROPE_TYPE_VISION) {
2109
2538
  return false;
2110
2539
  }
2111
-
2540
+ #ifdef ASCEND_310P
2112
2541
  if(!ggml_is_contiguous(op->src[0])){
2113
2542
  return false;
2114
2543
  }
2544
+ #endif
2115
2545
  return true;
2116
2546
  }
2117
2547
  case GGML_OP_UPSCALE: {
@@ -2141,8 +2571,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2141
2571
  // value of paddingW should be at most half of kernelW
2142
2572
  return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
2143
2573
  }
2144
- case GGML_OP_SUM:
2145
2574
  case GGML_OP_DUP:
2575
+ case GGML_OP_SUM:
2146
2576
  case GGML_OP_IM2COL:
2147
2577
  case GGML_OP_CONCAT:
2148
2578
  case GGML_OP_REPEAT:
@@ -2158,12 +2588,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2158
2588
  case GGML_OP_MUL:
2159
2589
  case GGML_OP_DIV:
2160
2590
  case GGML_OP_RMS_NORM:
2161
- case GGML_OP_SCALE:
2162
2591
  case GGML_OP_SQR:
2163
2592
  case GGML_OP_SQRT:
2164
2593
  case GGML_OP_CLAMP:
2165
2594
  case GGML_OP_DIAG_MASK_INF:
2166
- case GGML_OP_SOFT_MAX:
2167
2595
  case GGML_OP_SUM_ROWS:
2168
2596
  case GGML_OP_ARGSORT:
2169
2597
  case GGML_OP_ACC:
@@ -2175,13 +2603,29 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2175
2603
  case GGML_OP_ARGMAX:
2176
2604
  case GGML_OP_COS:
2177
2605
  case GGML_OP_SIN:
2178
- case GGML_OP_CONV_TRANSPOSE_1D:
2179
2606
  case GGML_OP_LOG:
2180
2607
  case GGML_OP_MEAN:
2181
2608
  case GGML_OP_PAD_REFLECT_1D:
2182
2609
  case GGML_OP_COUNT_EQUAL:
2183
2610
  return true;
2611
+ case GGML_OP_CONV_TRANSPOSE_1D:
2612
+ // TODO: ((weightL - 1) * dilationW - padLeft)=1336 should not be larger than 255.
2613
+ return (op->src[0]->ne[0] - 1) <= 255;
2614
+ case GGML_OP_SCALE:
2615
+ float bias;
2616
+ memcpy(&bias, (const float *)(op->op_params) + 1, sizeof(float));
2617
+ return bias == 0.0f; // TODO: support bias != 0.0f
2618
+ case GGML_OP_SOFT_MAX:
2619
+ // TODO: support attention sinks [TAG_ATTN_SINKS]
2620
+ if (op->src[2]) {
2621
+ return false;
2622
+ }
2623
+ return true;
2184
2624
  case GGML_OP_FLASH_ATTN_EXT:{
2625
+ #ifdef ASCEND_310P
2626
+ // FA not support on 310p device
2627
+ return false;
2628
+ #endif
2185
2629
  // derived from [ggml-cuda.cu]
2186
2630
  if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
2187
2631
  return false;
@@ -2192,22 +2636,20 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2192
2636
  if(op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16){
2193
2637
  return false;
2194
2638
  }
2195
- if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
2196
- // different head sizes of K and V are not supported yet
2197
- return false;
2198
- }
2199
- if (op->src[0]->ne[0] == 192) {
2639
+ // TODO: support attention sinks [TAG_ATTN_SINKS]
2640
+ if (op->src[4]) {
2200
2641
  return false;
2201
2642
  }
2202
- if (op->src[0]->ne[0] == 576) {
2203
- // DeepSeek MLA
2643
+ if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
2644
+ // different head sizes of K and V are not supported yet
2204
2645
  return false;
2205
2646
  }
2206
- if (op->src[0]->ne[3] != 1) {
2647
+ if (op->src[0]->ne[0] % 16 != 0) {
2648
+ // TODO: padding to support
2207
2649
  return false;
2208
2650
  }
2209
2651
  float logitSoftcap = 0.0f;
2210
- memcpy(&logitSoftcap, (float*)op->op_params + 2, sizeof(float));
2652
+ memcpy(&logitSoftcap, (const float *)(op->op_params) + 2, sizeof(float));
2211
2653
  if(logitSoftcap != 0.0f) {
2212
2654
  return false;
2213
2655
  }
@@ -2314,6 +2756,7 @@ static const ggml_backend_i ggml_backend_cann_interface = {
2314
2756
  /* .graph_compute = */ ggml_backend_cann_graph_compute,
2315
2757
  /* .event_record = */ ggml_backend_cann_event_record,
2316
2758
  /* .event_wait = */ ggml_backend_cann_event_wait,
2759
+ /* .graph_optimize = */ NULL,
2317
2760
  };
2318
2761
 
2319
2762
  /**