whispercpp 1.3.3 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (963) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -43
  3. data/ext/extconf.rb +2 -2
  4. data/ext/ruby_whisper.c +14 -2
  5. data/ext/ruby_whisper.h +39 -0
  6. data/ext/ruby_whisper_context.c +22 -22
  7. data/ext/ruby_whisper_model.c +12 -12
  8. data/ext/ruby_whisper_params.c +79 -25
  9. data/ext/ruby_whisper_segment.c +84 -19
  10. data/ext/ruby_whisper_token.c +351 -0
  11. data/ext/ruby_whisper_transcribe.cpp +1 -1
  12. data/ext/ruby_whisper_vad_context.c +75 -0
  13. data/ext/ruby_whisper_vad_context_detect.cpp +50 -0
  14. data/ext/ruby_whisper_vad_segment.c +139 -0
  15. data/ext/ruby_whisper_vad_segments.c +106 -0
  16. data/ext/sources/CMakeLists.txt +4 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/cmake/arm64-apple-clang.cmake +16 -0
  19. data/ext/sources/cmake/arm64-windows-llvm.cmake +16 -0
  20. data/ext/sources/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
  21. data/ext/sources/cmake/x64-windows-llvm.cmake +5 -0
  22. data/ext/sources/examples/CMakeLists.txt +1 -0
  23. data/ext/sources/examples/addon.node/addon.cpp +19 -19
  24. data/ext/sources/examples/addon.node/index.js +7 -5
  25. data/ext/sources/examples/addon.node/vad-example.js +2 -2
  26. data/ext/sources/examples/bench/bench.cpp +26 -16
  27. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  28. data/ext/sources/examples/cli/cli.cpp +122 -111
  29. data/ext/sources/examples/command/command.cpp +26 -24
  30. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  31. data/ext/sources/examples/common-ggml.cpp +2 -0
  32. data/ext/sources/examples/lsp/CMakeLists.txt +2 -1
  33. data/ext/sources/examples/lsp/lsp.cpp +19 -17
  34. data/ext/sources/examples/quantize/CMakeLists.txt +2 -1
  35. data/ext/sources/examples/server/server.cpp +34 -24
  36. data/ext/sources/examples/server.py +6 -1
  37. data/ext/sources/examples/stream/stream.cpp +4 -2
  38. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  39. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  40. data/ext/sources/examples/talk-llama/CMakeLists.txt +7 -3
  41. data/ext/sources/examples/talk-llama/llama-adapter.cpp +113 -7
  42. data/ext/sources/examples/talk-llama/llama-adapter.h +13 -1
  43. data/ext/sources/examples/talk-llama/llama-arch.cpp +2136 -1491
  44. data/ext/sources/examples/talk-llama/llama-arch.h +125 -3
  45. data/ext/sources/examples/talk-llama/llama-batch.cpp +174 -100
  46. data/ext/sources/examples/talk-llama/llama-batch.h +46 -20
  47. data/ext/sources/examples/talk-llama/llama-chat.cpp +199 -8
  48. data/ext/sources/examples/talk-llama/llama-chat.h +11 -0
  49. data/ext/sources/examples/talk-llama/llama-context.cpp +1213 -413
  50. data/ext/sources/examples/talk-llama/llama-context.h +99 -36
  51. data/ext/sources/examples/talk-llama/llama-cparams.h +5 -4
  52. data/ext/sources/examples/talk-llama/llama-grammar.cpp +288 -53
  53. data/ext/sources/examples/talk-llama/llama-grammar.h +22 -1
  54. data/ext/sources/examples/talk-llama/llama-graph.cpp +883 -294
  55. data/ext/sources/examples/talk-llama/llama-graph.h +361 -161
  56. data/ext/sources/examples/talk-llama/llama-hparams.cpp +144 -6
  57. data/ext/sources/examples/talk-llama/llama-hparams.h +100 -23
  58. data/ext/sources/examples/talk-llama/llama-impl.cpp +7 -3
  59. data/ext/sources/examples/talk-llama/llama-impl.h +3 -1
  60. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +328 -0
  61. data/ext/sources/examples/talk-llama/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +38 -29
  62. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2100 -0
  63. data/ext/sources/examples/talk-llama/llama-kv-cache.h +373 -27
  64. data/ext/sources/examples/talk-llama/llama-kv-cells.h +124 -30
  65. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +63 -41
  66. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +30 -29
  67. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +77 -35
  68. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +15 -16
  69. data/ext/sources/examples/talk-llama/llama-memory.h +16 -10
  70. data/ext/sources/examples/talk-llama/llama-mmap.cpp +172 -37
  71. data/ext/sources/examples/talk-llama/llama-mmap.h +8 -3
  72. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +93 -9
  73. data/ext/sources/examples/talk-llama/llama-model-loader.h +9 -2
  74. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +3 -0
  75. data/ext/sources/examples/talk-llama/llama-model.cpp +3369 -10145
  76. data/ext/sources/examples/talk-llama/llama-model.h +104 -12
  77. data/ext/sources/examples/talk-llama/llama-quant.cpp +53 -30
  78. data/ext/sources/examples/talk-llama/llama-sampling.cpp +1520 -324
  79. data/ext/sources/examples/talk-llama/llama-sampling.h +19 -7
  80. data/ext/sources/examples/talk-llama/llama-vocab.cpp +562 -39
  81. data/ext/sources/examples/talk-llama/llama-vocab.h +50 -0
  82. data/ext/sources/examples/talk-llama/llama.cpp +794 -12
  83. data/ext/sources/examples/talk-llama/llama.h +246 -190
  84. data/ext/sources/examples/talk-llama/models/afmoe.cpp +191 -0
  85. data/ext/sources/examples/talk-llama/models/apertus.cpp +125 -0
  86. data/ext/sources/examples/talk-llama/models/arcee.cpp +135 -0
  87. data/ext/sources/examples/talk-llama/models/arctic.cpp +138 -0
  88. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +86 -0
  89. data/ext/sources/examples/talk-llama/models/baichuan.cpp +122 -0
  90. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +144 -0
  91. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +135 -0
  92. data/ext/sources/examples/talk-llama/models/bert.cpp +178 -0
  93. data/ext/sources/examples/talk-llama/models/bitnet.cpp +160 -0
  94. data/ext/sources/examples/talk-llama/models/bloom.cpp +101 -0
  95. data/ext/sources/examples/talk-llama/models/chameleon.cpp +178 -0
  96. data/ext/sources/examples/talk-llama/models/chatglm.cpp +132 -0
  97. data/ext/sources/examples/talk-llama/models/codeshell.cpp +111 -0
  98. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +102 -0
  99. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +134 -0
  100. data/ext/sources/examples/talk-llama/models/command-r.cpp +122 -0
  101. data/ext/sources/examples/talk-llama/models/dbrx.cpp +123 -0
  102. data/ext/sources/examples/talk-llama/models/deci.cpp +135 -0
  103. data/ext/sources/examples/talk-llama/models/deepseek.cpp +144 -0
  104. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +259 -0
  105. data/ext/sources/examples/talk-llama/models/dots1.cpp +134 -0
  106. data/ext/sources/examples/talk-llama/models/dream.cpp +105 -0
  107. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +150 -0
  108. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +110 -0
  109. data/ext/sources/examples/talk-llama/models/exaone.cpp +114 -0
  110. data/ext/sources/examples/talk-llama/models/exaone4.cpp +123 -0
  111. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +113 -0
  112. data/ext/sources/examples/talk-llama/models/falcon.cpp +120 -0
  113. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +116 -0
  114. data/ext/sources/examples/talk-llama/models/gemma.cpp +112 -0
  115. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +128 -0
  116. data/ext/sources/examples/talk-llama/models/gemma3.cpp +155 -0
  117. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +384 -0
  118. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +170 -0
  119. data/ext/sources/examples/talk-llama/models/glm4.cpp +150 -0
  120. data/ext/sources/examples/talk-llama/models/gpt2.cpp +105 -0
  121. data/ext/sources/examples/talk-llama/models/gptneox.cpp +144 -0
  122. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +196 -0
  123. data/ext/sources/examples/talk-llama/models/granite.cpp +211 -0
  124. data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +283 -0
  125. data/ext/sources/examples/talk-llama/models/grok.cpp +159 -0
  126. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +141 -0
  127. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +132 -0
  128. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +154 -0
  129. data/ext/sources/examples/talk-llama/models/internlm2.cpp +120 -0
  130. data/ext/sources/examples/talk-llama/models/jais.cpp +86 -0
  131. data/ext/sources/examples/talk-llama/models/jamba.cpp +106 -0
  132. data/ext/sources/examples/talk-llama/models/lfm2.cpp +175 -0
  133. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +122 -0
  134. data/ext/sources/examples/talk-llama/models/llada.cpp +99 -0
  135. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +178 -0
  136. data/ext/sources/examples/talk-llama/models/llama.cpp +168 -0
  137. data/ext/sources/examples/talk-llama/models/maincoder.cpp +117 -0
  138. data/ext/sources/examples/talk-llama/models/mamba.cpp +55 -0
  139. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +123 -0
  140. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +199 -0
  141. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +124 -0
  142. data/ext/sources/examples/talk-llama/models/mistral3.cpp +160 -0
  143. data/ext/sources/examples/talk-llama/models/models.h +569 -0
  144. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +116 -0
  145. data/ext/sources/examples/talk-llama/models/mpt.cpp +126 -0
  146. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +150 -0
  147. data/ext/sources/examples/talk-llama/models/nemotron.cpp +122 -0
  148. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +104 -0
  149. data/ext/sources/examples/talk-llama/models/olmo.cpp +121 -0
  150. data/ext/sources/examples/talk-llama/models/olmo2.cpp +150 -0
  151. data/ext/sources/examples/talk-llama/models/olmoe.cpp +124 -0
  152. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +127 -0
  153. data/ext/sources/examples/talk-llama/models/openelm.cpp +124 -0
  154. data/ext/sources/examples/talk-llama/models/orion.cpp +123 -0
  155. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +121 -0
  156. data/ext/sources/examples/talk-llama/models/phi2.cpp +121 -0
  157. data/ext/sources/examples/talk-llama/models/phi3.cpp +152 -0
  158. data/ext/sources/examples/talk-llama/models/plamo.cpp +110 -0
  159. data/ext/sources/examples/talk-llama/models/plamo2.cpp +316 -0
  160. data/ext/sources/examples/talk-llama/models/plamo3.cpp +128 -0
  161. data/ext/sources/examples/talk-llama/models/plm.cpp +168 -0
  162. data/ext/sources/examples/talk-llama/models/qwen.cpp +108 -0
  163. data/ext/sources/examples/talk-llama/models/qwen2.cpp +126 -0
  164. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +151 -0
  165. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +117 -0
  166. data/ext/sources/examples/talk-llama/models/qwen3.cpp +117 -0
  167. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +124 -0
  168. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +873 -0
  169. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +149 -0
  170. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +141 -0
  171. data/ext/sources/examples/talk-llama/models/refact.cpp +94 -0
  172. data/ext/sources/examples/talk-llama/models/rnd1.cpp +126 -0
  173. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +162 -0
  174. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +94 -0
  175. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +86 -0
  176. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +135 -0
  177. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +90 -0
  178. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +124 -0
  179. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +126 -0
  180. data/ext/sources/examples/talk-llama/models/smollm3.cpp +128 -0
  181. data/ext/sources/examples/talk-llama/models/stablelm.cpp +146 -0
  182. data/ext/sources/examples/talk-llama/models/starcoder.cpp +100 -0
  183. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +121 -0
  184. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +166 -0
  185. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +96 -0
  186. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +149 -0
  187. data/ext/sources/examples/talk-llama/models/xverse.cpp +108 -0
  188. data/ext/sources/examples/talk-llama/talk-llama.cpp +9 -6
  189. data/ext/sources/examples/talk-llama/unicode.cpp +309 -16
  190. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  191. data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +1 -1
  192. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +4 -2
  193. data/ext/sources/examples/whisper.wasm/index-tmpl.html +18 -17
  194. data/ext/sources/ggml/CMakeLists.txt +135 -79
  195. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  196. data/ext/sources/ggml/include/ggml-alloc.h +9 -0
  197. data/ext/sources/ggml/include/ggml-backend.h +21 -2
  198. data/ext/sources/ggml/include/ggml-cpu.h +2 -1
  199. data/ext/sources/ggml/include/ggml-hexagon.h +19 -0
  200. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  201. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  202. data/ext/sources/ggml/include/ggml-rpc.h +8 -11
  203. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  204. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  205. data/ext/sources/ggml/include/ggml-zendnn.h +22 -0
  206. data/ext/sources/ggml/include/ggml.h +406 -23
  207. data/ext/sources/ggml/src/CMakeLists.txt +99 -13
  208. data/ext/sources/ggml/src/ggml-alloc.c +368 -161
  209. data/ext/sources/ggml/src/ggml-backend-impl.h +5 -5
  210. data/ext/sources/ggml/src/ggml-backend-reg.cpp +55 -14
  211. data/ext/sources/ggml/src/ggml-backend.cpp +290 -57
  212. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +17 -3
  213. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +10 -13
  214. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +14 -0
  215. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +59 -45
  216. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +138 -47
  217. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +2586 -1917
  218. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +348 -309
  219. data/ext/sources/ggml/src/ggml-cann/common.h +350 -133
  220. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +894 -625
  221. data/ext/sources/ggml/src/ggml-common.h +17 -0
  222. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +167 -75
  223. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
  224. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
  225. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +560 -622
  226. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1002 -270
  227. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +107 -587
  228. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
  229. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  230. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +373 -486
  231. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
  232. data/ext/sources/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  233. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +521 -353
  234. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
  235. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
  236. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +4682 -1660
  237. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +82 -4
  238. data/ext/sources/ggml/src/ggml-cpu/common.h +14 -0
  239. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +18 -9
  240. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +263 -111
  241. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +39 -28
  242. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +683 -82
  243. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +38 -43
  244. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +435 -119
  245. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
  246. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1234 -1182
  247. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
  248. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +2167 -1480
  249. data/ext/sources/ggml/src/ggml-cpu/ops.h +10 -12
  250. data/ext/sources/ggml/src/ggml-cpu/quants.c +35 -0
  251. data/ext/sources/ggml/src/ggml-cpu/quants.h +8 -0
  252. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1132 -81
  253. data/ext/sources/ggml/src/ggml-cpu/repack.h +36 -0
  254. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +120 -93
  255. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  256. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  257. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  258. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  259. data/ext/sources/ggml/src/ggml-cpu/traits.cpp +2 -2
  260. data/ext/sources/ggml/src/ggml-cpu/traits.h +1 -1
  261. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
  262. data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +7 -0
  263. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +294 -27
  264. data/ext/sources/ggml/src/ggml-cpu/vec.h +606 -48
  265. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +92 -17
  266. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  267. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  268. data/ext/sources/ggml/src/ggml-cuda/argmax.cu +2 -2
  269. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +123 -6
  270. data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +16 -0
  271. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  272. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  273. data/ext/sources/ggml/src/ggml-cuda/common.cuh +588 -128
  274. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  275. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  276. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  277. data/ext/sources/ggml/src/ggml-cuda/convert.cu +95 -22
  278. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +25 -0
  279. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  280. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +335 -485
  281. data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +1 -5
  282. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  283. data/ext/sources/ggml/src/ggml-cuda/cumsum.cu +307 -0
  284. data/ext/sources/ggml/src/ggml-cuda/cumsum.cuh +5 -0
  285. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  286. data/ext/sources/ggml/src/ggml-cuda/diag.cu +77 -0
  287. data/ext/sources/ggml/src/ggml-cuda/diag.cuh +5 -0
  288. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +519 -378
  289. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +750 -637
  290. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
  291. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +1244 -0
  292. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
  293. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +98 -61
  294. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
  295. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +230 -197
  296. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  297. data/ext/sources/ggml/src/ggml-cuda/fill.cu +37 -0
  298. data/ext/sources/ggml/src/ggml-cuda/fill.cuh +3 -0
  299. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  300. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1557 -294
  301. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  302. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  303. data/ext/sources/ggml/src/ggml-cuda/mean.cu +57 -2
  304. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +915 -69
  305. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +171 -0
  306. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +835 -0
  307. data/ext/sources/ggml/src/ggml-cuda/mmid.cu +164 -0
  308. data/ext/sources/ggml/src/ggml-cuda/mmid.cuh +5 -0
  309. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +109 -67
  310. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1601 -733
  311. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +802 -0
  312. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +12 -0
  313. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +286 -149
  314. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -1
  315. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  316. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  317. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  318. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  319. data/ext/sources/ggml/src/ggml-cuda/pad.cu +86 -32
  320. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  321. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  322. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +163 -10
  323. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +14 -0
  324. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  325. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  326. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  327. data/ext/sources/ggml/src/ggml-cuda/rope.cu +207 -98
  328. data/ext/sources/ggml/src/ggml-cuda/rope.cuh +2 -0
  329. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  330. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +330 -0
  331. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  332. data/ext/sources/ggml/src/ggml-cuda/set.cu +39 -0
  333. data/ext/sources/ggml/src/ggml-cuda/set.cuh +7 -0
  334. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  335. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  336. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +325 -61
  337. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +275 -0
  338. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
  339. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +14 -12
  340. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +291 -104
  341. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  342. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +21 -4
  343. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  344. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  345. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  346. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  347. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  348. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  349. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  350. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  351. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  352. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  353. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  354. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  355. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  356. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  357. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  358. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  359. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  360. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  361. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  362. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  363. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  364. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  365. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  366. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  367. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  368. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  369. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  370. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  371. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  372. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  373. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  374. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  375. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  376. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  377. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  378. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  379. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  380. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  381. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  382. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  383. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  384. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  385. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  386. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  387. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  388. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
  389. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  390. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  391. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  392. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  393. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  394. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  395. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  396. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  397. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  398. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  399. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  400. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  401. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  402. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  403. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  404. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  405. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  406. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +96 -0
  407. data/ext/sources/ggml/src/ggml-cuda/top-k.cuh +3 -0
  408. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +351 -0
  409. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +21 -0
  410. data/ext/sources/ggml/src/ggml-cuda/tri.cu +136 -0
  411. data/ext/sources/ggml/src/ggml-cuda/tri.cuh +5 -0
  412. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  413. data/ext/sources/ggml/src/ggml-cuda/unary.cu +189 -5
  414. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +44 -0
  415. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +248 -6
  416. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  417. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +8 -0
  418. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +70 -37
  419. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +10 -3
  420. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +80 -0
  421. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3151 -0
  422. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +44 -0
  423. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +682 -0
  424. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
  425. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  426. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +566 -0
  427. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +112 -0
  428. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  429. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.c +63 -0
  430. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +157 -0
  431. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +165 -0
  432. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +92 -0
  433. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  434. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +94 -0
  435. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +72 -0
  436. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
  437. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +1020 -0
  438. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +1353 -0
  439. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +1001 -0
  440. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2503 -0
  441. data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
  442. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +487 -0
  443. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +168 -0
  444. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
  445. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +287 -0
  446. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
  447. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  448. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +454 -0
  449. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +221 -0
  450. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +153 -0
  451. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +16 -13
  452. data/ext/sources/ggml/src/ggml-impl.h +186 -15
  453. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -7
  454. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  455. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  456. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  457. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +609 -0
  458. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1743 -0
  459. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +273 -0
  460. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1686 -0
  461. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +356 -61
  462. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +4161 -0
  463. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
  464. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +724 -0
  465. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +4495 -1876
  466. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +21 -9
  467. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +29 -0
  468. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4005 -427
  469. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  470. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  471. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  472. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  473. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +147 -0
  474. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +66 -0
  475. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +82 -0
  476. data/ext/sources/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  477. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  478. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  479. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  480. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  481. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  482. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  483. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
  484. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +177 -0
  485. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
  486. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  487. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  488. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +39 -0
  489. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  490. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  491. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  492. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  493. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  494. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  495. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  496. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  497. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  498. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  499. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  500. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  501. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  502. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  503. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  504. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
  505. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +94 -0
  506. data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
  507. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  508. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  509. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  510. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  511. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  512. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  513. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -0
  514. data/ext/sources/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  515. data/ext/sources/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  516. data/ext/sources/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  517. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
  518. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +33 -0
  519. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
  520. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  521. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  522. data/ext/sources/ggml/src/ggml-quants.c +111 -16
  523. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  524. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +497 -195
  525. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +48 -3
  526. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +77 -0
  527. data/ext/sources/ggml/src/ggml-sycl/add-id.hpp +8 -0
  528. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +8 -0
  529. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +6 -5
  530. data/ext/sources/ggml/src/ggml-sycl/common.hpp +117 -15
  531. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +50 -30
  532. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +10 -4
  533. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +200 -99
  534. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  535. data/ext/sources/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  536. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +72 -309
  537. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  538. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +18 -0
  539. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +67 -49
  540. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +77 -34
  541. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +397 -314
  542. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +12 -2
  543. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +14 -26
  544. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +9 -6
  545. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +643 -413
  546. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  547. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +2 -2
  548. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +80 -60
  549. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +223 -132
  550. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +230 -55
  551. data/ext/sources/ggml/src/ggml-sycl/norm.hpp +2 -0
  552. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +97 -0
  553. data/ext/sources/ggml/src/ggml-sycl/pad.hpp +24 -0
  554. data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
  555. data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
  556. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +2 -0
  557. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  558. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +8 -9
  559. data/ext/sources/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  560. data/ext/sources/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  561. data/ext/sources/ggml/src/ggml-sycl/roll.cpp +122 -0
  562. data/ext/sources/ggml/src/ggml-sycl/roll.hpp +20 -0
  563. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +65 -59
  564. data/ext/sources/ggml/src/ggml-sycl/set.cpp +73 -0
  565. data/ext/sources/ggml/src/ggml-sycl/set.hpp +5 -0
  566. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  567. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  568. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +330 -165
  569. data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +4 -0
  570. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
  571. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  572. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +12 -6
  573. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +60 -6
  574. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +16 -12
  575. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
  576. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +7398 -2635
  577. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  578. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
  579. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +43 -3
  580. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  581. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  582. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  583. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +15 -6
  584. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +56 -39
  585. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  586. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  587. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
  588. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
  589. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
  590. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
  591. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +347 -0
  592. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
  593. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
  594. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +5 -5
  595. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +67 -13
  596. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  597. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
  598. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
  599. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  600. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
  601. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  602. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  603. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
  604. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +158 -16
  605. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +38 -3
  606. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
  607. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  608. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
  609. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
  610. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
  611. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
  612. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
  613. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
  614. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
  615. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
  616. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  617. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +4 -4
  618. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +2 -2
  619. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
  620. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
  621. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +4 -4
  622. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
  623. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
  624. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +4 -4
  625. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +2 -2
  626. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
  627. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
  628. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
  629. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
  630. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  631. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  632. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +103 -36
  633. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +220 -0
  634. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +139 -45
  635. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +113 -38
  636. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +75 -14
  637. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  638. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
  639. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  640. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  641. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
  642. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  643. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
  644. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +19 -17
  645. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +2 -0
  646. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +7 -0
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +4 -0
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +33 -17
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +2 -2
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +227 -0
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +71 -21
  663. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +41 -25
  664. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +2 -2
  665. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +44 -26
  666. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +2 -2
  667. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +2 -2
  668. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +2 -2
  669. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +20 -14
  670. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
  671. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +4 -6
  672. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +2 -2
  673. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +4 -6
  674. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +4 -6
  675. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +2 -2
  676. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +143 -0
  677. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +494 -0
  678. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -556
  679. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +230 -51
  680. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +566 -0
  681. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +72 -0
  682. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +90 -223
  683. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
  684. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
  685. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +195 -0
  686. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  687. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
  688. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
  689. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  690. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +41 -5
  691. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
  692. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +59 -9
  693. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
  694. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
  695. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
  696. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
  697. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +104 -14
  698. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
  699. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  700. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  701. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +234 -0
  702. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
  703. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +6 -52
  704. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +6 -35
  705. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +6 -35
  706. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +28 -0
  707. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +6 -39
  708. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  709. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +5 -0
  710. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +3 -3
  711. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
  712. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
  713. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
  714. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
  715. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +30 -8
  716. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
  717. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
  718. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
  719. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
  720. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
  721. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  722. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
  723. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  724. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
  725. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
  726. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
  727. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  728. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
  729. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +16 -6
  730. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
  731. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
  732. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  733. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
  734. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
  735. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
  736. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
  737. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
  738. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +43 -0
  739. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  740. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +435 -24
  741. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +148 -6
  742. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.glsl +25 -0
  743. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +619 -177
  744. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  745. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  746. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +169 -0
  747. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3087 -0
  748. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  749. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
  750. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  751. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  752. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
  753. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +147 -0
  754. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +591 -0
  755. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  756. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  757. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  758. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  759. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
  760. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
  761. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
  762. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
  763. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  764. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  765. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
  766. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +112 -0
  767. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  768. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  769. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +483 -0
  770. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  771. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  772. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  773. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  774. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  775. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  776. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  777. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
  778. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +466 -0
  779. data/ext/sources/ggml/src/ggml.c +901 -129
  780. data/ext/sources/ggml/src/gguf.cpp +8 -1
  781. data/ext/sources/include/whisper.h +1 -0
  782. data/ext/sources/src/CMakeLists.txt +3 -1
  783. data/ext/sources/src/whisper.cpp +124 -81
  784. data/ext/sources/tests/CMakeLists.txt +8 -1
  785. data/ext/sources/tests/test-vad-full.cpp +7 -5
  786. data/ext/sources/tests/test-vad.cpp +3 -3
  787. data/extsources.rb +1 -0
  788. data/lib/whisper/model/uri.rb +17 -18
  789. data/sig/whisper.rbs +126 -2
  790. data/test/test_params.rb +24 -8
  791. data/test/test_segment.rb +0 -1
  792. data/test/test_token.rb +70 -0
  793. data/test/test_vad.rb +1 -1
  794. data/test/test_vad_context.rb +50 -0
  795. data/test/test_vad_segment.rb +19 -0
  796. data/test/test_vad_segments.rb +16 -0
  797. data/test/test_whisper.rb +8 -1
  798. data/whispercpp.gemspec +1 -1
  799. metadata +439 -179
  800. data/ext/sources/build-xcframework.sh +0 -547
  801. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +0 -279
  802. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +0 -1841
  803. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +0 -303
  804. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  805. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  806. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  807. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  808. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  809. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  810. data/ext/sources/ggml/src/ggml-cann/Doxyfile +0 -2579
  811. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  812. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  813. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  814. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  815. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  816. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  817. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  818. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  819. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  820. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  821. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  822. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  823. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  824. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  825. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  826. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  827. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -506
  828. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +0 -11
  829. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  830. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  831. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  832. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  833. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  834. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  835. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  836. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  837. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  838. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  839. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  840. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  841. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  842. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  843. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  844. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  845. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  846. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  847. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  848. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  849. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  850. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  851. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  852. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  853. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  854. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  855. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  856. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  857. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  858. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  859. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  860. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  861. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  862. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  863. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  864. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  865. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  866. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  867. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  868. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  869. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  870. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  871. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  872. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  873. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  874. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  875. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  876. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  877. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  878. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  879. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  880. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  881. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  882. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  883. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  884. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  885. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  886. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  887. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  888. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  889. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  890. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  891. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  892. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  893. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  894. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  895. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  896. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  897. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  898. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  899. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  900. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  901. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  902. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  903. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  904. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  905. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  906. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  907. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  908. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  909. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  910. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  911. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  912. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  913. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  914. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  915. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  916. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  917. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  918. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  919. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  920. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  921. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  922. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  923. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  924. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  925. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  926. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  927. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  928. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  929. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  930. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  931. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  932. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  933. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  934. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  935. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  936. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  937. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  938. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  939. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  940. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  941. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  942. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  943. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  944. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  945. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  946. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  947. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  948. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  949. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  950. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  951. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  952. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  953. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  954. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -6280
  955. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +0 -162
  956. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -118
  957. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -99
  958. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -58
  959. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
  960. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
  961. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
  962. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
  963. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
@@ -22,23 +22,24 @@
22
22
 
23
23
  #include "ggml-cann.h"
24
24
 
25
+ #include "ggml-backend-impl.h"
26
+ #include "ggml-cann/aclnn_ops.h"
27
+ #include "ggml-cann/common.h"
28
+ #include "ggml-impl.h"
29
+ #include "ggml.h"
30
+
25
31
  #include <acl/acl.h>
32
+ #include <aclnnop/aclnn_trans_matmul_weight.h>
26
33
  #include <stdarg.h>
27
34
 
35
+ #include <chrono>
28
36
  #include <cmath>
29
37
  #include <cstdio>
30
38
  #include <cstring>
31
39
  #include <mutex>
40
+ #include <optional>
32
41
  #include <queue>
33
- #include <chrono>
34
42
  #include <unordered_set>
35
- #include <optional>
36
-
37
- #include "ggml-impl.h"
38
- #include "ggml-backend-impl.h"
39
- #include "ggml-cann/aclnn_ops.h"
40
- #include "ggml-cann/common.h"
41
- #include "ggml.h"
42
43
 
43
44
  #define GGML_COMMON_DECL_C
44
45
 
@@ -55,33 +56,41 @@
55
56
  * @param line The line number where the error occurred.
56
57
  * @param msg The error message.
57
58
  */
58
- [[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
59
- const char* file, int line, const char* msg) {
59
+ [[noreturn]] void ggml_cann_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
60
60
  int32_t id = -1;
61
61
  aclrtGetDevice(&id);
62
62
 
63
63
  GGML_LOG_ERROR("CANN error: %s\n", msg);
64
- GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func,
65
- file, line);
64
+ GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
66
65
  GGML_LOG_ERROR(" %s\n", stmt);
67
66
  // abort with GGML_ASSERT to get a stack trace
68
67
  GGML_ABORT("CANN error");
69
68
  }
70
69
 
70
+ // Thread-local variable to record the current device of this thread.
71
+ thread_local int g_current_cann_device = -1;
72
+
71
73
  /**
72
- * @brief Sets the device to be used by CANN.
74
+ * @brief Set the CANN device to be used.
73
75
  *
74
- * @param device The device ID to set.
76
+ * @param device The target device ID to set.
75
77
  */
76
78
  void ggml_cann_set_device(const int32_t device) {
77
- // TODO: uncomment these lines after empty context has fixed.
78
- // int current_device;
79
- // ACL_CHECK(aclrtGetDevice(&current_device));
79
+ // int current_device = -1;
80
+ // Note: In some CANN versions, if no device has been set yet,
81
+ // aclrtGetDevice(&current_device) may return 0 by default.
82
+ // aclrtGetDevice(&current_device);
83
+
84
+ // If the current device is already the target one, no need to switch.
85
+ if (device == g_current_cann_device) {
86
+ return;
87
+ }
80
88
 
81
- // if (device == current_device) {
82
- // return;
83
- // }
89
+ // Switch to the new device.
84
90
  ACL_CHECK(aclrtSetDevice(device));
91
+
92
+ // Update the global device record.
93
+ g_current_cann_device = device;
85
94
  }
86
95
 
87
96
  /**
@@ -96,12 +105,14 @@ int32_t ggml_cann_get_device() {
96
105
  }
97
106
 
98
107
  /**
99
- * @brief Get the value of the specified environment variable (name).
108
+ * @brief Get the value of the specified environment variable (name) as lowercase.
100
109
  * if not empty, return a std::string object
101
110
  */
102
- std::optional<std::string> get_env(const std::string& name) {
103
- const char* val = std::getenv(name.c_str());
104
- if (!val) return std::nullopt;
111
+ std::optional<std::string> get_env_as_lowercase(const std::string & name) {
112
+ const char * val = std::getenv(name.c_str());
113
+ if (!val) {
114
+ return std::nullopt;
115
+ }
105
116
  std::string res = std::string(val);
106
117
  std::transform(res.begin(), res.end(), res.begin(), ::tolower);
107
118
  return res;
@@ -110,11 +121,29 @@ std::optional<std::string> get_env(const std::string& name) {
110
121
  /**
111
122
  * @brief Verify whether the environment variable is a valid value.
112
123
  */
113
- bool parse_bool(const std::string& value) {
114
- std::unordered_set<std::string> valid_values = {"on", "1", "yes", "y", "enable", "true"};
124
+ bool parse_bool(const std::string & value) {
125
+ static const std::unordered_set<std::string> valid_values = { "on", "1", "yes", "y", "enable", "true" };
115
126
  return valid_values.find(value) != valid_values.end();
116
127
  }
117
128
 
129
+ /**
130
+ * @brief Parse a string as an integer, returning 0 if invalid.
131
+ *
132
+ * This function attempts to convert the input string `value` to an `int`.
133
+ * If the string is not a valid integer or is out of the `int` range,
134
+ * it returns 0.
135
+ *
136
+ * @param value The string to parse.
137
+ * @return The parsed integer, or 0 if conversion fails.
138
+ */
139
+ int parse_integer(const std::string & value) {
140
+ try {
141
+ return std::stoi(value);
142
+ } catch (...) {
143
+ return 0;
144
+ }
145
+ }
146
+
118
147
  /**
119
148
  * @brief Initialize the CANN device information.
120
149
  *
@@ -126,11 +155,10 @@ bool parse_bool(const std::string& value) {
126
155
  static ggml_cann_device_info ggml_cann_init() {
127
156
  ggml_cann_device_info info = {};
128
157
 
129
- aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count);
158
+ aclError err = aclrtGetDeviceCount((uint32_t *) &info.device_count);
130
159
 
131
160
  if (err != ACL_SUCCESS) {
132
- GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n",
133
- __func__, aclGetRecentErrMsg());
161
+ GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n", __func__, aclGetRecentErrMsg());
134
162
  return info;
135
163
  }
136
164
 
@@ -138,16 +166,15 @@ static ggml_cann_device_info ggml_cann_init() {
138
166
 
139
167
  for (int id = 0; id < info.device_count; ++id) {
140
168
  aclrtPhysicalMemProp prop = {};
141
- prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
142
- prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
143
- prop.memAttr = ACL_HBM_MEM_HUGE;
144
- prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
145
- prop.location.id = id;
146
- prop.reserve = 0;
147
- err = aclrtMemGetAllocationGranularity(
148
- &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
149
- &info.devices[id].vmm_granularity);
150
- info.devices[id].vmm = err == ACL_SUCCESS;
169
+ prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
170
+ prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
171
+ prop.memAttr = ACL_HBM_MEM_HUGE;
172
+ prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
173
+ prop.location.id = id;
174
+ prop.reserve = 0;
175
+ err = aclrtMemGetAllocationGranularity(&prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
176
+ &info.devices[id].vmm_granularity);
177
+ info.devices[id].vmm = err == ACL_SUCCESS;
151
178
 
152
179
  size_t free, total;
153
180
  ggml_backend_cann_get_device_memory(id, &free, &total);
@@ -167,7 +194,7 @@ static ggml_cann_device_info ggml_cann_init() {
167
194
  *
168
195
  * @return A reference to the structure containing the device information.
169
196
  */
170
- const ggml_cann_device_info& ggml_cann_info() {
197
+ const ggml_cann_device_info & ggml_cann_info() {
171
198
  static ggml_cann_device_info info = ggml_cann_init();
172
199
  return info;
173
200
  }
@@ -187,7 +214,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
187
214
  /**
188
215
  * @brief The minimum free margin for a buffer.
189
216
  */
190
- static const size_t min_free_margin = 1ull << 20; // 1MB
217
+ static const size_t min_free_margin = 1ull << 20; // 1MB
191
218
 
192
219
  /**
193
220
  * @brief The alignment for buffer allocation.
@@ -208,22 +235,18 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
208
235
  * @brief Structure representing a CANN buffer.
209
236
  */
210
237
  struct ggml_cann_buffer {
211
- void* ptr = nullptr; ///< Pointer to the buffer.
212
- size_t size = 0; ///< Size of the buffer.
213
- std::chrono::steady_clock::time_point last_used; ///< Last used time.
238
+ void * ptr = nullptr; ///< Pointer to the buffer.
239
+ size_t size = 0; ///< Size of the buffer.
240
+ std::chrono::steady_clock::time_point last_used; ///< Last used time.
214
241
 
215
- bool operator>(const ggml_cann_buffer& other) const {
216
- return size > other.size;
217
- }
242
+ bool operator>(const ggml_cann_buffer & other) const { return size > other.size; }
218
243
  };
219
244
 
220
245
  /**
221
246
  * @brief Array of CANN buffers in the pool.
222
247
  */
223
- std::unordered_map<void*, size_t> buffer_pool;
224
- std::priority_queue<ggml_cann_buffer,
225
- std::vector<ggml_cann_buffer>,
226
- std::greater<>> free_buffers ;
248
+ std::unordered_map<void *, size_t> buffer_pool;
249
+ std::priority_queue<ggml_cann_buffer, std::vector<ggml_cann_buffer>, std::greater<>> free_buffers;
227
250
 
228
251
  /**
229
252
  * @brief Total size of all buffers in the pool.
@@ -236,7 +259,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
236
259
  * @param device The device ID to associate with this buffer pool.
237
260
  */
238
261
  explicit ggml_cann_pool_buf_prio(int device) : device(device) {
239
- disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
262
+ disable_clean = parse_bool(get_env_as_lowercase("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
240
263
  }
241
264
 
242
265
  /**
@@ -244,7 +267,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
244
267
  */
245
268
  ~ggml_cann_pool_buf_prio() {
246
269
  ggml_cann_set_device(device);
247
- for (auto& [b_ptr, b_size] : buffer_pool) {
270
+ for (auto & [b_ptr, b_size] : buffer_pool) {
248
271
  aclrtFree(b_ptr);
249
272
  pool_size -= b_size;
250
273
  }
@@ -260,14 +283,14 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
260
283
  * the allocated buffer.
261
284
  * @return A pointer to the allocated buffer.
262
285
  */
263
- void* alloc(size_t size, size_t* actual_size) override {
286
+ void * alloc(size_t size, size_t * actual_size) override {
264
287
  size = GGML_PAD(size, alignment);
265
288
  if (size == 0) {
266
289
  size = alignment;
267
290
  }
268
291
 
269
- void* ptr = nullptr;
270
- auto now = std::chrono::steady_clock::now();
292
+ void * ptr = nullptr;
293
+ auto now = std::chrono::steady_clock::now();
271
294
 
272
295
  std::vector<ggml_cann_buffer> free_buffers_rest;
273
296
  free_buffers_rest.reserve(free_buffers.size());
@@ -280,24 +303,22 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
280
303
  const size_t margin = b.size - size;
281
304
  if (margin <= max_reuse_margin) {
282
305
  *actual_size = b.size;
283
- ptr = b.ptr;
306
+ ptr = b.ptr;
284
307
  #ifdef DEBUG_CANN_MALLOC
285
308
  GGML_LOG_INFO(
286
309
  "cann pool[%d]: reused %p, "
287
310
  "pool_size = %5u MB, "
288
311
  "size = %5u MB, "
289
312
  "margin = %5u MB\n",
290
- device, b.ptr,
291
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
292
- (uint32_t)(GGML_PAD(size, 1048576) / 1048576),
293
- (uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
313
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
314
+ (uint32_t) (GGML_PAD(size, 1048576) / 1048576),
315
+ (uint32_t) (GGML_PAD(margin, 1048576) / 1048576));
294
316
  #endif
295
317
  break;
296
318
  }
297
319
  }
298
320
 
299
- bool should_clean = !disable_clean &&
300
- b.size > min_free_margin &&
321
+ bool should_clean = !disable_clean && b.size > min_free_margin &&
301
322
  std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
302
323
  if (should_clean) {
303
324
  // free the buffer if the size is needed to be freed
@@ -309,20 +330,20 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
309
330
  "cann pool[%d]: clean %p, "
310
331
  "pool_size = %5u MB, "
311
332
  "size = %5u MB\n",
312
- device, b.ptr,
313
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
314
- (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
333
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
334
+ (uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
315
335
  #endif
316
336
  continue;
317
337
  }
318
338
  free_buffers_rest.push_back(b);
319
339
  }
320
- for (ggml_cann_buffer &b : free_buffers_rest) {
340
+ for (ggml_cann_buffer & b : free_buffers_rest) {
321
341
  free_buffers.push(std::move(b));
322
342
  }
323
343
 
324
344
  #ifdef DEBUG_CANN_MALLOC
325
- GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
345
+ GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device,
346
+ (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
326
347
  #endif
327
348
  if (ptr != nullptr) {
328
349
  return ptr;
@@ -338,8 +359,8 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
338
359
  "cann pool[%d]: allocate %p, "
339
360
  "pool_size = %5u MB, "
340
361
  "size = %5u MB\n",
341
- device, ptr, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
342
- (uint32_t)(GGML_PAD(size, 1048576) / 1048576));
362
+ device, ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
363
+ (uint32_t) (GGML_PAD(size, 1048576) / 1048576));
343
364
  #endif
344
365
  buffer_pool.emplace(ptr, size);
345
366
  return ptr;
@@ -351,7 +372,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
351
372
  * @param ptr Pointer to the buffer to free.
352
373
  * @param size Size of the buffer to free.
353
374
  */
354
- void free(void* ptr, size_t size) override {
375
+ void free(void * ptr, size_t size) override {
355
376
  GGML_UNUSED(size);
356
377
  auto it = buffer_pool.find(ptr);
357
378
  if (it == buffer_pool.end()) {
@@ -359,13 +380,12 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
359
380
  }
360
381
 
361
382
  auto now = std::chrono::steady_clock::now();
362
- free_buffers.emplace(ggml_cann_buffer{ptr, it->second, now});
383
+ free_buffers.emplace(ggml_cann_buffer{ ptr, it->second, now });
363
384
  #ifdef DEBUG_CANN_MALLOC
364
385
  GGML_LOG_INFO(
365
386
  "cann pool[%d]: return %p, "
366
387
  "pool_size = %5u MB\n",
367
- device, ptr,
368
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
388
+ device, ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
369
389
  #endif
370
390
  }
371
391
  };
@@ -384,7 +404,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
384
404
  /**
385
405
  * @brief The minimum free margin for a buffer.
386
406
  */
387
- static const size_t min_free_margin = 1ull << 20; // 1MB
407
+ static const size_t min_free_margin = 1ull << 20; // 1MB
388
408
 
389
409
  /**
390
410
  * @brief The alignment for buffer allocation.
@@ -410,10 +430,10 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
410
430
  * @brief Structure representing a CANN buffer.
411
431
  */
412
432
  struct ggml_cann_buffer {
413
- void* ptr = nullptr; ///< Pointer to the buffer memory.
414
- size_t size = 0; ///< Size of the buffer.
415
- bool used = false; ///< Whether the buffer is currently in use.
416
- std::chrono::steady_clock::time_point last_used; ///< Last used time.
433
+ void * ptr = nullptr; ///< Pointer to the buffer memory.
434
+ size_t size = 0; ///< Size of the buffer.
435
+ bool used = false; ///< Whether the buffer is currently in use.
436
+ std::chrono::steady_clock::time_point last_used; ///< Last used time.
417
437
  };
418
438
 
419
439
  /**
@@ -432,7 +452,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
432
452
  * @param device The device ID to associate with this buffer pool.
433
453
  */
434
454
  explicit ggml_cann_pool_buf(int device) : device(device) {
435
- disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
455
+ disable_clean = parse_bool(get_env_as_lowercase("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
436
456
  }
437
457
 
438
458
  /**
@@ -441,7 +461,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
441
461
  ~ggml_cann_pool_buf() {
442
462
  ggml_cann_set_device(device);
443
463
  for (int i = 0; i < MAX_BUFFERS; ++i) {
444
- ggml_cann_buffer& b = buffer_pool[i];
464
+ ggml_cann_buffer & b = buffer_pool[i];
445
465
  if (b.ptr != nullptr) {
446
466
  aclrtFree(b.ptr);
447
467
  pool_size -= b.size;
@@ -458,18 +478,18 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
458
478
  * the allocated buffer.
459
479
  * @return A pointer to the allocated buffer.
460
480
  */
461
- void* alloc(size_t size, size_t* actual_size) override {
481
+ void * alloc(size_t size, size_t * actual_size) override {
462
482
  size = GGML_PAD(size, alignment);
463
483
  if (size == 0) {
464
484
  size = alignment;
465
485
  }
466
486
 
467
- void* ptr = nullptr;
468
- auto now = std::chrono::steady_clock::now();
487
+ void * ptr = nullptr;
488
+ auto now = std::chrono::steady_clock::now();
469
489
 
470
490
  int i = 0;
471
491
  for (; i < MAX_BUFFERS; ++i) {
472
- ggml_cann_buffer& b = buffer_pool[i];
492
+ ggml_cann_buffer & b = buffer_pool[i];
473
493
  if (b.ptr == nullptr) {
474
494
  break;
475
495
  }
@@ -481,25 +501,23 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
481
501
  const size_t margin = b.size - size;
482
502
  if (margin <= max_reuse_margin) {
483
503
  *actual_size = b.size;
484
- b.used = true;
485
- ptr = b.ptr;
504
+ b.used = true;
505
+ ptr = b.ptr;
486
506
  #ifdef DEBUG_CANN_MALLOC
487
507
  GGML_LOG_INFO(
488
508
  "cann pool[%d]: reused %p, "
489
509
  "pool_size = %5u MB, "
490
510
  "size = %5u MB, "
491
511
  "margin = %5u MB\n",
492
- device, b.ptr,
493
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
494
- (uint32_t)(GGML_PAD(size, 1048576) / 1048576),
495
- (uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
512
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
513
+ (uint32_t) (GGML_PAD(size, 1048576) / 1048576),
514
+ (uint32_t) (GGML_PAD(margin, 1048576) / 1048576));
496
515
  #endif
497
516
  break;
498
517
  }
499
518
  }
500
519
 
501
- bool should_clean = !disable_clean &&
502
- b.size > min_free_margin &&
520
+ bool should_clean = !disable_clean && b.size > min_free_margin &&
503
521
  std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
504
522
  if (should_clean) {
505
523
  // free the buffer if the size is needed to be freed
@@ -510,9 +528,8 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
510
528
  "cann pool[%d]: clean %p, "
511
529
  "pool_size = %5u MB, "
512
530
  "size = %5u MB\n",
513
- device, b.ptr,
514
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
515
- (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
531
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
532
+ (uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
516
533
  #endif
517
534
  b.ptr = nullptr;
518
535
  }
@@ -523,13 +540,13 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
523
540
 
524
541
  if (i < MAX_BUFFERS) {
525
542
  // allocate a new buffer if no buffer can be reused
526
- ggml_cann_buffer& b = buffer_pool[i];
543
+ ggml_cann_buffer & b = buffer_pool[i];
527
544
  ggml_cann_set_device(device);
528
545
  ACL_CHECK(aclrtMalloc(&b.ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
529
546
  pool_size += size;
530
547
  *actual_size = size;
531
- b.size = size;
532
- b.used = true;
548
+ b.size = size;
549
+ b.used = true;
533
550
  if (i >= MAX_BUFFERS - 8) {
534
551
  GGML_LOG_WARN("cann pool[%d]: slots almost full\n", device);
535
552
  }
@@ -538,9 +555,8 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
538
555
  "cann pool[%d]: allocate %p, "
539
556
  "pool_size = %5u MB, "
540
557
  "size = %5u MB\n",
541
- device, b.ptr,
542
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
543
- (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
558
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
559
+ (uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
544
560
  #endif
545
561
  return b.ptr;
546
562
  }
@@ -554,21 +570,20 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
554
570
  * @param ptr Pointer to the buffer to free.
555
571
  * @param size Size of the buffer to free.
556
572
  */
557
- void free(void* ptr, size_t size) override {
573
+ void free(void * ptr, size_t size) override {
558
574
  GGML_UNUSED(size);
559
575
  for (int i = 0; i < MAX_BUFFERS; ++i) {
560
- ggml_cann_buffer& b = buffer_pool[i];
576
+ ggml_cann_buffer & b = buffer_pool[i];
561
577
  if (b.ptr != ptr) {
562
578
  continue;
563
579
  }
564
- b.used = false;
580
+ b.used = false;
565
581
  b.last_used = std::chrono::steady_clock::now();
566
582
  #ifdef DEBUG_CANN_MALLOC
567
583
  GGML_LOG_INFO(
568
584
  "cann pool[%d]: return %p, "
569
585
  "pool_size = %5u MB\n",
570
- device, b.ptr,
571
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
586
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
572
587
  #endif
573
588
  return;
574
589
  }
@@ -596,7 +611,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
596
611
  /**
597
612
  * @brief Pointer to the start of the virtual memory pool.
598
613
  */
599
- void* pool_addr = 0;
614
+ void * pool_addr = 0;
600
615
 
601
616
  /**
602
617
  * @brief Amount of virtual memory used in the pool.
@@ -621,7 +636,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
621
636
  /**
622
637
  * @brief Offsets for the mapped memory regions.
623
638
  */
624
- std::vector<void*> map_offsets;
639
+ std::vector<void *> map_offsets;
625
640
 
626
641
  /**
627
642
  * @brief Constructor to initialize the buffer pool with virtual memory for
@@ -629,11 +644,10 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
629
644
  *
630
645
  * @param device The device ID to associate with this buffer pool.
631
646
  */
632
- explicit ggml_cann_pool_vmm(int device)
633
- : device(device) {
634
- auto dev = ggml_cann_info().devices[device];
647
+ explicit ggml_cann_pool_vmm(int device) : device(device) {
648
+ auto dev = ggml_cann_info().devices[device];
635
649
  granularity = dev.vmm_granularity;
636
- max_size = dev.total_vram;
650
+ max_size = dev.total_vram;
637
651
  }
638
652
 
639
653
  /**
@@ -641,10 +655,10 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
641
655
  */
642
656
  ~ggml_cann_pool_vmm() {
643
657
  if (pool_addr != 0) {
644
- for (auto& offset : map_offsets) {
658
+ for (auto & offset : map_offsets) {
645
659
  ACL_CHECK(aclrtUnmapMem(offset));
646
660
  }
647
- for (auto& handle : handles) {
661
+ for (auto & handle : handles) {
648
662
  ACL_CHECK(aclrtFreePhysical(handle));
649
663
  }
650
664
  ACL_CHECK(aclrtReleaseMemAddress(pool_addr));
@@ -659,11 +673,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
659
673
  * the allocated buffer.
660
674
  * @return A pointer to the allocated buffer.
661
675
  */
662
- void* alloc(size_t size, size_t* actual_size) override {
676
+ void * alloc(size_t size, size_t * actual_size) override {
663
677
  // round up the allocation size to the alignment to ensure that all
664
678
  // allocations are aligned for all data types
665
679
  const size_t alignment = 128;
666
- size = GGML_PAD(size, alignment);
680
+ size = GGML_PAD(size, alignment);
667
681
  if (size == 0) {
668
682
  size = alignment;
669
683
  }
@@ -673,53 +687,51 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
673
687
  if (size > avail) {
674
688
  // round up to the next multiple of the granularity
675
689
  size_t reserve_size = size - avail;
676
- reserve_size = GGML_PAD(reserve_size, granularity);
690
+ reserve_size = GGML_PAD(reserve_size, granularity);
677
691
 
678
692
  GGML_ASSERT(pool_size + reserve_size <= max_size);
679
693
 
680
694
  // allocate more physical memory
681
695
  aclrtPhysicalMemProp prop = {};
682
- prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
683
- prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
684
- prop.memAttr = ACL_HBM_MEM_HUGE;
685
- prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
686
- prop.location.id = device;
687
- prop.reserve = 0;
696
+ prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
697
+ prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
698
+ prop.memAttr = ACL_HBM_MEM_HUGE;
699
+ prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
700
+ prop.location.id = device;
701
+ prop.reserve = 0;
688
702
  aclrtDrvMemHandle handle;
689
703
  ACL_CHECK(aclrtMallocPhysical(&handle, reserve_size, &prop, 0));
690
704
 
691
705
  // reserve virtual address space (if not already reserved)
692
706
  if (pool_addr == 0) {
693
- ACL_CHECK(aclrtReserveMemAddress(
694
- &pool_addr, max_size, 0, NULL, 1));
707
+ ACL_CHECK(aclrtReserveMemAddress(&pool_addr, max_size, 0, NULL, 1));
695
708
  }
696
709
 
697
710
  // map at the end of the pool
698
- ACL_CHECK(aclrtMapMem((char*)pool_addr + pool_size, reserve_size, 0,
699
- handle, 0));
711
+ ACL_CHECK(aclrtMapMem((char *) pool_addr + pool_size, reserve_size, 0, handle, 0));
700
712
 
701
713
  handles.push_back(handle);
702
- map_offsets.push_back((char*)pool_addr + pool_size);
714
+ map_offsets.push_back((char *) pool_addr + pool_size);
703
715
 
704
716
  // add to the pool
705
717
  pool_size += reserve_size;
706
718
 
707
719
  #ifdef DEBUG_CANN_MALLOC
708
- GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
709
- device, (unsigned long long) (pool_size/1024/1024),
710
- (unsigned long long) (reserve_size/1024/1024));
720
+ GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n", device,
721
+ (unsigned long long) (pool_size / 1024 / 1024),
722
+ (unsigned long long) (reserve_size / 1024 / 1024));
711
723
  #endif
712
724
  }
713
725
 
714
726
  GGML_ASSERT(pool_addr != 0);
715
727
 
716
- void* ptr = (void*)((char*)pool_addr + pool_used);
728
+ void * ptr = (void *) ((char *) pool_addr + pool_used);
717
729
  *actual_size = size;
718
730
  pool_used += size;
719
731
 
720
732
  #ifdef DEBUG_CANN_MALLOC
721
- GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
722
- (unsigned long long)size, (unsigned long long)ptr);
733
+ GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size,
734
+ (unsigned long long) ptr);
723
735
  #endif
724
736
  return ptr;
725
737
  }
@@ -730,16 +742,16 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
730
742
  * @param ptr Pointer to the buffer to free.
731
743
  * @param size Size of the buffer to free.
732
744
  */
733
- void free(void* ptr, size_t size) override {
745
+ void free(void * ptr, size_t size) override {
734
746
  #ifdef DEBUG_CANN_MALLOC
735
- GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
736
- (unsigned long long)size, (unsigned long long)ptr);
747
+ GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size,
748
+ (unsigned long long) ptr);
737
749
  #endif
738
750
 
739
751
  pool_used -= size;
740
752
 
741
753
  // all deallocations must be in reverse order of the allocations
742
- GGML_ASSERT(ptr == (void*)((char*)pool_addr + pool_used));
754
+ GGML_ASSERT(ptr == (void *) ((char *) pool_addr + pool_used));
743
755
  }
744
756
  };
745
757
 
@@ -751,9 +763,8 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
751
763
  * @param device The device ID for which to create the pool.
752
764
  * @return A unique pointer to the created CANN pool.
753
765
  */
754
- std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
755
- int device) {
756
- std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
766
+ std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(int device) {
767
+ std::string mem_pool_type = get_env_as_lowercase("GGML_CANN_MEM_POOL").value_or("");
757
768
 
758
769
  if (mem_pool_type == "prio") {
759
770
  GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
@@ -777,9 +788,8 @@ std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
777
788
  * ID, device pointer, and a name derived from GGML_CANN_NAME and the device ID.
778
789
  */
779
790
  struct ggml_backend_cann_buffer_context {
780
- int32_t device; ///< The device ID associated with this buffer context.
781
- void* dev_ptr =
782
- nullptr; ///< Pointer to the device memory allocated for the buffer.
791
+ int32_t device; ///< The device ID associated with this buffer context.
792
+ void * dev_ptr = nullptr; ///< Pointer to the device memory allocated for the buffer.
783
793
 
784
794
  /**
785
795
  * @brief Constructor to initialize the CANN buffer context.
@@ -787,9 +797,7 @@ struct ggml_backend_cann_buffer_context {
787
797
  * @param device The device ID associated with this buffer context.
788
798
  * @param dev_ptr Pointer to the device memory allocated for the buffer.
789
799
  */
790
- ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr)
791
- : device(device),
792
- dev_ptr(dev_ptr) {}
800
+ ggml_backend_cann_buffer_context(int32_t device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
793
801
 
794
802
  /**
795
803
  * @brief Destructor to free the device memory allocated for the buffer.
@@ -807,8 +815,8 @@ struct ggml_backend_cann_buffer_context {
807
815
  * @return true if the buffer is a CANN buffer, false otherwise.
808
816
  */
809
817
  static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
810
- static bool ggml_backend_buffer_is_cann(
811
- ggml_backend_buffer_t buffer) {
818
+
819
+ static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
812
820
  return ggml_backend_buft_is_cann(buffer->buft);
813
821
  }
814
822
 
@@ -820,10 +828,8 @@ static bool ggml_backend_buffer_is_cann(
820
828
  *
821
829
  * @param buffer The CANN buffer to free.
822
830
  */
823
- static void ggml_backend_cann_buffer_free_buffer(
824
- ggml_backend_buffer_t buffer) {
825
- ggml_backend_cann_buffer_context* ctx =
826
- (ggml_backend_cann_buffer_context*)buffer->context;
831
+ static void ggml_backend_cann_buffer_free_buffer(ggml_backend_buffer_t buffer) {
832
+ ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
827
833
  delete ctx;
828
834
  }
829
835
 
@@ -836,10 +842,8 @@ static void ggml_backend_cann_buffer_free_buffer(
836
842
  * @param buffer The CANN buffer whose base pointer is to be retrieved.
837
843
  * @return A pointer to the base of the device memory allocated for the buffer.
838
844
  */
839
- static void* ggml_backend_cann_buffer_get_base(
840
- ggml_backend_buffer_t buffer) {
841
- ggml_backend_cann_buffer_context* ctx =
842
- (ggml_backend_cann_buffer_context*)buffer->context;
845
+ static void * ggml_backend_cann_buffer_get_base(ggml_backend_buffer_t buffer) {
846
+ ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
843
847
  return ctx->dev_ptr;
844
848
  }
845
849
 
@@ -856,21 +860,17 @@ static void* ggml_backend_cann_buffer_get_base(
856
860
  * @param dst Pointer to the destination buffer where transformed data will be
857
861
  * stored.
858
862
  */
859
- static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
860
- const void* src,
861
- void* dst) {
862
-
863
- int64_t n_elems = ggml_nelements(tensor);
864
- int64_t groups = n_elems / QK4_0;
865
- size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
863
+ static void ggml_backend_cann_transform_q4_0(ggml_tensor * tensor, const void * src, void * dst) {
864
+ int64_t n_elems = ggml_nelements(tensor);
865
+ int64_t groups = n_elems / QK4_0;
866
+ size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
866
867
 
867
- uint8_t* quant_offset = (uint8_t*)dst;
868
- uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
868
+ uint8_t * quant_offset = (uint8_t *) dst;
869
+ uint16_t * scale_offset = (uint16_t *) ((char *) dst + quant_bytes);
869
870
 
870
871
  for (int i = 0; i < groups; i++) {
871
- const block_q4_0* group =
872
- (const block_q4_0*)((const char*)src + i * sizeof(block_q4_0));
873
- *scale_offset = group->d;
872
+ const block_q4_0 * group = (const block_q4_0 *) ((const char *) src + i * sizeof(block_q4_0));
873
+ *scale_offset = group->d;
874
874
  scale_offset++;
875
875
 
876
876
  // 0-15
@@ -889,8 +889,7 @@ static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
889
889
  }
890
890
 
891
891
  // put (uint4b_t -8) into int4b_t
892
- for (quant_offset = (uint8_t*)dst;
893
- quant_offset < (uint8_t*)dst + quant_bytes; quant_offset++) {
892
+ for (quant_offset = (uint8_t *) dst; quant_offset < (uint8_t *) dst + quant_bytes; quant_offset++) {
894
893
  (*quant_offset) ^= 0x88;
895
894
  }
896
895
  }
@@ -908,29 +907,27 @@ static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
908
907
  * @param dst Pointer to the destination buffer where the Q4.0 formatted data
909
908
  * will be stored.
910
909
  */
911
- static void ggml_backend_cann_transform_back_q4_0(
912
- const ggml_tensor* tensor, void* src, void* dst) {
910
+ static void ggml_backend_cann_transform_back_q4_0(const ggml_tensor * tensor, void * src, void * dst) {
911
+ int64_t n_elems = ggml_nelements(tensor);
912
+ int64_t groups = n_elems / QK4_0;
913
+ size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
913
914
 
914
- int64_t n_elems = ggml_nelements(tensor);
915
- int64_t groups = n_elems / QK4_0;
916
- size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
915
+ uint8_t * quant_offset = (uint8_t *) src;
916
+ uint16_t * scale_offset = (uint16_t *) ((char *) src + quant_bytes);
917
917
 
918
- uint8_t* quant_offset = (uint8_t*)src;
919
- uint16_t* scale_offset = (uint16_t*)((char*)src + quant_bytes);
920
-
921
- for (; quant_offset < (uint8_t*)src + quant_bytes; quant_offset++) {
918
+ for (; quant_offset < (uint8_t *) src + quant_bytes; quant_offset++) {
922
919
  (*quant_offset) ^= 0x88;
923
920
  }
924
- quant_offset = (uint8_t*)src;
921
+ quant_offset = (uint8_t *) src;
925
922
 
926
923
  for (int i = 0; i < groups; i++) {
927
- block_q4_0* group = (block_q4_0*)((char*)dst + i * sizeof(block_q4_0));
928
- group->d = *scale_offset;
924
+ block_q4_0 * group = (block_q4_0 *) ((char *) dst + i * sizeof(block_q4_0));
925
+ group->d = *scale_offset;
929
926
  scale_offset++;
930
927
 
931
928
  // 0-15
932
929
  for (int j = 0; j < QK4_0 / 2; j += 2) {
933
- group->qs[j] = ((*quant_offset) & 0x0F);
930
+ group->qs[j] = ((*quant_offset) & 0x0F);
934
931
  group->qs[j + 1] = ((*quant_offset) >> 4);
935
932
  quant_offset++;
936
933
  }
@@ -957,20 +954,17 @@ static void ggml_backend_cann_transform_back_q4_0(
957
954
  * @param dst Pointer to the destination buffer where transformed data will be
958
955
  * stored.
959
956
  */
960
- static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
961
- const void* src,
962
- void* dst) {
963
- int64_t n_elems = ggml_nelements(tensor);
964
- int64_t groups = n_elems / QK8_0;
965
- size_t quant_bytes = n_elems * sizeof(uint8_t);
957
+ static void ggml_backend_cann_transform_q8_0(ggml_tensor * tensor, const void * src, void * dst) {
958
+ int64_t n_elems = ggml_nelements(tensor);
959
+ int64_t groups = n_elems / QK8_0;
960
+ size_t quant_bytes = n_elems * sizeof(uint8_t);
966
961
 
967
- uint8_t* quant_offset = (uint8_t*)dst;
968
- uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
962
+ uint8_t * quant_offset = (uint8_t *) dst;
963
+ uint16_t * scale_offset = (uint16_t *) ((char *) dst + quant_bytes);
969
964
 
970
965
  for (int i = 0; i < groups; i++) {
971
- const block_q8_0* group =
972
- (const block_q8_0*)((const char*)src + i * sizeof(block_q8_0));
973
- *scale_offset = group->d;
966
+ const block_q8_0 * group = (const block_q8_0 *) ((const char *) src + i * sizeof(block_q8_0));
967
+ *scale_offset = group->d;
974
968
  scale_offset++;
975
969
  size_t group_quant_size = QK8_0 * sizeof(uint8_t);
976
970
  memcpy(quant_offset, group->qs, group_quant_size);
@@ -991,19 +985,17 @@ static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
991
985
  * @param dst Pointer to the destination buffer where the Q8.0 formatted data
992
986
  * will be stored.
993
987
  */
994
- static void ggml_backend_cann_transform_back_q8_0(
995
- const ggml_tensor* tensor, const void* src, void* dst) {
996
- int64_t n_elems = ggml_nelements(tensor);
997
- int64_t groups = n_elems / QK8_0;
998
- size_t quant_bytes = n_elems * sizeof(uint8_t);
988
+ static void ggml_backend_cann_transform_back_q8_0(const ggml_tensor * tensor, const void * src, void * dst) {
989
+ int64_t n_elems = ggml_nelements(tensor);
990
+ int64_t groups = n_elems / QK8_0;
991
+ size_t quant_bytes = n_elems * sizeof(uint8_t);
999
992
 
1000
- const uint8_t* quant_offset = (const uint8_t*)src;
1001
- const uint16_t* scale_offset =
1002
- (const uint16_t*)((const char*)src + quant_bytes);
993
+ const uint8_t * quant_offset = (const uint8_t *) src;
994
+ const uint16_t * scale_offset = (const uint16_t *) ((const char *) src + quant_bytes);
1003
995
 
1004
996
  for (int i = 0; i < groups; i++) {
1005
- block_q8_0* group = (block_q8_0*)((char*)dst + i * sizeof(block_q8_0));
1006
- group->d = *scale_offset;
997
+ block_q8_0 * group = (block_q8_0 *) ((char *) dst + i * sizeof(block_q8_0));
998
+ group->d = *scale_offset;
1007
999
  scale_offset++;
1008
1000
  size_t group_quant_size = QK8_0 * sizeof(uint8_t);
1009
1001
  memcpy(group->qs, quant_offset, group_quant_size);
@@ -1023,8 +1015,7 @@ static void ggml_backend_cann_transform_back_q8_0(
1023
1015
  * @param dst Pointer to the destination buffer where transformed data will be
1024
1016
  * stored.
1025
1017
  */
1026
- static void ggml_backend_cann_transform(ggml_tensor* tensor,
1027
- const void* src, void* dst) {
1018
+ static void ggml_backend_cann_transform(ggml_tensor * tensor, const void * src, void * dst) {
1028
1019
  switch (tensor->type) {
1029
1020
  case GGML_TYPE_Q4_0:
1030
1021
  ggml_backend_cann_transform_q4_0(tensor, src, dst);
@@ -1049,8 +1040,7 @@ static void ggml_backend_cann_transform(ggml_tensor* tensor,
1049
1040
  * @param dst Pointer to the destination buffer where transformed tensor data
1050
1041
  * will be stored.
1051
1042
  */
1052
- static void ggml_backend_cann_transform_back(
1053
- const ggml_tensor* tensor, void* src, void* dst) {
1043
+ static void ggml_backend_cann_transform_back(const ggml_tensor * tensor, void * src, void * dst) {
1054
1044
  switch (tensor->type) {
1055
1045
  case GGML_TYPE_Q4_0:
1056
1046
  ggml_backend_cann_transform_back_q4_0(tensor, src, dst);
@@ -1091,8 +1081,7 @@ static bool need_transform(ggml_type type) {
1091
1081
  * @param buffer The CANN buffer from which to initialize the tensor.
1092
1082
  * @param tensor Pointer to the tensor to be initialized.
1093
1083
  */
1094
- static enum ggml_status ggml_backend_cann_buffer_init_tensor(
1095
- ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
1084
+ static enum ggml_status ggml_backend_cann_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
1096
1085
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
1097
1086
  GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
1098
1087
  return GGML_STATUS_SUCCESS;
@@ -1103,18 +1092,105 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
1103
1092
  if (ggml_is_quantized(tensor->type)) {
1104
1093
  // Initialize padding to 0 to avoid possible NaN values
1105
1094
  size_t original_size = ggml_nbytes(tensor);
1106
- size_t padded_size =
1107
- ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
1095
+ size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
1108
1096
 
1109
1097
  if (padded_size > original_size && tensor->view_src == nullptr) {
1110
1098
  size_t memset_size = padded_size - original_size;
1111
- ACL_CHECK(aclrtMemset((char*)tensor->data + original_size,
1112
- memset_size, 0, memset_size));
1099
+ ACL_CHECK(aclrtMemset((char *) tensor->data + original_size, memset_size, 0, memset_size));
1113
1100
  }
1114
1101
  }
1115
1102
  return GGML_STATUS_SUCCESS;
1116
1103
  }
1117
1104
 
1105
+ /**
1106
+ * @brief Workspace for caching NZ buffers per device.
1107
+ *
1108
+ * This struct manages a device buffer used in NZ computations. It supports
1109
+ * allocation, reallocation, and clearing of cached memory. The struct is
1110
+ * designed to be used with a global array, one per device.
1111
+ */
1112
+ struct ggml_cann_nz_workspace {
1113
+ void * ptr; // Pointer to allocated device buffer
1114
+ size_t allocated; // Size of currently allocated buffer in bytes
1115
+
1116
+ /**
1117
+ * @brief Constructor. Initializes the workspace with no allocated memory.
1118
+ */
1119
+ ggml_cann_nz_workspace() : ptr(nullptr), allocated(0) {}
1120
+
1121
+ /**
1122
+ * @brief Free cached memory and reset the workspace.
1123
+ *
1124
+ * If a buffer has been allocated, this function releases it using
1125
+ * aclrtFree and resets internal state.
1126
+ */
1127
+ void clear() {
1128
+ if (ptr) {
1129
+ ACL_CHECK(aclrtFree(ptr));
1130
+ ptr = nullptr;
1131
+ allocated = 0;
1132
+ }
1133
+ }
1134
+
1135
+ /**
1136
+ * @brief Allocate or reallocate the workspace buffer.
1137
+ *
1138
+ * If the requested size is larger than the currently allocated size,
1139
+ * the old buffer will be freed and a new buffer of the requested size
1140
+ * will be allocated on the device.
1141
+ *
1142
+ * @param new_size Size in bytes to allocate for the workspace.
1143
+ */
1144
+ void realloc(size_t new_size) {
1145
+ if (new_size > allocated) {
1146
+ clear();
1147
+ ACL_CHECK(aclrtMalloc(&ptr, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
1148
+ allocated = new_size;
1149
+ }
1150
+ }
1151
+
1152
+ /**
1153
+ * @brief Get the device buffer pointer.
1154
+ *
1155
+ * @return Pointer to the allocated buffer, or nullptr if not allocated.
1156
+ */
1157
+ void * get() const { return ptr; }
1158
+ };
1159
+
1160
+ /**
1161
+ * @brief Global array of NZ workspaces, one per device.
1162
+ */
1163
+ static ggml_cann_nz_workspace g_nz_workspaces[GGML_CANN_MAX_DEVICES];
1164
+
1165
+ /**
1166
+ * @brief Convert tensor weights to NZ format using Ascend CANN API.
1167
+ *
1168
+ * This function creates a transposed tensor descriptor and performs the
1169
+ * TransMatmulWeight operation. Converting tensor formats can significantly
1170
+ * improve performance on certain hardware.
1171
+ *
1172
+ * @param tensor Pointer to the input ggml_tensor containing the weights.
1173
+ * @param offset Byte offset within the tensor data buffer where weights start.
1174
+ * @param device device id.
1175
+ *
1176
+ * @note The workspace buffer used in this function is managed globally and reused
1177
+ * across calls. This reduces overhead from repeated memory allocation and deallocation.
1178
+ */
1179
+ static void weight_format_to_nz(ggml_tensor * tensor, size_t offset, int device) {
1180
+ acl_tensor_ptr weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne, tensor->nb, 2, ACL_FORMAT_ND, offset);
1181
+ uint64_t workspaceSize = 0;
1182
+ aclOpExecutor * executor;
1183
+
1184
+ // TransMatmulWeight
1185
+ ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed.get(), &workspaceSize, &executor));
1186
+ // Avoid frequent malloc/free of the workspace.
1187
+ g_nz_workspaces[device].realloc(workspaceSize);
1188
+
1189
+ void * g_nz_workspace = g_nz_workspaces[device].get();
1190
+
1191
+ ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
1192
+ }
1193
+
1118
1194
  // TODO: need handle tensor which has paddings.
1119
1195
  /**
1120
1196
  * @brief Set tensor data in a CANN buffer.
@@ -1128,27 +1204,32 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
1128
1204
  * @param offset Offset in the source data from where to start copying.
1129
1205
  * @param size Size of the data to be copied, in bytes.
1130
1206
  */
1131
- static void ggml_backend_cann_buffer_set_tensor(
1132
- ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
1133
- size_t offset, size_t size) {
1134
- ggml_backend_cann_buffer_context *ctx =
1135
- (ggml_backend_cann_buffer_context *)buffer->context;
1207
+ static void ggml_backend_cann_buffer_set_tensor(ggml_backend_buffer_t buffer,
1208
+ ggml_tensor * tensor,
1209
+ const void * data,
1210
+ size_t offset,
1211
+ size_t size) {
1212
+ ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
1136
1213
 
1137
1214
  ggml_cann_set_device(ctx->device);
1138
1215
  // TODO: refer to cann(#6017), it use thread's default stream.
1139
1216
  // For acl, synchronous functions use this default stream.
1140
1217
  // Why aclrtSynchronizeDevice?
1141
1218
 
1219
+ // Only check env once.
1220
+ static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
1142
1221
  if (!need_transform(tensor->type)) {
1143
- ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
1144
- ACL_MEMCPY_HOST_TO_DEVICE));
1222
+ ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
1223
+ if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
1224
+ GGML_ASSERT(tensor->ne[2] == 1);
1225
+ GGML_ASSERT(tensor->ne[3] == 1);
1226
+ weight_format_to_nz(tensor, offset, ctx->device);
1227
+ }
1145
1228
  } else {
1146
- void *transform_buffer = malloc(size);
1229
+ void * transform_buffer = malloc(size);
1147
1230
  ggml_backend_cann_transform(tensor, data, transform_buffer);
1148
1231
 
1149
- ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
1150
- transform_buffer, size,
1151
- ACL_MEMCPY_HOST_TO_DEVICE));
1232
+ ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, transform_buffer, size, ACL_MEMCPY_HOST_TO_DEVICE));
1152
1233
  free(transform_buffer);
1153
1234
  }
1154
1235
  }
@@ -1166,22 +1247,20 @@ static void ggml_backend_cann_buffer_set_tensor(
1166
1247
  * @param offset Offset in the destination buffer where to start copying.
1167
1248
  * @param size Size of the data to be copied, in bytes.
1168
1249
  */
1169
- static void ggml_backend_cann_buffer_get_tensor(
1170
- ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
1171
- size_t offset, size_t size) {
1172
- ggml_backend_cann_buffer_context* ctx =
1173
- (ggml_backend_cann_buffer_context*)buffer->context;
1250
+ static void ggml_backend_cann_buffer_get_tensor(ggml_backend_buffer_t buffer,
1251
+ const ggml_tensor * tensor,
1252
+ void * data,
1253
+ size_t offset,
1254
+ size_t size) {
1255
+ ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
1174
1256
 
1175
1257
  ggml_cann_set_device(ctx->device);
1176
1258
 
1177
1259
  if (!need_transform(tensor->type)) {
1178
- ACL_CHECK(aclrtMemcpy(data, size, (char*)tensor->data + offset, size,
1179
- ACL_MEMCPY_DEVICE_TO_HOST));
1260
+ ACL_CHECK(aclrtMemcpy(data, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST));
1180
1261
  } else {
1181
- void* transform_buffer = malloc(size);
1182
- ACL_CHECK(aclrtMemcpy(transform_buffer, size,
1183
- (char*)tensor->data + offset, size,
1184
- ACL_MEMCPY_DEVICE_TO_HOST));
1262
+ void * transform_buffer = malloc(size);
1263
+ ACL_CHECK(aclrtMemcpy(transform_buffer, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST));
1185
1264
  ggml_backend_cann_transform_back(tensor, transform_buffer, data);
1186
1265
  free(transform_buffer);
1187
1266
  }
@@ -1200,31 +1279,31 @@ static void ggml_backend_cann_buffer_get_tensor(
1200
1279
  * @param dst Pointer to the destination tensor where the data will be copied.
1201
1280
  * @return true if the copy operation succeeded, false otherwise.
1202
1281
  */
1203
- static bool ggml_backend_cann_buffer_cpy_tensor(
1204
- ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) {
1282
+ static bool ggml_backend_cann_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
1283
+ const ggml_tensor * src,
1284
+ ggml_tensor * dst) {
1205
1285
  if (ggml_backend_buffer_is_cann(src->buffer)) {
1206
- ggml_backend_cann_buffer_context* src_ctx =
1207
- (ggml_backend_cann_buffer_context*)src->buffer->context;
1208
- ggml_backend_cann_buffer_context* dst_ctx =
1209
- (ggml_backend_cann_buffer_context*)buffer->context;
1286
+ ggml_backend_cann_buffer_context * src_ctx = (ggml_backend_cann_buffer_context *) src->buffer->context;
1287
+ ggml_backend_cann_buffer_context * dst_ctx = (ggml_backend_cann_buffer_context *) buffer->context;
1210
1288
 
1211
1289
  size_t memcpy_size = ggml_nbytes(src);
1212
1290
  // Same device.
1213
1291
  if (src_ctx->device == dst_ctx->device) {
1214
- ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size,
1215
- (const char*)src->data, memcpy_size,
1292
+ ACL_CHECK(aclrtMemcpy((char *) dst->data, memcpy_size, (const char *) src->data, memcpy_size,
1216
1293
  ACL_MEMCPY_DEVICE_TO_DEVICE));
1217
1294
  return true;
1218
1295
  } else {
1296
+ #ifdef ASCEND_310P
1297
+ // TODO: Support 310p P2P copy
1298
+ return false;
1299
+ #endif
1219
1300
  // Different device but can access by peer.
1220
1301
  int32_t canAccessPeer = 0;
1221
- ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device,
1222
- dst_ctx->device));
1302
+ ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device, dst_ctx->device));
1223
1303
  if (canAccessPeer) {
1224
1304
  ggml_cann_set_device(src_ctx->device);
1225
1305
  ACL_CHECK(aclrtDeviceEnablePeerAccess(dst_ctx->device, 0));
1226
- ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size,
1227
- (const char*)src->data, memcpy_size,
1306
+ ACL_CHECK(aclrtMemcpy((char *) dst->data, memcpy_size, (const char *) src->data, memcpy_size,
1228
1307
  ACL_MEMCPY_DEVICE_TO_DEVICE));
1229
1308
  return true;
1230
1309
  }
@@ -1242,10 +1321,8 @@ static bool ggml_backend_cann_buffer_cpy_tensor(
1242
1321
  * @param buffer The CANN buffer to be cleared.
1243
1322
  * @param value The value to which each byte in the buffer will be set.
1244
1323
  */
1245
- static void ggml_backend_cann_buffer_clear(
1246
- ggml_backend_buffer_t buffer, uint8_t value) {
1247
- ggml_backend_cann_buffer_context* ctx =
1248
- (ggml_backend_cann_buffer_context*)buffer->context;
1324
+ static void ggml_backend_cann_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
1325
+ ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
1249
1326
 
1250
1327
  ggml_cann_set_device(ctx->device);
1251
1328
  ACL_CHECK(aclrtMemset(ctx->dev_ptr, buffer->size, value, buffer->size));
@@ -1275,9 +1352,8 @@ static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
1275
1352
  * buffer type.
1276
1353
  */
1277
1354
  struct ggml_backend_cann_buffer_type_context {
1278
- int32_t
1279
- device; /**< Device identifier associated with the buffer context. */
1280
- std::string name; /**< Name associated with the buffer context. */
1355
+ int32_t device; /**< Device identifier associated with the buffer context. */
1356
+ std::string name; /**< Name associated with the buffer context. */
1281
1357
  };
1282
1358
 
1283
1359
  /**
@@ -1289,10 +1365,8 @@ struct ggml_backend_cann_buffer_type_context {
1289
1365
  * @param buft Pointer to the buffer type context.
1290
1366
  * @return Const pointer to the C-style string containing the name.
1291
1367
  */
1292
- static const char* ggml_backend_cann_buffer_type_name(
1293
- ggml_backend_buffer_type_t buft) {
1294
- ggml_backend_cann_buffer_type_context* buft_ctx =
1295
- (ggml_backend_cann_buffer_type_context*)buft->context;
1368
+ static const char * ggml_backend_cann_buffer_type_name(ggml_backend_buffer_type_t buft) {
1369
+ ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
1296
1370
 
1297
1371
  return buft_ctx->name.c_str();
1298
1372
  }
@@ -1307,34 +1381,27 @@ static const char* ggml_backend_cann_buffer_type_name(
1307
1381
  * @param size Size in bytes of the buffer to allocate.
1308
1382
  * @return Pointer to the allocated buffer, or nullptr if allocation fails.
1309
1383
  */
1310
- static ggml_backend_buffer_t
1311
- ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
1312
- size_t size) {
1313
- ggml_backend_cann_buffer_type_context* buft_ctx =
1314
- (ggml_backend_cann_buffer_type_context*)buft->context;
1384
+ static ggml_backend_buffer_t ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1385
+ ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
1315
1386
 
1316
1387
  ggml_cann_set_device(buft_ctx->device);
1317
1388
 
1318
1389
  const size_t alignment = 128;
1319
- size = GGML_PAD(size, alignment);
1390
+ size = GGML_PAD(size, alignment);
1320
1391
  if (size == 0) {
1321
1392
  size = alignment;
1322
1393
  }
1323
- void* dev_ptr;
1394
+ void * dev_ptr;
1324
1395
  aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
1325
1396
  if (err != ACL_SUCCESS) {
1326
- GGML_LOG_ERROR(
1327
- "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
1328
- __func__, size / 1024.0 / 1024.0, buft_ctx->device,
1329
- aclGetRecentErrMsg());
1397
+ GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n", __func__,
1398
+ size / 1024.0 / 1024.0, buft_ctx->device, aclGetRecentErrMsg());
1330
1399
  return nullptr;
1331
1400
  }
1332
1401
 
1333
- ggml_backend_cann_buffer_context* ctx =
1334
- new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr);
1402
+ ggml_backend_cann_buffer_context * ctx = new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr);
1335
1403
 
1336
- return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface,
1337
- ctx, size);
1404
+ return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface, ctx, size);
1338
1405
  }
1339
1406
 
1340
1407
  /**
@@ -1349,8 +1416,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
1349
1416
  * @return The alignment requirement in bytes (fixed at 128 bytes for CANN
1350
1417
  * buffers).
1351
1418
  */
1352
- static size_t ggml_backend_cann_buffer_type_get_alignment(
1353
- ggml_backend_buffer_type_t buft) {
1419
+ static size_t ggml_backend_cann_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
1354
1420
  return 128;
1355
1421
 
1356
1422
  GGML_UNUSED(buft);
@@ -1370,10 +1436,13 @@ static size_t ggml_backend_cann_buffer_type_get_alignment(
1370
1436
  * @return The total allocation size in bytes required for the tensor in the
1371
1437
  * CANN buffer.
1372
1438
  */
1373
- static size_t ggml_backend_cann_buffer_type_get_alloc_size(
1374
- ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
1375
- size_t size = ggml_nbytes(tensor);
1376
- int64_t ne0 = tensor->ne[0];
1439
+ static size_t ggml_backend_cann_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
1440
+ const ggml_tensor * tensor) {
1441
+ size_t size = ggml_nbytes(tensor);
1442
+ int64_t ne0 = tensor->ne[0];
1443
+
1444
+ // Only check env once.
1445
+ static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
1377
1446
 
1378
1447
  // last line must bigger than 32, because every single op deal at
1379
1448
  // least 32 bytes.
@@ -1381,14 +1450,21 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
1381
1450
  // int64_t line_size = ne0 * ggml_element_size(tensor);
1382
1451
  // int64_t line_size_align_32 = (line_size + 31) & ~31;
1383
1452
  // size += (line_size_align_32 - line_size);
1384
-
1385
- // TODO: not support quantized yet.
1386
- // TODO: consider un-continue tensor.
1387
1453
  if (ggml_is_quantized(tensor->type)) {
1388
1454
  if (ne0 % MATRIX_ROW_PADDING != 0) {
1389
- size += ggml_row_size(
1390
- tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
1455
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
1391
1456
  }
1457
+ } else if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
1458
+ // NZ format weight are not support quantized yet.
1459
+ // If ND tensor transform to NZ, size may changed.
1460
+ int64_t shape[] = { tensor->ne[1], tensor->ne[0] };
1461
+ GGML_ASSERT(tensor->ne[2] == 1);
1462
+ GGML_ASSERT(tensor->ne[3] == 1);
1463
+ const aclIntArray * acl_shape = aclCreateIntArray(shape, 2);
1464
+ size_t new_size;
1465
+ ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape, ggml_cann_type_mapping(tensor->type), &new_size));
1466
+ ACL_CHECK(aclDestroyIntArray(acl_shape));
1467
+ size = std::max(size, new_size);
1392
1468
  }
1393
1469
 
1394
1470
  return size;
@@ -1427,17 +1503,15 @@ static const ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface
1427
1503
  * @return A pointer to the buffer type interface for the specified device, or
1428
1504
  * nullptr if the device index is out of range.
1429
1505
  */
1430
- ggml_backend_buffer_type_t
1431
- ggml_backend_cann_buffer_type(int32_t device) {
1432
- static std::mutex mutex;
1506
+ ggml_backend_buffer_type_t ggml_backend_cann_buffer_type(int32_t device) {
1507
+ static std::mutex mutex;
1433
1508
  std::lock_guard<std::mutex> lock(mutex);
1434
1509
 
1435
1510
  if (device >= ggml_backend_cann_get_device_count()) {
1436
1511
  return nullptr;
1437
1512
  }
1438
1513
 
1439
- static ggml_backend_buffer_type
1440
- ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES];
1514
+ static ggml_backend_buffer_type ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES];
1441
1515
 
1442
1516
  static bool ggml_backend_cann_buffer_type_initialized = false;
1443
1517
 
@@ -1447,8 +1521,7 @@ ggml_backend_cann_buffer_type(int32_t device) {
1447
1521
  /* .iface = */ ggml_backend_cann_buffer_type_interface,
1448
1522
  /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
1449
1523
  /* .context = */
1450
- new ggml_backend_cann_buffer_type_context{
1451
- i, "CANN" + std::to_string(i)},
1524
+ new ggml_backend_cann_buffer_type_context{ i, "CANN" + std::to_string(i) },
1452
1525
  };
1453
1526
  }
1454
1527
  ggml_backend_cann_buffer_type_initialized = true;
@@ -1512,16 +1585,16 @@ static void * ggml_cann_host_malloc(size_t size) {
1512
1585
  }
1513
1586
 
1514
1587
  const size_t alignment = 128;
1515
- size = GGML_PAD(size, alignment);
1588
+ size = GGML_PAD(size, alignment);
1516
1589
  if (size == 0) {
1517
1590
  size = alignment;
1518
1591
  }
1519
1592
 
1520
- void * hostPtr = nullptr;
1521
- aclError err = aclrtMallocHost((void **) &hostPtr, size);
1593
+ void * hostPtr = nullptr;
1594
+ aclError err = aclrtMallocHost((void **) &hostPtr, size);
1522
1595
  if (err != ACL_SUCCESS) {
1523
- GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1524
- size / 1024.0 / 1024.0, aclGetRecentErrMsg());
1596
+ GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, size / 1024.0 / 1024.0,
1597
+ aclGetRecentErrMsg());
1525
1598
  return nullptr;
1526
1599
  }
1527
1600
  return hostPtr;
@@ -1534,7 +1607,8 @@ static void * ggml_cann_host_malloc(size_t size) {
1534
1607
  * @param size Size in bytes of the host buffer to allocate.
1535
1608
  * @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
1536
1609
  */
1537
- static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1610
+ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
1611
+ size_t size) {
1538
1612
  void * hostPtr = ggml_cann_host_malloc(size);
1539
1613
 
1540
1614
  if (hostPtr == nullptr) {
@@ -1543,8 +1617,8 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm
1543
1617
  }
1544
1618
 
1545
1619
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
1546
- buffer->buft = buft;
1547
- buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
1620
+ buffer->buft = buft;
1621
+ buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
1548
1622
 
1549
1623
  return buffer;
1550
1624
  }
@@ -1558,14 +1632,15 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm
1558
1632
  ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
1559
1633
  static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
1560
1634
  /* .iface = */ {
1561
- /* .get_name = */ ggml_backend_cann_host_buffer_type_name,
1562
- /* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
1563
- /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
1564
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1635
+ /* .get_name = */ ggml_backend_cann_host_buffer_type_name,
1636
+ /* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
1637
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
1638
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1565
1639
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
1566
- /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1567
- },
1568
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0),
1640
+ /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1641
+ },
1642
+ /* .device = */
1643
+ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0),
1569
1644
  /* .context = */ nullptr,
1570
1645
  };
1571
1646
 
@@ -1585,8 +1660,7 @@ ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
1585
1660
  * stored.
1586
1661
  * @return true if the computation was successful; false otherwise.
1587
1662
  */
1588
- static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1589
- struct ggml_tensor* dst) {
1663
+ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct ggml_tensor * dst) {
1590
1664
  switch (dst->op) {
1591
1665
  case GGML_OP_REPEAT:
1592
1666
  ggml_cann_repeat(ctx, dst);
@@ -1594,6 +1668,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1594
1668
  case GGML_OP_GET_ROWS:
1595
1669
  ggml_cann_get_rows(ctx, dst);
1596
1670
  break;
1671
+ case GGML_OP_SET_ROWS:
1672
+ ggml_cann_set_rows(ctx, dst);
1673
+ break;
1597
1674
  case GGML_OP_DUP:
1598
1675
  ggml_cann_dup(ctx, dst);
1599
1676
  break;
@@ -1616,48 +1693,50 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1616
1693
  case GGML_OP_UNARY:
1617
1694
  switch (ggml_get_unary_op(dst)) {
1618
1695
  case GGML_UNARY_OP_ABS:
1619
- GGML_CANN_CALL_UNARY_OP(Abs);
1696
+ GGML_CANN_CALL_OP_UNARY(Abs);
1620
1697
  break;
1621
1698
  case GGML_UNARY_OP_NEG:
1622
- GGML_CANN_CALL_UNARY_OP(Neg);
1699
+ GGML_CANN_CALL_OP_UNARY(Neg);
1623
1700
  break;
1624
1701
  case GGML_UNARY_OP_GELU:
1625
- GGML_CANN_CALL_UNARY_OP(Gelu);
1702
+ case GGML_UNARY_OP_GELU_ERF:
1703
+ // aclnnGelu internally uses the erf-based approximation.
1704
+ GGML_CANN_CALL_OP_UNARY(Gelu);
1626
1705
  break;
1627
1706
  case GGML_UNARY_OP_SILU:
1628
- GGML_CANN_CALL_UNARY_OP(Silu);
1707
+ GGML_CANN_CALL_OP_UNARY(Silu);
1708
+ break;
1709
+ case GGML_UNARY_OP_GELU_QUICK:
1710
+ {
1711
+ auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
1712
+ GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
1713
+ };
1714
+ ggml_cann_op_unary(lambda, ctx, dst);
1715
+ }
1629
1716
  break;
1630
- case GGML_UNARY_OP_GELU_QUICK: {
1631
- auto lambda = [](ggml_backend_cann_context& ctx,
1632
- aclTensor* acl_src,
1633
- aclTensor* acl_dst) {
1634
- GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
1635
- };
1636
- ggml_cann_unary_op(lambda, ctx, dst);
1637
- } break;
1638
1717
  case GGML_UNARY_OP_TANH:
1639
- GGML_CANN_CALL_UNARY_OP(Tanh);
1718
+ GGML_CANN_CALL_OP_UNARY(Tanh);
1640
1719
  break;
1641
1720
  case GGML_UNARY_OP_RELU:
1642
- GGML_CANN_CALL_UNARY_OP(Relu);
1721
+ GGML_CANN_CALL_OP_UNARY(Relu);
1643
1722
  break;
1644
1723
  case GGML_UNARY_OP_SIGMOID:
1645
- GGML_CANN_CALL_UNARY_OP(Sigmoid);
1724
+ GGML_CANN_CALL_OP_UNARY(Sigmoid);
1646
1725
  break;
1647
1726
  case GGML_UNARY_OP_HARDSIGMOID:
1648
- GGML_CANN_CALL_UNARY_OP(Hardsigmoid);
1727
+ GGML_CANN_CALL_OP_UNARY(Hardsigmoid);
1649
1728
  break;
1650
1729
  case GGML_UNARY_OP_HARDSWISH:
1651
- GGML_CANN_CALL_UNARY_OP(Hardswish);
1730
+ GGML_CANN_CALL_OP_UNARY(Hardswish);
1652
1731
  break;
1653
1732
  case GGML_UNARY_OP_EXP:
1654
- GGML_CANN_CALL_UNARY_OP(Exp);
1733
+ GGML_CANN_CALL_OP_UNARY(Exp);
1655
1734
  break;
1656
1735
  case GGML_UNARY_OP_ELU:
1657
1736
  ggml_cann_elu(ctx, dst);
1658
1737
  break;
1659
1738
  case GGML_UNARY_OP_SGN:
1660
- GGML_CANN_CALL_UNARY_OP(Sign);
1739
+ GGML_CANN_CALL_OP_UNARY(Sign);
1661
1740
  break;
1662
1741
  case GGML_UNARY_OP_STEP:
1663
1742
  ggml_cann_step(ctx, dst);
@@ -1666,12 +1745,43 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1666
1745
  return false;
1667
1746
  }
1668
1747
  break;
1748
+ case GGML_OP_GLU:
1749
+ switch (ggml_get_glu_op(dst)) {
1750
+ case GGML_GLU_OP_REGLU:
1751
+ GGML_CANN_CALL_OP_UNARY_GATED(Relu);
1752
+ break;
1753
+ case GGML_GLU_OP_GEGLU:
1754
+ case GGML_GLU_OP_GEGLU_ERF:
1755
+ // aclnnGelu internally uses the erf-based approximation.
1756
+ GGML_CANN_CALL_OP_UNARY_GATED(Gelu);
1757
+ break;
1758
+ case GGML_GLU_OP_SWIGLU:
1759
+ GGML_CANN_CALL_OP_UNARY_GATED(Silu);
1760
+ break;
1761
+ case GGML_GLU_OP_GEGLU_QUICK:
1762
+ {
1763
+ auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
1764
+ GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
1765
+ };
1766
+ ggml_cann_op_unary_gated(lambda, ctx, dst);
1767
+ }
1768
+ break;
1769
+ default:
1770
+ return false;
1771
+ }
1772
+ break;
1669
1773
  case GGML_OP_NORM:
1670
1774
  ggml_cann_norm(ctx, dst);
1671
1775
  break;
1672
1776
  case GGML_OP_GROUP_NORM:
1673
1777
  ggml_cann_group_norm(ctx, dst);
1674
1778
  break;
1779
+ case GGML_OP_L2_NORM:
1780
+ ggml_cann_l2_norm(ctx, dst);
1781
+ break;
1782
+ case GGML_OP_CROSS_ENTROPY_LOSS:
1783
+ ggml_cann_cross_entropy_loss(ctx, dst);
1784
+ break;
1675
1785
  case GGML_OP_CONCAT:
1676
1786
  ggml_cann_concat(ctx, dst);
1677
1787
  break;
@@ -1708,7 +1818,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1708
1818
  ggml_cann_binary_op<aclnn_mul>(ctx, dst);
1709
1819
  break;
1710
1820
  case GGML_OP_SQRT:
1711
- GGML_CANN_CALL_UNARY_OP(Sqrt);
1821
+ GGML_CANN_CALL_OP_UNARY(Sqrt);
1712
1822
  break;
1713
1823
  case GGML_OP_CLAMP:
1714
1824
  ggml_cann_clamp(ctx, dst);
@@ -1753,16 +1863,16 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1753
1863
  ggml_cann_argmax(ctx, dst);
1754
1864
  break;
1755
1865
  case GGML_OP_COS:
1756
- ggml_cann_unary_op<aclnn_cos>(ctx, dst);
1866
+ ggml_cann_op_unary<aclnn_cos>(ctx, dst);
1757
1867
  break;
1758
1868
  case GGML_OP_SIN:
1759
- ggml_cann_unary_op<aclnn_sin>(ctx, dst);
1869
+ ggml_cann_op_unary<aclnn_sin>(ctx, dst);
1760
1870
  break;
1761
1871
  case GGML_OP_CONV_TRANSPOSE_1D:
1762
1872
  ggml_cann_conv_transpose_1d(ctx, dst);
1763
1873
  break;
1764
1874
  case GGML_OP_LOG:
1765
- GGML_CANN_CALL_UNARY_OP(Log);
1875
+ GGML_CANN_CALL_OP_UNARY(Log);
1766
1876
  break;
1767
1877
  case GGML_OP_MEAN:
1768
1878
  ggml_cann_mean(ctx, dst);
@@ -1776,6 +1886,12 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1776
1886
  case GGML_OP_FLASH_ATTN_EXT:
1777
1887
  ggml_cann_flash_attn_ext(ctx, dst);
1778
1888
  break;
1889
+ case GGML_OP_OUT_PROD:
1890
+ ggml_cann_out_prod(ctx, dst);
1891
+ break;
1892
+ case GGML_OP_SSM_CONV:
1893
+ ggml_cann_ssm_conv(ctx, dst);
1894
+ break;
1779
1895
  default:
1780
1896
  return false;
1781
1897
  }
@@ -1793,9 +1909,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1793
1909
  * @param backend Pointer to the CANN backend structure.
1794
1910
  * @return A pointer to a constant string representing the backend name.
1795
1911
  */
1796
- static const char* ggml_backend_cann_name(ggml_backend_t backend) {
1797
- ggml_backend_cann_context* cann_ctx =
1798
- (ggml_backend_cann_context*)backend->context;
1912
+ static const char * ggml_backend_cann_name(ggml_backend_t backend) {
1913
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
1799
1914
 
1800
1915
  return cann_ctx->name.c_str();
1801
1916
  }
@@ -1809,8 +1924,7 @@ static const char* ggml_backend_cann_name(ggml_backend_t backend) {
1809
1924
  * @param backend Pointer to the CANN backend structure to be freed.
1810
1925
  */
1811
1926
  static void ggml_backend_cann_free(ggml_backend_t backend) {
1812
- ggml_backend_cann_context* cann_ctx =
1813
- (ggml_backend_cann_context*)backend->context;
1927
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
1814
1928
  ACL_CHECK(aclrtSynchronizeDevice());
1815
1929
  ACL_CHECK(aclrtResetDevice(cann_ctx->device));
1816
1930
 
@@ -1818,7 +1932,6 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
1818
1932
  delete backend;
1819
1933
  }
1820
1934
 
1821
-
1822
1935
  /**
1823
1936
  * @brief Sets tensor data asynchronously in the CANN backend.
1824
1937
  *
@@ -1831,21 +1944,18 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
1831
1944
  * @param size Size of the data to copy in bytes.
1832
1945
  */
1833
1946
  static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
1834
- ggml_tensor *tensor,
1835
- const void *data,
1836
- size_t offset,
1837
- size_t size) {
1838
- ggml_backend_cann_context *cann_ctx =
1839
- (ggml_backend_cann_context *)backend->context;
1840
- ggml_backend_buffer_t buf =
1841
- tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
1842
-
1843
- GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
1844
- "unsupported buffer type");
1947
+ ggml_tensor * tensor,
1948
+ const void * data,
1949
+ size_t offset,
1950
+ size_t size) {
1951
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
1952
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
1953
+
1954
+ GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && "unsupported buffer type");
1845
1955
  GGML_ASSERT(!ggml_is_quantized(tensor->type));
1846
1956
 
1847
- ggml_cann_async_memcpy(cann_ctx, (char *)tensor->data + offset, data, size,
1848
- ACL_MEMCPY_HOST_TO_DEVICE);
1957
+ ACL_CHECK(aclrtMemcpyAsync((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE,
1958
+ cann_ctx->stream()));
1849
1959
  }
1850
1960
 
1851
1961
  /**
@@ -1859,21 +1969,19 @@ static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
1859
1969
  * @param offset Offset in bytes within the host data.
1860
1970
  * @param size Size of the data to copy in bytes.
1861
1971
  */
1862
- static void ggml_backend_cann_get_tensor_async(
1863
- ggml_backend_t backend, const ggml_tensor *tensor, void *data,
1864
- size_t offset, size_t size) {
1865
- ggml_backend_cann_context *cann_ctx =
1866
- (ggml_backend_cann_context *)backend->context;
1867
- ggml_backend_buffer_t buf =
1868
- tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
1869
-
1870
- GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
1871
- "unsupported buffer type");
1972
+ static void ggml_backend_cann_get_tensor_async(ggml_backend_t backend,
1973
+ const ggml_tensor * tensor,
1974
+ void * data,
1975
+ size_t offset,
1976
+ size_t size) {
1977
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
1978
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
1979
+
1980
+ GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && "unsupported buffer type");
1872
1981
  GGML_ASSERT(!ggml_is_quantized(tensor->type));
1873
1982
 
1874
- ggml_cann_async_memcpy(cann_ctx, data, (char *)tensor->data + offset, size,
1875
- ACL_MEMCPY_DEVICE_TO_HOST);
1876
-
1983
+ ACL_CHECK(aclrtMemcpyAsync(data, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST,
1984
+ cann_ctx->stream()));
1877
1985
  }
1878
1986
 
1879
1987
  /**
@@ -1889,62 +1997,67 @@ static void ggml_backend_cann_get_tensor_async(
1889
1997
  * @param dst Pointer to the destination tensor to copy data to.
1890
1998
  * @return true if the copy operation succeeds, false otherwise.
1891
1999
  */
1892
- static bool ggml_backend_cann_cpy_tensor_async(
1893
- ggml_backend_t backend_src, ggml_backend_t backend_dst,
1894
- const ggml_tensor* src, ggml_tensor* dst) {
1895
- GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
1896
- ggml_backend_is_cann(backend_dst));
2000
+ static bool ggml_backend_cann_cpy_tensor_async(ggml_backend_t backend_src,
2001
+ ggml_backend_t backend_dst,
2002
+ const ggml_tensor * src,
2003
+ ggml_tensor * dst) {
2004
+ GGML_ASSERT(ggml_backend_is_cann(backend_src) || ggml_backend_is_cann(backend_dst));
2005
+
2006
+ GGML_ASSERT(!is_matmul_weight((const ggml_tensor *) src));
1897
2007
 
1898
- if (!ggml_backend_buffer_is_cann(src->buffer) ||
1899
- !ggml_backend_buffer_is_cann(dst->buffer)) {
2008
+ if (!ggml_backend_buffer_is_cann(src->buffer) || !ggml_backend_buffer_is_cann(dst->buffer)) {
1900
2009
  return false;
1901
2010
  }
1902
2011
 
1903
- ggml_backend_buffer_t buf_src =
1904
- src->view_src ? src->view_src->buffer : src->buffer;
1905
- ggml_backend_buffer_t buf_dst =
1906
- dst->view_src ? dst->view_src->buffer : dst->buffer;
2012
+ ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
2013
+ ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
1907
2014
 
1908
- ggml_backend_cann_context* cann_ctx_src =
1909
- (ggml_backend_cann_context*)backend_src->context;
1910
- ggml_backend_cann_context* cann_ctx_dst =
1911
- (ggml_backend_cann_context*)backend_dst->context;
2015
+ ggml_backend_cann_context * cann_ctx_src = (ggml_backend_cann_context *) backend_src->context;
2016
+ ggml_backend_cann_context * cann_ctx_dst = (ggml_backend_cann_context *) backend_dst->context;
1912
2017
 
1913
2018
  size_t copy_size = ggml_nbytes(dst);
2019
+ if (copy_size == 0) {
2020
+ return true;
2021
+ }
1914
2022
  if (backend_src != backend_dst) {
1915
- ggml_backend_cann_buffer_context* buf_ctx_src =
1916
- (ggml_backend_cann_buffer_context*)buf_src->context;
1917
- ggml_backend_cann_buffer_context* buf_ctx_dst =
1918
- (ggml_backend_cann_buffer_context*)buf_dst->context;
2023
+ #ifdef ASCEND_310P
2024
+ // TODO: Support 310p P2P copy
2025
+ return false;
2026
+ #endif
2027
+ ggml_backend_cann_buffer_context * buf_ctx_src = (ggml_backend_cann_buffer_context *) buf_src->context;
2028
+ ggml_backend_cann_buffer_context * buf_ctx_dst = (ggml_backend_cann_buffer_context *) buf_dst->context;
1919
2029
 
1920
2030
  GGML_ASSERT(cann_ctx_src->device == buf_ctx_src->device);
1921
2031
  GGML_ASSERT(cann_ctx_dst->device == buf_ctx_dst->device);
1922
2032
 
1923
2033
  int32_t canAccessPeer = 0;
1924
- ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device,
1925
- cann_ctx_dst->device));
2034
+ ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device, cann_ctx_dst->device));
1926
2035
  if (!canAccessPeer) {
1927
2036
  return false;
1928
2037
  }
1929
2038
 
1930
2039
  // need open both directions for memcpyasync between devices.
1931
- ggml_cann_set_device(cann_ctx_dst->device);
1932
2040
  ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
1933
2041
  ggml_cann_set_device(cann_ctx_src->device);
1934
2042
  ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
1935
2043
 
1936
2044
  // wait for task_queue empty to keep task order.
1937
- cann_ctx_src->task_queue.wait();
1938
- ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
1939
- ACL_MEMCPY_DEVICE_TO_DEVICE,
2045
+ ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
1940
2046
  cann_ctx_src->stream()));
1941
-
1942
- //TODO: workaround for Event didn`t work here.
1943
- aclrtSynchronizeStream(cann_ctx_src->stream());
2047
+ // record event on src stream after the copy
2048
+ // TODO: this event is not effective with acl graph mode, change to use aclrtSynchronizeStream
2049
+ // if (!cann_ctx_src->copy_event) {
2050
+ // ACL_CHECK(aclrtCreateEventWithFlag(&cann_ctx_src->copy_event, ACL_EVENT_SYNC));
2051
+ // }
2052
+ // ACL_CHECK(aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
2053
+
2054
+ // // wait on dst stream for the copy to complete
2055
+ // ggml_cann_set_device(cann_ctx_dst->device);
2056
+ // ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), cann_ctx_src->copy_event));
2057
+ ACL_CHECK(aclrtSynchronizeStream(cann_ctx_src->stream()));
1944
2058
  } else {
1945
2059
  // src and dst are on the same backend
1946
- ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
1947
- ACL_MEMCPY_DEVICE_TO_DEVICE,
2060
+ ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
1948
2061
  cann_ctx_dst->stream()));
1949
2062
  }
1950
2063
 
@@ -1960,13 +2073,110 @@ static bool ggml_backend_cann_cpy_tensor_async(
1960
2073
  * @param backend Pointer to the CANN backend structure to synchronize.
1961
2074
  */
1962
2075
  static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
1963
- ggml_backend_cann_context* cann_ctx =
1964
- (ggml_backend_cann_context*)backend->context;
1965
- cann_ctx->task_queue.wait();
2076
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
1966
2077
  ggml_cann_set_device(cann_ctx->device);
1967
2078
  ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
1968
2079
  }
1969
2080
 
2081
+ /**
2082
+ * @brief Check if CANN backend can fuse the specified operation sequence
2083
+ *
2084
+ * This function determines whether an operation sequence starting from the specified node
2085
+ * can be fused into an optimized operation in the CANN backend. Operation fusion can reduce
2086
+ * memory access overhead and improve computational efficiency.
2087
+ *
2088
+ * @param cgraph Pointer to the computation graph
2089
+ * @param node_idx Index of the starting node in the computation graph
2090
+ * @param ops Sequence of operation types to check for fusion
2091
+ * @return true if the operations can be fused
2092
+ * @return false if the operations cannot be fused
2093
+ */
2094
+ static bool ggml_cann_can_fuse(const struct ggml_cgraph * cgraph,
2095
+ int node_idx,
2096
+ std::initializer_list<enum ggml_op> ops) {
2097
+ if (!ggml_can_fuse(cgraph, node_idx, ops)) {
2098
+ return false;
2099
+ }
2100
+
2101
+ // CANN backend supports fusing ADD + RMS_NORM operations
2102
+ if ((ops.size() == 2) && ops.begin()[0] == GGML_OP_ADD && ops.begin()[1] == GGML_OP_RMS_NORM) {
2103
+ ggml_tensor * add_node = cgraph->nodes[node_idx];
2104
+ // TODO: support broadcast for ADD + RMS_NORM
2105
+ if (add_node->src[0]->ne[0] != add_node->src[1]->ne[0] || add_node->src[0]->ne[1] != add_node->src[1]->ne[1] ||
2106
+ add_node->src[0]->ne[2] != add_node->src[1]->ne[2] || add_node->src[0]->ne[3] != add_node->src[1]->ne[3]) {
2107
+ return false;
2108
+ }
2109
+ return true;
2110
+ }
2111
+
2112
+ return false;
2113
+ }
2114
+
2115
+ /**
2116
+ * @brief Evaluate the computation graph and optionally capture or execute it using CANN graph API.
2117
+ *
2118
+ * If CANN graph execution is enabled and graph capture is required, this function begins
2119
+ * graph capture, runs the graph, ends capture, and stores the captured graph.
2120
+ *
2121
+ * Otherwise, it falls back to op-by-op execution using the CANN compute kernel dispatcher.
2122
+ *
2123
+ * @param cann_ctx The CANN backend context.
2124
+ * @param cgraph The ggml computation graph.
2125
+ * @param use_cann_graph Whether to use CANN graph execution.
2126
+ * @param cann_graph_capture_required Whether graph capture is needed due to graph changes.
2127
+ */
2128
+ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx,
2129
+ ggml_cgraph * cgraph,
2130
+ bool use_cann_graph,
2131
+ bool cann_graph_capture_required) {
2132
+ #ifdef USE_ACL_GRAPH
2133
+ if (use_cann_graph && cann_graph_capture_required) { // Begin CANN graph capture
2134
+ ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
2135
+ }
2136
+ #endif // USE_ACL_GRAPH
2137
+ // Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
2138
+ // With the use of CANN graphs, the execution will be performed by the graph launch.
2139
+ static bool opt_fusion = parse_bool(get_env_as_lowercase("GGML_CANN_OPERATOR_FUSION").value_or(""));
2140
+
2141
+ if (!use_cann_graph || cann_graph_capture_required) {
2142
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2143
+ ggml_tensor * node = cgraph->nodes[i];
2144
+ if (opt_fusion) {
2145
+ if (ggml_cann_can_fuse(cgraph, i, { GGML_OP_ADD, GGML_OP_RMS_NORM })) {
2146
+ ggml_cann_op_add_rms_norm_fused(*cann_ctx, node, cgraph->nodes[i + 1]);
2147
+ i++;
2148
+ continue;
2149
+ }
2150
+ }
2151
+
2152
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE ||
2153
+ node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2154
+ continue;
2155
+ }
2156
+
2157
+ bool ok = ggml_cann_compute_forward(*cann_ctx, node);
2158
+ if (!ok) {
2159
+ GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2160
+ }
2161
+ GGML_ASSERT(ok);
2162
+ }
2163
+ }
2164
+
2165
+ #ifdef USE_ACL_GRAPH
2166
+ if (use_cann_graph) {
2167
+ GGML_ASSERT(!cann_ctx->graph_lru_cache.cache_list.empty());
2168
+ ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
2169
+
2170
+ if (cann_graph_capture_required) { // End CANN graph capture
2171
+ ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
2172
+ }
2173
+
2174
+ // Execute CANN graph
2175
+ ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream()));
2176
+ }
2177
+ #endif // USE_ACL_GRAPH
2178
+ }
2179
+
1970
2180
  /**
1971
2181
  * @brief Computes a computational graph using a CANN backend.
1972
2182
  *
@@ -1979,28 +2189,50 @@ static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
1979
2189
  * @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
1980
2190
  * completes successfully, otherwise an appropriate error status.
1981
2191
  */
1982
- static enum ggml_status ggml_backend_cann_graph_compute(
1983
- ggml_backend_t backend, ggml_cgraph* cgraph) {
1984
- ggml_backend_cann_context* cann_ctx =
1985
- (ggml_backend_cann_context*)backend->context;
1986
-
2192
+ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
2193
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
1987
2194
  ggml_cann_set_device(cann_ctx->device);
1988
-
1989
- for (int i = 0; i < cgraph->n_nodes; i++) {
1990
- ggml_tensor* node = cgraph->nodes[i];
1991
-
1992
- if (ggml_is_empty(node) || node->op == GGML_OP_NONE) {
1993
- continue;
2195
+ g_nz_workspaces[cann_ctx->device].clear();
2196
+
2197
+ // calculate rope cache for fist layer in current device.
2198
+ cann_ctx->rope_cache.cached = false;
2199
+
2200
+ bool graph_capture_required = false;
2201
+ #ifdef USE_ACL_GRAPH
2202
+ bool use_cann_graph = true;
2203
+
2204
+ static bool prefill_use_graph = parse_bool(get_env_as_lowercase("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
2205
+ if (!prefill_use_graph) {
2206
+ // Do not use acl_graph for prefill.
2207
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2208
+ ggml_tensor * node = cgraph->nodes[i];
2209
+ // TODO: Optimize here. Currently, we can only
2210
+ // get seq_len by FA's input.
2211
+ if (node->op == GGML_OP_FLASH_ATTN_EXT) {
2212
+ // Q -> src[0], shape: [B, S, N, D]
2213
+ use_cann_graph = (node->src[0]->ne[1] == 1);
2214
+ break;
2215
+ }
1994
2216
  }
2217
+ }
1995
2218
 
1996
- bool ok = ggml_cann_compute_forward(*cann_ctx, node);
2219
+ if (!cann_ctx->acl_graph_mode) {
2220
+ use_cann_graph = false;
2221
+ }
1997
2222
 
1998
- if (!ok) {
1999
- GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
2000
- node->name, ggml_op_name(node->op));
2223
+ if (use_cann_graph) {
2224
+ // If no matching graph is found, the graph needs to be recaptured.
2225
+ graph_capture_required = !cann_ctx->graph_lru_cache.find_and_move_to_front(cgraph);
2226
+ if (graph_capture_required) {
2227
+ // If no matching graph is found, add a new ACL graph.
2228
+ ggml_cann_graph * new_graph = ggml_cann_graph::create_from_cgraph(cgraph);
2229
+ cann_ctx->graph_lru_cache.push(new_graph);
2001
2230
  }
2002
- GGML_ASSERT(ok);
2003
2231
  }
2232
+ #else
2233
+ bool use_cann_graph = false;
2234
+ #endif // USE_ACL_GRAPH
2235
+ evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, graph_capture_required);
2004
2236
 
2005
2237
  return GGML_STATUS_SUCCESS;
2006
2238
  }
@@ -2017,8 +2249,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(
2017
2249
  * @return bool Returns true if the operation is supported by the backend,
2018
2250
  * otherwise false.
2019
2251
  */
2020
- static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2021
- const ggml_tensor* op) {
2252
+ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
2022
2253
  switch (op->op) {
2023
2254
  case GGML_OP_UNARY:
2024
2255
  switch (ggml_get_unary_op(op)) {
@@ -2036,28 +2267,41 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2036
2267
  case GGML_UNARY_OP_ELU:
2037
2268
  case GGML_UNARY_OP_SGN:
2038
2269
  case GGML_UNARY_OP_STEP:
2270
+ case GGML_UNARY_OP_GELU_ERF:
2039
2271
  return true;
2040
2272
  default:
2041
2273
  return false;
2042
2274
  }
2043
- case GGML_OP_MUL_MAT: {
2044
- switch (op->src[0]->type) {
2045
- case GGML_TYPE_F16:
2046
- case GGML_TYPE_F32:
2275
+ case GGML_OP_GLU:
2276
+ switch (ggml_get_glu_op(op)) {
2277
+ case GGML_GLU_OP_REGLU:
2278
+ case GGML_GLU_OP_GEGLU:
2279
+ case GGML_GLU_OP_SWIGLU:
2280
+ case GGML_GLU_OP_GEGLU_ERF:
2281
+ case GGML_GLU_OP_GEGLU_QUICK:
2047
2282
  return true;
2048
- case GGML_TYPE_Q8_0:
2049
- case GGML_TYPE_Q4_0:
2050
- #ifdef ASCEND_310P
2051
- // Q4 && Q8 per group is not suppor on 310p device
2052
- return false;
2053
- #endif
2054
- // only support contiguous for quantized types.
2055
- return ggml_is_contiguous(op->src[0]) &&
2056
- ggml_is_contiguous(op->src[1]);
2057
2283
  default:
2058
2284
  return false;
2059
2285
  }
2060
- }
2286
+ break;
2287
+ case GGML_OP_MUL_MAT:
2288
+ {
2289
+ switch (op->src[0]->type) {
2290
+ case GGML_TYPE_F16:
2291
+ case GGML_TYPE_F32:
2292
+ return true;
2293
+ case GGML_TYPE_Q8_0:
2294
+ case GGML_TYPE_Q4_0:
2295
+ #ifdef ASCEND_310P
2296
+ // Q4 && Q8 per group is not support on 310p device
2297
+ return false;
2298
+ #endif
2299
+ // only support contiguous for quantized types.
2300
+ return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
2301
+ default:
2302
+ return false;
2303
+ }
2304
+ }
2061
2305
  case GGML_OP_MUL_MAT_ID:
2062
2306
  switch (op->src[0]->type) {
2063
2307
  case GGML_TYPE_F16:
@@ -2066,106 +2310,112 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2066
2310
  case GGML_TYPE_Q8_0:
2067
2311
  case GGML_TYPE_Q4_0:
2068
2312
  #ifdef ASCEND_310P
2069
- // Q4 && Q8 per group is not suppor on 310p device
2313
+ // Q4 && Q8 per group is not support on 310p device
2070
2314
  return false;
2071
2315
  #endif
2072
2316
  // only support contiguous for quantized types.
2073
- return ggml_is_contiguous(op->src[0]) &&
2074
- ggml_is_contiguous(op->src[1]);
2317
+ return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
2075
2318
  default:
2076
2319
  return false;
2077
2320
  }
2078
2321
  // embedding
2079
- case GGML_OP_GET_ROWS: {
2080
- switch (op->src[0]->type) {
2081
- case GGML_TYPE_F32:
2082
- case GGML_TYPE_F16:
2083
- case GGML_TYPE_Q8_0:
2084
- return true;
2085
- default:
2086
- return false;
2087
- }
2088
- } break;
2089
- case GGML_OP_CPY: {
2090
- ggml_tensor *src = op->src[0];
2091
- if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
2092
- (src->type != GGML_TYPE_F32 &&
2093
- src->type != GGML_TYPE_F16)) {
2094
- // only support F32 and F16.
2095
- return false;
2322
+ case GGML_OP_GET_ROWS:
2323
+ {
2324
+ switch (op->src[0]->type) {
2325
+ case GGML_TYPE_F32:
2326
+ case GGML_TYPE_F16:
2327
+ case GGML_TYPE_Q8_0:
2328
+ return true;
2329
+ default:
2330
+ return false;
2331
+ }
2096
2332
  }
2097
-
2098
- if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
2099
- // unsupport dst is not contiguous.
2100
- return false;
2333
+ break;
2334
+ case GGML_OP_SET_ROWS:
2335
+ {
2336
+ switch (op->type) {
2337
+ case GGML_TYPE_F32:
2338
+ case GGML_TYPE_F16:
2339
+ return true;
2340
+ default:
2341
+ return false;
2342
+ }
2101
2343
  }
2102
-
2103
- return true;
2104
- } break;
2105
- case GGML_OP_CONT: {
2106
- // TODO: support GGML_TYPE_BF16
2107
- switch (op->src[0]->type) {
2108
- case GGML_TYPE_F32:
2109
- case GGML_TYPE_F16:
2110
- return true;
2111
- default:
2344
+ break;
2345
+ case GGML_OP_CPY:
2346
+ {
2347
+ ggml_tensor * src = op->src[0];
2348
+ if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
2349
+ (src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_F16)) {
2350
+ // only support F32 and F16.
2112
2351
  return false;
2352
+ }
2353
+ return true;
2113
2354
  }
2114
- }
2115
- case GGML_OP_ROPE: {
2116
- // TODO: with ops-test v == 1
2117
- float ext_factor = 0.0f;
2118
- memcpy(&ext_factor, (const float *) op->op_params + 7, sizeof(float));
2119
- // TODO: n_dims <= ne0
2120
- if (op->src[0]->ne[0] != op->op_params[1]) {
2121
- return false;
2122
- }
2123
- // TODO: ext_factor != 0
2124
- if (ext_factor != 0) {
2125
- return false;
2126
- }
2127
-
2128
- const int mode = ((const int32_t *) op->op_params)[2];
2129
- if (mode & GGML_ROPE_TYPE_MROPE) {
2130
- return false;
2131
- }
2132
- if (mode & GGML_ROPE_TYPE_VISION) {
2133
- return false;
2134
- }
2135
-
2136
- if(!ggml_is_contiguous(op->src[0])){
2137
- return false;
2355
+ break;
2356
+ case GGML_OP_CONT:
2357
+ {
2358
+ // TODO: support GGML_TYPE_BF16
2359
+ switch (op->src[0]->type) {
2360
+ case GGML_TYPE_F32:
2361
+ case GGML_TYPE_F16:
2362
+ return true;
2363
+ default:
2364
+ return false;
2365
+ }
2138
2366
  }
2139
- return true;
2140
- }
2141
- case GGML_OP_UPSCALE: {
2142
- // aclnnUpsampleNearest2dGetWorkspaceSize not support
2143
- // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
2144
- if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
2145
- return false;
2367
+ case GGML_OP_ROPE:
2368
+ {
2369
+ if (op->src[0]->ne[0] > 896) {
2370
+ return false;
2371
+ }
2372
+ #ifdef ASCEND_310P
2373
+ // TODO: Support rope_dim < ne00(dim)
2374
+ if (op->src[0]->ne[0] != op->op_params[1]) {
2375
+ return false;
2376
+ }
2377
+ if (!ggml_is_contiguous(op->src[0])) {
2378
+ return false;
2379
+ }
2380
+ #endif
2381
+ return true;
2146
2382
  }
2147
- if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) {
2148
- return false;
2383
+ case GGML_OP_UPSCALE:
2384
+ {
2385
+ // aclnnUpsampleNearest2dGetWorkspaceSize not support
2386
+ // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
2387
+ if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
2388
+ return false;
2389
+ }
2390
+ if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) {
2391
+ return false;
2392
+ }
2393
+ if (op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS) {
2394
+ return false;
2395
+ }
2396
+ return true;
2149
2397
  }
2150
- return true;
2151
- }
2152
- case GGML_OP_POOL_2D: {
2153
- const int32_t * opts = (const int32_t *) op->op_params;
2398
+ case GGML_OP_POOL_2D:
2399
+ {
2400
+ const int32_t * opts = (const int32_t *) op->op_params;
2154
2401
  #ifdef ASCEND_310P
2155
- enum ggml_op_pool opt = static_cast<ggml_op_pool>(opts[0]);
2156
- if(opt == GGML_OP_POOL_MAX){
2157
- return false;
2158
- }
2402
+ enum ggml_op_pool opt = static_cast<ggml_op_pool>(opts[0]);
2403
+ if (opt == GGML_OP_POOL_MAX) {
2404
+ return false;
2405
+ }
2159
2406
  #endif
2160
- const int k0 = opts[1];
2161
- const int k1 = opts[2];
2162
- const int p0 = opts[5];
2163
- const int p1 = opts[6];
2164
- // value of paddingH should be at most half of kernelH
2165
- // value of paddingW should be at most half of kernelW
2166
- return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
2167
- }
2407
+ const int k0 = opts[1];
2408
+ const int k1 = opts[2];
2409
+ const int p0 = opts[5];
2410
+ const int p1 = opts[6];
2411
+ // value of paddingH should be at most half of kernelH
2412
+ // value of paddingW should be at most half of kernelW
2413
+ return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
2414
+ }
2168
2415
  case GGML_OP_SUM:
2416
+ return ggml_is_contiguous_rows(op->src[0]);
2417
+ case GGML_OP_L2_NORM:
2418
+ case GGML_OP_CROSS_ENTROPY_LOSS:
2169
2419
  case GGML_OP_DUP:
2170
2420
  case GGML_OP_IM2COL:
2171
2421
  case GGML_OP_CONCAT:
@@ -2182,61 +2432,93 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2182
2432
  case GGML_OP_MUL:
2183
2433
  case GGML_OP_DIV:
2184
2434
  case GGML_OP_RMS_NORM:
2185
- case GGML_OP_SCALE:
2186
2435
  case GGML_OP_SQR:
2187
2436
  case GGML_OP_SQRT:
2188
2437
  case GGML_OP_CLAMP:
2189
2438
  case GGML_OP_DIAG_MASK_INF:
2190
- case GGML_OP_SOFT_MAX:
2191
2439
  case GGML_OP_SUM_ROWS:
2192
2440
  case GGML_OP_ARGSORT:
2193
2441
  case GGML_OP_ACC:
2194
2442
  case GGML_OP_GROUP_NORM:
2443
+ return true;
2195
2444
  case GGML_OP_PAD:
2445
+ // TODO: add circular padding support for cann, see https://github.com/ggml-org/llama.cpp/pull/16985
2446
+ return ggml_get_op_params_i32(op, 8) == 0;
2196
2447
  case GGML_OP_ARANGE:
2197
2448
  case GGML_OP_TIMESTEP_EMBEDDING:
2198
2449
  case GGML_OP_LEAKY_RELU:
2199
2450
  case GGML_OP_ARGMAX:
2200
2451
  case GGML_OP_COS:
2201
2452
  case GGML_OP_SIN:
2202
- case GGML_OP_CONV_TRANSPOSE_1D:
2203
2453
  case GGML_OP_LOG:
2204
2454
  case GGML_OP_MEAN:
2205
2455
  case GGML_OP_PAD_REFLECT_1D:
2206
2456
  case GGML_OP_COUNT_EQUAL:
2207
2457
  return true;
2208
- case GGML_OP_FLASH_ATTN_EXT:{
2209
- // derived from [ggml-cuda.cu]
2210
- if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
2211
- return false;
2212
- }
2213
- if(op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 && op->src[1]->type != GGML_TYPE_BF16){
2214
- return false;
2215
- }
2216
- if(op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16){
2217
- return false;
2218
- }
2219
- if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
2220
- // different head sizes of K and V are not supported yet
2221
- return false;
2222
- }
2223
- if (op->src[0]->ne[0] == 192) {
2224
- return false;
2225
- }
2226
- if (op->src[0]->ne[0] == 576) {
2227
- // DeepSeek MLA
2458
+ case GGML_OP_OUT_PROD:
2459
+ {
2460
+ #ifdef ASCEND_310P
2461
+ // Ger is not supported on 310p device
2228
2462
  return false;
2463
+ #endif
2464
+ switch (op->src[0]->type) {
2465
+ case GGML_TYPE_F16:
2466
+ case GGML_TYPE_F32:
2467
+ return true;
2468
+ default:
2469
+ return false;
2470
+ }
2229
2471
  }
2230
- if (op->src[0]->ne[3] != 1) {
2472
+ case GGML_OP_CONV_TRANSPOSE_1D:
2473
+ return true;
2474
+ case GGML_OP_SCALE:
2475
+ float bias;
2476
+ memcpy(&bias, (const float *) (op->op_params) + 1, sizeof(float));
2477
+ return bias == 0.0f; // TODO: support bias != 0.0f
2478
+ case GGML_OP_SOFT_MAX:
2479
+ // TODO: support attention sinks [TAG_ATTN_SINKS]
2480
+ if (op->src[2]) {
2231
2481
  return false;
2232
2482
  }
2233
- float logitSoftcap = 0.0f;
2234
- memcpy(&logitSoftcap, (float*)op->op_params + 2, sizeof(float));
2235
- if(logitSoftcap != 0.0f) {
2483
+ return true;
2484
+ case GGML_OP_FLASH_ATTN_EXT:
2485
+ {
2486
+ #ifdef ASCEND_310P
2487
+ // FA not support on 310p device
2236
2488
  return false;
2489
+ #endif
2490
+ // derived from [ggml-cuda.cu]
2491
+ if (op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16) {
2492
+ return false;
2493
+ }
2494
+ if (op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 &&
2495
+ op->src[1]->type != GGML_TYPE_BF16) {
2496
+ return false;
2497
+ }
2498
+ if (op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16) {
2499
+ return false;
2500
+ }
2501
+ // TODO: support attention sinks [TAG_ATTN_SINKS]
2502
+ if (op->src[4]) {
2503
+ return false;
2504
+ }
2505
+ if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
2506
+ // different head sizes of K and V are not supported yet
2507
+ return false;
2508
+ }
2509
+ if (op->src[0]->ne[0] % 16 != 0) {
2510
+ // TODO: padding to support
2511
+ return false;
2512
+ }
2513
+ float logitSoftcap = 0.0f;
2514
+ memcpy(&logitSoftcap, (const float *) (op->op_params) + 2, sizeof(float));
2515
+ if (logitSoftcap != 0.0f) {
2516
+ return false;
2517
+ }
2518
+ return true;
2237
2519
  }
2520
+ case GGML_OP_SSM_CONV:
2238
2521
  return true;
2239
- }
2240
2522
  default:
2241
2523
  return false;
2242
2524
  }
@@ -2259,28 +2541,6 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
2259
2541
  return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
2260
2542
  }
2261
2543
 
2262
- /**
2263
- * @brief Determines if a tensor operation should be offloaded to the CANN
2264
- * backend.
2265
- *
2266
- * This function checks if a given tensor operation should be offloaded to the
2267
- * CANN backend based on the operation type and the size of the tensor. It
2268
- * returns true if the second dimension (ne[1]) of the tensor is greater than or
2269
- * equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
2270
- *
2271
- * @param backend Pointer to the CANN backend.
2272
- * @param op Pointer to the tensor operation to check.
2273
- * @return bool Returns true if the operation should be offloaded, otherwise
2274
- * false.
2275
- */
2276
- static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
2277
- const ggml_tensor* op) {
2278
- const int min_batch_size = 32;
2279
- GGML_UNUSED(dev);
2280
-
2281
- return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
2282
- }
2283
-
2284
2544
  /**
2285
2545
  * @brief Records an event on the CANN backend stream.
2286
2546
  *
@@ -2290,9 +2550,8 @@ static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
2290
2550
  * @param event Pointer to the event structure to be recorded.
2291
2551
  */
2292
2552
  static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
2293
- ggml_backend_cann_context* cann_ctx =
2294
- (ggml_backend_cann_context*)backend->context;
2295
- ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
2553
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
2554
+ ACL_CHECK(aclrtRecordEvent((aclrtEvent) event->context, cann_ctx->stream()));
2296
2555
  }
2297
2556
 
2298
2557
  /**
@@ -2305,13 +2564,10 @@ static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_
2305
2564
  * @param event Pointer to the event structure that the backend needs to wait
2306
2565
  * for.
2307
2566
  */
2308
- static void ggml_backend_cann_event_wait(ggml_backend_t backend,
2309
- ggml_backend_event_t event) {
2310
- ggml_backend_cann_context* cann_ctx =
2311
- (ggml_backend_cann_context*)backend->context;
2567
+ static void ggml_backend_cann_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
2568
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
2312
2569
  if (ggml_backend_is_cann(backend)) {
2313
- ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
2314
- (aclrtEvent)event->context));
2570
+ ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(), (aclrtEvent) event->context));
2315
2571
  } else {
2316
2572
  GGML_ABORT("fatal error");
2317
2573
  }
@@ -2338,6 +2594,7 @@ static const ggml_backend_i ggml_backend_cann_interface = {
2338
2594
  /* .graph_compute = */ ggml_backend_cann_graph_compute,
2339
2595
  /* .event_record = */ ggml_backend_cann_event_record,
2340
2596
  /* .event_wait = */ ggml_backend_cann_event_wait,
2597
+ /* .graph_optimize = */ NULL,
2341
2598
  };
2342
2599
 
2343
2600
  /**
@@ -2349,30 +2606,31 @@ static const ggml_backend_i ggml_backend_cann_interface = {
2349
2606
  * @return A pointer to the static GUID.
2350
2607
  */
2351
2608
  static ggml_guid_t ggml_backend_cann_guid() {
2352
- static ggml_guid guid = {0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34,
2353
- 0xbe, 0x1a, 0x9e, 0x71, 0x1f, 0x9e, 0xed, 0x64};
2609
+ static ggml_guid guid = { 0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34,
2610
+ 0xbe, 0x1a, 0x9e, 0x71, 0x1f, 0x9e, 0xed, 0x64 };
2354
2611
  return &guid;
2355
2612
  }
2356
2613
 
2357
2614
  // backend device
2358
2615
  struct ggml_backend_cann_device_context {
2359
- int device;
2616
+ int device;
2360
2617
  std::string name;
2361
2618
  std::string description;
2619
+ int op_offload_min_batch_size;
2362
2620
  };
2363
2621
 
2364
2622
  static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
2365
- ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
2623
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
2366
2624
  return ctx->name.c_str();
2367
2625
  }
2368
2626
 
2369
- static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
2370
- ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
2627
+ static const char * ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
2628
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
2371
2629
  return ctx->description.c_str();
2372
2630
  }
2373
2631
 
2374
2632
  static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
2375
- ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
2633
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
2376
2634
  ggml_backend_cann_get_device_memory(ctx->device, free, total);
2377
2635
  }
2378
2636
 
@@ -2399,7 +2657,7 @@ static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_back
2399
2657
 
2400
2658
  static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
2401
2659
  GGML_UNUSED(params);
2402
- ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
2660
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
2403
2661
  return ggml_backend_cann_init(ctx->device);
2404
2662
  }
2405
2663
 
@@ -2416,19 +2674,17 @@ static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, cons
2416
2674
  * @return bool Returns true if the CANN backend supports the buffer type,
2417
2675
  * otherwise false.
2418
2676
  */
2419
- static bool ggml_backend_cann_supports_buft(
2420
- ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
2677
+ static bool ggml_backend_cann_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
2421
2678
  if (ggml_backend_buft_is_cann(buft)) {
2422
- ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
2423
- ggml_backend_cann_buffer_type_context * buft_ctx =
2424
- (ggml_backend_cann_buffer_type_context *)buft->context;
2679
+ ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *) dev->context;
2680
+ ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
2425
2681
  return buft_ctx->device == dev_ctx->device;
2426
2682
  }
2427
2683
  return false;
2428
2684
  }
2429
2685
 
2430
2686
  static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
2431
- ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
2687
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
2432
2688
  return ggml_backend_cann_buffer_type(ctx->device);
2433
2689
  }
2434
2690
 
@@ -2437,6 +2693,26 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
2437
2693
  return ggml_backend_cann_host_buffer_type();
2438
2694
  }
2439
2695
 
2696
+ /**
2697
+ * @brief Determines if a tensor operation should be offloaded to the CANN
2698
+ * backend.
2699
+ *
2700
+ * This function checks if a given tensor operation should be offloaded to the
2701
+ * CANN backend based on the operation type and the size of the tensor. It
2702
+ * returns true if the second dimension (ne[1]) of the tensor is greater than or
2703
+ * equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
2704
+ *
2705
+ * @param backend Pointer to the CANN backend.
2706
+ * @param op Pointer to the tensor operation to check.
2707
+ * @return bool Returns true if the operation should be offloaded, otherwise
2708
+ * false.
2709
+ */
2710
+ static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
2711
+ ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
2712
+
2713
+ return op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS;
2714
+ }
2715
+
2440
2716
  /**
2441
2717
  * @brief Creates a new event for the CANN backend device.
2442
2718
  *
@@ -2447,9 +2723,8 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
2447
2723
  * @param backend Pointer to the CANN backend.
2448
2724
  * @return ggml_backend_event_t Returns a pointer to the new event structure.
2449
2725
  */
2450
- static ggml_backend_event_t ggml_backend_cann_device_event_new(
2451
- ggml_backend_dev_t dev) {
2452
- ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
2726
+ static ggml_backend_event_t ggml_backend_cann_device_event_new(ggml_backend_dev_t dev) {
2727
+ ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *) dev->context;
2453
2728
 
2454
2729
  ggml_cann_set_device(dev_ctx->device);
2455
2730
 
@@ -2471,7 +2746,7 @@ static ggml_backend_event_t ggml_backend_cann_device_event_new(
2471
2746
  * @param event Pointer to the event structure to be freed.
2472
2747
  */
2473
2748
  static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
2474
- ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
2749
+ ACL_CHECK(aclrtDestroyEvent((aclrtEvent) event->context));
2475
2750
 
2476
2751
  delete event;
2477
2752
  GGML_UNUSED(dev);
@@ -2485,7 +2760,7 @@ static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_bac
2485
2760
  * @param event Pointer to the event structure to be synchronized.
2486
2761
  */
2487
2762
  static void ggml_backend_cann_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
2488
- ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
2763
+ ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent) event->context));
2489
2764
 
2490
2765
  GGML_UNUSED(dev);
2491
2766
  }
@@ -2496,10 +2771,10 @@ static const ggml_backend_device_i ggml_backend_cann_device_interface = {
2496
2771
  /* .get_memory = */ ggml_backend_cann_device_get_memory,
2497
2772
  /* .get_type = */ ggml_backend_cann_device_get_type,
2498
2773
  /* .get_props = */ ggml_backend_cann_device_get_props,
2499
- /* .init_backend = */ ggml_backend_cann_device_init, // called for every card
2774
+ /* .init_backend = */ ggml_backend_cann_device_init, // called for every card
2500
2775
  /* .get_buffer_type = */ ggml_backend_cann_device_get_buffer_type,
2501
2776
  /* .get_host_buffer_type = */ ggml_backend_cann_device_get_host_buffer_type,
2502
- /* .buffer_from_host_ptr = */ NULL, // not supported for CANN
2777
+ /* .buffer_from_host_ptr = */ NULL, // not supported for CANN
2503
2778
  /* .supports_op = */ ggml_backend_cann_supports_op,
2504
2779
  /* .supports_buft = */ ggml_backend_cann_supports_buft,
2505
2780
  /* .offload_op = */ ggml_backend_cann_offload_op,
@@ -2508,7 +2783,6 @@ static const ggml_backend_device_i ggml_backend_cann_device_interface = {
2508
2783
  /* .event_synchronize = */ ggml_backend_cann_device_event_synchronize,
2509
2784
  };
2510
2785
 
2511
-
2512
2786
  // backend reg
2513
2787
  struct ggml_backend_cann_reg_context {
2514
2788
  std::vector<ggml_backend_dev_t> devices;
@@ -2520,12 +2794,12 @@ static const char * ggml_backend_cann_reg_get_name(ggml_backend_reg_t reg) {
2520
2794
  }
2521
2795
 
2522
2796
  static size_t ggml_backend_cann_reg_get_device_count(ggml_backend_reg_t reg) {
2523
- ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
2797
+ ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *) reg->context;
2524
2798
  return ctx->devices.size();
2525
2799
  }
2526
2800
 
2527
2801
  static ggml_backend_dev_t ggml_backend_cann_reg_get_device(ggml_backend_reg_t reg, size_t index) {
2528
- ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
2802
+ ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *) reg->context;
2529
2803
  GGML_ASSERT(index < ctx->devices.size());
2530
2804
  return ctx->devices[index];
2531
2805
  }
@@ -2547,34 +2821,32 @@ static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
2547
2821
  // backend registry, called only once for cann backend
2548
2822
  ggml_backend_reg_t ggml_backend_cann_reg() {
2549
2823
  static ggml_backend_reg reg;
2550
- static bool initialized = false;
2824
+ static bool initialized = false;
2551
2825
 
2552
2826
  {
2553
- static std::mutex mutex;
2827
+ static std::mutex mutex;
2554
2828
  std::lock_guard<std::mutex> lock(mutex);
2555
2829
  if (!initialized) {
2556
2830
  aclInit(nullptr);
2557
2831
  ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
2832
+ const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
2558
2833
 
2559
2834
  for (int i = 0; i < ggml_cann_info().device_count; i++) {
2560
- ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context();
2561
- dev_ctx->description = aclrtGetSocName();
2562
- dev_ctx->device = i;
2563
- dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
2835
+ ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context();
2836
+ dev_ctx->description = aclrtGetSocName();
2837
+ dev_ctx->device = i;
2838
+ dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
2839
+ dev_ctx->op_offload_min_batch_size = min_batch_size;
2564
2840
  ggml_cann_set_device(i);
2565
- ggml_backend_dev_t dev = new ggml_backend_device {
2566
- /* .iface = */ ggml_backend_cann_device_interface,
2567
- /* .reg = */ &reg,
2568
- /* .context = */ dev_ctx
2569
- };
2841
+ ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface = */ ggml_backend_cann_device_interface,
2842
+ /* .reg = */ &reg,
2843
+ /* .context = */ dev_ctx };
2570
2844
  ctx->devices.push_back(dev);
2571
2845
  }
2572
2846
 
2573
- reg = ggml_backend_reg {
2574
- /* .api_version = */ GGML_BACKEND_API_VERSION,
2575
- /* .iface = */ ggml_backend_cann_reg_interface,
2576
- /* .context = */ ctx
2577
- };
2847
+ reg = ggml_backend_reg{ /* .api_version = */ GGML_BACKEND_API_VERSION,
2848
+ /* .iface = */ ggml_backend_cann_reg_interface,
2849
+ /* .context = */ ctx };
2578
2850
  }
2579
2851
 
2580
2852
  initialized = true;
@@ -2590,39 +2862,36 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) {
2590
2862
  return nullptr;
2591
2863
  }
2592
2864
 
2593
- ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
2865
+ ggml_backend_cann_context * ctx = new ggml_backend_cann_context(device);
2594
2866
  if (ctx == nullptr) {
2595
2867
  GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
2596
2868
  return nullptr;
2597
2869
  }
2598
2870
  ggml_cann_set_device(ctx->device);
2599
2871
  ggml_backend_t cann_backend =
2600
- new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
2601
- /* .interface = */ ggml_backend_cann_interface,
2602
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
2603
- /* .context = */ ctx};
2872
+ new ggml_backend{ /* .guid = */ ggml_backend_cann_guid(),
2873
+ /* .interface = */ ggml_backend_cann_interface,
2874
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
2875
+ /* .context = */ ctx };
2604
2876
 
2605
2877
  return cann_backend;
2606
2878
  }
2607
2879
 
2608
2880
  bool ggml_backend_is_cann(ggml_backend_t backend) {
2609
- return backend != NULL &&
2610
- ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
2881
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
2611
2882
  }
2612
2883
 
2613
2884
  int32_t ggml_backend_cann_get_device_count() {
2614
2885
  return ggml_cann_info().device_count;
2615
2886
  }
2616
2887
 
2617
- void ggml_backend_cann_get_device_description(
2618
- int32_t device, char* description, size_t description_size) {
2888
+ void ggml_backend_cann_get_device_description(int32_t device, char * description, size_t description_size) {
2619
2889
  ggml_cann_set_device(device);
2620
- const char* soc_name = aclrtGetSocName();
2890
+ const char * soc_name = aclrtGetSocName();
2621
2891
  snprintf(description, description_size, "%s", soc_name);
2622
2892
  }
2623
2893
 
2624
- void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
2625
- size_t* total) {
2894
+ void ggml_backend_cann_get_device_memory(int32_t device, size_t * free, size_t * total) {
2626
2895
  ggml_cann_set_device(device);
2627
2896
  ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
2628
2897
  }