whispercpp 1.3.4 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (630) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -43
  3. data/ext/extconf.rb +2 -2
  4. data/ext/ruby_whisper.c +14 -2
  5. data/ext/ruby_whisper.h +39 -0
  6. data/ext/ruby_whisper_context.c +22 -22
  7. data/ext/ruby_whisper_model.c +12 -12
  8. data/ext/ruby_whisper_params.c +47 -23
  9. data/ext/ruby_whisper_segment.c +84 -19
  10. data/ext/ruby_whisper_token.c +351 -0
  11. data/ext/ruby_whisper_transcribe.cpp +1 -1
  12. data/ext/ruby_whisper_vad_context.c +75 -0
  13. data/ext/ruby_whisper_vad_context_detect.cpp +50 -0
  14. data/ext/ruby_whisper_vad_segment.c +139 -0
  15. data/ext/ruby_whisper_vad_segments.c +106 -0
  16. data/ext/sources/CMakeLists.txt +4 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/cmake/arm64-apple-clang.cmake +16 -0
  19. data/ext/sources/cmake/arm64-windows-llvm.cmake +16 -0
  20. data/ext/sources/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
  21. data/ext/sources/cmake/x64-windows-llvm.cmake +5 -0
  22. data/ext/sources/examples/addon.node/vad-example.js +2 -2
  23. data/ext/sources/examples/cli/cli.cpp +121 -112
  24. data/ext/sources/examples/lsp/CMakeLists.txt +2 -1
  25. data/ext/sources/examples/quantize/CMakeLists.txt +2 -1
  26. data/ext/sources/examples/server/server.cpp +10 -11
  27. data/ext/sources/examples/talk-llama/CMakeLists.txt +5 -1
  28. data/ext/sources/examples/talk-llama/llama-adapter.cpp +12 -3
  29. data/ext/sources/examples/talk-llama/llama-adapter.h +7 -1
  30. data/ext/sources/examples/talk-llama/llama-arch.cpp +2046 -1974
  31. data/ext/sources/examples/talk-llama/llama-arch.h +67 -2
  32. data/ext/sources/examples/talk-llama/llama-batch.cpp +75 -33
  33. data/ext/sources/examples/talk-llama/llama-batch.h +17 -4
  34. data/ext/sources/examples/talk-llama/llama-chat.cpp +79 -3
  35. data/ext/sources/examples/talk-llama/llama-chat.h +4 -0
  36. data/ext/sources/examples/talk-llama/llama-context.cpp +775 -78
  37. data/ext/sources/examples/talk-llama/llama-context.h +57 -9
  38. data/ext/sources/examples/talk-llama/llama-cparams.h +1 -0
  39. data/ext/sources/examples/talk-llama/llama-grammar.cpp +288 -53
  40. data/ext/sources/examples/talk-llama/llama-grammar.h +22 -1
  41. data/ext/sources/examples/talk-llama/llama-graph.cpp +381 -64
  42. data/ext/sources/examples/talk-llama/llama-graph.h +103 -13
  43. data/ext/sources/examples/talk-llama/llama-hparams.cpp +26 -2
  44. data/ext/sources/examples/talk-llama/llama-hparams.h +41 -10
  45. data/ext/sources/examples/talk-llama/llama-impl.cpp +7 -3
  46. data/ext/sources/examples/talk-llama/llama-impl.h +1 -1
  47. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +5 -3
  48. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +145 -65
  49. data/ext/sources/examples/talk-llama/llama-kv-cache.h +22 -7
  50. data/ext/sources/examples/talk-llama/llama-kv-cells.h +44 -2
  51. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +12 -10
  52. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +32 -19
  53. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +2 -2
  54. data/ext/sources/examples/talk-llama/llama-mmap.cpp +172 -37
  55. data/ext/sources/examples/talk-llama/llama-mmap.h +8 -3
  56. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +91 -9
  57. data/ext/sources/examples/talk-llama/llama-model-loader.h +6 -0
  58. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +3 -0
  59. data/ext/sources/examples/talk-llama/llama-model.cpp +1529 -13134
  60. data/ext/sources/examples/talk-llama/llama-model.h +44 -3
  61. data/ext/sources/examples/talk-llama/llama-quant.cpp +8 -23
  62. data/ext/sources/examples/talk-llama/llama-sampling.cpp +1294 -198
  63. data/ext/sources/examples/talk-llama/llama-sampling.h +19 -7
  64. data/ext/sources/examples/talk-llama/llama-vocab.cpp +133 -37
  65. data/ext/sources/examples/talk-llama/llama-vocab.h +45 -40
  66. data/ext/sources/examples/talk-llama/llama.cpp +729 -2
  67. data/ext/sources/examples/talk-llama/llama.h +152 -14
  68. data/ext/sources/examples/talk-llama/models/afmoe.cpp +191 -0
  69. data/ext/sources/examples/talk-llama/models/apertus.cpp +125 -0
  70. data/ext/sources/examples/talk-llama/models/arcee.cpp +135 -0
  71. data/ext/sources/examples/talk-llama/models/arctic.cpp +138 -0
  72. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +86 -0
  73. data/ext/sources/examples/talk-llama/models/baichuan.cpp +122 -0
  74. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +144 -0
  75. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +135 -0
  76. data/ext/sources/examples/talk-llama/models/bert.cpp +178 -0
  77. data/ext/sources/examples/talk-llama/models/bitnet.cpp +160 -0
  78. data/ext/sources/examples/talk-llama/models/bloom.cpp +101 -0
  79. data/ext/sources/examples/talk-llama/models/chameleon.cpp +178 -0
  80. data/ext/sources/examples/talk-llama/models/chatglm.cpp +132 -0
  81. data/ext/sources/examples/talk-llama/models/codeshell.cpp +111 -0
  82. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +102 -0
  83. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +134 -0
  84. data/ext/sources/examples/talk-llama/models/command-r.cpp +122 -0
  85. data/ext/sources/examples/talk-llama/models/dbrx.cpp +123 -0
  86. data/ext/sources/examples/talk-llama/models/deci.cpp +135 -0
  87. data/ext/sources/examples/talk-llama/models/deepseek.cpp +144 -0
  88. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +259 -0
  89. data/ext/sources/examples/talk-llama/models/dots1.cpp +134 -0
  90. data/ext/sources/examples/talk-llama/models/dream.cpp +105 -0
  91. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +150 -0
  92. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +110 -0
  93. data/ext/sources/examples/talk-llama/models/exaone.cpp +114 -0
  94. data/ext/sources/examples/talk-llama/models/exaone4.cpp +123 -0
  95. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +113 -0
  96. data/ext/sources/examples/talk-llama/models/falcon.cpp +120 -0
  97. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +116 -0
  98. data/ext/sources/examples/talk-llama/models/gemma.cpp +112 -0
  99. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +128 -0
  100. data/ext/sources/examples/talk-llama/models/gemma3.cpp +155 -0
  101. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +384 -0
  102. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +170 -0
  103. data/ext/sources/examples/talk-llama/models/glm4.cpp +150 -0
  104. data/ext/sources/examples/talk-llama/models/gpt2.cpp +105 -0
  105. data/ext/sources/examples/talk-llama/models/gptneox.cpp +144 -0
  106. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +196 -0
  107. data/ext/sources/examples/talk-llama/models/granite.cpp +211 -0
  108. data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +283 -0
  109. data/ext/sources/examples/talk-llama/models/grok.cpp +159 -0
  110. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +141 -0
  111. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +132 -0
  112. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +154 -0
  113. data/ext/sources/examples/talk-llama/models/internlm2.cpp +120 -0
  114. data/ext/sources/examples/talk-llama/models/jais.cpp +86 -0
  115. data/ext/sources/examples/talk-llama/models/jamba.cpp +106 -0
  116. data/ext/sources/examples/talk-llama/models/lfm2.cpp +175 -0
  117. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +122 -0
  118. data/ext/sources/examples/talk-llama/models/llada.cpp +99 -0
  119. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +178 -0
  120. data/ext/sources/examples/talk-llama/models/llama.cpp +168 -0
  121. data/ext/sources/examples/talk-llama/models/maincoder.cpp +117 -0
  122. data/ext/sources/examples/talk-llama/models/mamba.cpp +55 -0
  123. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +123 -0
  124. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +199 -0
  125. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +124 -0
  126. data/ext/sources/examples/talk-llama/models/mistral3.cpp +160 -0
  127. data/ext/sources/examples/talk-llama/models/models.h +569 -0
  128. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +116 -0
  129. data/ext/sources/examples/talk-llama/models/mpt.cpp +126 -0
  130. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +150 -0
  131. data/ext/sources/examples/talk-llama/models/nemotron.cpp +122 -0
  132. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +104 -0
  133. data/ext/sources/examples/talk-llama/models/olmo.cpp +121 -0
  134. data/ext/sources/examples/talk-llama/models/olmo2.cpp +150 -0
  135. data/ext/sources/examples/talk-llama/models/olmoe.cpp +124 -0
  136. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +127 -0
  137. data/ext/sources/examples/talk-llama/models/openelm.cpp +124 -0
  138. data/ext/sources/examples/talk-llama/models/orion.cpp +123 -0
  139. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +121 -0
  140. data/ext/sources/examples/talk-llama/models/phi2.cpp +121 -0
  141. data/ext/sources/examples/talk-llama/models/phi3.cpp +152 -0
  142. data/ext/sources/examples/talk-llama/models/plamo.cpp +110 -0
  143. data/ext/sources/examples/talk-llama/models/plamo2.cpp +316 -0
  144. data/ext/sources/examples/talk-llama/models/plamo3.cpp +128 -0
  145. data/ext/sources/examples/talk-llama/models/plm.cpp +168 -0
  146. data/ext/sources/examples/talk-llama/models/qwen.cpp +108 -0
  147. data/ext/sources/examples/talk-llama/models/qwen2.cpp +126 -0
  148. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +151 -0
  149. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +117 -0
  150. data/ext/sources/examples/talk-llama/models/qwen3.cpp +117 -0
  151. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +124 -0
  152. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +873 -0
  153. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +149 -0
  154. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +141 -0
  155. data/ext/sources/examples/talk-llama/models/refact.cpp +94 -0
  156. data/ext/sources/examples/talk-llama/models/rnd1.cpp +126 -0
  157. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +162 -0
  158. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +94 -0
  159. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +86 -0
  160. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +135 -0
  161. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +90 -0
  162. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +124 -0
  163. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +126 -0
  164. data/ext/sources/examples/talk-llama/models/smollm3.cpp +128 -0
  165. data/ext/sources/examples/talk-llama/models/stablelm.cpp +146 -0
  166. data/ext/sources/examples/talk-llama/models/starcoder.cpp +100 -0
  167. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +121 -0
  168. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +166 -0
  169. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +96 -0
  170. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +149 -0
  171. data/ext/sources/examples/talk-llama/models/xverse.cpp +108 -0
  172. data/ext/sources/examples/talk-llama/unicode.cpp +102 -16
  173. data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +1 -1
  174. data/ext/sources/examples/whisper.wasm/index-tmpl.html +1 -1
  175. data/ext/sources/ggml/CMakeLists.txt +82 -54
  176. data/ext/sources/ggml/include/ggml-alloc.h +9 -0
  177. data/ext/sources/ggml/include/ggml-backend.h +4 -1
  178. data/ext/sources/ggml/include/ggml-cpu.h +1 -0
  179. data/ext/sources/ggml/include/ggml-hexagon.h +19 -0
  180. data/ext/sources/ggml/include/ggml-rpc.h +8 -11
  181. data/ext/sources/ggml/include/ggml-zendnn.h +22 -0
  182. data/ext/sources/ggml/include/ggml.h +190 -12
  183. data/ext/sources/ggml/src/CMakeLists.txt +82 -11
  184. data/ext/sources/ggml/src/ggml-alloc.c +124 -41
  185. data/ext/sources/ggml/src/ggml-backend-impl.h +1 -4
  186. data/ext/sources/ggml/src/ggml-backend-reg.cpp +27 -3
  187. data/ext/sources/ggml/src/ggml-backend.cpp +71 -21
  188. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +17 -3
  189. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -9
  190. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
  191. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +138 -47
  192. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +2179 -1696
  193. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +238 -317
  194. data/ext/sources/ggml/src/ggml-cann/common.h +283 -208
  195. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +626 -776
  196. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +156 -86
  197. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
  198. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
  199. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
  200. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1004 -0
  201. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
  202. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  203. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
  204. data/ext/sources/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  205. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6 -6
  206. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +50 -2
  207. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -3
  208. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +195 -71
  209. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  210. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +573 -106
  211. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +33 -44
  212. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +298 -112
  213. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
  214. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +819 -125
  215. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
  216. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +708 -431
  217. data/ext/sources/ggml/src/ggml-cpu/ops.h +5 -4
  218. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +671 -31
  219. data/ext/sources/ggml/src/ggml-cpu/repack.h +14 -0
  220. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +41 -43
  221. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +3 -2
  222. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
  223. data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +7 -0
  224. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +124 -1
  225. data/ext/sources/ggml/src/ggml-cpu/vec.h +261 -146
  226. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +72 -1
  227. data/ext/sources/ggml/src/ggml-cuda/argmax.cu +2 -2
  228. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +123 -6
  229. data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +16 -0
  230. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +1 -1
  231. data/ext/sources/ggml/src/ggml-cuda/common.cuh +353 -80
  232. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
  233. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
  234. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +339 -246
  235. data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +1 -5
  236. data/ext/sources/ggml/src/ggml-cuda/cumsum.cu +307 -0
  237. data/ext/sources/ggml/src/ggml-cuda/cumsum.cuh +5 -0
  238. data/ext/sources/ggml/src/ggml-cuda/diag.cu +77 -0
  239. data/ext/sources/ggml/src/ggml-cuda/diag.cuh +5 -0
  240. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +31 -21
  241. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +663 -596
  242. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +35 -741
  243. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +1241 -0
  244. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +30 -37
  245. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +14 -13
  246. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
  247. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +83 -37
  248. data/ext/sources/ggml/src/ggml-cuda/fill.cu +37 -0
  249. data/ext/sources/ggml/src/ggml-cuda/fill.cuh +3 -0
  250. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1155 -164
  251. data/ext/sources/ggml/src/ggml-cuda/mean.cu +5 -4
  252. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +741 -48
  253. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +60 -12
  254. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +381 -42
  255. data/ext/sources/ggml/src/ggml-cuda/mmid.cu +164 -0
  256. data/ext/sources/ggml/src/ggml-cuda/mmid.cuh +5 -0
  257. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +69 -176
  258. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +498 -171
  259. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +375 -79
  260. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +3 -2
  261. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +241 -95
  262. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -1
  263. data/ext/sources/ggml/src/ggml-cuda/pad.cu +64 -33
  264. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +151 -0
  265. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +14 -0
  266. data/ext/sources/ggml/src/ggml-cuda/rope.cu +192 -77
  267. data/ext/sources/ggml/src/ggml-cuda/rope.cuh +2 -0
  268. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +101 -47
  269. data/ext/sources/ggml/src/ggml-cuda/set.cu +39 -0
  270. data/ext/sources/ggml/src/ggml-cuda/set.cuh +7 -0
  271. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +203 -6
  272. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +275 -0
  273. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
  274. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +14 -20
  275. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +49 -84
  276. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  277. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  278. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  279. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  280. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  281. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  282. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  283. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  284. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  285. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +19 -1
  286. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +96 -0
  287. data/ext/sources/ggml/src/ggml-cuda/top-k.cuh +3 -0
  288. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +168 -76
  289. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +11 -4
  290. data/ext/sources/ggml/src/ggml-cuda/tri.cu +136 -0
  291. data/ext/sources/ggml/src/ggml-cuda/tri.cuh +5 -0
  292. data/ext/sources/ggml/src/ggml-cuda/unary.cu +105 -11
  293. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +36 -0
  294. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +163 -7
  295. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  296. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +12 -1
  297. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +6 -0
  298. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +80 -0
  299. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3151 -0
  300. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +44 -0
  301. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +682 -0
  302. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
  303. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  304. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +566 -0
  305. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +112 -0
  306. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  307. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.c +63 -0
  308. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +157 -0
  309. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +165 -0
  310. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +92 -0
  311. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  312. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +94 -0
  313. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +72 -0
  314. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
  315. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +1020 -0
  316. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +1353 -0
  317. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +1001 -0
  318. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2503 -0
  319. data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
  320. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +487 -0
  321. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +168 -0
  322. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
  323. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +287 -0
  324. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
  325. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  326. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +454 -0
  327. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +221 -0
  328. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +153 -0
  329. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +8 -13
  330. data/ext/sources/ggml/src/ggml-impl.h +67 -6
  331. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +2 -2
  332. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +29 -20
  333. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +652 -285
  334. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +103 -56
  335. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +496 -118
  336. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +231 -9
  337. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +1227 -224
  338. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +12 -0
  339. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +14 -8
  340. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1972 -704
  341. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +3 -1
  342. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +11 -0
  343. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1430 -120
  344. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +63 -0
  345. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +82 -0
  346. data/ext/sources/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  347. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +4 -3
  348. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  349. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  350. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
  351. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +39 -0
  352. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  353. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
  354. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
  355. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  356. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
  357. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
  358. data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
  359. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +35 -16
  360. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -0
  361. data/ext/sources/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  362. data/ext/sources/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  363. data/ext/sources/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  364. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
  365. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +438 -156
  366. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +48 -3
  367. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +77 -0
  368. data/ext/sources/ggml/src/ggml-sycl/add-id.hpp +8 -0
  369. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +6 -0
  370. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +0 -9
  371. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +0 -6
  372. data/ext/sources/ggml/src/ggml-sycl/common.hpp +117 -15
  373. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +55 -44
  374. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +34 -0
  375. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  376. data/ext/sources/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  377. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +0 -3
  378. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +18 -0
  379. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +76 -3
  380. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +333 -300
  381. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +10 -2
  382. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +335 -110
  383. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +22 -0
  384. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +156 -0
  385. data/ext/sources/ggml/src/ggml-sycl/norm.hpp +2 -0
  386. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +97 -0
  387. data/ext/sources/ggml/src/ggml-sycl/pad.hpp +24 -0
  388. data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
  389. data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
  390. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +2 -0
  391. data/ext/sources/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  392. data/ext/sources/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  393. data/ext/sources/ggml/src/ggml-sycl/roll.cpp +122 -0
  394. data/ext/sources/ggml/src/ggml-sycl/roll.hpp +20 -0
  395. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +30 -17
  396. data/ext/sources/ggml/src/ggml-sycl/set.cpp +73 -0
  397. data/ext/sources/ggml/src/ggml-sycl/set.hpp +5 -0
  398. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +327 -162
  399. data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +4 -0
  400. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
  401. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  402. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +58 -0
  403. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
  404. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +5013 -2859
  405. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  406. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
  407. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
  408. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  409. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
  410. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  411. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
  412. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
  413. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  414. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  415. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
  416. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
  417. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
  418. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
  419. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +47 -49
  420. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
  421. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
  422. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
  423. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +4 -4
  424. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  425. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
  426. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
  427. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  428. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
  429. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  430. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  431. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
  432. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +9 -21
  433. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
  434. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +1 -1
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +1 -1
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +1 -1
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
  449. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
  450. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
  451. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
  452. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
  453. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  454. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
  455. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
  456. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
  457. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
  458. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -3
  459. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  460. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +39 -17
  461. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +19 -1
  462. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +45 -7
  463. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +50 -12
  464. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +1 -1
  465. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  466. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
  467. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
  468. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
  469. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
  470. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
  471. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
  472. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +17 -2
  473. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +2 -0
  474. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +7 -0
  475. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +4 -4
  476. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +3 -3
  477. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
  478. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
  479. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +2 -2
  480. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +2 -2
  481. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +19 -7
  482. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +2 -3
  483. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
  484. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
  485. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
  486. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
  487. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +2 -2
  488. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mat_vec_base.comp → mul_mat_vec_base.glsl} +70 -25
  489. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
  490. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +71 -21
  491. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +41 -25
  492. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +2 -2
  493. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +44 -26
  494. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +2 -2
  495. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +2 -2
  496. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +2 -2
  497. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
  498. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
  499. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +4 -6
  500. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +2 -2
  501. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +4 -6
  502. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +4 -6
  503. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +2 -2
  504. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +39 -36
  505. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +494 -0
  506. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +78 -103
  507. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +34 -23
  508. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mm_funcs.comp → mul_mm_funcs.glsl} +69 -59
  509. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +72 -0
  510. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +88 -228
  511. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
  512. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
  513. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
  514. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  515. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
  516. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
  517. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
  518. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +21 -6
  519. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
  520. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +10 -10
  521. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
  522. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
  523. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
  524. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
  525. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +50 -4
  526. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
  527. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
  528. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
  529. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +234 -0
  530. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
  531. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +6 -50
  532. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +6 -33
  533. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +6 -33
  534. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +28 -0
  535. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +6 -39
  536. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  537. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
  538. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
  539. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
  540. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
  541. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
  542. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
  543. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +2 -2
  544. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
  545. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
  546. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
  547. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
  548. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  549. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
  550. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
  551. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
  552. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
  553. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
  554. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  555. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
  556. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +2 -25
  557. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
  558. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
  559. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
  560. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
  561. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +1 -1
  562. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
  563. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
  564. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
  565. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +43 -0
  566. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  567. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +345 -26
  568. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +90 -12
  569. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +335 -151
  570. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  571. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +28 -2
  572. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +169 -0
  573. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1964 -435
  574. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  575. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
  576. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
  577. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +33 -10
  578. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +591 -0
  579. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +1 -1
  580. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  581. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +6 -6
  582. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
  583. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
  584. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
  585. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
  586. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +83 -17
  587. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  588. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
  589. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +112 -0
  590. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  591. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +483 -0
  592. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
  593. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +466 -0
  594. data/ext/sources/ggml/src/ggml.c +425 -33
  595. data/ext/sources/include/whisper.h +1 -0
  596. data/ext/sources/src/CMakeLists.txt +3 -1
  597. data/ext/sources/src/whisper.cpp +101 -35
  598. data/ext/sources/tests/CMakeLists.txt +2 -2
  599. data/ext/sources/tests/test-vad-full.cpp +4 -2
  600. data/ext/sources/tests/test-vad.cpp +1 -1
  601. data/extsources.rb +1 -0
  602. data/lib/whisper/model/uri.rb +17 -18
  603. data/sig/whisper.rbs +119 -2
  604. data/test/test_params.rb +16 -8
  605. data/test/test_segment.rb +0 -1
  606. data/test/test_token.rb +70 -0
  607. data/test/test_vad.rb +1 -1
  608. data/test/test_vad_context.rb +50 -0
  609. data/test/test_vad_segment.rb +19 -0
  610. data/test/test_vad_segments.rb +16 -0
  611. data/test/test_whisper.rb +7 -0
  612. data/whispercpp.gemspec +1 -1
  613. metadata +287 -34
  614. data/ext/sources/build-xcframework.sh +0 -571
  615. data/ext/sources/ggml/src/ggml-cann/Doxyfile +0 -2579
  616. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
  617. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
  618. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +0 -44
  619. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +0 -41
  620. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
  621. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +0 -44
  622. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +0 -41
  623. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +0 -48
  624. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
  625. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
  626. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
  627. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
  628. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
  629. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
  630. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
@@ -23,28 +23,28 @@
23
23
  #ifndef CANN_COMMON_H
24
24
  #define CANN_COMMON_H
25
25
 
26
+ #include "../ggml-impl.h"
27
+ #include "../include/ggml-cann.h"
28
+ #include "../include/ggml.h"
29
+
26
30
  #include <acl/acl.h>
31
+ #include <unistd.h>
27
32
 
33
+ #include <atomic>
34
+ #include <condition_variable>
28
35
  #include <cstdio>
36
+ #include <functional>
29
37
  #include <iostream>
38
+ #include <list>
30
39
  #include <map>
31
40
  #include <memory>
32
- #include <string>
33
- #include <vector>
34
- #include <atomic>
35
- #include <condition_variable>
36
41
  #include <mutex>
37
- #include <thread>
38
- #include <unistd.h>
39
- #include <functional>
40
42
  #include <optional>
41
- #include <list>
42
-
43
- #include "../include/ggml-cann.h"
44
- #include "../include/ggml.h"
45
- #include "../ggml-impl.h"
43
+ #include <string>
44
+ #include <thread>
45
+ #include <vector>
46
46
 
47
- #define MATRIX_ROW_PADDING 512
47
+ #define MATRIX_ROW_PADDING 512
48
48
  #define GGML_CANN_MAX_STREAMS 8
49
49
 
50
50
  /**
@@ -56,8 +56,7 @@
56
56
  * @param line The line number at which the error occurred.
57
57
  * @param msg The error message.
58
58
  */
59
- [[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
60
- const char* file, int line, const char* msg);
59
+ [[noreturn]] void ggml_cann_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
61
60
 
62
61
  /**
63
62
  * @brief Checks the result of a CANN function call and invokes the error
@@ -89,25 +88,24 @@ struct ggml_cann_device_info {
89
88
  * @brief Information about a single CANN device.
90
89
  */
91
90
  struct cann_device_info {
92
- int cc; /**< Compute capability. */
91
+ int cc; /**< Compute capability. */
93
92
  size_t smpb; /**< Maximum shared memory per block. */
94
- bool vmm; /**< Virtual memory support. */
93
+ bool vmm; /**< Virtual memory support. */
95
94
  size_t vmm_granularity; /**< Granularity of virtual memory. */
96
95
  size_t total_vram; /**< Total video RAM available on the device. */
97
96
  };
98
97
 
99
- cann_device_info devices[GGML_CANN_MAX_DEVICES] =
100
- {}; /**< Array of CANN device information. */
98
+ cann_device_info devices[GGML_CANN_MAX_DEVICES] = {}; /**< Array of CANN device information. */
101
99
  };
102
100
 
103
- const ggml_cann_device_info& ggml_cann_info();
101
+ const ggml_cann_device_info & ggml_cann_info();
104
102
 
105
- void ggml_cann_set_device(int32_t device);
103
+ void ggml_cann_set_device(int32_t device);
106
104
  int32_t ggml_cann_get_device();
107
105
 
108
- std::optional<std::string> get_env(const std::string& name);
109
- bool parse_bool(const std::string& value);
110
- int parse_integer(const std::string& value);
106
+ std::optional<std::string> get_env_as_lowercase(const std::string & name);
107
+ bool parse_bool(const std::string & value);
108
+ int parse_integer(const std::string & value);
111
109
 
112
110
  /**
113
111
  * @brief Abstract base class for memory pools used by CANN.
@@ -126,7 +124,7 @@ struct ggml_cann_pool {
126
124
  * will be stored.
127
125
  * @return Pointer to the allocated memory block.
128
126
  */
129
- virtual void* alloc(size_t size, size_t* actual_size) = 0;
127
+ virtual void * alloc(size_t size, size_t * actual_size) = 0;
130
128
 
131
129
  /**
132
130
  * @brief Frees a previously allocated memory block.
@@ -136,16 +134,16 @@ struct ggml_cann_pool {
136
134
  * @note Note that all CANN opertors are running async. Make sure memory is
137
135
  * still avaiable before this operator finished.
138
136
  */
139
- virtual void free(void* ptr, size_t size) = 0;
137
+ virtual void free(void * ptr, size_t size) = 0;
140
138
  };
141
139
 
142
140
  /**
143
141
  * @brief RAII wrapper for managing memory allocations from a CANN memory pool.
144
142
  */
145
143
  struct ggml_cann_pool_alloc {
146
- ggml_cann_pool* pool = nullptr; /**< Pointer to the memory pool. */
147
- void* ptr = nullptr; /**< Pointer to the allocated memory block. */
148
- size_t actual_size = 0; /**< Actual size of the allocated memory block. */
144
+ ggml_cann_pool * pool = nullptr; /**< Pointer to the memory pool. */
145
+ void * ptr = nullptr; /**< Pointer to the allocated memory block. */
146
+ size_t actual_size = 0; /**< Actual size of the allocated memory block. */
149
147
 
150
148
  /**
151
149
  * @brief Default constructor.
@@ -156,16 +154,14 @@ struct ggml_cann_pool_alloc {
156
154
  * @brief Constructor that initializes the memory pool.
157
155
  * @param pool Reference to the memory pool.
158
156
  */
159
- explicit ggml_cann_pool_alloc(ggml_cann_pool& pool) : pool(&pool) {}
157
+ explicit ggml_cann_pool_alloc(ggml_cann_pool & pool) : pool(&pool) {}
160
158
 
161
159
  /**
162
160
  * @brief Constructor that initializes the memory pool and allocates memory.
163
161
  * @param pool Reference to the memory pool.
164
162
  * @param size Size of the memory block to allocate.
165
163
  */
166
- ggml_cann_pool_alloc(ggml_cann_pool& pool, size_t size) : pool(&pool) {
167
- alloc(size);
168
- }
164
+ ggml_cann_pool_alloc(ggml_cann_pool & pool, size_t size) : pool(&pool) { alloc(size); }
169
165
 
170
166
  /**
171
167
  * @brief Destructor that frees the allocated memory block.
@@ -181,7 +177,7 @@ struct ggml_cann_pool_alloc {
181
177
  * @param size Size of the memory block to allocate.
182
178
  * @return Pointer to the allocated memory block.
183
179
  */
184
- void* alloc(size_t size) {
180
+ void * alloc(size_t size) {
185
181
  GGML_ASSERT(pool != nullptr);
186
182
  GGML_ASSERT(ptr == nullptr);
187
183
  ptr = pool->alloc(size, &this->actual_size);
@@ -194,7 +190,7 @@ struct ggml_cann_pool_alloc {
194
190
  * @param size Size of the memory block to allocate.
195
191
  * @return Pointer to the allocated memory block.
196
192
  */
197
- void* alloc(ggml_cann_pool& pool, size_t size) {
193
+ void * alloc(ggml_cann_pool & pool, size_t size) {
198
194
  this->pool = &pool;
199
195
  return alloc(size);
200
196
  }
@@ -203,162 +199,175 @@ struct ggml_cann_pool_alloc {
203
199
  * @brief Gets the pointer to the allocated memory block.
204
200
  * @return Pointer to the allocated memory block.
205
201
  */
206
- void* get() { return ptr; }
202
+ void * get() { return ptr; }
207
203
 
208
204
  // Deleted copy constructor
209
- ggml_cann_pool_alloc(const ggml_cann_pool_alloc&) = delete;
205
+ ggml_cann_pool_alloc(const ggml_cann_pool_alloc &) = delete;
210
206
 
211
207
  // Deleted move constructor
212
- ggml_cann_pool_alloc(ggml_cann_pool_alloc&&) = delete;
208
+ ggml_cann_pool_alloc(ggml_cann_pool_alloc &&) = delete;
213
209
 
214
210
  // Deleted copy assignment operator
215
- ggml_cann_pool_alloc& operator=(const ggml_cann_pool_alloc&) = delete;
211
+ ggml_cann_pool_alloc & operator=(const ggml_cann_pool_alloc &) = delete;
216
212
 
217
213
  // Deleted move assignment operator
218
- ggml_cann_pool_alloc& operator=(ggml_cann_pool_alloc&&) = delete;
214
+ ggml_cann_pool_alloc & operator=(ggml_cann_pool_alloc &&) = delete;
219
215
  };
220
216
 
221
- /**
222
- * @brief Function pointer type for ACLNN operator calls.
223
- */
224
- using aclnn_func_t = aclnnStatus (*)(void*, uint64_t, aclOpExecutor*, aclrtStream);
217
+ #ifdef USE_ACL_GRAPH
218
+ struct ggml_graph_node_properties {
219
+ // dst tensor
220
+ void * node_address;
221
+ int64_t ne[GGML_MAX_DIMS];
222
+ size_t nb[GGML_MAX_DIMS];
225
223
 
226
- /**
227
- * @brief Base class for all CANN tasks to be submitted to the task queue.
228
- *
229
- * Users should override the run_task() method with actual task logic.
230
- */
231
- class cann_task {
232
- public:
233
- virtual void run_task() {}
234
- };
224
+ // src tensor
225
+ void * src_address[GGML_MAX_SRC];
226
+ int64_t src_ne[GGML_MAX_SRC][GGML_MAX_DIMS];
227
+ size_t src_nb[GGML_MAX_SRC][GGML_MAX_DIMS];
235
228
 
236
- /**
237
- * @brief A lock-free ring-buffer based task queue for asynchronously executing cann_task instances.
238
- */
239
- class cann_task_queue {
240
- public:
241
- /**
242
- * @brief Constructs a task queue with a fixed power-of-two capacity for a specific device.
243
- *
244
- * @param capacity Queue capacity. Must be a power of 2.
245
- * @param device Target device ID (used for context setting).
246
- */
247
- explicit cann_task_queue(size_t capacity, int32_t device)
248
- : buffer_(capacity), capacity_(capacity), head_(0), tail_(0),
249
- running_(false), device_(device) {
250
- GGML_ASSERT((capacity & (capacity - 1)) == 0 && "capacity must be power of 2");
251
- mask_ = capacity_ - 1;
252
- }
229
+ // op
230
+ ggml_op node_op;
231
+ int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
253
232
 
254
233
  /**
255
- * @brief Attempts to enqueue a task into the queue.
234
+ * @brief Check if a ggml tensor node matches this property set.
256
235
  *
257
- * @param item Unique pointer to the task.
258
- * @return true if the task was successfully enqueued, false if the queue was full.
236
+ * This function compares all relevant fields (address, op type, shape, source inputs, op params)
237
+ * to determine whether the current node matches these previously recorded properties.
238
+ *
239
+ * @param node The current ggml tensor node.
240
+ * @return true if all fields match (excluding GGML_OP_VIEW); false otherwise.
259
241
  */
260
- bool enqueue(std::unique_ptr<cann_task>&& item) {
261
- size_t next_tail = (tail_ + 1) & mask_;
262
-
263
- if (next_tail == head_) {
242
+ bool has_matching_properties(ggml_tensor * node) {
243
+ if (node->data != this->node_address && node->op != GGML_OP_VIEW) {
264
244
  return false;
265
245
  }
266
246
 
267
- buffer_[tail_] = std::move(item);
268
- std::atomic_thread_fence(std::memory_order_release);
269
- tail_ = next_tail;
270
-
271
- return true;
272
- }
247
+ if (node->op != this->node_op) {
248
+ return false;
249
+ }
273
250
 
274
- /**
275
- * @brief Submits a task to the queue, and starts the worker thread if not already running.
276
- *
277
- * @param task Task to be submitted.
278
- */
279
- void submit_task(std::unique_ptr<cann_task>&& task) {
280
- while(!enqueue(std::move(task))) {
281
- std::this_thread::yield();
282
- continue;
251
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
252
+ if (node->ne[i] != this->ne[i]) {
253
+ return false;
254
+ }
255
+ if (node->nb[i] != this->nb[i]) {
256
+ return false;
257
+ }
283
258
  }
284
259
 
285
- if (!running_) {
286
- running_ = true;
287
- thread_ = std::thread(&cann_task_queue::execute, this);
260
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
261
+ if (node->src[i]) {
262
+ if (node->src[i]->data != this->src_address[i] && node->op != GGML_OP_VIEW) {
263
+ return false;
264
+ }
265
+
266
+ for (int d = 0; d < GGML_MAX_DIMS; d++) {
267
+ if (node->src[i]->ne[d] != this->src_ne[i][d]) {
268
+ return false;
269
+ }
270
+ if (node->src[i]->nb[d] != this->src_nb[i][d]) {
271
+ return false;
272
+ }
273
+ }
274
+ } else {
275
+ if (this->src_address[i] != nullptr) {
276
+ return false;
277
+ }
278
+ }
288
279
  }
289
280
 
281
+ if (node->op == GGML_OP_SCALE || node->op == GGML_OP_UNARY || node->op == GGML_OP_GLU) {
282
+ return memcmp(this->op_params, node->op_params, GGML_MAX_OP_PARAMS) == 0;
283
+ }
284
+ return true;
290
285
  }
286
+ };
291
287
 
292
- /**
293
- * @brief Waits until the queue is completely empty and no tasks are being processed.
294
- */
295
- void wait() {
296
- while (running_ && head_ != tail_) {
297
- std::this_thread::yield();
298
- continue;
288
+ struct ggml_cann_graph {
289
+ ~ggml_cann_graph() {
290
+ if (graph != nullptr) {
291
+ ACL_CHECK(aclmdlRIDestroy(graph));
299
292
  }
300
293
  }
301
294
 
295
+ aclmdlRI graph = nullptr;
296
+
297
+ std::vector<ggml_graph_node_properties> ggml_graph_properties;
298
+
302
299
  /**
303
- * @brief Stops the task queue and joins the worker thread.
300
+ * @brief Create a new CANN graph from a ggml computation graph.
301
+ *
302
+ * This function creates a new ggml_cann_graph object and fills its node properties
303
+ * (operation type, dimensions, strides, input sources, and operation parameters)
304
+ * based on the current ggml computation graph.
305
+ *
306
+ * Each node in the ggml graph is mapped to a property entry in the new CANN graph:
307
+ * - node address
308
+ * - operation type
309
+ * - shape (ne) and strides (nb)
310
+ * - source tensor addresses
311
+ * - operation parameters
312
+ *
313
+ * @param cgraph The current ggml computation graph.
314
+ * @return Pointer to the newly created ggml_cann_graph object.
304
315
  */
305
- void stop() {
306
- running_ = false;
307
- if (thread_.joinable()) {
308
- thread_.join();
316
+ static ggml_cann_graph * create_from_cgraph(ggml_cgraph * cgraph) {
317
+ ggml_cann_graph * new_graph = new ggml_cann_graph();
318
+ new_graph->ggml_graph_properties.resize(cgraph->n_nodes);
319
+
320
+ for (int node_idx = 0; node_idx < cgraph->n_nodes; ++node_idx) {
321
+ ggml_tensor * node = cgraph->nodes[node_idx];
322
+ auto & prop = new_graph->ggml_graph_properties[node_idx];
323
+
324
+ prop.node_address = node->data;
325
+ prop.node_op = node->op;
326
+
327
+ std::copy_n(node->ne, GGML_MAX_DIMS, prop.ne);
328
+ std::copy_n(node->nb, GGML_MAX_DIMS, prop.nb);
329
+
330
+ for (int src = 0; src < GGML_MAX_SRC; ++src) {
331
+ if (node->src[src]) {
332
+ prop.src_address[src] = node->src[src]->data;
333
+ std::copy_n(node->src[src]->ne, GGML_MAX_DIMS, prop.src_ne[src]);
334
+ std::copy_n(node->src[src]->nb, GGML_MAX_DIMS, prop.src_nb[src]);
335
+ } else {
336
+ prop.src_address[src] = nullptr;
337
+ std::fill_n(prop.src_ne[src], GGML_MAX_DIMS, 0);
338
+ std::fill_n(prop.src_nb[src], GGML_MAX_DIMS, 0);
339
+ }
340
+ }
341
+
342
+ memcpy(prop.op_params, node->op_params, GGML_MAX_OP_PARAMS);
309
343
  }
344
+
345
+ return new_graph;
310
346
  }
311
347
 
312
- private:
313
348
  /**
314
- * @brief Worker thread function that continuously dequeues and executes tasks.
349
+ * @brief Check whether this CANN graph matches the given ggml computation graph.
350
+ *
351
+ * This function compares the number of nodes and each node's properties
352
+ * (operation type, dimensions, strides, inputs, and operation parameters)
353
+ * to determine whether this CANN graph matches the given ggml graph.
354
+ *
355
+ * @param cgraph The current ggml computation graph.
356
+ * @return true if this CANN graph matches the ggml graph; false otherwise.
315
357
  */
316
- void execute() {
317
- ggml_cann_set_device(device_);
358
+ bool matches_cgraph(ggml_cgraph * cgraph) {
359
+ if (this->ggml_graph_properties.size() != static_cast<size_t>(cgraph->n_nodes)) {
360
+ return false;
361
+ }
318
362
 
319
- while (running_) {
320
- if(head_ == tail_) {
321
- std::this_thread::yield();
322
- continue;
363
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
364
+ if (!this->ggml_graph_properties[i].has_matching_properties(cgraph->nodes[i])) {
365
+ return false;
323
366
  }
324
-
325
- std::atomic_thread_fence(std::memory_order_acquire);
326
- buffer_[head_]->run_task();
327
- buffer_[head_].reset();
328
- head_ = (head_ + 1) & mask_;
329
367
  }
330
- }
331
-
332
- std::vector<std::unique_ptr<cann_task>> buffer_;
333
- const size_t capacity_;
334
- size_t mask_;
335
- size_t head_;
336
- size_t tail_;
337
- bool running_;
338
- std::thread thread_;
339
- int32_t device_;
340
- };
341
368
 
342
- #ifdef USE_ACL_GRAPH
343
- struct ggml_graph_node_properties {
344
- void * node_address;
345
- ggml_op node_op;
346
- int64_t ne[GGML_MAX_DIMS];
347
- size_t nb[GGML_MAX_DIMS];
348
- void * src_address[GGML_MAX_SRC];
349
- int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
350
- };
351
-
352
- struct ggml_cann_graph {
353
- ~ggml_cann_graph() {
354
- if (graph != nullptr) {
355
- ACL_CHECK(aclmdlRIDestroy(graph));
356
- }
369
+ return true;
357
370
  }
358
-
359
- aclmdlRI graph = nullptr;
360
-
361
- std::vector<ggml_graph_node_properties> ggml_graph_properties;
362
371
  };
363
372
 
364
373
  /**
@@ -369,13 +378,11 @@ struct ggml_cann_graph {
369
378
  * move existing graphs to the front (most recently used), and clear the cache.
370
379
  */
371
380
  struct ggml_cann_graph_lru_cache {
372
- size_t capacity; /**< Maximum number of graphs in the cache. */
381
+ size_t capacity; /**< Maximum number of graphs in the cache. */
373
382
 
374
- std::list<ggml_cann_graph*> cache_list; /**< List storing cached graphs as raw pointers. */
383
+ std::list<ggml_cann_graph *> cache_list; /**< List storing cached graphs as raw pointers. */
375
384
 
376
- ggml_cann_graph_lru_cache() {
377
- capacity = parse_integer(get_env("GGML_CANN_GRAPH_CACHE_CAPACITY").value_or("12"));
378
- }
385
+ ggml_cann_graph_lru_cache() { capacity = parse_integer(get_env("GGML_CANN_GRAPH_CACHE_CAPACITY").value_or("12")); }
379
386
 
380
387
  /**
381
388
  * @brief Push a new graph to the front of the cache.
@@ -383,24 +390,15 @@ struct ggml_cann_graph_lru_cache {
383
390
  * @param new_node Pointer to the new ggml_cann_graph to cache.
384
391
  * Ownership is transferred to the cache (cache will delete it).
385
392
  */
386
- void push(ggml_cann_graph* new_node) {
393
+ void push(ggml_cann_graph * new_node) {
387
394
  if (cache_list.size() >= capacity) {
388
- ggml_cann_graph* old = cache_list.back();
395
+ ggml_cann_graph * old = cache_list.back();
389
396
  cache_list.pop_back();
390
- delete old; // free the old graph
397
+ delete old; // free the old graph
391
398
  }
392
399
  cache_list.push_front(new_node);
393
400
  }
394
401
 
395
- /**
396
- * @brief Move an existing graph to the front of the cache.
397
- * @param node Pointer to the ggml_cann_graph to move.
398
- */
399
- void move_to_front(ggml_cann_graph* node) {
400
- cache_list.remove(node);
401
- cache_list.push_front(node);
402
- }
403
-
404
402
  /**
405
403
  * @brief Clear all graphs from the cache (also frees memory).
406
404
  */
@@ -414,92 +412,171 @@ struct ggml_cann_graph_lru_cache {
414
412
  /**
415
413
  * @brief Destructor that clears the cache and frees all cached graphs.
416
414
  */
417
- ~ggml_cann_graph_lru_cache() {
418
- clear();
415
+ ~ggml_cann_graph_lru_cache() { clear(); }
416
+
417
+ /**
418
+ * @brief Find a cached CANN graph that matches the given ggml graph and move it to front.
419
+ *
420
+ * This function iterates through the cached CANN graphs stored in the LRU cache and
421
+ * compares them against the given ggml computation graph. If a matching graph is found,
422
+ * it is promoted to the front of the LRU cache and returned. Otherwise, the function
423
+ * returns nullptr.
424
+ *
425
+ * @param cgraph The current ggml computation graph.
426
+ * @return true if found; false otherwise.
427
+ */
428
+ bool find_and_move_to_front(ggml_cgraph * cgraph) {
429
+ for (auto & graph_ptr : this->cache_list) {
430
+ if (graph_ptr->matches_cgraph(cgraph)) {
431
+ cache_list.remove(graph_ptr);
432
+ cache_list.push_front(graph_ptr);
433
+ return true;
434
+ }
435
+ }
436
+ return false;
419
437
  }
420
438
  };
421
439
  #endif // USE_ACL_GRAPH
422
440
 
423
441
  struct ggml_cann_rope_cache {
424
442
  ~ggml_cann_rope_cache() {
425
- if(theta_scale_cache != nullptr) {
443
+ if (theta_scale_cache) {
426
444
  ACL_CHECK(aclrtFree(theta_scale_cache));
427
445
  }
428
- if(sin_cache != nullptr) {
446
+ if (sin_cache) {
429
447
  ACL_CHECK(aclrtFree(sin_cache));
430
448
  }
431
- if(cos_cache != nullptr) {
449
+ if (cos_cache) {
432
450
  ACL_CHECK(aclrtFree(cos_cache));
433
451
  }
452
+ if (position_select_index) {
453
+ ACL_CHECK(aclrtFree(position_select_index));
454
+ }
455
+ if (theta_scale_exp_host) {
456
+ free(theta_scale_exp_host);
457
+ }
458
+ if (position_select_index_host) {
459
+ free(position_select_index_host);
460
+ }
461
+ if (yarn_ramp_cache) {
462
+ ACL_CHECK(aclrtFree(yarn_ramp_cache));
463
+ }
464
+ }
465
+
466
+ bool equal(int64_t theta_scale_length,
467
+ int64_t position_length,
468
+ float ext_factor,
469
+ float theta_scale,
470
+ float freq_scale,
471
+ float attn_factor,
472
+ bool is_neox,
473
+ bool indep_sects,
474
+ bool mrope_used,
475
+ bool is_imrope,
476
+ int sections[4]) {
477
+ return this->theta_scale_length == theta_scale_length && this->position_length == position_length &&
478
+ this->ext_factor == ext_factor && this->theta_scale == theta_scale && this->freq_scale == freq_scale &&
479
+ this->attn_factor == attn_factor && this->is_neox == is_neox && this->indep_sects == indep_sects &&
480
+ this->mrope_used == mrope_used && this->is_imrope == is_imrope && this->sections[0] == sections[0] &&
481
+ this->sections[1] == sections[1] && this->sections[2] == sections[2] && this->sections[3] == sections[3];
482
+ }
483
+
484
+ void set(int64_t theta_scale_length,
485
+ int64_t position_length,
486
+ float ext_factor,
487
+ float theta_scale,
488
+ float freq_scale,
489
+ float attn_factor,
490
+ bool is_neox,
491
+ bool indep_sects,
492
+ bool mrope_used,
493
+ bool is_imrope,
494
+ int sections[4]) {
495
+ this->theta_scale_length = theta_scale_length;
496
+ this->position_length = position_length;
497
+ this->ext_factor = ext_factor;
498
+ this->theta_scale = theta_scale;
499
+ this->freq_scale = freq_scale;
500
+ this->attn_factor = attn_factor;
501
+ this->is_neox = is_neox;
502
+ this->indep_sects = indep_sects;
503
+ this->mrope_used = mrope_used;
504
+ this->is_imrope = is_imrope;
505
+ this->sections[0] = sections[0];
506
+ this->sections[1] = sections[1];
507
+ this->sections[2] = sections[2];
508
+ this->sections[3] = sections[3];
434
509
  }
435
510
 
436
- void* theta_scale_cache = nullptr;
437
- int64_t theta_scale_length = 0;
511
+ // memory cache, prepare before inferencing.
512
+ void * theta_scale_cache = nullptr;
513
+ float * theta_scale_exp_host = nullptr;
514
+ int * position_select_index_host = nullptr;
515
+ void * position_select_index = nullptr;
516
+ void * yarn_ramp_cache = nullptr;
438
517
  // sin/cos cache, used only to accelerate first layer on each device
439
- void* sin_cache = nullptr;
440
- void* cos_cache = nullptr;
441
- int64_t position_length = 0;
518
+ void * sin_cache = nullptr;
519
+ void * cos_cache = nullptr;
442
520
  // Properties to check before reusing the sincos cache
443
- bool cached = false;
444
- float ext_factor = 0.0f;
445
- float theta_scale = 0.0f;
446
- float freq_scale = 0.0f;
447
- float attn_factor = 0.0f;
448
- bool is_neox = false;
521
+ int64_t theta_scale_length = 0;
522
+ int64_t position_length = 0;
523
+ bool cached = false;
524
+ float ext_factor = 0.0f;
525
+ float theta_scale = 0.0f;
526
+ float freq_scale = 0.0f;
527
+ float attn_factor = 0.0f;
528
+ bool is_neox = false;
529
+ bool indep_sects = false;
530
+ bool mrope_used = false;
531
+ int sections[4] = { 0, 0, 0, 0 };
532
+ bool is_imrope = false;
449
533
  };
450
534
 
451
535
  struct ggml_cann_tensor_cache {
452
536
  ~ggml_cann_tensor_cache() {
453
- if(cache != nullptr) {
537
+ if (cache != nullptr) {
454
538
  ACL_CHECK(aclrtFree(cache));
455
539
  }
456
540
  }
457
541
 
458
- void* cache = nullptr;
459
- int64_t size = 0;
542
+ void * cache = nullptr;
543
+ int64_t size = 0;
460
544
  };
461
545
 
462
546
  /**
463
547
  * @brief Context for managing CANN backend operations.
464
548
  */
465
549
  struct ggml_backend_cann_context {
466
- int32_t device; /**< Device ID. */
467
- std::string name; /**< Name of the device. */
468
- std::string description; /**< Description of the device. */
469
- aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
550
+ int32_t device; /**< Device ID. */
551
+ std::string name; /**< Name of the device. */
552
+ std::string description; /**< Description of the device. */
553
+ aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
470
554
  #ifdef USE_ACL_GRAPH
471
555
  /// Cached CANN ACL graph used for executing the current ggml computation graph.
472
556
  ggml_cann_graph_lru_cache graph_lru_cache;
473
- bool acl_graph_mode = true;
557
+ bool acl_graph_mode = true;
474
558
  #endif
475
- cann_task_queue task_queue;
476
- bool async_mode;
559
+ bool async_mode;
477
560
  // Rope Cache
478
- ggml_cann_rope_cache rope_cache;
561
+ ggml_cann_rope_cache rope_cache;
479
562
  // Constant Pool
480
563
  ggml_cann_tensor_cache rms_norm_one_tensor_cache;
481
564
  ggml_cann_tensor_cache rms_norm_zero_tensor_cache;
482
565
 
483
- aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
566
+ aclrtStream streams[GGML_CANN_MAX_STREAMS] = { nullptr }; /**< Array of streams for the device. */
484
567
 
485
568
  /**
486
569
  * @brief Constructor for initializing the context with a given device.
487
570
  * @param device Device ID.
488
571
  */
489
- explicit ggml_backend_cann_context(int device)
490
- : device(device), name("CANN" + std::to_string(device)), task_queue(1024, device) {
572
+ explicit ggml_backend_cann_context(int device) : device(device), name("CANN" + std::to_string(device)) {
491
573
  ggml_cann_set_device(device);
492
574
  description = aclrtGetSocName();
493
575
 
494
- async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
495
- GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
496
- device, async_mode ? "ON" : "OFF");
497
576
  #ifdef USE_ACL_GRAPH
498
577
  acl_graph_mode = parse_bool(get_env("GGML_CANN_ACL_GRAPH").value_or("on"));
499
- GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n",
500
- __func__, device,
501
- acl_graph_mode ? "GRAPH" : "EAGER",
502
- acl_graph_mode ? "acl graph enabled" : "acl graph disabled");
578
+ GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n", __func__, device, acl_graph_mode ? "GRAPH" : "EAGER",
579
+ acl_graph_mode ? "acl graph enabled" : "acl graph disabled");
503
580
  #endif
504
581
  }
505
582
 
@@ -508,7 +585,6 @@ struct ggml_backend_cann_context {
508
585
  */
509
586
  ~ggml_backend_cann_context() {
510
587
  ggml_cann_set_device(device);
511
- task_queue.stop();
512
588
  if (copy_event != nullptr) {
513
589
  ACL_CHECK(aclrtDestroyEvent(copy_event));
514
590
  }
@@ -542,8 +618,7 @@ struct ggml_backend_cann_context {
542
618
  aclrtStream stream() { return stream(0); }
543
619
 
544
620
  // TODO: each stream should have a memory pool.
545
- std::unique_ptr<ggml_cann_pool>
546
- mem_pool; /**< Memory pool for the device. */
621
+ std::unique_ptr<ggml_cann_pool> mem_pool; /**< Memory pool for the device. */
547
622
 
548
623
  /**
549
624
  * @brief Create a new memory pool for a given device.
@@ -556,7 +631,7 @@ struct ggml_backend_cann_context {
556
631
  * @brief Get or create the memory pool for the context.
557
632
  * @return Reference to the memory pool.
558
633
  */
559
- ggml_cann_pool& pool() {
634
+ ggml_cann_pool & pool() {
560
635
  if (mem_pool == nullptr) {
561
636
  mem_pool = new_pool_for_device(device);
562
637
  }