whispercpp 1.3.1 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (857) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +7 -3
  3. data/README.md +161 -43
  4. data/Rakefile +45 -13
  5. data/ext/.gitignore +4 -8
  6. data/ext/dependencies.rb +73 -0
  7. data/ext/extconf.rb +21 -198
  8. data/ext/options.rb +85 -0
  9. data/ext/ruby_whisper.c +177 -0
  10. data/ext/ruby_whisper.h +17 -2
  11. data/ext/ruby_whisper_context.c +672 -0
  12. data/ext/ruby_whisper_error.c +52 -0
  13. data/ext/ruby_whisper_model.c +232 -0
  14. data/ext/ruby_whisper_params.c +1303 -0
  15. data/ext/ruby_whisper_segment.c +220 -0
  16. data/ext/ruby_whisper_transcribe.cpp +93 -0
  17. data/ext/ruby_whisper_vad_params.c +288 -0
  18. data/ext/sources/CMakeGraphVizOptions.cmake +8 -0
  19. data/ext/sources/CMakeLists.txt +255 -0
  20. data/ext/sources/bindings/javascript/CMakeLists.txt +41 -0
  21. data/ext/sources/bindings/javascript/emscripten.cpp +93 -0
  22. data/ext/sources/bindings/javascript/libwhisper.worker.js +1 -0
  23. data/ext/sources/bindings/javascript/package-tmpl.json +26 -0
  24. data/ext/sources/bindings/javascript/package.json +26 -0
  25. data/ext/sources/bindings/javascript/whisper.js +19 -0
  26. data/ext/sources/build-xcframework.sh +547 -0
  27. data/ext/sources/cmake/DefaultTargetOptions.cmake +16 -0
  28. data/ext/sources/cmake/FindFFmpeg.cmake +163 -0
  29. data/ext/sources/cmake/build-info.cmake +60 -0
  30. data/ext/sources/cmake/git-vars.cmake +22 -0
  31. data/ext/sources/cmake/whisper-config.cmake.in +65 -0
  32. data/ext/sources/cmake/whisper.pc.in +10 -0
  33. data/ext/sources/examples/CMakeLists.txt +124 -0
  34. data/ext/sources/examples/addon.node/CMakeLists.txt +31 -0
  35. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +133 -0
  36. data/ext/sources/examples/addon.node/addon.cpp +557 -0
  37. data/ext/sources/examples/addon.node/index.js +57 -0
  38. data/ext/sources/examples/addon.node/package.json +16 -0
  39. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  40. data/ext/sources/examples/bench/CMakeLists.txt +8 -0
  41. data/ext/sources/examples/bench/bench.cpp +176 -0
  42. data/ext/sources/examples/bench.wasm/CMakeLists.txt +49 -0
  43. data/ext/sources/examples/bench.wasm/emscripten.cpp +87 -0
  44. data/ext/sources/examples/bench.wasm/index-tmpl.html +284 -0
  45. data/ext/sources/examples/cli/CMakeLists.txt +8 -0
  46. data/ext/sources/examples/cli/cli.cpp +1295 -0
  47. data/ext/sources/examples/coi-serviceworker.js +146 -0
  48. data/ext/sources/examples/command/CMakeLists.txt +10 -0
  49. data/ext/sources/examples/command/command.cpp +800 -0
  50. data/ext/sources/examples/command/commands.txt +9 -0
  51. data/ext/sources/examples/command.wasm/CMakeLists.txt +50 -0
  52. data/ext/sources/examples/command.wasm/emscripten.cpp +327 -0
  53. data/ext/sources/examples/command.wasm/index-tmpl.html +414 -0
  54. data/ext/sources/examples/common-ggml.cpp +238 -0
  55. data/ext/sources/examples/common-ggml.h +18 -0
  56. data/ext/sources/examples/common-sdl.cpp +227 -0
  57. data/ext/sources/examples/common-sdl.h +49 -0
  58. data/ext/sources/examples/common-whisper.cpp +175 -0
  59. data/ext/sources/examples/common-whisper.h +24 -0
  60. data/ext/sources/examples/common.cpp +675 -0
  61. data/ext/sources/examples/common.h +322 -0
  62. data/ext/sources/examples/deprecation-warning/CMakeLists.txt +6 -0
  63. data/ext/sources/examples/deprecation-warning/deprecation-warning.cpp +38 -0
  64. data/ext/sources/examples/ffmpeg-transcode.cpp +368 -0
  65. data/ext/sources/examples/generate-karaoke.sh +57 -0
  66. data/ext/sources/examples/grammar-parser.cpp +423 -0
  67. data/ext/sources/examples/grammar-parser.h +29 -0
  68. data/ext/sources/examples/helpers.js +191 -0
  69. data/ext/sources/examples/json.hpp +24596 -0
  70. data/ext/sources/examples/livestream.sh +112 -0
  71. data/ext/sources/examples/lsp/CMakeLists.txt +9 -0
  72. data/ext/sources/examples/lsp/lsp.cpp +469 -0
  73. data/ext/sources/examples/lsp/whisper.vim +362 -0
  74. data/ext/sources/examples/miniaudio.h +93468 -0
  75. data/ext/sources/examples/python/test_whisper_processor.py +7 -0
  76. data/ext/sources/examples/python/whisper_processor.py +54 -0
  77. data/ext/sources/examples/quantize/CMakeLists.txt +6 -0
  78. data/ext/sources/examples/quantize/quantize.cpp +226 -0
  79. data/ext/sources/examples/server/CMakeLists.txt +15 -0
  80. data/ext/sources/examples/server/bench.js +29 -0
  81. data/ext/sources/examples/server/httplib.h +10497 -0
  82. data/ext/sources/examples/server/server.cpp +1238 -0
  83. data/ext/sources/examples/server.py +115 -0
  84. data/ext/sources/examples/stb_vorbis.c +5584 -0
  85. data/ext/sources/examples/stream/CMakeLists.txt +10 -0
  86. data/ext/sources/examples/stream/stream.cpp +435 -0
  87. data/ext/sources/examples/stream.wasm/CMakeLists.txt +49 -0
  88. data/ext/sources/examples/stream.wasm/emscripten.cpp +216 -0
  89. data/ext/sources/examples/stream.wasm/index-tmpl.html +414 -0
  90. data/ext/sources/examples/sycl/CMakeLists.txt +9 -0
  91. data/ext/sources/examples/sycl/build.sh +22 -0
  92. data/ext/sources/examples/sycl/ls-sycl-device.cpp +11 -0
  93. data/ext/sources/examples/sycl/run-whisper.sh +17 -0
  94. data/ext/sources/examples/talk-llama/CMakeLists.txt +43 -0
  95. data/ext/sources/examples/talk-llama/eleven-labs.py +80 -0
  96. data/ext/sources/examples/talk-llama/llama-adapter.cpp +388 -0
  97. data/ext/sources/examples/talk-llama/llama-adapter.h +76 -0
  98. data/ext/sources/examples/talk-llama/llama-arch.cpp +1914 -0
  99. data/ext/sources/examples/talk-llama/llama-arch.h +464 -0
  100. data/ext/sources/examples/talk-llama/llama-batch.cpp +843 -0
  101. data/ext/sources/examples/talk-llama/llama-batch.h +147 -0
  102. data/ext/sources/examples/talk-llama/llama-chat.cpp +685 -0
  103. data/ext/sources/examples/talk-llama/llama-chat.h +59 -0
  104. data/ext/sources/examples/talk-llama/llama-context.cpp +2845 -0
  105. data/ext/sources/examples/talk-llama/llama-context.h +297 -0
  106. data/ext/sources/examples/talk-llama/llama-cparams.cpp +5 -0
  107. data/ext/sources/examples/talk-llama/llama-cparams.h +41 -0
  108. data/ext/sources/examples/talk-llama/llama-grammar.cpp +1229 -0
  109. data/ext/sources/examples/talk-llama/llama-grammar.h +173 -0
  110. data/ext/sources/examples/talk-llama/llama-graph.cpp +1693 -0
  111. data/ext/sources/examples/talk-llama/llama-graph.h +710 -0
  112. data/ext/sources/examples/talk-llama/llama-hparams.cpp +103 -0
  113. data/ext/sources/examples/talk-llama/llama-hparams.h +207 -0
  114. data/ext/sources/examples/talk-llama/llama-impl.cpp +167 -0
  115. data/ext/sources/examples/talk-llama/llama-impl.h +61 -0
  116. data/ext/sources/examples/talk-llama/llama-io.cpp +15 -0
  117. data/ext/sources/examples/talk-llama/llama-io.h +35 -0
  118. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
  119. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
  120. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
  121. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
  122. data/ext/sources/examples/talk-llama/llama-kv-cache.h +44 -0
  123. data/ext/sources/examples/talk-llama/llama-kv-cells.h +439 -0
  124. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
  125. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
  126. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
  127. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
  128. data/ext/sources/examples/talk-llama/llama-memory.cpp +59 -0
  129. data/ext/sources/examples/talk-llama/llama-memory.h +116 -0
  130. data/ext/sources/examples/talk-llama/llama-mmap.cpp +600 -0
  131. data/ext/sources/examples/talk-llama/llama-mmap.h +68 -0
  132. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +1163 -0
  133. data/ext/sources/examples/talk-llama/llama-model-loader.h +169 -0
  134. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +282 -0
  135. data/ext/sources/examples/talk-llama/llama-model-saver.h +37 -0
  136. data/ext/sources/examples/talk-llama/llama-model.cpp +15114 -0
  137. data/ext/sources/examples/talk-llama/llama-model.h +452 -0
  138. data/ext/sources/examples/talk-llama/llama-quant.cpp +1049 -0
  139. data/ext/sources/examples/talk-llama/llama-quant.h +1 -0
  140. data/ext/sources/examples/talk-llama/llama-sampling.cpp +2575 -0
  141. data/ext/sources/examples/talk-llama/llama-sampling.h +32 -0
  142. data/ext/sources/examples/talk-llama/llama-vocab.cpp +3377 -0
  143. data/ext/sources/examples/talk-llama/llama-vocab.h +132 -0
  144. data/ext/sources/examples/talk-llama/llama.cpp +358 -0
  145. data/ext/sources/examples/talk-llama/llama.h +1484 -0
  146. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +23 -0
  147. data/ext/sources/examples/talk-llama/speak +40 -0
  148. data/ext/sources/examples/talk-llama/speak.bat +1 -0
  149. data/ext/sources/examples/talk-llama/speak.ps1 +14 -0
  150. data/ext/sources/examples/talk-llama/talk-llama.cpp +810 -0
  151. data/ext/sources/examples/talk-llama/unicode-data.cpp +7034 -0
  152. data/ext/sources/examples/talk-llama/unicode-data.h +20 -0
  153. data/ext/sources/examples/talk-llama/unicode.cpp +854 -0
  154. data/ext/sources/examples/talk-llama/unicode.h +66 -0
  155. data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +8 -0
  156. data/ext/sources/examples/vad-speech-segments/speech.cpp +149 -0
  157. data/ext/sources/examples/wchess/CMakeLists.txt +10 -0
  158. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +19 -0
  159. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +803 -0
  160. data/ext/sources/examples/wchess/libwchess/Chessboard.h +33 -0
  161. data/ext/sources/examples/wchess/libwchess/WChess.cpp +193 -0
  162. data/ext/sources/examples/wchess/libwchess/WChess.h +63 -0
  163. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +117 -0
  164. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +8 -0
  165. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +251 -0
  166. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +50 -0
  167. data/ext/sources/examples/whisper.wasm/emscripten.cpp +118 -0
  168. data/ext/sources/examples/whisper.wasm/index-tmpl.html +658 -0
  169. data/ext/sources/ggml/CMakeLists.txt +435 -0
  170. data/ext/sources/ggml/cmake/BuildTypes.cmake +54 -0
  171. data/ext/sources/ggml/cmake/GitVars.cmake +22 -0
  172. data/ext/sources/ggml/cmake/common.cmake +50 -0
  173. data/ext/sources/ggml/cmake/ggml-config.cmake.in +152 -0
  174. data/ext/{ggml → sources/ggml}/include/ggml-alloc.h +1 -1
  175. data/ext/{ggml → sources/ggml}/include/ggml-backend.h +10 -8
  176. data/ext/{ggml → sources/ggml}/include/ggml-cpp.h +2 -1
  177. data/ext/{ggml → sources/ggml}/include/ggml-cpu.h +11 -1
  178. data/ext/{ggml → sources/ggml}/include/ggml-metal.h +1 -1
  179. data/ext/{ggml → sources/ggml}/include/ggml-opt.h +49 -28
  180. data/ext/{ggml → sources/ggml}/include/ggml-rpc.h +6 -1
  181. data/ext/{ggml → sources/ggml}/include/ggml-vulkan.h +0 -2
  182. data/ext/{ggml → sources/ggml}/include/ggml.h +325 -269
  183. data/ext/sources/ggml/include/gguf.h +202 -0
  184. data/ext/sources/ggml/src/CMakeLists.txt +404 -0
  185. data/ext/{ggml → sources/ggml}/src/ggml-alloc.c +34 -29
  186. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  187. data/ext/{ggml → sources/ggml}/src/ggml-backend-impl.h +1 -2
  188. data/ext/{ggml → sources/ggml}/src/ggml-backend-reg.cpp +92 -53
  189. data/ext/{ggml → sources/ggml}/src/ggml-backend.cpp +69 -34
  190. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  191. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +75 -0
  192. data/ext/sources/ggml/src/ggml-cann/Doxyfile +2579 -0
  193. data/ext/{ggml → sources/ggml}/src/ggml-cann/acl_tensor.cpp +10 -4
  194. data/ext/{ggml → sources/ggml}/src/ggml-cann/acl_tensor.h +5 -5
  195. data/ext/{ggml → sources/ggml}/src/ggml-cann/aclnn_ops.cpp +1272 -1506
  196. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +1125 -0
  197. data/ext/{ggml → sources/ggml}/src/ggml-cann/common.h +140 -1
  198. data/ext/{ggml → sources/ggml}/src/ggml-cann/ggml-cann.cpp +588 -146
  199. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +30 -0
  200. data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/dup.cpp +3 -5
  201. data/ext/{ggml → sources/ggml}/src/ggml-common.h +16 -8
  202. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +597 -0
  203. data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/amx.cpp +3 -2
  204. data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/mmq.cpp +11 -10
  205. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  206. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  207. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  208. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  209. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  210. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  211. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  212. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  213. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  214. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  215. data/ext/{ggml/src/ggml-cpu/cpu-feats-x86.cpp → sources/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp} +5 -1
  216. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  217. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +3285 -0
  218. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  219. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  220. data/ext/sources/ggml/src/ggml-cpu/binary-ops.h +16 -0
  221. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  222. data/ext/sources/ggml/src/ggml-cpu/common.h +73 -0
  223. data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-impl.h +172 -41
  224. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +3551 -0
  225. data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu.cpp +78 -25
  226. data/ext/{ggml/src/ggml-cpu/ggml-cpu-hbm.cpp → sources/ggml/src/ggml-cpu/hbm.cpp} +1 -1
  227. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +337 -0
  228. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +95 -0
  229. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +482 -0
  230. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  231. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3594 -0
  232. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +19 -0
  233. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +9786 -0
  234. data/ext/sources/ggml/src/ggml-cpu/ops.h +118 -0
  235. data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
  236. data/ext/{ggml/src/ggml-cpu/ggml-cpu-quants.h → sources/ggml/src/ggml-cpu/quants.h} +26 -0
  237. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
  238. data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
  239. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +1184 -0
  240. data/ext/{ggml/src/ggml-cpu/ggml-cpu-traits.cpp → sources/ggml/src/ggml-cpu/traits.cpp} +1 -1
  241. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  242. data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +28 -0
  243. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +345 -0
  244. data/ext/sources/ggml/src/ggml-cpu/vec.h +1027 -0
  245. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +184 -0
  246. data/ext/sources/ggml/src/ggml-cuda/acc.cu +61 -0
  247. data/ext/sources/ggml/src/ggml-cuda/acc.cuh +5 -0
  248. data/ext/sources/ggml/src/ggml-cuda/arange.cu +34 -0
  249. data/ext/sources/ggml/src/ggml-cuda/arange.cuh +5 -0
  250. data/ext/sources/ggml/src/ggml-cuda/argmax.cu +91 -0
  251. data/ext/sources/ggml/src/ggml-cuda/argmax.cuh +3 -0
  252. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +104 -0
  253. data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +3 -0
  254. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +363 -0
  255. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +9 -0
  256. data/ext/sources/ggml/src/ggml-cuda/clamp.cu +45 -0
  257. data/ext/sources/ggml/src/ggml-cuda/clamp.cuh +5 -0
  258. data/ext/sources/ggml/src/ggml-cuda/common.cuh +851 -0
  259. data/ext/sources/ggml/src/ggml-cuda/concat.cu +221 -0
  260. data/ext/sources/ggml/src/ggml-cuda/concat.cuh +5 -0
  261. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +89 -0
  262. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
  263. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  264. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  265. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  266. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  267. data/ext/sources/ggml/src/ggml-cuda/convert.cu +752 -0
  268. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +31 -0
  269. data/ext/sources/ggml/src/ggml-cuda/count-equal.cu +64 -0
  270. data/ext/sources/ggml/src/ggml-cuda/count-equal.cuh +5 -0
  271. data/ext/sources/ggml/src/ggml-cuda/cp-async.cuh +57 -0
  272. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +705 -0
  273. data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +11 -0
  274. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +189 -0
  275. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
  276. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +103 -0
  277. data/ext/sources/ggml/src/ggml-cuda/diagmask.cu +40 -0
  278. data/ext/sources/ggml/src/ggml-cuda/diagmask.cuh +5 -0
  279. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +881 -0
  280. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1474 -0
  281. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +357 -0
  282. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +3 -0
  283. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +365 -0
  284. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +3 -0
  285. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +482 -0
  286. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +472 -0
  287. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +638 -0
  288. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +3 -0
  289. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +346 -0
  290. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +3 -0
  291. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +275 -0
  292. data/ext/sources/ggml/src/ggml-cuda/getrows.cuh +15 -0
  293. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +3647 -0
  294. data/ext/sources/ggml/src/ggml-cuda/gla.cu +93 -0
  295. data/ext/sources/ggml/src/ggml-cuda/gla.cuh +3 -0
  296. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +103 -0
  297. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +5 -0
  298. data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
  299. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  300. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +396 -0
  301. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +324 -0
  302. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +3217 -0
  303. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +506 -0
  304. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +11 -0
  305. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +595 -0
  306. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +12 -0
  307. data/ext/sources/ggml/src/ggml-cuda/norm.cu +458 -0
  308. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +11 -0
  309. data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
  310. data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
  311. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +68 -0
  312. data/ext/sources/ggml/src/ggml-cuda/out-prod.cuh +3 -0
  313. data/ext/sources/ggml/src/ggml-cuda/pad.cu +49 -0
  314. data/ext/sources/ggml/src/ggml-cuda/pad.cuh +5 -0
  315. data/ext/sources/ggml/src/ggml-cuda/pool2d.cu +94 -0
  316. data/ext/sources/ggml/src/ggml-cuda/pool2d.cuh +5 -0
  317. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +190 -0
  318. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +27 -0
  319. data/ext/sources/ggml/src/ggml-cuda/rope.cu +456 -0
  320. data/ext/sources/ggml/src/ggml-cuda/rope.cuh +7 -0
  321. data/ext/sources/ggml/src/ggml-cuda/scale.cu +31 -0
  322. data/ext/sources/ggml/src/ggml-cuda/scale.cuh +5 -0
  323. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +283 -0
  324. data/ext/sources/ggml/src/ggml-cuda/softmax.cuh +7 -0
  325. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +148 -0
  326. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
  327. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +155 -0
  328. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
  329. data/ext/sources/ggml/src/ggml-cuda/sum.cu +45 -0
  330. data/ext/sources/ggml/src/ggml-cuda/sum.cuh +5 -0
  331. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +26 -0
  332. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +4 -0
  333. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +5 -0
  334. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +10 -0
  335. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
  336. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
  337. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +10 -0
  338. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +5 -0
  339. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +10 -0
  340. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +10 -0
  341. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
  342. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
  343. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +5 -0
  344. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
  345. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +10 -0
  346. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +10 -0
  347. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
  348. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
  349. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
  350. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +10 -0
  351. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +10 -0
  352. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  353. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  354. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  355. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  356. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  357. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  358. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  359. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  360. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  361. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  362. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  363. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  364. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  365. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  366. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  367. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  368. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  369. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  370. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  371. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  372. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  373. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  374. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  375. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  376. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  377. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  378. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  379. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  380. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  381. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  382. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  383. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  384. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  385. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  386. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  387. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  388. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  389. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  390. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  391. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  392. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  393. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  394. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  395. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  396. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  397. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  398. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  399. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  400. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  401. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  402. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  403. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  404. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  405. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  406. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  407. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  408. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  409. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  410. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  411. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  412. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  413. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  414. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  415. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  416. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  417. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  418. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  419. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  420. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  421. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  422. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  423. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  424. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  425. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  426. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  427. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  428. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  429. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  430. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  431. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  432. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  433. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  434. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  435. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  436. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  437. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  438. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +78 -0
  439. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
  440. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
  441. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
  442. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
  443. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
  444. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
  445. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
  446. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
  447. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  448. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  449. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  450. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  451. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  452. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  453. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  454. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  455. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  456. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  457. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +47 -0
  458. data/ext/sources/ggml/src/ggml-cuda/tsembd.cuh +5 -0
  459. data/ext/sources/ggml/src/ggml-cuda/unary.cu +378 -0
  460. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +66 -0
  461. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +51 -0
  462. data/ext/sources/ggml/src/ggml-cuda/upscale.cuh +5 -0
  463. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +1135 -0
  464. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/cuda.h +1 -0
  465. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/hip.h +57 -0
  466. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/musa.h +7 -1
  467. data/ext/sources/ggml/src/ggml-cuda/wkv.cu +199 -0
  468. data/ext/sources/ggml/src/ggml-cuda/wkv.cuh +7 -0
  469. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +135 -0
  470. data/ext/{ggml → sources/ggml}/src/ggml-impl.h +147 -158
  471. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  472. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +112 -0
  473. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +58 -0
  474. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +25 -0
  475. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +52 -0
  476. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +52 -0
  477. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +52 -0
  478. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +52 -0
  479. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +30 -0
  480. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +22 -0
  481. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +17 -0
  482. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +31 -0
  483. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +31 -0
  484. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +38 -0
  485. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +39 -0
  486. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +44 -0
  487. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +52 -0
  488. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +69 -0
  489. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +51 -0
  490. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +33 -0
  491. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +35 -0
  492. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +140 -0
  493. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +106 -0
  494. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +73 -0
  495. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +52 -0
  496. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +28 -0
  497. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +84 -0
  498. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +21 -0
  499. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +53 -0
  500. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +52 -0
  501. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +52 -0
  502. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +52 -0
  503. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +52 -0
  504. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +19 -0
  505. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +23 -0
  506. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +22 -0
  507. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +72 -0
  508. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +71 -0
  509. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +121 -0
  510. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +649 -0
  511. data/ext/{ggml → sources/ggml}/src/ggml-metal/ggml-metal.m +2504 -1108
  512. data/ext/{ggml → sources/ggml}/src/ggml-metal/ggml-metal.metal +2102 -1463
  513. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +113 -0
  514. data/ext/sources/ggml/src/ggml-musa/mudnn.cu +112 -0
  515. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +12 -0
  516. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +110 -0
  517. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +6494 -0
  518. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +83 -0
  519. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  520. data/ext/sources/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  521. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  522. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  523. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +118 -0
  524. data/ext/sources/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  525. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  526. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  527. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +62 -0
  528. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  529. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  530. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +163 -0
  531. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
  532. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  533. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  534. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  535. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +79 -0
  536. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  537. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  538. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  539. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  540. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  541. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  542. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  543. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  544. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  545. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  546. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  547. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  548. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl +190 -0
  549. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +81 -0
  550. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  551. data/ext/sources/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  552. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  553. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +96 -0
  554. data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +721 -0
  555. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +16 -0
  556. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  557. data/ext/sources/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  558. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +87 -0
  559. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +87 -0
  560. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +86 -0
  561. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +86 -0
  562. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  563. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  564. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  565. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +84 -0
  566. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  567. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  568. data/ext/{ggml → sources/ggml}/src/ggml-opt.cpp +373 -190
  569. data/ext/{ggml → sources/ggml}/src/ggml-quants.c +120 -128
  570. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  571. data/ext/{ggml → sources/ggml}/src/ggml-rpc/ggml-rpc.cpp +494 -84
  572. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +189 -0
  573. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +37 -0
  574. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +344 -0
  575. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  576. data/ext/{ggml → sources/ggml}/src/ggml-sycl/common.cpp +20 -32
  577. data/ext/sources/ggml/src/ggml-sycl/common.hpp +561 -0
  578. data/ext/{ggml → sources/ggml}/src/ggml-sycl/concat.cpp +56 -70
  579. data/ext/sources/ggml/src/ggml-sycl/concat.hpp +20 -0
  580. data/ext/{ggml → sources/ggml}/src/ggml-sycl/conv.cpp +8 -12
  581. data/ext/sources/ggml/src/ggml-sycl/conv.hpp +20 -0
  582. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +575 -0
  583. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +34 -0
  584. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +839 -0
  585. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +11 -0
  586. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +823 -0
  587. data/ext/{ggml → sources/ggml}/src/ggml-sycl/dmmv.cpp +188 -67
  588. data/ext/sources/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  589. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +2987 -0
  590. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1120 -0
  591. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +84 -0
  592. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +102 -0
  593. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +212 -0
  594. data/ext/sources/ggml/src/ggml-sycl/getrows.hpp +20 -0
  595. data/ext/{ggml → sources/ggml}/src/ggml-sycl/ggml-sycl.cpp +1197 -1295
  596. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +106 -0
  597. data/ext/sources/ggml/src/ggml-sycl/gla.hpp +8 -0
  598. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +136 -0
  599. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +21 -0
  600. data/ext/{ggml → sources/ggml}/src/ggml-sycl/mmq.cpp +60 -81
  601. data/ext/sources/ggml/src/ggml-sycl/mmq.hpp +33 -0
  602. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1065 -0
  603. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  604. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +482 -0
  605. data/ext/sources/ggml/src/ggml-sycl/norm.hpp +26 -0
  606. data/ext/{ggml → sources/ggml}/src/ggml-sycl/outprod.cpp +8 -17
  607. data/ext/sources/ggml/src/ggml-sycl/outprod.hpp +10 -0
  608. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +74 -0
  609. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +111 -0
  610. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +472 -0
  611. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +20 -0
  612. data/ext/{ggml → sources/ggml}/src/ggml-sycl/softmax.cpp +38 -28
  613. data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +20 -0
  614. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +15 -0
  615. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +26 -0
  616. data/ext/{ggml → sources/ggml}/src/ggml-sycl/tsembd.cpp +6 -11
  617. data/ext/sources/ggml/src/ggml-sycl/tsembd.hpp +20 -0
  618. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +1307 -0
  619. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +289 -0
  620. data/ext/sources/ggml/src/ggml-sycl/wkv.hpp +10 -0
  621. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +200 -0
  622. data/ext/sources/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
  623. data/ext/{ggml → sources/ggml}/src/ggml-vulkan/ggml-vulkan.cpp +3822 -1335
  624. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +31 -0
  625. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +29 -0
  626. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +29 -0
  627. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +51 -0
  628. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +69 -0
  629. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
  630. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
  631. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
  632. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
  633. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  634. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
  635. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
  636. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +242 -0
  637. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
  638. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
  639. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
  640. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +462 -0
  641. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +699 -0
  642. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.comp +13 -0
  643. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
  644. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
  645. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
  646. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +48 -0
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +39 -0
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +49 -0
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
  663. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
  664. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +337 -0
  665. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  666. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  667. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +267 -0
  668. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +59 -0
  669. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  670. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
  671. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
  672. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +64 -0
  673. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.comp +9 -0
  674. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +76 -0
  675. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +33 -0
  676. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +41 -0
  677. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  678. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  679. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
  680. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +100 -0
  681. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +41 -0
  682. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
  683. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
  684. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
  685. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
  686. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +118 -0
  687. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +82 -0
  688. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +79 -0
  689. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
  690. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +87 -0
  691. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
  692. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
  693. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
  694. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +118 -0
  695. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +154 -0
  696. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +130 -0
  697. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
  698. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +136 -0
  699. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +167 -0
  700. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
  701. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +868 -0
  702. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +441 -0
  703. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +442 -0
  704. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +99 -0
  705. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
  706. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
  707. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +28 -0
  708. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
  709. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +77 -0
  710. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  711. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
  712. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
  713. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
  714. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +61 -0
  715. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
  716. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +58 -0
  717. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +60 -0
  718. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +43 -0
  719. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +43 -0
  720. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +47 -0
  721. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
  722. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
  723. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
  724. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
  725. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
  726. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +173 -0
  727. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +50 -0
  728. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
  729. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
  730. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +37 -0
  731. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  732. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
  733. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_bfloat16_support.comp +7 -0
  734. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp +7 -0
  735. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat_support.comp +7 -0
  736. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_integer_dot_support.comp +7 -0
  737. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +41 -0
  738. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +1373 -0
  739. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -0
  740. data/ext/{ggml → sources/ggml}/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +203 -36
  741. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
  742. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
  743. data/ext/{ggml → sources/ggml}/src/ggml.c +918 -1782
  744. data/ext/sources/ggml/src/ggml.cpp +26 -0
  745. data/ext/sources/ggml/src/gguf.cpp +1351 -0
  746. data/ext/{include → sources/include}/whisper.h +70 -2
  747. data/ext/sources/src/CMakeLists.txt +145 -0
  748. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  749. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  750. data/ext/{src → sources/src}/coreml/whisper-decoder-impl.h +27 -15
  751. data/ext/{src → sources/src}/coreml/whisper-decoder-impl.m +36 -10
  752. data/ext/{src → sources/src}/coreml/whisper-encoder-impl.h +21 -9
  753. data/ext/{src → sources/src}/coreml/whisper-encoder-impl.m +29 -3
  754. data/ext/sources/src/coreml/whisper-encoder.mm +73 -0
  755. data/ext/sources/src/whisper-arch.h +197 -0
  756. data/ext/{src → sources/src}/whisper.cpp +1966 -386
  757. data/ext/sources/tests/CMakeLists.txt +105 -0
  758. data/ext/sources/tests/earnings21/eval.mk +58 -0
  759. data/ext/sources/tests/earnings21/eval.py +68 -0
  760. data/ext/sources/tests/earnings21/normalizers/__init__.py +2 -0
  761. data/ext/sources/tests/earnings21/normalizers/basic.py +80 -0
  762. data/ext/sources/tests/earnings21/normalizers/english.json +1741 -0
  763. data/ext/sources/tests/earnings21/normalizers/english.py +550 -0
  764. data/ext/sources/tests/earnings21/requirements.txt +6 -0
  765. data/ext/sources/tests/en-0-ref.txt +1 -0
  766. data/ext/sources/tests/en-1-ref.txt +1 -0
  767. data/ext/sources/tests/en-2-ref.txt +1 -0
  768. data/ext/sources/tests/es-0-ref.txt +1 -0
  769. data/ext/sources/tests/librispeech/eval.mk +39 -0
  770. data/ext/sources/tests/librispeech/eval.py +47 -0
  771. data/ext/sources/tests/librispeech/normalizers/__init__.py +2 -0
  772. data/ext/sources/tests/librispeech/normalizers/basic.py +80 -0
  773. data/ext/sources/tests/librispeech/normalizers/english.json +1741 -0
  774. data/ext/sources/tests/librispeech/normalizers/english.py +550 -0
  775. data/ext/sources/tests/librispeech/requirements.txt +6 -0
  776. data/ext/sources/tests/run-tests.sh +130 -0
  777. data/ext/sources/tests/test-c.c +3 -0
  778. data/ext/sources/tests/test-vad-full.cpp +54 -0
  779. data/ext/sources/tests/test-vad.cpp +83 -0
  780. data/ext/sources/tests/test-whisper.js +58 -0
  781. data/extsources.rb +39 -5
  782. data/lib/whisper/context.rb +15 -0
  783. data/lib/whisper/model/uri.rb +202 -126
  784. data/lib/whisper/segment.rb +58 -0
  785. data/sig/whisper.rbs +510 -0
  786. data/test/helper.rb +24 -0
  787. data/{tests → test}/test_callback.rb +45 -3
  788. data/{tests → test}/test_error.rb +2 -2
  789. data/{tests → test}/test_model.rb +47 -0
  790. data/test/test_package.rb +51 -0
  791. data/test/test_params.rb +297 -0
  792. data/test/test_segment.rb +146 -0
  793. data/test/test_vad.rb +19 -0
  794. data/test/test_vad_params.rb +103 -0
  795. data/{tests → test}/test_whisper.rb +106 -36
  796. data/whispercpp.gemspec +5 -5
  797. metadata +837 -134
  798. data/ext/cpu.mk +0 -9
  799. data/ext/examples/dr_wav.h +0 -8815
  800. data/ext/ggml/src/ggml-cann/aclnn_ops.h +0 -592
  801. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -4262
  802. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  803. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -10835
  804. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +0 -14123
  805. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +0 -1884
  806. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +0 -14
  807. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +0 -288
  808. data/ext/ggml/src/ggml-sycl/convert.cpp +0 -547
  809. data/ext/ggml/src/ggml-sycl/element_wise.cpp +0 -1030
  810. data/ext/ggml/src/ggml-sycl/im2col.cpp +0 -126
  811. data/ext/ggml/src/ggml-sycl/mmvq.cpp +0 -1015
  812. data/ext/ggml/src/ggml-sycl/norm.cpp +0 -378
  813. data/ext/ggml/src/ggml-sycl/rope.cpp +0 -276
  814. data/ext/ggml/src/ggml-sycl/wkv6.cpp +0 -141
  815. data/ext/metal-embed.mk +0 -17
  816. data/ext/metal.mk +0 -6
  817. data/ext/ruby_whisper.cpp +0 -1909
  818. data/ext/scripts/get-flags.mk +0 -38
  819. data/lib/whisper.rb +0 -2
  820. data/tests/helper.rb +0 -7
  821. data/tests/test_package.rb +0 -31
  822. data/tests/test_params.rb +0 -160
  823. data/tests/test_segment.rb +0 -83
  824. /data/ext/{ggml → sources/ggml}/include/ggml-blas.h +0 -0
  825. /data/ext/{ggml → sources/ggml}/include/ggml-cann.h +0 -0
  826. /data/ext/{ggml → sources/ggml}/include/ggml-cuda.h +0 -0
  827. /data/ext/{ggml → sources/ggml}/include/ggml-kompute.h +0 -0
  828. /data/ext/{ggml → sources/ggml}/include/ggml-opencl.h +0 -0
  829. /data/ext/{ggml → sources/ggml}/include/ggml-sycl.h +0 -0
  830. /data/ext/{ggml → sources/ggml}/src/ggml-amx/common.h +0 -0
  831. /data/ext/{ggml → sources/ggml}/src/ggml-amx/ggml-amx.cpp +0 -0
  832. /data/ext/{ggml → sources/ggml}/src/ggml-amx/mmq.cpp +0 -0
  833. /data/ext/{ggml → sources/ggml}/src/ggml-amx/mmq.h +0 -0
  834. /data/ext/{ggml → sources/ggml}/src/ggml-blas/ggml-blas.cpp +0 -0
  835. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/ascendc_kernels.h +0 -0
  836. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_f16.cpp +0 -0
  837. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_f32.cpp +0 -0
  838. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -0
  839. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -0
  840. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -0
  841. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -0
  842. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -0
  843. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/amx.h +0 -0
  844. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/common.h +0 -0
  845. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/mmq.h +0 -0
  846. /data/ext/{ggml/src/ggml-cpu/ggml-cpu-hbm.h → sources/ggml/src/ggml-cpu/hbm.h} +0 -0
  847. /data/ext/{ggml/src/ggml-cpu/ggml-cpu-traits.h → sources/ggml/src/ggml-cpu/traits.h} +0 -0
  848. /data/ext/{ggml → sources/ggml}/src/ggml-kompute/ggml-kompute.cpp +0 -0
  849. /data/ext/{ggml → sources/ggml}/src/ggml-quants.h +0 -0
  850. /data/ext/{ggml → sources/ggml}/src/ggml-threading.cpp +0 -0
  851. /data/ext/{ggml → sources/ggml}/src/ggml-threading.h +0 -0
  852. /data/ext/{src → sources/src}/coreml/whisper-encoder.h +0 -0
  853. /data/ext/{src → sources/src}/openvino/whisper-openvino-encoder.cpp +0 -0
  854. /data/ext/{src → sources/src}/openvino/whisper-openvino-encoder.h +0 -0
  855. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  856. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  857. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
@@ -29,11 +29,16 @@
29
29
  #include <cstdio>
30
30
  #include <cstring>
31
31
  #include <mutex>
32
+ #include <queue>
33
+ #include <chrono>
34
+ #include <unordered_set>
35
+ #include <optional>
32
36
 
33
37
  #include "ggml-impl.h"
34
38
  #include "ggml-backend-impl.h"
35
39
  #include "ggml-cann/aclnn_ops.h"
36
40
  #include "ggml-cann/common.h"
41
+ #include "ggml.h"
37
42
 
38
43
  #define GGML_COMMON_DECL_C
39
44
 
@@ -90,6 +95,26 @@ int32_t ggml_cann_get_device() {
90
95
  return id;
91
96
  }
92
97
 
98
+ /**
99
+ * @brief Get the value of the specified environment variable (name).
100
+ * if not empty, return a std::string object
101
+ */
102
+ std::optional<std::string> get_env(const std::string& name) {
103
+ const char* val = std::getenv(name.c_str());
104
+ if (!val) return std::nullopt;
105
+ std::string res = std::string(val);
106
+ std::transform(res.begin(), res.end(), res.begin(), ::tolower);
107
+ return res;
108
+ }
109
+
110
+ /**
111
+ * @brief Verify whether the environment variable is a valid value.
112
+ */
113
+ bool parse_bool(const std::string& value) {
114
+ std::unordered_set<std::string> valid_values = {"on", "1", "yes", "y", "enable", "true"};
115
+ return valid_values.find(value) != valid_values.end();
116
+ }
117
+
93
118
  /**
94
119
  * @brief Initialize the CANN device information.
95
120
  *
@@ -119,9 +144,10 @@ static ggml_cann_device_info ggml_cann_init() {
119
144
  prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
120
145
  prop.location.id = id;
121
146
  prop.reserve = 0;
122
- ACL_CHECK(aclrtMemGetAllocationGranularity(
147
+ err = aclrtMemGetAllocationGranularity(
123
148
  &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
124
- &info.devices[id].vmm_granularity));
149
+ &info.devices[id].vmm_granularity);
150
+ info.devices[id].vmm = err == ACL_SUCCESS;
125
151
 
126
152
  size_t free, total;
127
153
  ggml_backend_cann_get_device_memory(id, &free, &total);
@@ -148,11 +174,223 @@ const ggml_cann_device_info& ggml_cann_info() {
148
174
 
149
175
  //#define DEBUG_CANN_MALLOC
150
176
  /**
151
- * @brief A pool of CANN buffers(legacy).
177
+ * @brief A pool of CANN buffers(priority segment buffer).
152
178
  *
153
179
  * This class manages a pool of CANN buffers for a specific device.
154
180
  */
155
- struct ggml_cann_pool_leg : public ggml_cann_pool {
181
+ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
182
+ /**
183
+ * @brief The maximum reuse margin for a buffer.
184
+ */
185
+ static const size_t max_reuse_margin = 1ull << 22; // 4MB
186
+
187
+ /**
188
+ * @brief The minimum free margin for a buffer.
189
+ */
190
+ static const size_t min_free_margin = 1ull << 20; // 1MB
191
+
192
+ /**
193
+ * @brief The alignment for buffer allocation.
194
+ */
195
+ static const size_t alignment = 128;
196
+
197
+ /**
198
+ * @brief The device ID associated with this buffer pool.
199
+ */
200
+ int device;
201
+
202
+ /**
203
+ * @brief Whether to disable clean during buffer allocation.
204
+ */
205
+ bool disable_clean = false;
206
+
207
+ /**
208
+ * @brief Structure representing a CANN buffer.
209
+ */
210
+ struct ggml_cann_buffer {
211
+ void* ptr = nullptr; ///< Pointer to the buffer.
212
+ size_t size = 0; ///< Size of the buffer.
213
+ std::chrono::steady_clock::time_point last_used; ///< Last used time.
214
+
215
+ bool operator>(const ggml_cann_buffer& other) const {
216
+ return size > other.size;
217
+ }
218
+ };
219
+
220
+ /**
221
+ * @brief Array of CANN buffers in the pool.
222
+ */
223
+ std::unordered_map<void*, size_t> buffer_pool;
224
+ std::priority_queue<ggml_cann_buffer,
225
+ std::vector<ggml_cann_buffer>,
226
+ std::greater<>> free_buffers ;
227
+
228
+ /**
229
+ * @brief Total size of all buffers in the pool.
230
+ */
231
+ size_t pool_size = 0;
232
+
233
+ /**
234
+ * @brief Constructor to initialize the buffer pool for a specific device.
235
+ *
236
+ * @param device The device ID to associate with this buffer pool.
237
+ */
238
+ explicit ggml_cann_pool_buf_prio(int device) : device(device) {
239
+ disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
240
+ }
241
+
242
+ /**
243
+ * @brief Destructor to free all buffers in the pool.
244
+ */
245
+ ~ggml_cann_pool_buf_prio() {
246
+ ggml_cann_set_device(device);
247
+ for (auto& [b_ptr, b_size] : buffer_pool) {
248
+ aclrtFree(b_ptr);
249
+ pool_size -= b_size;
250
+ }
251
+ buffer_pool.clear();
252
+ GGML_ASSERT(pool_size == 0);
253
+ }
254
+
255
+ /**
256
+ * @brief Allocate a buffer of the given size.
257
+ *
258
+ * @param size The size of the buffer to allocate.
259
+ * @param actual_size A pointer to a variable to receive the actual size of
260
+ * the allocated buffer.
261
+ * @return A pointer to the allocated buffer.
262
+ */
263
+ void* alloc(size_t size, size_t* actual_size) override {
264
+ size = GGML_PAD(size, alignment);
265
+ if (size == 0) {
266
+ size = alignment;
267
+ }
268
+
269
+ void* ptr = nullptr;
270
+ auto now = std::chrono::steady_clock::now();
271
+
272
+ std::vector<ggml_cann_buffer> free_buffers_rest;
273
+ free_buffers_rest.reserve(free_buffers.size());
274
+ while (!free_buffers.empty()) {
275
+ auto b = free_buffers.top();
276
+ free_buffers.pop();
277
+
278
+ if (b.size >= size) {
279
+ // reuse the buffer if the size is enough
280
+ const size_t margin = b.size - size;
281
+ if (margin <= max_reuse_margin) {
282
+ *actual_size = b.size;
283
+ ptr = b.ptr;
284
+ #ifdef DEBUG_CANN_MALLOC
285
+ GGML_LOG_INFO(
286
+ "cann pool[%d]: reused %p, "
287
+ "pool_size = %5u MB, "
288
+ "size = %5u MB, "
289
+ "margin = %5u MB\n",
290
+ device, b.ptr,
291
+ (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
292
+ (uint32_t)(GGML_PAD(size, 1048576) / 1048576),
293
+ (uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
294
+ #endif
295
+ break;
296
+ }
297
+ }
298
+
299
+ bool should_clean = !disable_clean &&
300
+ b.size > min_free_margin &&
301
+ std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
302
+ if (should_clean) {
303
+ // free the buffer if the size is needed to be freed
304
+ ACL_CHECK(aclrtFree(b.ptr));
305
+ pool_size -= b.size;
306
+ buffer_pool.erase(b.ptr);
307
+ #ifdef DEBUG_CANN_MALLOC
308
+ GGML_LOG_INFO(
309
+ "cann pool[%d]: clean %p, "
310
+ "pool_size = %5u MB, "
311
+ "size = %5u MB\n",
312
+ device, b.ptr,
313
+ (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
314
+ (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
315
+ #endif
316
+ continue;
317
+ }
318
+ free_buffers_rest.push_back(b);
319
+ }
320
+ for (ggml_cann_buffer &b : free_buffers_rest) {
321
+ free_buffers.push(std::move(b));
322
+ }
323
+
324
+ #ifdef DEBUG_CANN_MALLOC
325
+ GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
326
+ #endif
327
+ if (ptr != nullptr) {
328
+ return ptr;
329
+ }
330
+
331
+ // allocate a new buffer if no buffer can be reused
332
+ ggml_cann_set_device(device);
333
+ ACL_CHECK(aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
334
+ *actual_size = size;
335
+ pool_size += size;
336
+ #ifdef DEBUG_CANN_MALLOC
337
+ GGML_LOG_INFO(
338
+ "cann pool[%d]: allocate %p, "
339
+ "pool_size = %5u MB, "
340
+ "size = %5u MB\n",
341
+ device, ptr, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
342
+ (uint32_t)(GGML_PAD(size, 1048576) / 1048576));
343
+ #endif
344
+ buffer_pool.emplace(ptr, size);
345
+ return ptr;
346
+ }
347
+
348
+ /**
349
+ * @brief Free a buffer and return it to the pool.
350
+ *
351
+ * @param ptr Pointer to the buffer to free.
352
+ * @param size Size of the buffer to free.
353
+ */
354
+ void free(void* ptr, size_t size) override {
355
+ GGML_UNUSED(size);
356
+ auto it = buffer_pool.find(ptr);
357
+ if (it == buffer_pool.end()) {
358
+ GGML_ABORT("cann pool[%d]: buffer %p not found in pool\n", device, ptr);
359
+ }
360
+
361
+ auto now = std::chrono::steady_clock::now();
362
+ free_buffers.emplace(ggml_cann_buffer{ptr, it->second, now});
363
+ #ifdef DEBUG_CANN_MALLOC
364
+ GGML_LOG_INFO(
365
+ "cann pool[%d]: return %p, "
366
+ "pool_size = %5u MB\n",
367
+ device, ptr,
368
+ (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
369
+ #endif
370
+ }
371
+ };
372
+
373
+ /**
374
+ * @brief A pool of CANN buffers(segment buffer).
375
+ *
376
+ * This class manages a pool of CANN buffers for a specific device.
377
+ */
378
+ struct ggml_cann_pool_buf : public ggml_cann_pool {
379
+ /**
380
+ * @brief The maximum reuse margin for a buffer.
381
+ */
382
+ static const size_t max_reuse_margin = 1ull << 22; // 4MB
383
+
384
+ /**
385
+ * @brief The minimum free margin for a buffer.
386
+ */
387
+ static const size_t min_free_margin = 1ull << 20; // 1MB
388
+
389
+ /**
390
+ * @brief The alignment for buffer allocation.
391
+ */
392
+ static const size_t alignment = 128;
393
+
156
394
  /**
157
395
  * @brief The maximum number of buffers in the pool.
158
396
  */
@@ -163,12 +401,19 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
163
401
  */
164
402
  int device;
165
403
 
404
+ /**
405
+ * @brief Whether to disable clean during buffer allocation.
406
+ */
407
+ bool disable_clean = false;
408
+
166
409
  /**
167
410
  * @brief Structure representing a CANN buffer.
168
411
  */
169
412
  struct ggml_cann_buffer {
170
413
  void* ptr = nullptr; ///< Pointer to the buffer memory.
171
414
  size_t size = 0; ///< Size of the buffer.
415
+ bool used = false; ///< Whether the buffer is currently in use.
416
+ std::chrono::steady_clock::time_point last_used; ///< Last used time.
172
417
  };
173
418
 
174
419
  /**
@@ -186,17 +431,19 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
186
431
  *
187
432
  * @param device The device ID to associate with this buffer pool.
188
433
  */
189
- explicit ggml_cann_pool_leg(int device) : device(device) {}
434
+ explicit ggml_cann_pool_buf(int device) : device(device) {
435
+ disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
436
+ }
190
437
 
191
438
  /**
192
439
  * @brief Destructor to free all buffers in the pool.
193
440
  */
194
- ~ggml_cann_pool_leg() {
441
+ ~ggml_cann_pool_buf() {
195
442
  ggml_cann_set_device(device);
196
443
  for (int i = 0; i < MAX_BUFFERS; ++i) {
197
444
  ggml_cann_buffer& b = buffer_pool[i];
198
445
  if (b.ptr != nullptr) {
199
- ACL_CHECK(aclrtFree(b.ptr));
446
+ aclrtFree(b.ptr);
200
447
  pool_size -= b.size;
201
448
  }
202
449
  }
@@ -212,63 +459,93 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
212
459
  * @return A pointer to the allocated buffer.
213
460
  */
214
461
  void* alloc(size_t size, size_t* actual_size) override {
215
- const size_t alignment = 128;
216
462
  size = GGML_PAD(size, alignment);
217
463
  if (size == 0) {
218
464
  size = alignment;
219
465
  }
220
- #ifdef DEBUG_CANN_MALLOC
221
- int nnz = 0;
222
- size_t max_size = 0;
223
- #endif
224
- size_t best_diff = 1ull << 36;
225
- int ibest = -1;
226
- for (int i = 0; i < MAX_BUFFERS; ++i) {
466
+
467
+ void* ptr = nullptr;
468
+ auto now = std::chrono::steady_clock::now();
469
+
470
+ int i = 0;
471
+ for (; i < MAX_BUFFERS; ++i) {
227
472
  ggml_cann_buffer& b = buffer_pool[i];
228
- if (b.ptr != nullptr) {
473
+ if (b.ptr == nullptr) {
474
+ break;
475
+ }
476
+ if (b.used) {
477
+ continue;
478
+ }
479
+ if (b.size >= size) {
480
+ // reuse the buffer if the size is enough
481
+ const size_t margin = b.size - size;
482
+ if (margin <= max_reuse_margin) {
483
+ *actual_size = b.size;
484
+ b.used = true;
485
+ ptr = b.ptr;
229
486
  #ifdef DEBUG_CANN_MALLOC
230
- ++nnz;
231
- if (b.size > max_size) max_size = b.size;
487
+ GGML_LOG_INFO(
488
+ "cann pool[%d]: reused %p, "
489
+ "pool_size = %5u MB, "
490
+ "size = %5u MB, "
491
+ "margin = %5u MB\n",
492
+ device, b.ptr,
493
+ (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
494
+ (uint32_t)(GGML_PAD(size, 1048576) / 1048576),
495
+ (uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
232
496
  #endif
233
- if (b.size >= size) {
234
- size_t diff = b.size - size;
235
- if (diff < best_diff) {
236
- best_diff = diff;
237
- ibest = i;
238
- if (!best_diff) {
239
- void* ptr = b.ptr;
240
- *actual_size = b.size;
241
- b.ptr = nullptr;
242
- b.size = 0;
243
- return ptr;
244
- }
245
- }
497
+ break;
246
498
  }
247
499
  }
500
+
501
+ bool should_clean = !disable_clean &&
502
+ b.size > min_free_margin &&
503
+ std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
504
+ if (should_clean) {
505
+ // free the buffer if the size is needed to be freed
506
+ ACL_CHECK(aclrtFree(b.ptr));
507
+ pool_size -= b.size;
508
+ #ifdef DEBUG_CANN_MALLOC
509
+ GGML_LOG_INFO(
510
+ "cann pool[%d]: clean %p, "
511
+ "pool_size = %5u MB, "
512
+ "size = %5u MB\n",
513
+ device, b.ptr,
514
+ (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
515
+ (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
516
+ #endif
517
+ b.ptr = nullptr;
518
+ }
248
519
  }
249
- if (ibest >= 0) {
250
- ggml_cann_buffer& b = buffer_pool[ibest];
251
- void* ptr = b.ptr;
252
- *actual_size = b.size;
253
- b.ptr = nullptr;
254
- b.size = 0;
520
+ if (ptr != nullptr) {
255
521
  return ptr;
256
522
  }
257
- void* ptr;
258
- ggml_cann_set_device(device);
259
- ACL_CHECK(
260
- aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
261
- *actual_size = size;
262
- pool_size += size;
523
+
524
+ if (i < MAX_BUFFERS) {
525
+ // allocate a new buffer if no buffer can be reused
526
+ ggml_cann_buffer& b = buffer_pool[i];
527
+ ggml_cann_set_device(device);
528
+ ACL_CHECK(aclrtMalloc(&b.ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
529
+ pool_size += size;
530
+ *actual_size = size;
531
+ b.size = size;
532
+ b.used = true;
533
+ if (i >= MAX_BUFFERS - 8) {
534
+ GGML_LOG_WARN("cann pool[%d]: slots almost full\n", device);
535
+ }
263
536
  #ifdef DEBUG_CANN_MALLOC
264
- GGML_LOG_INFO(
265
- "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
266
- "requested %u MB\n",
267
- __func__, device, nnz, (uint32_t)(max_size / 1024 / 1024),
268
- (uint32_t)(pool_size / 1024 / 1024),
269
- (uint32_t)(size / 1024 / 1024));
537
+ GGML_LOG_INFO(
538
+ "cann pool[%d]: allocate %p, "
539
+ "pool_size = %5u MB, "
540
+ "size = %5u MB\n",
541
+ device, b.ptr,
542
+ (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
543
+ (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
270
544
  #endif
271
- return ptr;
545
+ return b.ptr;
546
+ }
547
+
548
+ GGML_ABORT("cann pool[%d]: slots full\n", device);
272
549
  }
273
550
 
274
551
  /**
@@ -278,18 +555,24 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
278
555
  * @param size Size of the buffer to free.
279
556
  */
280
557
  void free(void* ptr, size_t size) override {
558
+ GGML_UNUSED(size);
281
559
  for (int i = 0; i < MAX_BUFFERS; ++i) {
282
560
  ggml_cann_buffer& b = buffer_pool[i];
283
- if (b.ptr == nullptr) {
284
- b.ptr = ptr;
285
- b.size = size;
286
- return;
561
+ if (b.ptr != ptr) {
562
+ continue;
287
563
  }
564
+ b.used = false;
565
+ b.last_used = std::chrono::steady_clock::now();
566
+ #ifdef DEBUG_CANN_MALLOC
567
+ GGML_LOG_INFO(
568
+ "cann pool[%d]: return %p, "
569
+ "pool_size = %5u MB\n",
570
+ device, b.ptr,
571
+ (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
572
+ #endif
573
+ return;
288
574
  }
289
- // memory should always buffered. these memory may still needed by
290
- // tasks in stream.
291
- // TODO, fix me.
292
- GGML_ABORT("Cann buffer pool full, increase MAX_CANN_BUFFERS\n");
575
+ GGML_ABORT("cann pool[%d]: slots full\n", device);
293
576
  }
294
577
  };
295
578
 
@@ -347,8 +630,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
347
630
  * @param device The device ID to associate with this buffer pool.
348
631
  */
349
632
  explicit ggml_cann_pool_vmm(int device)
350
- : device(device),
351
- granularity(ggml_cann_info().devices[device].vmm_granularity) {
633
+ : device(device) {
352
634
  auto dev = ggml_cann_info().devices[device];
353
635
  granularity = dev.vmm_granularity;
354
636
  max_size = dev.total_vram;
@@ -471,7 +753,20 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
471
753
  */
472
754
  std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
473
755
  int device) {
474
- return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
756
+ std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
757
+
758
+ if (mem_pool_type == "prio") {
759
+ GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
760
+ return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf_prio(device));
761
+ }
762
+
763
+ if (ggml_cann_info().devices[device].vmm && mem_pool_type != "leg") {
764
+ GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
765
+ return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
766
+ }
767
+
768
+ GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device);
769
+ return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf(device));
475
770
  }
476
771
 
477
772
  // cann buffer
@@ -796,14 +1091,14 @@ static bool need_transform(ggml_type type) {
796
1091
  * @param buffer The CANN buffer from which to initialize the tensor.
797
1092
  * @param tensor Pointer to the tensor to be initialized.
798
1093
  */
799
- static void ggml_backend_cann_buffer_init_tensor(
1094
+ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
800
1095
  ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
801
1096
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
802
1097
  GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
803
- return;
1098
+ return GGML_STATUS_SUCCESS;
804
1099
  }
805
1100
 
806
- // TODO: can backend doesn't support quantized yet. Just leave the code
1101
+ // TODO: cann backend doesn't support quantized yet. Just leave the code
807
1102
  // here.
808
1103
  if (ggml_is_quantized(tensor->type)) {
809
1104
  // Initialize padding to 0 to avoid possible NaN values
@@ -817,6 +1112,7 @@ static void ggml_backend_cann_buffer_init_tensor(
817
1112
  memset_size, 0, memset_size));
818
1113
  }
819
1114
  }
1115
+ return GGML_STATUS_SUCCESS;
820
1116
  }
821
1117
 
822
1118
  // TODO: need handle tensor which has paddings.
@@ -1019,8 +1315,11 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
1019
1315
 
1020
1316
  ggml_cann_set_device(buft_ctx->device);
1021
1317
 
1022
- size = std::max(size, (size_t)1);
1023
-
1318
+ const size_t alignment = 128;
1319
+ size = GGML_PAD(size, alignment);
1320
+ if (size == 0) {
1321
+ size = alignment;
1322
+ }
1024
1323
  void* dev_ptr;
1025
1324
  aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
1026
1325
  if (err != ACL_SUCCESS) {
@@ -1299,47 +1598,69 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1299
1598
  ggml_cann_dup(ctx, dst);
1300
1599
  break;
1301
1600
  case GGML_OP_ADD:
1302
- ggml_cann_add(ctx, dst);
1601
+ case GGML_OP_ADD1:
1602
+ ggml_cann_binary_op<aclnn_add>(ctx, dst);
1603
+ break;
1604
+ case GGML_OP_SUB:
1605
+ ggml_cann_binary_op<aclnn_sub>(ctx, dst);
1303
1606
  break;
1304
1607
  case GGML_OP_ACC:
1305
1608
  ggml_cann_acc(ctx, dst);
1306
1609
  break;
1307
1610
  case GGML_OP_MUL:
1308
- ggml_cann_mul_div<aclnnMulGetWorkspaceSize, aclnnMul>(ctx, dst);
1611
+ ggml_cann_binary_op<aclnn_mul>(ctx, dst);
1309
1612
  break;
1310
1613
  case GGML_OP_DIV:
1311
- ggml_cann_mul_div<aclnnDivGetWorkspaceSize, aclnnDiv>(ctx, dst);
1614
+ ggml_cann_binary_op<aclnn_div>(ctx, dst);
1312
1615
  break;
1313
1616
  case GGML_OP_UNARY:
1314
1617
  switch (ggml_get_unary_op(dst)) {
1618
+ case GGML_UNARY_OP_ABS:
1619
+ GGML_CANN_CALL_UNARY_OP(Abs);
1620
+ break;
1621
+ case GGML_UNARY_OP_NEG:
1622
+ GGML_CANN_CALL_UNARY_OP(Neg);
1623
+ break;
1315
1624
  case GGML_UNARY_OP_GELU:
1316
- ggml_cann_activation<aclnnGeluGetWorkspaceSize, aclnnGelu>(
1317
- ctx, dst);
1625
+ GGML_CANN_CALL_UNARY_OP(Gelu);
1318
1626
  break;
1319
1627
  case GGML_UNARY_OP_SILU:
1320
- ggml_cann_activation<aclnnSiluGetWorkspaceSize, aclnnSilu>(
1321
- ctx, dst);
1322
- break;
1323
- // TODO: Use faster gelu??
1324
- case GGML_UNARY_OP_GELU_QUICK:
1325
- ggml_cann_activation<aclnnGeluGetWorkspaceSize, aclnnGelu>(
1326
- ctx, dst);
1628
+ GGML_CANN_CALL_UNARY_OP(Silu);
1327
1629
  break;
1630
+ case GGML_UNARY_OP_GELU_QUICK: {
1631
+ auto lambda = [](ggml_backend_cann_context& ctx,
1632
+ aclTensor* acl_src,
1633
+ aclTensor* acl_dst) {
1634
+ GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
1635
+ };
1636
+ ggml_cann_unary_op(lambda, ctx, dst);
1637
+ } break;
1328
1638
  case GGML_UNARY_OP_TANH:
1329
- ggml_cann_activation<aclnnTanhGetWorkspaceSize, aclnnTanh>(
1330
- ctx, dst);
1639
+ GGML_CANN_CALL_UNARY_OP(Tanh);
1331
1640
  break;
1332
1641
  case GGML_UNARY_OP_RELU:
1333
- ggml_cann_activation<aclnnReluGetWorkspaceSize, aclnnRelu>(
1334
- ctx, dst);
1642
+ GGML_CANN_CALL_UNARY_OP(Relu);
1643
+ break;
1644
+ case GGML_UNARY_OP_SIGMOID:
1645
+ GGML_CANN_CALL_UNARY_OP(Sigmoid);
1335
1646
  break;
1336
1647
  case GGML_UNARY_OP_HARDSIGMOID:
1337
- ggml_cann_activation<aclnnHardsigmoidGetWorkspaceSize,
1338
- aclnnHardsigmoid>(ctx, dst);
1648
+ GGML_CANN_CALL_UNARY_OP(Hardsigmoid);
1339
1649
  break;
1340
1650
  case GGML_UNARY_OP_HARDSWISH:
1341
- ggml_cann_activation<aclnnHardswishGetWorkspaceSize,
1342
- aclnnHardswish>(ctx, dst);
1651
+ GGML_CANN_CALL_UNARY_OP(Hardswish);
1652
+ break;
1653
+ case GGML_UNARY_OP_EXP:
1654
+ GGML_CANN_CALL_UNARY_OP(Exp);
1655
+ break;
1656
+ case GGML_UNARY_OP_ELU:
1657
+ ggml_cann_elu(ctx, dst);
1658
+ break;
1659
+ case GGML_UNARY_OP_SGN:
1660
+ GGML_CANN_CALL_UNARY_OP(Sign);
1661
+ break;
1662
+ case GGML_UNARY_OP_STEP:
1663
+ ggml_cann_step(ctx, dst);
1343
1664
  break;
1344
1665
  default:
1345
1666
  return false;
@@ -1376,12 +1697,18 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1376
1697
  ggml_cann_mul_mat(ctx, dst);
1377
1698
  break;
1378
1699
  case GGML_OP_MUL_MAT_ID:
1379
- return false;
1700
+ ggml_cann_mul_mat_id(ctx, dst);
1701
+ break;
1380
1702
  case GGML_OP_SCALE:
1381
1703
  ggml_cann_scale(ctx, dst);
1382
1704
  break;
1383
1705
  case GGML_OP_SQR:
1384
- ggml_cann_sqr(ctx, dst);
1706
+ GGML_ASSERT(dst->src[1] == nullptr);
1707
+ dst->src[1] = dst->src[0];
1708
+ ggml_cann_binary_op<aclnn_mul>(ctx, dst);
1709
+ break;
1710
+ case GGML_OP_SQRT:
1711
+ GGML_CANN_CALL_UNARY_OP(Sqrt);
1385
1712
  break;
1386
1713
  case GGML_OP_CLAMP:
1387
1714
  ggml_cann_clamp(ctx, dst);
@@ -1413,12 +1740,42 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1413
1740
  case GGML_OP_POOL_2D:
1414
1741
  ggml_cann_pool2d(ctx, dst);
1415
1742
  break;
1743
+ case GGML_OP_SUM:
1744
+ ggml_cann_sum(ctx, dst);
1745
+ break;
1416
1746
  case GGML_OP_SUM_ROWS:
1417
1747
  ggml_cann_sum_rows(ctx, dst);
1418
1748
  break;
1419
1749
  case GGML_OP_ARGSORT:
1420
1750
  ggml_cann_argsort(ctx, dst);
1421
1751
  break;
1752
+ case GGML_OP_ARGMAX:
1753
+ ggml_cann_argmax(ctx, dst);
1754
+ break;
1755
+ case GGML_OP_COS:
1756
+ ggml_cann_unary_op<aclnn_cos>(ctx, dst);
1757
+ break;
1758
+ case GGML_OP_SIN:
1759
+ ggml_cann_unary_op<aclnn_sin>(ctx, dst);
1760
+ break;
1761
+ case GGML_OP_CONV_TRANSPOSE_1D:
1762
+ ggml_cann_conv_transpose_1d(ctx, dst);
1763
+ break;
1764
+ case GGML_OP_LOG:
1765
+ GGML_CANN_CALL_UNARY_OP(Log);
1766
+ break;
1767
+ case GGML_OP_MEAN:
1768
+ ggml_cann_mean(ctx, dst);
1769
+ break;
1770
+ case GGML_OP_PAD_REFLECT_1D:
1771
+ ggml_cann_pad_reflect_1d(ctx, dst);
1772
+ break;
1773
+ case GGML_OP_COUNT_EQUAL:
1774
+ ggml_cann_count_equal(ctx, dst);
1775
+ break;
1776
+ case GGML_OP_FLASH_ATTN_EXT:
1777
+ ggml_cann_flash_attn_ext(ctx, dst);
1778
+ break;
1422
1779
  default:
1423
1780
  return false;
1424
1781
  }
@@ -1457,21 +1814,15 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
1457
1814
  ACL_CHECK(aclrtSynchronizeDevice());
1458
1815
  ACL_CHECK(aclrtResetDevice(cann_ctx->device));
1459
1816
 
1460
- // finalize when last backend freed.
1461
- if (cann_ctx->device == ggml_backend_cann_get_device_count() - 1) {
1462
- ACL_CHECK(aclFinalize());
1463
- }
1464
-
1465
1817
  delete cann_ctx;
1466
1818
  delete backend;
1467
1819
  }
1468
1820
 
1821
+
1469
1822
  /**
1470
1823
  * @brief Sets tensor data asynchronously in the CANN backend.
1471
1824
  *
1472
- * This function asynchronously sets tensor data in the CANN backend. Depending
1473
- * on the tensor type, it may perform data transformations before copying data
1474
- * to the device.
1825
+ * This function asynchronously sets tensor data in the CANN backend.
1475
1826
  *
1476
1827
  * @param backend Pointer to the CANN backend structure.
1477
1828
  * @param tensor Pointer to the tensor structure to set data for.
@@ -1486,23 +1837,28 @@ static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
1486
1837
  size_t size) {
1487
1838
  ggml_backend_cann_context *cann_ctx =
1488
1839
  (ggml_backend_cann_context *)backend->context;
1840
+ ggml_backend_buffer_t buf =
1841
+ tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
1489
1842
 
1490
- if (!need_transform(tensor->type)) {
1491
- ACL_CHECK(aclrtMemcpyAsync((char *)tensor->data + offset, size, data,
1492
- size, ACL_MEMCPY_HOST_TO_DEVICE,
1493
- cann_ctx->stream()));
1494
- } else {
1495
- void *transform_buffer = malloc(size);
1496
- ggml_backend_cann_transform(tensor, data, transform_buffer);
1843
+ GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
1844
+ "unsupported buffer type");
1845
+ GGML_ASSERT(!ggml_is_quantized(tensor->type));
1497
1846
 
1498
- ACL_CHECK(aclrtMemcpyAsync(
1499
- (char *)tensor->data + offset, size, transform_buffer, size,
1500
- ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
1501
- ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
1502
- free(transform_buffer);
1503
- }
1847
+ ggml_cann_async_memcpy(cann_ctx, (char *)tensor->data + offset, data, size,
1848
+ ACL_MEMCPY_HOST_TO_DEVICE);
1504
1849
  }
1505
1850
 
1851
+ /**
1852
+ * @brief Gets tensor data asynchronously in the CANN backend.
1853
+ *
1854
+ * This function asynchronously gets tensor data in the CANN backend.
1855
+ *
1856
+ * @param backend Pointer to the CANN backend structure.
1857
+ * @param tensor Pointer to the tensor structure to get data from.
1858
+ * @param data Pointer to the host data to copy from the tensor.
1859
+ * @param offset Offset in bytes within the host data.
1860
+ * @param size Size of the data to copy in bytes.
1861
+ */
1506
1862
  static void ggml_backend_cann_get_tensor_async(
1507
1863
  ggml_backend_t backend, const ggml_tensor *tensor, void *data,
1508
1864
  size_t offset, size_t size) {
@@ -1513,20 +1869,11 @@ static void ggml_backend_cann_get_tensor_async(
1513
1869
 
1514
1870
  GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
1515
1871
  "unsupported buffer type");
1872
+ GGML_ASSERT(!ggml_is_quantized(tensor->type));
1873
+
1874
+ ggml_cann_async_memcpy(cann_ctx, data, (char *)tensor->data + offset, size,
1875
+ ACL_MEMCPY_DEVICE_TO_HOST);
1516
1876
 
1517
- if (!need_transform(tensor->type)) {
1518
- ACL_CHECK(aclrtMemcpyAsync(data, size, (char *)tensor->data + offset,
1519
- size, ACL_MEMCPY_DEVICE_TO_HOST,
1520
- cann_ctx->stream()));
1521
- } else {
1522
- void *transform_buffer = malloc(size);
1523
- ACL_CHECK(aclrtMemcpyAsync(
1524
- transform_buffer, size, (char *)tensor->data + offset, size,
1525
- ACL_MEMCPY_DEVICE_TO_HOST, cann_ctx->stream()));
1526
- ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
1527
- ggml_backend_cann_transform_back(tensor, transform_buffer, data);
1528
- free(transform_buffer);
1529
- }
1530
1877
  }
1531
1878
 
1532
1879
  /**
@@ -1586,6 +1933,8 @@ static bool ggml_backend_cann_cpy_tensor_async(
1586
1933
  ggml_cann_set_device(cann_ctx_src->device);
1587
1934
  ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
1588
1935
 
1936
+ // wait for task_queue empty to keep task order.
1937
+ cann_ctx_src->task_queue.wait();
1589
1938
  ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
1590
1939
  ACL_MEMCPY_DEVICE_TO_DEVICE,
1591
1940
  cann_ctx_src->stream()));
@@ -1613,9 +1962,8 @@ static bool ggml_backend_cann_cpy_tensor_async(
1613
1962
  static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
1614
1963
  ggml_backend_cann_context* cann_ctx =
1615
1964
  (ggml_backend_cann_context*)backend->context;
1616
-
1965
+ cann_ctx->task_queue.wait();
1617
1966
  ggml_cann_set_device(cann_ctx->device);
1618
-
1619
1967
  ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
1620
1968
  }
1621
1969
 
@@ -1674,58 +2022,86 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1674
2022
  switch (op->op) {
1675
2023
  case GGML_OP_UNARY:
1676
2024
  switch (ggml_get_unary_op(op)) {
2025
+ case GGML_UNARY_OP_ABS:
2026
+ case GGML_UNARY_OP_NEG:
1677
2027
  case GGML_UNARY_OP_GELU:
1678
2028
  case GGML_UNARY_OP_SILU:
1679
2029
  case GGML_UNARY_OP_RELU:
2030
+ case GGML_UNARY_OP_SIGMOID:
1680
2031
  case GGML_UNARY_OP_HARDSIGMOID:
1681
2032
  case GGML_UNARY_OP_HARDSWISH:
1682
2033
  case GGML_UNARY_OP_GELU_QUICK:
1683
2034
  case GGML_UNARY_OP_TANH:
2035
+ case GGML_UNARY_OP_EXP:
2036
+ case GGML_UNARY_OP_ELU:
2037
+ case GGML_UNARY_OP_SGN:
2038
+ case GGML_UNARY_OP_STEP:
1684
2039
  return true;
1685
2040
  default:
1686
2041
  return false;
1687
2042
  }
1688
2043
  case GGML_OP_MUL_MAT: {
1689
2044
  switch (op->src[0]->type) {
1690
- case GGML_TYPE_Q8_0:
1691
- // Current groupsize should not be greater than k-1 in
1692
- // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
1693
- if (op->src[0]->ne[0] <= QK8_0) {
1694
- return false;
1695
- }
1696
2045
  case GGML_TYPE_F16:
1697
2046
  case GGML_TYPE_F32:
1698
- case GGML_TYPE_Q4_0:
1699
2047
  return true;
2048
+ case GGML_TYPE_Q8_0:
2049
+ case GGML_TYPE_Q4_0:
2050
+ #ifdef ASCEND_310P
2051
+ // Q4 && Q8 per group is not suppor on 310p device
2052
+ return false;
2053
+ #endif
2054
+ // only support contiguous for quantized types.
2055
+ return ggml_is_contiguous(op->src[0]) &&
2056
+ ggml_is_contiguous(op->src[1]);
1700
2057
  default:
1701
2058
  return false;
1702
2059
  }
1703
2060
  }
1704
2061
  case GGML_OP_MUL_MAT_ID:
1705
- return false;
1706
- // embedding
1707
- case GGML_OP_GET_ROWS: {
1708
2062
  switch (op->src[0]->type) {
1709
- case GGML_TYPE_F32:
1710
2063
  case GGML_TYPE_F16:
1711
- case GGML_TYPE_Q4_0:
1712
- case GGML_TYPE_Q8_0:
2064
+ case GGML_TYPE_F32:
1713
2065
  return true;
2066
+ case GGML_TYPE_Q8_0:
2067
+ case GGML_TYPE_Q4_0:
2068
+ #ifdef ASCEND_310P
2069
+ // Q4 && Q8 per group is not suppor on 310p device
2070
+ return false;
2071
+ #endif
2072
+ // only support contiguous for quantized types.
2073
+ return ggml_is_contiguous(op->src[0]) &&
2074
+ ggml_is_contiguous(op->src[1]);
1714
2075
  default:
1715
2076
  return false;
1716
2077
  }
1717
- } break;
1718
- case GGML_OP_CPY: {
1719
- switch (op->type) {
2078
+ // embedding
2079
+ case GGML_OP_GET_ROWS: {
2080
+ switch (op->src[0]->type) {
1720
2081
  case GGML_TYPE_F32:
1721
2082
  case GGML_TYPE_F16:
1722
2083
  case GGML_TYPE_Q8_0:
1723
- case GGML_TYPE_Q4_0:
1724
2084
  return true;
1725
2085
  default:
1726
2086
  return false;
1727
2087
  }
1728
- }
2088
+ } break;
2089
+ case GGML_OP_CPY: {
2090
+ ggml_tensor *src = op->src[0];
2091
+ if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
2092
+ (src->type != GGML_TYPE_F32 &&
2093
+ src->type != GGML_TYPE_F16)) {
2094
+ // only support F32 and F16.
2095
+ return false;
2096
+ }
2097
+
2098
+ if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
2099
+ // unsupport dst is not contiguous.
2100
+ return false;
2101
+ }
2102
+
2103
+ return true;
2104
+ } break;
1729
2105
  case GGML_OP_CONT: {
1730
2106
  // TODO: support GGML_TYPE_BF16
1731
2107
  switch (op->src[0]->type) {
@@ -1738,13 +2114,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1738
2114
  }
1739
2115
  case GGML_OP_ROPE: {
1740
2116
  // TODO: with ops-test v == 1
1741
- float * ext_factor = (float*)((int32_t*)op->op_params + 7);
2117
+ float ext_factor = 0.0f;
2118
+ memcpy(&ext_factor, (const float *) op->op_params + 7, sizeof(float));
1742
2119
  // TODO: n_dims <= ne0
1743
2120
  if (op->src[0]->ne[0] != op->op_params[1]) {
1744
2121
  return false;
1745
2122
  }
1746
2123
  // TODO: ext_factor != 0
1747
- if (*ext_factor != 0) {
2124
+ if (ext_factor != 0) {
1748
2125
  return false;
1749
2126
  }
1750
2127
 
@@ -1756,6 +2133,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1756
2133
  return false;
1757
2134
  }
1758
2135
 
2136
+ if(!ggml_is_contiguous(op->src[0])){
2137
+ return false;
2138
+ }
1759
2139
  return true;
1760
2140
  }
1761
2141
  case GGML_OP_UPSCALE: {
@@ -1764,11 +2144,31 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1764
2144
  if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
1765
2145
  return false;
1766
2146
  }
2147
+ if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) {
2148
+ return false;
2149
+ }
1767
2150
  return true;
1768
2151
  }
2152
+ case GGML_OP_POOL_2D: {
2153
+ const int32_t * opts = (const int32_t *) op->op_params;
2154
+ #ifdef ASCEND_310P
2155
+ enum ggml_op_pool opt = static_cast<ggml_op_pool>(opts[0]);
2156
+ if(opt == GGML_OP_POOL_MAX){
2157
+ return false;
2158
+ }
2159
+ #endif
2160
+ const int k0 = opts[1];
2161
+ const int k1 = opts[2];
2162
+ const int p0 = opts[5];
2163
+ const int p1 = opts[6];
2164
+ // value of paddingH should be at most half of kernelH
2165
+ // value of paddingW should be at most half of kernelW
2166
+ return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
2167
+ }
2168
+ case GGML_OP_SUM:
2169
+ case GGML_OP_DUP:
1769
2170
  case GGML_OP_IM2COL:
1770
2171
  case GGML_OP_CONCAT:
1771
- case GGML_OP_DUP:
1772
2172
  case GGML_OP_REPEAT:
1773
2173
  case GGML_OP_NONE:
1774
2174
  case GGML_OP_RESHAPE:
@@ -1777,15 +2177,17 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1777
2177
  case GGML_OP_TRANSPOSE:
1778
2178
  case GGML_OP_NORM:
1779
2179
  case GGML_OP_ADD:
2180
+ case GGML_OP_ADD1:
2181
+ case GGML_OP_SUB:
1780
2182
  case GGML_OP_MUL:
1781
2183
  case GGML_OP_DIV:
1782
2184
  case GGML_OP_RMS_NORM:
1783
2185
  case GGML_OP_SCALE:
1784
2186
  case GGML_OP_SQR:
2187
+ case GGML_OP_SQRT:
1785
2188
  case GGML_OP_CLAMP:
1786
2189
  case GGML_OP_DIAG_MASK_INF:
1787
2190
  case GGML_OP_SOFT_MAX:
1788
- case GGML_OP_POOL_2D:
1789
2191
  case GGML_OP_SUM_ROWS:
1790
2192
  case GGML_OP_ARGSORT:
1791
2193
  case GGML_OP_ACC:
@@ -1794,7 +2196,47 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1794
2196
  case GGML_OP_ARANGE:
1795
2197
  case GGML_OP_TIMESTEP_EMBEDDING:
1796
2198
  case GGML_OP_LEAKY_RELU:
2199
+ case GGML_OP_ARGMAX:
2200
+ case GGML_OP_COS:
2201
+ case GGML_OP_SIN:
2202
+ case GGML_OP_CONV_TRANSPOSE_1D:
2203
+ case GGML_OP_LOG:
2204
+ case GGML_OP_MEAN:
2205
+ case GGML_OP_PAD_REFLECT_1D:
2206
+ case GGML_OP_COUNT_EQUAL:
1797
2207
  return true;
2208
+ case GGML_OP_FLASH_ATTN_EXT:{
2209
+ // derived from [ggml-cuda.cu]
2210
+ if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
2211
+ return false;
2212
+ }
2213
+ if(op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 && op->src[1]->type != GGML_TYPE_BF16){
2214
+ return false;
2215
+ }
2216
+ if(op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16){
2217
+ return false;
2218
+ }
2219
+ if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
2220
+ // different head sizes of K and V are not supported yet
2221
+ return false;
2222
+ }
2223
+ if (op->src[0]->ne[0] == 192) {
2224
+ return false;
2225
+ }
2226
+ if (op->src[0]->ne[0] == 576) {
2227
+ // DeepSeek MLA
2228
+ return false;
2229
+ }
2230
+ if (op->src[0]->ne[3] != 1) {
2231
+ return false;
2232
+ }
2233
+ float logitSoftcap = 0.0f;
2234
+ memcpy(&logitSoftcap, (float*)op->op_params + 2, sizeof(float));
2235
+ if(logitSoftcap != 0.0f) {
2236
+ return false;
2237
+ }
2238
+ return true;
2239
+ }
1798
2240
  default:
1799
2241
  return false;
1800
2242
  }