whispercpp 1.3.1 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (797) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +4 -3
  3. data/README.md +92 -31
  4. data/Rakefile +26 -7
  5. data/ext/.gitignore +5 -7
  6. data/ext/dependencies.rb +61 -0
  7. data/ext/extconf.rb +21 -198
  8. data/ext/options.rb +221 -0
  9. data/ext/ruby_whisper.c +159 -0
  10. data/ext/ruby_whisper.h +17 -2
  11. data/ext/ruby_whisper_context.c +641 -0
  12. data/ext/ruby_whisper_error.c +52 -0
  13. data/ext/ruby_whisper_model.c +232 -0
  14. data/ext/ruby_whisper_params.c +1301 -0
  15. data/ext/ruby_whisper_segment.c +143 -0
  16. data/ext/ruby_whisper_transcribe.cpp +87 -0
  17. data/ext/ruby_whisper_vad_params.c +288 -0
  18. data/ext/sources/.dockerignore +3 -0
  19. data/ext/sources/.github/workflows/bindings-ruby.yml +21 -0
  20. data/ext/sources/CMakeGraphVizOptions.cmake +8 -0
  21. data/ext/sources/CMakeLists.txt +251 -0
  22. data/ext/sources/bindings/javascript/CMakeLists.txt +41 -0
  23. data/ext/sources/bindings/javascript/emscripten.cpp +93 -0
  24. data/ext/sources/bindings/javascript/libwhisper.worker.js +1 -0
  25. data/ext/sources/bindings/javascript/package-tmpl.json +26 -0
  26. data/ext/sources/bindings/javascript/package.json +26 -0
  27. data/ext/sources/bindings/javascript/whisper.js +19 -0
  28. data/ext/sources/build-xcframework.sh +547 -0
  29. data/ext/sources/ci/run.sh +336 -0
  30. data/ext/sources/close-issue.yml +28 -0
  31. data/ext/sources/cmake/DefaultTargetOptions.cmake +16 -0
  32. data/ext/sources/cmake/FindFFmpeg.cmake +163 -0
  33. data/ext/sources/cmake/build-info.cmake +60 -0
  34. data/ext/sources/cmake/git-vars.cmake +22 -0
  35. data/ext/sources/cmake/whisper-config.cmake.in +65 -0
  36. data/ext/sources/cmake/whisper.pc.in +10 -0
  37. data/ext/sources/examples/CMakeLists.txt +124 -0
  38. data/ext/sources/examples/addon.node/CMakeLists.txt +31 -0
  39. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +37 -0
  40. data/ext/sources/examples/addon.node/addon.cpp +438 -0
  41. data/ext/sources/examples/addon.node/index.js +54 -0
  42. data/ext/sources/examples/addon.node/package.json +16 -0
  43. data/ext/sources/examples/bench/CMakeLists.txt +8 -0
  44. data/ext/sources/examples/bench/bench.cpp +175 -0
  45. data/ext/sources/examples/bench.wasm/CMakeLists.txt +49 -0
  46. data/ext/sources/examples/bench.wasm/emscripten.cpp +87 -0
  47. data/ext/sources/examples/bench.wasm/index-tmpl.html +284 -0
  48. data/ext/sources/examples/cli/CMakeLists.txt +8 -0
  49. data/ext/sources/examples/cli/cli.cpp +1294 -0
  50. data/ext/sources/examples/coi-serviceworker.js +146 -0
  51. data/ext/sources/examples/command/CMakeLists.txt +10 -0
  52. data/ext/sources/examples/command/command.cpp +776 -0
  53. data/ext/sources/examples/command/commands.txt +9 -0
  54. data/ext/sources/examples/command.wasm/CMakeLists.txt +50 -0
  55. data/ext/sources/examples/command.wasm/emscripten.cpp +327 -0
  56. data/ext/sources/examples/command.wasm/index-tmpl.html +414 -0
  57. data/ext/sources/examples/common-ggml.cpp +238 -0
  58. data/ext/sources/examples/common-ggml.h +18 -0
  59. data/ext/sources/examples/common-sdl.cpp +227 -0
  60. data/ext/sources/examples/common-sdl.h +49 -0
  61. data/ext/sources/examples/common-whisper.cpp +168 -0
  62. data/ext/sources/examples/common-whisper.h +24 -0
  63. data/ext/sources/examples/common.cpp +675 -0
  64. data/ext/sources/examples/common.h +322 -0
  65. data/ext/sources/examples/deprecation-warning/CMakeLists.txt +6 -0
  66. data/ext/sources/examples/deprecation-warning/deprecation-warning.cpp +38 -0
  67. data/ext/sources/examples/ffmpeg-transcode.cpp +368 -0
  68. data/ext/sources/examples/generate-karaoke.sh +57 -0
  69. data/ext/sources/examples/grammar-parser.cpp +423 -0
  70. data/ext/sources/examples/grammar-parser.h +29 -0
  71. data/ext/sources/examples/helpers.js +191 -0
  72. data/ext/sources/examples/json.hpp +24596 -0
  73. data/ext/sources/examples/livestream.sh +112 -0
  74. data/ext/sources/examples/lsp/CMakeLists.txt +9 -0
  75. data/ext/sources/examples/lsp/lsp.cpp +467 -0
  76. data/ext/sources/examples/lsp/whisper.vim +362 -0
  77. data/ext/sources/examples/miniaudio.h +93468 -0
  78. data/ext/sources/examples/python/test_whisper_processor.py +7 -0
  79. data/ext/sources/examples/python/whisper_processor.py +54 -0
  80. data/ext/sources/examples/quantize/CMakeLists.txt +6 -0
  81. data/ext/sources/examples/quantize/quantize.cpp +223 -0
  82. data/ext/sources/examples/server/CMakeLists.txt +12 -0
  83. data/ext/sources/examples/server/bench.js +29 -0
  84. data/ext/sources/examples/server/httplib.h +10497 -0
  85. data/ext/sources/examples/server/server.cpp +1091 -0
  86. data/ext/sources/examples/server.py +115 -0
  87. data/ext/sources/examples/stb_vorbis.c +5584 -0
  88. data/ext/sources/examples/stream/CMakeLists.txt +10 -0
  89. data/ext/sources/examples/stream/stream.cpp +429 -0
  90. data/ext/sources/examples/stream.wasm/CMakeLists.txt +49 -0
  91. data/ext/sources/examples/stream.wasm/emscripten.cpp +216 -0
  92. data/ext/sources/examples/stream.wasm/index-tmpl.html +414 -0
  93. data/ext/sources/examples/sycl/CMakeLists.txt +9 -0
  94. data/ext/sources/examples/sycl/build.sh +22 -0
  95. data/ext/sources/examples/sycl/ls-sycl-device.cpp +11 -0
  96. data/ext/sources/examples/sycl/run-whisper.sh +17 -0
  97. data/ext/sources/examples/talk-llama/CMakeLists.txt +40 -0
  98. data/ext/sources/examples/talk-llama/eleven-labs.py +80 -0
  99. data/ext/sources/examples/talk-llama/llama-adapter.cpp +388 -0
  100. data/ext/sources/examples/talk-llama/llama-adapter.h +76 -0
  101. data/ext/sources/examples/talk-llama/llama-arch.cpp +1746 -0
  102. data/ext/sources/examples/talk-llama/llama-arch.h +437 -0
  103. data/ext/sources/examples/talk-llama/llama-batch.cpp +374 -0
  104. data/ext/sources/examples/talk-llama/llama-batch.h +89 -0
  105. data/ext/sources/examples/talk-llama/llama-chat.cpp +663 -0
  106. data/ext/sources/examples/talk-llama/llama-chat.h +58 -0
  107. data/ext/sources/examples/talk-llama/llama-context.cpp +2676 -0
  108. data/ext/sources/examples/talk-llama/llama-context.h +276 -0
  109. data/ext/sources/examples/talk-llama/llama-cparams.cpp +5 -0
  110. data/ext/sources/examples/talk-llama/llama-cparams.h +41 -0
  111. data/ext/sources/examples/talk-llama/llama-grammar.cpp +1229 -0
  112. data/ext/sources/examples/talk-llama/llama-grammar.h +173 -0
  113. data/ext/sources/examples/talk-llama/llama-graph.cpp +1618 -0
  114. data/ext/sources/examples/talk-llama/llama-graph.h +640 -0
  115. data/ext/sources/examples/talk-llama/llama-hparams.cpp +95 -0
  116. data/ext/sources/examples/talk-llama/llama-hparams.h +190 -0
  117. data/ext/sources/examples/talk-llama/llama-impl.cpp +167 -0
  118. data/ext/sources/examples/talk-llama/llama-impl.h +61 -0
  119. data/ext/sources/examples/talk-llama/llama-io.cpp +15 -0
  120. data/ext/sources/examples/talk-llama/llama-io.h +35 -0
  121. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2739 -0
  122. data/ext/sources/examples/talk-llama/llama-kv-cache.h +502 -0
  123. data/ext/sources/examples/talk-llama/llama-kv-cells.h +379 -0
  124. data/ext/sources/examples/talk-llama/llama-memory.cpp +1 -0
  125. data/ext/sources/examples/talk-llama/llama-memory.h +32 -0
  126. data/ext/sources/examples/talk-llama/llama-mmap.cpp +600 -0
  127. data/ext/sources/examples/talk-llama/llama-mmap.h +68 -0
  128. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +1138 -0
  129. data/ext/sources/examples/talk-llama/llama-model-loader.h +169 -0
  130. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +281 -0
  131. data/ext/sources/examples/talk-llama/llama-model-saver.h +37 -0
  132. data/ext/sources/examples/talk-llama/llama-model.cpp +13814 -0
  133. data/ext/sources/examples/talk-llama/llama-model.h +425 -0
  134. data/ext/sources/examples/talk-llama/llama-quant.cpp +966 -0
  135. data/ext/sources/examples/talk-llama/llama-quant.h +1 -0
  136. data/ext/sources/examples/talk-llama/llama-sampling.cpp +2575 -0
  137. data/ext/sources/examples/talk-llama/llama-sampling.h +32 -0
  138. data/ext/sources/examples/talk-llama/llama-vocab.cpp +3340 -0
  139. data/ext/sources/examples/talk-llama/llama-vocab.h +131 -0
  140. data/ext/sources/examples/talk-llama/llama.cpp +354 -0
  141. data/ext/sources/examples/talk-llama/llama.h +1377 -0
  142. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +23 -0
  143. data/ext/sources/examples/talk-llama/speak +40 -0
  144. data/ext/sources/examples/talk-llama/speak.bat +1 -0
  145. data/ext/sources/examples/talk-llama/speak.ps1 +14 -0
  146. data/ext/sources/examples/talk-llama/talk-llama.cpp +808 -0
  147. data/ext/sources/examples/talk-llama/unicode-data.cpp +7034 -0
  148. data/ext/sources/examples/talk-llama/unicode-data.h +20 -0
  149. data/ext/sources/examples/talk-llama/unicode.cpp +849 -0
  150. data/ext/sources/examples/talk-llama/unicode.h +66 -0
  151. data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +8 -0
  152. data/ext/sources/examples/vad-speech-segments/speech.cpp +143 -0
  153. data/ext/sources/examples/wchess/CMakeLists.txt +10 -0
  154. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +19 -0
  155. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +803 -0
  156. data/ext/sources/examples/wchess/libwchess/Chessboard.h +33 -0
  157. data/ext/sources/examples/wchess/libwchess/WChess.cpp +193 -0
  158. data/ext/sources/examples/wchess/libwchess/WChess.h +63 -0
  159. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +117 -0
  160. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +8 -0
  161. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +249 -0
  162. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +50 -0
  163. data/ext/sources/examples/whisper.wasm/emscripten.cpp +118 -0
  164. data/ext/sources/examples/whisper.wasm/index-tmpl.html +658 -0
  165. data/ext/sources/ggml/CMakeLists.txt +390 -0
  166. data/ext/sources/ggml/cmake/BuildTypes.cmake +54 -0
  167. data/ext/sources/ggml/cmake/GitVars.cmake +22 -0
  168. data/ext/sources/ggml/cmake/common.cmake +26 -0
  169. data/ext/sources/ggml/cmake/ggml-config.cmake.in +152 -0
  170. data/ext/{ggml → sources/ggml}/include/ggml-alloc.h +1 -1
  171. data/ext/{ggml → sources/ggml}/include/ggml-backend.h +9 -7
  172. data/ext/{ggml → sources/ggml}/include/ggml-cpp.h +2 -1
  173. data/ext/{ggml → sources/ggml}/include/ggml-cpu.h +9 -1
  174. data/ext/{ggml → sources/ggml}/include/ggml-metal.h +1 -1
  175. data/ext/{ggml → sources/ggml}/include/ggml-opt.h +49 -28
  176. data/ext/{ggml → sources/ggml}/include/ggml-rpc.h +6 -1
  177. data/ext/{ggml → sources/ggml}/include/ggml-vulkan.h +0 -2
  178. data/ext/{ggml → sources/ggml}/include/ggml.h +182 -265
  179. data/ext/sources/ggml/include/gguf.h +202 -0
  180. data/ext/sources/ggml/src/CMakeLists.txt +346 -0
  181. data/ext/{ggml → sources/ggml}/src/ggml-alloc.c +34 -29
  182. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  183. data/ext/{ggml → sources/ggml}/src/ggml-backend-impl.h +1 -2
  184. data/ext/{ggml → sources/ggml}/src/ggml-backend-reg.cpp +87 -53
  185. data/ext/{ggml → sources/ggml}/src/ggml-backend.cpp +26 -14
  186. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  187. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +74 -0
  188. data/ext/sources/ggml/src/ggml-cann/Doxyfile +2579 -0
  189. data/ext/{ggml → sources/ggml}/src/ggml-cann/acl_tensor.cpp +10 -4
  190. data/ext/{ggml → sources/ggml}/src/ggml-cann/acl_tensor.h +5 -5
  191. data/ext/{ggml → sources/ggml}/src/ggml-cann/aclnn_ops.cpp +1272 -1506
  192. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +1125 -0
  193. data/ext/{ggml → sources/ggml}/src/ggml-cann/common.h +135 -1
  194. data/ext/{ggml → sources/ggml}/src/ggml-cann/ggml-cann.cpp +564 -146
  195. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +30 -0
  196. data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/dup.cpp +3 -5
  197. data/ext/{ggml → sources/ggml}/src/ggml-common.h +12 -8
  198. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +504 -0
  199. data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/amx.cpp +2 -1
  200. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  201. data/ext/sources/ggml/src/ggml-cpu/binary-ops.h +16 -0
  202. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  203. data/ext/sources/ggml/src/ggml-cpu/common.h +72 -0
  204. data/ext/{ggml → sources/ggml}/src/ggml-cpu/cpu-feats-x86.cpp +5 -1
  205. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +6431 -0
  206. data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-impl.h +163 -41
  207. data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-quants.c +4029 -1117
  208. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +3510 -0
  209. data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu.cpp +67 -18
  210. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +337 -0
  211. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +95 -0
  212. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +482 -0
  213. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  214. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3544 -0
  215. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
  216. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +8903 -0
  217. data/ext/sources/ggml/src/ggml-cpu/ops.h +110 -0
  218. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  219. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  220. data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +28 -0
  221. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +252 -0
  222. data/ext/sources/ggml/src/ggml-cpu/vec.h +818 -0
  223. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +184 -0
  224. data/ext/sources/ggml/src/ggml-cuda/acc.cu +61 -0
  225. data/ext/sources/ggml/src/ggml-cuda/acc.cuh +5 -0
  226. data/ext/sources/ggml/src/ggml-cuda/arange.cu +34 -0
  227. data/ext/sources/ggml/src/ggml-cuda/arange.cuh +5 -0
  228. data/ext/sources/ggml/src/ggml-cuda/argmax.cu +91 -0
  229. data/ext/sources/ggml/src/ggml-cuda/argmax.cuh +3 -0
  230. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +104 -0
  231. data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +3 -0
  232. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +363 -0
  233. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +9 -0
  234. data/ext/sources/ggml/src/ggml-cuda/clamp.cu +45 -0
  235. data/ext/sources/ggml/src/ggml-cuda/clamp.cuh +5 -0
  236. data/ext/sources/ggml/src/ggml-cuda/common.cuh +828 -0
  237. data/ext/sources/ggml/src/ggml-cuda/concat.cu +221 -0
  238. data/ext/sources/ggml/src/ggml-cuda/concat.cuh +5 -0
  239. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +89 -0
  240. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
  241. data/ext/sources/ggml/src/ggml-cuda/convert.cu +730 -0
  242. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +26 -0
  243. data/ext/sources/ggml/src/ggml-cuda/count-equal.cu +64 -0
  244. data/ext/sources/ggml/src/ggml-cuda/count-equal.cuh +5 -0
  245. data/ext/sources/ggml/src/ggml-cuda/cp-async.cuh +57 -0
  246. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +705 -0
  247. data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +11 -0
  248. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +189 -0
  249. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
  250. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +103 -0
  251. data/ext/sources/ggml/src/ggml-cuda/diagmask.cu +40 -0
  252. data/ext/sources/ggml/src/ggml-cuda/diagmask.cuh +5 -0
  253. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +881 -0
  254. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1471 -0
  255. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +357 -0
  256. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +3 -0
  257. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +365 -0
  258. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +3 -0
  259. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +482 -0
  260. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +472 -0
  261. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +634 -0
  262. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +3 -0
  263. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +346 -0
  264. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +3 -0
  265. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +275 -0
  266. data/ext/sources/ggml/src/ggml-cuda/getrows.cuh +15 -0
  267. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +3505 -0
  268. data/ext/sources/ggml/src/ggml-cuda/gla.cu +93 -0
  269. data/ext/sources/ggml/src/ggml-cuda/gla.cuh +3 -0
  270. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +103 -0
  271. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +5 -0
  272. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +396 -0
  273. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +324 -0
  274. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +3217 -0
  275. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +336 -0
  276. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +12 -0
  277. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +595 -0
  278. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +12 -0
  279. data/ext/sources/ggml/src/ggml-cuda/norm.cu +458 -0
  280. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +11 -0
  281. data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
  282. data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
  283. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +68 -0
  284. data/ext/sources/ggml/src/ggml-cuda/out-prod.cuh +3 -0
  285. data/ext/sources/ggml/src/ggml-cuda/pad.cu +49 -0
  286. data/ext/sources/ggml/src/ggml-cuda/pad.cuh +5 -0
  287. data/ext/sources/ggml/src/ggml-cuda/pool2d.cu +94 -0
  288. data/ext/sources/ggml/src/ggml-cuda/pool2d.cuh +5 -0
  289. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +190 -0
  290. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +27 -0
  291. data/ext/sources/ggml/src/ggml-cuda/rope.cu +456 -0
  292. data/ext/sources/ggml/src/ggml-cuda/rope.cuh +7 -0
  293. data/ext/sources/ggml/src/ggml-cuda/scale.cu +31 -0
  294. data/ext/sources/ggml/src/ggml-cuda/scale.cuh +5 -0
  295. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +283 -0
  296. data/ext/sources/ggml/src/ggml-cuda/softmax.cuh +7 -0
  297. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +148 -0
  298. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
  299. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +153 -0
  300. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
  301. data/ext/sources/ggml/src/ggml-cuda/sum.cu +45 -0
  302. data/ext/sources/ggml/src/ggml-cuda/sum.cuh +5 -0
  303. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +39 -0
  304. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +5 -0
  305. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +5 -0
  306. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +10 -0
  307. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
  308. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
  309. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +10 -0
  310. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +5 -0
  311. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +10 -0
  312. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +10 -0
  313. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
  314. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
  315. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +5 -0
  316. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
  317. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +10 -0
  318. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +10 -0
  319. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
  320. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
  321. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
  322. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +10 -0
  323. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +10 -0
  324. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  325. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  326. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  327. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  328. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  329. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  330. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  331. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  332. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  333. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  334. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  335. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  336. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  337. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  338. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  339. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  340. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  341. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  342. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  343. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  344. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  345. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  346. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  347. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  348. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  349. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  350. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  351. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  352. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  353. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  354. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  355. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  356. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  357. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  358. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  359. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  360. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  361. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  362. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  363. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  364. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  365. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  366. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  367. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  368. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  369. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  370. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  371. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  372. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  373. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  374. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  375. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  376. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  377. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  378. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  379. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  380. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  381. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  382. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  383. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  384. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  385. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  386. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  387. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  388. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  389. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  390. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  391. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  392. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  393. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  394. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  395. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  396. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  397. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  398. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  399. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  400. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  401. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  402. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  403. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  404. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  405. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  406. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  407. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  408. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  409. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  410. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +78 -0
  411. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
  412. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
  413. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
  414. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
  415. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
  416. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
  417. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
  418. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
  419. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  420. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  421. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  422. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  423. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  424. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  425. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  426. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  427. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  428. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  429. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +47 -0
  430. data/ext/sources/ggml/src/ggml-cuda/tsembd.cuh +5 -0
  431. data/ext/sources/ggml/src/ggml-cuda/unary.cu +289 -0
  432. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +59 -0
  433. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +51 -0
  434. data/ext/sources/ggml/src/ggml-cuda/upscale.cuh +5 -0
  435. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +1135 -0
  436. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/cuda.h +1 -0
  437. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/hip.h +57 -0
  438. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/musa.h +7 -1
  439. data/ext/sources/ggml/src/ggml-cuda/wkv.cu +199 -0
  440. data/ext/sources/ggml/src/ggml-cuda/wkv.cuh +7 -0
  441. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +131 -0
  442. data/ext/{ggml → sources/ggml}/src/ggml-impl.h +64 -19
  443. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  444. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +112 -0
  445. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +58 -0
  446. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +25 -0
  447. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +52 -0
  448. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +52 -0
  449. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +52 -0
  450. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +52 -0
  451. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +30 -0
  452. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +22 -0
  453. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +17 -0
  454. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +31 -0
  455. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +31 -0
  456. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +38 -0
  457. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +39 -0
  458. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +44 -0
  459. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +52 -0
  460. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +69 -0
  461. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +51 -0
  462. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +33 -0
  463. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +35 -0
  464. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +140 -0
  465. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +106 -0
  466. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +73 -0
  467. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +52 -0
  468. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +28 -0
  469. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +84 -0
  470. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +21 -0
  471. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +53 -0
  472. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +52 -0
  473. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +52 -0
  474. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +52 -0
  475. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +52 -0
  476. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +19 -0
  477. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +23 -0
  478. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +22 -0
  479. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +72 -0
  480. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +71 -0
  481. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +120 -0
  482. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +622 -0
  483. data/ext/{ggml → sources/ggml}/src/ggml-metal/ggml-metal.m +2178 -1064
  484. data/ext/{ggml → sources/ggml}/src/ggml-metal/ggml-metal.metal +1575 -1218
  485. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +113 -0
  486. data/ext/sources/ggml/src/ggml-musa/mudnn.cu +112 -0
  487. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +12 -0
  488. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +96 -0
  489. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +5124 -0
  490. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +83 -0
  491. data/ext/sources/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  492. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  493. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +118 -0
  494. data/ext/sources/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  495. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  496. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +62 -0
  497. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  498. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  499. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +163 -0
  500. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  501. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  502. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +79 -0
  503. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  504. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  505. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  506. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  507. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  508. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  509. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  510. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  511. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  512. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  513. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  514. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl +190 -0
  515. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +81 -0
  516. data/ext/sources/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  517. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +96 -0
  518. data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +721 -0
  519. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +16 -0
  520. data/ext/sources/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  521. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +87 -0
  522. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +87 -0
  523. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +86 -0
  524. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +86 -0
  525. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +84 -0
  526. data/ext/{ggml → sources/ggml}/src/ggml-opt.cpp +373 -190
  527. data/ext/{ggml → sources/ggml}/src/ggml-quants.c +114 -120
  528. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  529. data/ext/{ggml → sources/ggml}/src/ggml-rpc/ggml-rpc.cpp +480 -73
  530. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +189 -0
  531. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +37 -0
  532. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +345 -0
  533. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  534. data/ext/{ggml → sources/ggml}/src/ggml-sycl/common.cpp +20 -32
  535. data/ext/sources/ggml/src/ggml-sycl/common.hpp +589 -0
  536. data/ext/{ggml → sources/ggml}/src/ggml-sycl/concat.cpp +32 -33
  537. data/ext/sources/ggml/src/ggml-sycl/concat.hpp +20 -0
  538. data/ext/{ggml → sources/ggml}/src/ggml-sycl/conv.cpp +4 -2
  539. data/ext/sources/ggml/src/ggml-sycl/conv.hpp +20 -0
  540. data/ext/{ggml → sources/ggml}/src/ggml-sycl/convert.cpp +104 -28
  541. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +34 -0
  542. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +700 -0
  543. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +11 -0
  544. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +791 -0
  545. data/ext/{ggml → sources/ggml}/src/ggml-sycl/dmmv.cpp +156 -17
  546. data/ext/sources/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  547. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +2957 -0
  548. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1511 -0
  549. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +75 -0
  550. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +99 -0
  551. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +309 -0
  552. data/ext/sources/ggml/src/ggml-sycl/getrows.hpp +20 -0
  553. data/ext/{ggml → sources/ggml}/src/ggml-sycl/ggml-sycl.cpp +1004 -1240
  554. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +106 -0
  555. data/ext/sources/ggml/src/ggml-sycl/gla.hpp +8 -0
  556. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +136 -0
  557. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +21 -0
  558. data/ext/{ggml → sources/ggml}/src/ggml-sycl/mmq.cpp +0 -1
  559. data/ext/sources/ggml/src/ggml-sycl/mmq.hpp +33 -0
  560. data/ext/{ggml → sources/ggml}/src/ggml-sycl/mmvq.cpp +261 -166
  561. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  562. data/ext/{ggml → sources/ggml}/src/ggml-sycl/norm.cpp +204 -81
  563. data/ext/sources/ggml/src/ggml-sycl/norm.hpp +26 -0
  564. data/ext/{ggml → sources/ggml}/src/ggml-sycl/outprod.cpp +8 -17
  565. data/ext/sources/ggml/src/ggml-sycl/outprod.hpp +10 -0
  566. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +74 -0
  567. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +83 -0
  568. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +361 -0
  569. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +20 -0
  570. data/ext/{ggml → sources/ggml}/src/ggml-sycl/softmax.cpp +35 -25
  571. data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +20 -0
  572. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  573. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  574. data/ext/{ggml → sources/ggml}/src/ggml-sycl/tsembd.cpp +3 -3
  575. data/ext/sources/ggml/src/ggml-sycl/tsembd.hpp +20 -0
  576. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +1215 -0
  577. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +293 -0
  578. data/ext/sources/ggml/src/ggml-sycl/wkv.hpp +10 -0
  579. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +196 -0
  580. data/ext/sources/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
  581. data/ext/{ggml → sources/ggml}/src/ggml-vulkan/ggml-vulkan.cpp +3130 -1087
  582. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +39 -0
  583. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +29 -0
  584. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +29 -0
  585. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +51 -0
  586. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +69 -0
  587. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
  588. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
  589. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
  590. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
  591. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
  592. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
  593. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +242 -0
  594. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
  595. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
  596. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
  597. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +462 -0
  598. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +699 -0
  599. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.comp +13 -0
  600. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
  601. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
  602. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
  603. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
  604. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +48 -0
  605. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +39 -0
  606. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +49 -0
  607. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
  608. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
  609. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
  610. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
  611. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
  612. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
  613. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
  614. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
  615. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
  616. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
  617. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
  618. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
  619. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
  620. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
  621. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +337 -0
  622. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  623. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  624. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +267 -0
  625. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +59 -0
  626. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
  627. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
  628. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +64 -0
  629. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.comp +9 -0
  630. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +76 -0
  631. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +33 -0
  632. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +41 -0
  633. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
  634. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +100 -0
  635. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +41 -0
  636. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
  637. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
  638. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
  639. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
  640. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +118 -0
  641. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +82 -0
  642. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +79 -0
  643. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
  644. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +87 -0
  645. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
  646. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +118 -0
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +154 -0
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +130 -0
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +136 -0
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +167 -0
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +868 -0
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +441 -0
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +442 -0
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +99 -0
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +28 -0
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
  663. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +77 -0
  664. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
  665. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
  666. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
  667. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +52 -0
  668. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
  669. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +58 -0
  670. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +60 -0
  671. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +43 -0
  672. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +43 -0
  673. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +47 -0
  674. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
  675. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
  676. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
  677. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
  678. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
  679. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +173 -0
  680. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +50 -0
  681. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
  682. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
  683. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +37 -0
  684. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
  685. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_bfloat16_support.comp +7 -0
  686. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp +7 -0
  687. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat_support.comp +7 -0
  688. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_integer_dot_support.comp +7 -0
  689. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +41 -0
  690. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +1373 -0
  691. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -0
  692. data/ext/{ggml → sources/ggml}/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +193 -35
  693. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
  694. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
  695. data/ext/{ggml → sources/ggml}/src/ggml.c +676 -1820
  696. data/ext/sources/ggml/src/gguf.cpp +1330 -0
  697. data/ext/{include → sources/include}/whisper.h +68 -2
  698. data/ext/sources/src/CMakeLists.txt +143 -0
  699. data/ext/{src → sources/src}/coreml/whisper-decoder-impl.h +27 -15
  700. data/ext/{src → sources/src}/coreml/whisper-decoder-impl.m +35 -10
  701. data/ext/{src → sources/src}/coreml/whisper-encoder-impl.h +21 -9
  702. data/ext/{src → sources/src}/coreml/whisper-encoder-impl.m +28 -3
  703. data/ext/sources/src/coreml/whisper-encoder.mm +73 -0
  704. data/ext/sources/src/whisper-arch.h +197 -0
  705. data/ext/{src → sources/src}/whisper.cpp +1905 -374
  706. data/ext/sources/tests/CMakeLists.txt +105 -0
  707. data/ext/sources/tests/earnings21/eval.mk +58 -0
  708. data/ext/sources/tests/earnings21/eval.py +68 -0
  709. data/ext/sources/tests/earnings21/normalizers/__init__.py +2 -0
  710. data/ext/sources/tests/earnings21/normalizers/basic.py +80 -0
  711. data/ext/sources/tests/earnings21/normalizers/english.json +1741 -0
  712. data/ext/sources/tests/earnings21/normalizers/english.py +550 -0
  713. data/ext/sources/tests/earnings21/requirements.txt +6 -0
  714. data/ext/sources/tests/en-0-ref.txt +1 -0
  715. data/ext/sources/tests/en-1-ref.txt +1 -0
  716. data/ext/sources/tests/en-2-ref.txt +1 -0
  717. data/ext/sources/tests/es-0-ref.txt +1 -0
  718. data/ext/sources/tests/librispeech/eval.mk +39 -0
  719. data/ext/sources/tests/librispeech/eval.py +47 -0
  720. data/ext/sources/tests/librispeech/normalizers/__init__.py +2 -0
  721. data/ext/sources/tests/librispeech/normalizers/basic.py +80 -0
  722. data/ext/sources/tests/librispeech/normalizers/english.json +1741 -0
  723. data/ext/sources/tests/librispeech/normalizers/english.py +550 -0
  724. data/ext/sources/tests/librispeech/requirements.txt +6 -0
  725. data/ext/sources/tests/run-tests.sh +130 -0
  726. data/ext/sources/tests/test-c.c +3 -0
  727. data/ext/sources/tests/test-vad-full.cpp +54 -0
  728. data/ext/sources/tests/test-vad.cpp +83 -0
  729. data/ext/sources/tests/test-whisper.js +58 -0
  730. data/extsources.rb +33 -5
  731. data/lib/whisper/model/uri.rb +149 -128
  732. data/sig/whisper.rbs +480 -0
  733. data/tests/helper.rb +28 -0
  734. data/tests/test_callback.rb +45 -3
  735. data/tests/test_error.rb +2 -2
  736. data/tests/test_model.rb +38 -0
  737. data/tests/test_package.rb +18 -3
  738. data/tests/test_params.rb +145 -8
  739. data/tests/test_segment.rb +10 -19
  740. data/tests/test_vad.rb +19 -0
  741. data/tests/test_vad_params.rb +103 -0
  742. data/tests/test_whisper.rb +37 -37
  743. data/whispercpp.gemspec +5 -4
  744. metadata +766 -111
  745. data/ext/cpu.mk +0 -9
  746. data/ext/examples/dr_wav.h +0 -8815
  747. data/ext/ggml/src/ggml-cann/aclnn_ops.h +0 -592
  748. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -4262
  749. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +0 -14123
  750. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +0 -1884
  751. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +0 -14
  752. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +0 -288
  753. data/ext/ggml/src/ggml-sycl/element_wise.cpp +0 -1030
  754. data/ext/ggml/src/ggml-sycl/im2col.cpp +0 -126
  755. data/ext/ggml/src/ggml-sycl/rope.cpp +0 -276
  756. data/ext/ggml/src/ggml-sycl/wkv6.cpp +0 -141
  757. data/ext/metal-embed.mk +0 -17
  758. data/ext/metal.mk +0 -6
  759. data/ext/ruby_whisper.cpp +0 -1909
  760. data/ext/scripts/get-flags.mk +0 -38
  761. data/lib/whisper.rb +0 -2
  762. /data/ext/{ggml → sources/ggml}/include/ggml-blas.h +0 -0
  763. /data/ext/{ggml → sources/ggml}/include/ggml-cann.h +0 -0
  764. /data/ext/{ggml → sources/ggml}/include/ggml-cuda.h +0 -0
  765. /data/ext/{ggml → sources/ggml}/include/ggml-kompute.h +0 -0
  766. /data/ext/{ggml → sources/ggml}/include/ggml-opencl.h +0 -0
  767. /data/ext/{ggml → sources/ggml}/include/ggml-sycl.h +0 -0
  768. /data/ext/{ggml → sources/ggml}/src/ggml-amx/common.h +0 -0
  769. /data/ext/{ggml → sources/ggml}/src/ggml-amx/ggml-amx.cpp +0 -0
  770. /data/ext/{ggml → sources/ggml}/src/ggml-amx/mmq.cpp +0 -0
  771. /data/ext/{ggml → sources/ggml}/src/ggml-amx/mmq.h +0 -0
  772. /data/ext/{ggml → sources/ggml}/src/ggml-blas/ggml-blas.cpp +0 -0
  773. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/ascendc_kernels.h +0 -0
  774. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_f16.cpp +0 -0
  775. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_f32.cpp +0 -0
  776. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -0
  777. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -0
  778. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -0
  779. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -0
  780. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -0
  781. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/amx.h +0 -0
  782. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/common.h +0 -0
  783. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/mmq.cpp +0 -0
  784. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/mmq.h +0 -0
  785. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-aarch64.h +0 -0
  786. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-hbm.cpp +0 -0
  787. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-hbm.h +0 -0
  788. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-quants.h +0 -0
  789. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-traits.cpp +0 -0
  790. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-traits.h +0 -0
  791. /data/ext/{ggml → sources/ggml}/src/ggml-kompute/ggml-kompute.cpp +0 -0
  792. /data/ext/{ggml → sources/ggml}/src/ggml-quants.h +0 -0
  793. /data/ext/{ggml → sources/ggml}/src/ggml-threading.cpp +0 -0
  794. /data/ext/{ggml → sources/ggml}/src/ggml-threading.h +0 -0
  795. /data/ext/{src → sources/src}/coreml/whisper-encoder.h +0 -0
  796. /data/ext/{src → sources/src}/openvino/whisper-openvino-encoder.cpp +0 -0
  797. /data/ext/{src → sources/src}/openvino/whisper-openvino-encoder.h +0 -0
@@ -29,11 +29,14 @@
29
29
  #include <cstdio>
30
30
  #include <cstring>
31
31
  #include <mutex>
32
+ #include <queue>
33
+ #include <chrono>
32
34
 
33
35
  #include "ggml-impl.h"
34
36
  #include "ggml-backend-impl.h"
35
37
  #include "ggml-cann/aclnn_ops.h"
36
38
  #include "ggml-cann/common.h"
39
+ #include "ggml.h"
37
40
 
38
41
  #define GGML_COMMON_DECL_C
39
42
 
@@ -119,9 +122,10 @@ static ggml_cann_device_info ggml_cann_init() {
119
122
  prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
120
123
  prop.location.id = id;
121
124
  prop.reserve = 0;
122
- ACL_CHECK(aclrtMemGetAllocationGranularity(
125
+ err = aclrtMemGetAllocationGranularity(
123
126
  &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
124
- &info.devices[id].vmm_granularity));
127
+ &info.devices[id].vmm_granularity);
128
+ info.devices[id].vmm = err == ACL_SUCCESS;
125
129
 
126
130
  size_t free, total;
127
131
  ggml_backend_cann_get_device_memory(id, &free, &total);
@@ -148,11 +152,223 @@ const ggml_cann_device_info& ggml_cann_info() {
148
152
 
149
153
  //#define DEBUG_CANN_MALLOC
150
154
  /**
151
- * @brief A pool of CANN buffers(legacy).
155
+ * @brief A pool of CANN buffers(priority segment buffer).
152
156
  *
153
157
  * This class manages a pool of CANN buffers for a specific device.
154
158
  */
155
- struct ggml_cann_pool_leg : public ggml_cann_pool {
159
+ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
160
+ /**
161
+ * @brief The maximum reuse margin for a buffer.
162
+ */
163
+ static const size_t max_reuse_margin = 1ull << 22; // 4MB
164
+
165
+ /**
166
+ * @brief The minimum free margin for a buffer.
167
+ */
168
+ static const size_t min_free_margin = 1ull << 20; // 1MB
169
+
170
+ /**
171
+ * @brief The alignment for buffer allocation.
172
+ */
173
+ static const size_t alignment = 128;
174
+
175
+ /**
176
+ * @brief The device ID associated with this buffer pool.
177
+ */
178
+ int device;
179
+
180
+ /**
181
+ * @brief Whether to disable clean during buffer allocation.
182
+ */
183
+ bool disable_clean = false;
184
+
185
+ /**
186
+ * @brief Structure representing a CANN buffer.
187
+ */
188
+ struct ggml_cann_buffer {
189
+ void* ptr = nullptr; ///< Pointer to the buffer.
190
+ size_t size = 0; ///< Size of the buffer.
191
+ std::chrono::steady_clock::time_point last_used; ///< Last used time.
192
+
193
+ bool operator>(const ggml_cann_buffer& other) const {
194
+ return size > other.size;
195
+ }
196
+ };
197
+
198
+ /**
199
+ * @brief Array of CANN buffers in the pool.
200
+ */
201
+ std::unordered_map<void*, size_t> buffer_pool;
202
+ std::priority_queue<ggml_cann_buffer,
203
+ std::vector<ggml_cann_buffer>,
204
+ std::greater<>> free_buffers ;
205
+
206
+ /**
207
+ * @brief Total size of all buffers in the pool.
208
+ */
209
+ size_t pool_size = 0;
210
+
211
+ /**
212
+ * @brief Constructor to initialize the buffer pool for a specific device.
213
+ *
214
+ * @param device The device ID to associate with this buffer pool.
215
+ */
216
+ explicit ggml_cann_pool_buf_prio(int device) : device(device) {
217
+ disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
218
+ }
219
+
220
+ /**
221
+ * @brief Destructor to free all buffers in the pool.
222
+ */
223
+ ~ggml_cann_pool_buf_prio() {
224
+ ggml_cann_set_device(device);
225
+ for (auto& [b_ptr, b_size] : buffer_pool) {
226
+ aclrtFree(b_ptr);
227
+ pool_size -= b_size;
228
+ }
229
+ buffer_pool.clear();
230
+ GGML_ASSERT(pool_size == 0);
231
+ }
232
+
233
+ /**
234
+ * @brief Allocate a buffer of the given size.
235
+ *
236
+ * @param size The size of the buffer to allocate.
237
+ * @param actual_size A pointer to a variable to receive the actual size of
238
+ * the allocated buffer.
239
+ * @return A pointer to the allocated buffer.
240
+ */
241
+ void* alloc(size_t size, size_t* actual_size) override {
242
+ size = GGML_PAD(size, alignment);
243
+ if (size == 0) {
244
+ size = alignment;
245
+ }
246
+
247
+ void* ptr = nullptr;
248
+ auto now = std::chrono::steady_clock::now();
249
+
250
+ std::vector<ggml_cann_buffer> free_buffers_rest;
251
+ free_buffers_rest.reserve(free_buffers.size());
252
+ while (!free_buffers.empty()) {
253
+ auto b = free_buffers.top();
254
+ free_buffers.pop();
255
+
256
+ if (b.size >= size) {
257
+ // reuse the buffer if the size is enough
258
+ const size_t margin = b.size - size;
259
+ if (margin <= max_reuse_margin) {
260
+ *actual_size = b.size;
261
+ ptr = b.ptr;
262
+ #ifdef DEBUG_CANN_MALLOC
263
+ GGML_LOG_INFO(
264
+ "cann pool[%d]: reused %p, "
265
+ "pool_size = %5u MB, "
266
+ "size = %5u MB, "
267
+ "margin = %5u MB\n",
268
+ device, b.ptr,
269
+ (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
270
+ (uint32_t)(GGML_PAD(size, 1048576) / 1048576),
271
+ (uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
272
+ #endif
273
+ break;
274
+ }
275
+ }
276
+
277
+ bool should_clean = !disable_clean &&
278
+ b.size > min_free_margin &&
279
+ std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
280
+ if (should_clean) {
281
+ // free the buffer if the size is needed to be freed
282
+ ACL_CHECK(aclrtFree(b.ptr));
283
+ pool_size -= b.size;
284
+ buffer_pool.erase(b.ptr);
285
+ #ifdef DEBUG_CANN_MALLOC
286
+ GGML_LOG_INFO(
287
+ "cann pool[%d]: clean %p, "
288
+ "pool_size = %5u MB, "
289
+ "size = %5u MB\n",
290
+ device, b.ptr,
291
+ (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
292
+ (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
293
+ #endif
294
+ continue;
295
+ }
296
+ free_buffers_rest.push_back(b);
297
+ }
298
+ for (ggml_cann_buffer &b : free_buffers_rest) {
299
+ free_buffers.push(std::move(b));
300
+ }
301
+
302
+ #ifdef DEBUG_CANN_MALLOC
303
+ GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
304
+ #endif
305
+ if (ptr != nullptr) {
306
+ return ptr;
307
+ }
308
+
309
+ // allocate a new buffer if no buffer can be reused
310
+ ggml_cann_set_device(device);
311
+ ACL_CHECK(aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
312
+ *actual_size = size;
313
+ pool_size += size;
314
+ #ifdef DEBUG_CANN_MALLOC
315
+ GGML_LOG_INFO(
316
+ "cann pool[%d]: allocate %p, "
317
+ "pool_size = %5u MB, "
318
+ "size = %5u MB\n",
319
+ device, ptr, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
320
+ (uint32_t)(GGML_PAD(size, 1048576) / 1048576));
321
+ #endif
322
+ buffer_pool.emplace(ptr, size);
323
+ return ptr;
324
+ }
325
+
326
+ /**
327
+ * @brief Free a buffer and return it to the pool.
328
+ *
329
+ * @param ptr Pointer to the buffer to free.
330
+ * @param size Size of the buffer to free.
331
+ */
332
+ void free(void* ptr, size_t size) override {
333
+ GGML_UNUSED(size);
334
+ auto it = buffer_pool.find(ptr);
335
+ if (it == buffer_pool.end()) {
336
+ GGML_ABORT("cann pool[%d]: buffer %p not found in pool\n", device, ptr);
337
+ }
338
+
339
+ auto now = std::chrono::steady_clock::now();
340
+ free_buffers.emplace(ggml_cann_buffer{ptr, it->second, now});
341
+ #ifdef DEBUG_CANN_MALLOC
342
+ GGML_LOG_INFO(
343
+ "cann pool[%d]: return %p, "
344
+ "pool_size = %5u MB\n",
345
+ device, ptr,
346
+ (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
347
+ #endif
348
+ }
349
+ };
350
+
351
+ /**
352
+ * @brief A pool of CANN buffers(segment buffer).
353
+ *
354
+ * This class manages a pool of CANN buffers for a specific device.
355
+ */
356
+ struct ggml_cann_pool_buf : public ggml_cann_pool {
357
+ /**
358
+ * @brief The maximum reuse margin for a buffer.
359
+ */
360
+ static const size_t max_reuse_margin = 1ull << 22; // 4MB
361
+
362
+ /**
363
+ * @brief The minimum free margin for a buffer.
364
+ */
365
+ static const size_t min_free_margin = 1ull << 20; // 1MB
366
+
367
+ /**
368
+ * @brief The alignment for buffer allocation.
369
+ */
370
+ static const size_t alignment = 128;
371
+
156
372
  /**
157
373
  * @brief The maximum number of buffers in the pool.
158
374
  */
@@ -163,12 +379,19 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
163
379
  */
164
380
  int device;
165
381
 
382
+ /**
383
+ * @brief Whether to disable clean during buffer allocation.
384
+ */
385
+ bool disable_clean = false;
386
+
166
387
  /**
167
388
  * @brief Structure representing a CANN buffer.
168
389
  */
169
390
  struct ggml_cann_buffer {
170
391
  void* ptr = nullptr; ///< Pointer to the buffer memory.
171
392
  size_t size = 0; ///< Size of the buffer.
393
+ bool used = false; ///< Whether the buffer is currently in use.
394
+ std::chrono::steady_clock::time_point last_used; ///< Last used time.
172
395
  };
173
396
 
174
397
  /**
@@ -186,17 +409,19 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
186
409
  *
187
410
  * @param device The device ID to associate with this buffer pool.
188
411
  */
189
- explicit ggml_cann_pool_leg(int device) : device(device) {}
412
+ explicit ggml_cann_pool_buf(int device) : device(device) {
413
+ disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
414
+ }
190
415
 
191
416
  /**
192
417
  * @brief Destructor to free all buffers in the pool.
193
418
  */
194
- ~ggml_cann_pool_leg() {
419
+ ~ggml_cann_pool_buf() {
195
420
  ggml_cann_set_device(device);
196
421
  for (int i = 0; i < MAX_BUFFERS; ++i) {
197
422
  ggml_cann_buffer& b = buffer_pool[i];
198
423
  if (b.ptr != nullptr) {
199
- ACL_CHECK(aclrtFree(b.ptr));
424
+ aclrtFree(b.ptr);
200
425
  pool_size -= b.size;
201
426
  }
202
427
  }
@@ -212,63 +437,93 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
212
437
  * @return A pointer to the allocated buffer.
213
438
  */
214
439
  void* alloc(size_t size, size_t* actual_size) override {
215
- const size_t alignment = 128;
216
440
  size = GGML_PAD(size, alignment);
217
441
  if (size == 0) {
218
442
  size = alignment;
219
443
  }
220
- #ifdef DEBUG_CANN_MALLOC
221
- int nnz = 0;
222
- size_t max_size = 0;
223
- #endif
224
- size_t best_diff = 1ull << 36;
225
- int ibest = -1;
226
- for (int i = 0; i < MAX_BUFFERS; ++i) {
444
+
445
+ void* ptr = nullptr;
446
+ auto now = std::chrono::steady_clock::now();
447
+
448
+ int i = 0;
449
+ for (; i < MAX_BUFFERS; ++i) {
227
450
  ggml_cann_buffer& b = buffer_pool[i];
228
- if (b.ptr != nullptr) {
451
+ if (b.ptr == nullptr) {
452
+ break;
453
+ }
454
+ if (b.used) {
455
+ continue;
456
+ }
457
+ if (b.size >= size) {
458
+ // reuse the buffer if the size is enough
459
+ const size_t margin = b.size - size;
460
+ if (margin <= max_reuse_margin) {
461
+ *actual_size = b.size;
462
+ b.used = true;
463
+ ptr = b.ptr;
229
464
  #ifdef DEBUG_CANN_MALLOC
230
- ++nnz;
231
- if (b.size > max_size) max_size = b.size;
465
+ GGML_LOG_INFO(
466
+ "cann pool[%d]: reused %p, "
467
+ "pool_size = %5u MB, "
468
+ "size = %5u MB, "
469
+ "margin = %5u MB\n",
470
+ device, b.ptr,
471
+ (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
472
+ (uint32_t)(GGML_PAD(size, 1048576) / 1048576),
473
+ (uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
232
474
  #endif
233
- if (b.size >= size) {
234
- size_t diff = b.size - size;
235
- if (diff < best_diff) {
236
- best_diff = diff;
237
- ibest = i;
238
- if (!best_diff) {
239
- void* ptr = b.ptr;
240
- *actual_size = b.size;
241
- b.ptr = nullptr;
242
- b.size = 0;
243
- return ptr;
244
- }
245
- }
475
+ break;
246
476
  }
247
477
  }
478
+
479
+ bool should_clean = !disable_clean &&
480
+ b.size > min_free_margin &&
481
+ std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
482
+ if (should_clean) {
483
+ // free the buffer if the size is needed to be freed
484
+ ACL_CHECK(aclrtFree(b.ptr));
485
+ pool_size -= b.size;
486
+ #ifdef DEBUG_CANN_MALLOC
487
+ GGML_LOG_INFO(
488
+ "cann pool[%d]: clean %p, "
489
+ "pool_size = %5u MB, "
490
+ "size = %5u MB\n",
491
+ device, b.ptr,
492
+ (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
493
+ (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
494
+ #endif
495
+ b.ptr = nullptr;
496
+ }
248
497
  }
249
- if (ibest >= 0) {
250
- ggml_cann_buffer& b = buffer_pool[ibest];
251
- void* ptr = b.ptr;
252
- *actual_size = b.size;
253
- b.ptr = nullptr;
254
- b.size = 0;
498
+ if (ptr != nullptr) {
255
499
  return ptr;
256
500
  }
257
- void* ptr;
258
- ggml_cann_set_device(device);
259
- ACL_CHECK(
260
- aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
261
- *actual_size = size;
262
- pool_size += size;
501
+
502
+ if (i < MAX_BUFFERS) {
503
+ // allocate a new buffer if no buffer can be reused
504
+ ggml_cann_buffer& b = buffer_pool[i];
505
+ ggml_cann_set_device(device);
506
+ ACL_CHECK(aclrtMalloc(&b.ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
507
+ pool_size += size;
508
+ *actual_size = size;
509
+ b.size = size;
510
+ b.used = true;
511
+ if (i >= MAX_BUFFERS - 8) {
512
+ GGML_LOG_WARN("cann pool[%d]: slots almost full\n", device);
513
+ }
263
514
  #ifdef DEBUG_CANN_MALLOC
264
- GGML_LOG_INFO(
265
- "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
266
- "requested %u MB\n",
267
- __func__, device, nnz, (uint32_t)(max_size / 1024 / 1024),
268
- (uint32_t)(pool_size / 1024 / 1024),
269
- (uint32_t)(size / 1024 / 1024));
515
+ GGML_LOG_INFO(
516
+ "cann pool[%d]: allocate %p, "
517
+ "pool_size = %5u MB, "
518
+ "size = %5u MB\n",
519
+ device, b.ptr,
520
+ (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
521
+ (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
270
522
  #endif
271
- return ptr;
523
+ return b.ptr;
524
+ }
525
+
526
+ GGML_ABORT("cann pool[%d]: slots full\n", device);
272
527
  }
273
528
 
274
529
  /**
@@ -278,18 +533,24 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
278
533
  * @param size Size of the buffer to free.
279
534
  */
280
535
  void free(void* ptr, size_t size) override {
536
+ GGML_UNUSED(size);
281
537
  for (int i = 0; i < MAX_BUFFERS; ++i) {
282
538
  ggml_cann_buffer& b = buffer_pool[i];
283
- if (b.ptr == nullptr) {
284
- b.ptr = ptr;
285
- b.size = size;
286
- return;
539
+ if (b.ptr != ptr) {
540
+ continue;
287
541
  }
542
+ b.used = false;
543
+ b.last_used = std::chrono::steady_clock::now();
544
+ #ifdef DEBUG_CANN_MALLOC
545
+ GGML_LOG_INFO(
546
+ "cann pool[%d]: return %p, "
547
+ "pool_size = %5u MB\n",
548
+ device, b.ptr,
549
+ (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
550
+ #endif
551
+ return;
288
552
  }
289
- // memory should always buffered. these memory may still needed by
290
- // tasks in stream.
291
- // TODO, fix me.
292
- GGML_ABORT("Cann buffer pool full, increase MAX_CANN_BUFFERS\n");
553
+ GGML_ABORT("cann pool[%d]: slots full\n", device);
293
554
  }
294
555
  };
295
556
 
@@ -347,8 +608,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
347
608
  * @param device The device ID to associate with this buffer pool.
348
609
  */
349
610
  explicit ggml_cann_pool_vmm(int device)
350
- : device(device),
351
- granularity(ggml_cann_info().devices[device].vmm_granularity) {
611
+ : device(device) {
352
612
  auto dev = ggml_cann_info().devices[device];
353
613
  granularity = dev.vmm_granularity;
354
614
  max_size = dev.total_vram;
@@ -471,7 +731,18 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
471
731
  */
472
732
  std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
473
733
  int device) {
474
- return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
734
+ bool disable_vmm = (getenv("GGML_CANN_DISABLE_VMM_POOL") != nullptr);
735
+ if (!disable_vmm && ggml_cann_info().devices[device].vmm) {
736
+ GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
737
+ return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
738
+ }
739
+ bool enable_buf_prio = (getenv("GGML_CANN_ENABLE_BUF_PRIO_POOL") != nullptr);
740
+ if (enable_buf_prio) {
741
+ GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
742
+ return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf_prio(device));
743
+ }
744
+ GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device);
745
+ return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf(device));
475
746
  }
476
747
 
477
748
  // cann buffer
@@ -796,14 +1067,14 @@ static bool need_transform(ggml_type type) {
796
1067
  * @param buffer The CANN buffer from which to initialize the tensor.
797
1068
  * @param tensor Pointer to the tensor to be initialized.
798
1069
  */
799
- static void ggml_backend_cann_buffer_init_tensor(
1070
+ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
800
1071
  ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
801
1072
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
802
1073
  GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
803
- return;
1074
+ return GGML_STATUS_SUCCESS;
804
1075
  }
805
1076
 
806
- // TODO: can backend doesn't support quantized yet. Just leave the code
1077
+ // TODO: cann backend doesn't support quantized yet. Just leave the code
807
1078
  // here.
808
1079
  if (ggml_is_quantized(tensor->type)) {
809
1080
  // Initialize padding to 0 to avoid possible NaN values
@@ -817,6 +1088,7 @@ static void ggml_backend_cann_buffer_init_tensor(
817
1088
  memset_size, 0, memset_size));
818
1089
  }
819
1090
  }
1091
+ return GGML_STATUS_SUCCESS;
820
1092
  }
821
1093
 
822
1094
  // TODO: need handle tensor which has paddings.
@@ -1019,8 +1291,11 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
1019
1291
 
1020
1292
  ggml_cann_set_device(buft_ctx->device);
1021
1293
 
1022
- size = std::max(size, (size_t)1);
1023
-
1294
+ const size_t alignment = 128;
1295
+ size = GGML_PAD(size, alignment);
1296
+ if (size == 0) {
1297
+ size = alignment;
1298
+ }
1024
1299
  void* dev_ptr;
1025
1300
  aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
1026
1301
  if (err != ACL_SUCCESS) {
@@ -1299,47 +1574,69 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1299
1574
  ggml_cann_dup(ctx, dst);
1300
1575
  break;
1301
1576
  case GGML_OP_ADD:
1302
- ggml_cann_add(ctx, dst);
1577
+ case GGML_OP_ADD1:
1578
+ ggml_cann_binary_op<aclnn_add>(ctx, dst);
1579
+ break;
1580
+ case GGML_OP_SUB:
1581
+ ggml_cann_binary_op<aclnn_sub>(ctx, dst);
1303
1582
  break;
1304
1583
  case GGML_OP_ACC:
1305
1584
  ggml_cann_acc(ctx, dst);
1306
1585
  break;
1307
1586
  case GGML_OP_MUL:
1308
- ggml_cann_mul_div<aclnnMulGetWorkspaceSize, aclnnMul>(ctx, dst);
1587
+ ggml_cann_binary_op<aclnn_mul>(ctx, dst);
1309
1588
  break;
1310
1589
  case GGML_OP_DIV:
1311
- ggml_cann_mul_div<aclnnDivGetWorkspaceSize, aclnnDiv>(ctx, dst);
1590
+ ggml_cann_binary_op<aclnn_div>(ctx, dst);
1312
1591
  break;
1313
1592
  case GGML_OP_UNARY:
1314
1593
  switch (ggml_get_unary_op(dst)) {
1594
+ case GGML_UNARY_OP_ABS:
1595
+ GGML_CANN_CALL_UNARY_OP(Abs);
1596
+ break;
1597
+ case GGML_UNARY_OP_NEG:
1598
+ GGML_CANN_CALL_UNARY_OP(Neg);
1599
+ break;
1315
1600
  case GGML_UNARY_OP_GELU:
1316
- ggml_cann_activation<aclnnGeluGetWorkspaceSize, aclnnGelu>(
1317
- ctx, dst);
1601
+ GGML_CANN_CALL_UNARY_OP(Gelu);
1318
1602
  break;
1319
1603
  case GGML_UNARY_OP_SILU:
1320
- ggml_cann_activation<aclnnSiluGetWorkspaceSize, aclnnSilu>(
1321
- ctx, dst);
1322
- break;
1323
- // TODO: Use faster gelu??
1324
- case GGML_UNARY_OP_GELU_QUICK:
1325
- ggml_cann_activation<aclnnGeluGetWorkspaceSize, aclnnGelu>(
1326
- ctx, dst);
1604
+ GGML_CANN_CALL_UNARY_OP(Silu);
1327
1605
  break;
1606
+ case GGML_UNARY_OP_GELU_QUICK: {
1607
+ auto lambda = [](ggml_backend_cann_context& ctx,
1608
+ aclTensor* acl_src,
1609
+ aclTensor* acl_dst) {
1610
+ GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
1611
+ };
1612
+ ggml_cann_unary_op(lambda, ctx, dst);
1613
+ } break;
1328
1614
  case GGML_UNARY_OP_TANH:
1329
- ggml_cann_activation<aclnnTanhGetWorkspaceSize, aclnnTanh>(
1330
- ctx, dst);
1615
+ GGML_CANN_CALL_UNARY_OP(Tanh);
1331
1616
  break;
1332
1617
  case GGML_UNARY_OP_RELU:
1333
- ggml_cann_activation<aclnnReluGetWorkspaceSize, aclnnRelu>(
1334
- ctx, dst);
1618
+ GGML_CANN_CALL_UNARY_OP(Relu);
1619
+ break;
1620
+ case GGML_UNARY_OP_SIGMOID:
1621
+ GGML_CANN_CALL_UNARY_OP(Sigmoid);
1335
1622
  break;
1336
1623
  case GGML_UNARY_OP_HARDSIGMOID:
1337
- ggml_cann_activation<aclnnHardsigmoidGetWorkspaceSize,
1338
- aclnnHardsigmoid>(ctx, dst);
1624
+ GGML_CANN_CALL_UNARY_OP(Hardsigmoid);
1339
1625
  break;
1340
1626
  case GGML_UNARY_OP_HARDSWISH:
1341
- ggml_cann_activation<aclnnHardswishGetWorkspaceSize,
1342
- aclnnHardswish>(ctx, dst);
1627
+ GGML_CANN_CALL_UNARY_OP(Hardswish);
1628
+ break;
1629
+ case GGML_UNARY_OP_EXP:
1630
+ GGML_CANN_CALL_UNARY_OP(Exp);
1631
+ break;
1632
+ case GGML_UNARY_OP_ELU:
1633
+ ggml_cann_elu(ctx, dst);
1634
+ break;
1635
+ case GGML_UNARY_OP_SGN:
1636
+ GGML_CANN_CALL_UNARY_OP(Sign);
1637
+ break;
1638
+ case GGML_UNARY_OP_STEP:
1639
+ ggml_cann_step(ctx, dst);
1343
1640
  break;
1344
1641
  default:
1345
1642
  return false;
@@ -1376,12 +1673,18 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1376
1673
  ggml_cann_mul_mat(ctx, dst);
1377
1674
  break;
1378
1675
  case GGML_OP_MUL_MAT_ID:
1379
- return false;
1676
+ ggml_cann_mul_mat_id(ctx, dst);
1677
+ break;
1380
1678
  case GGML_OP_SCALE:
1381
1679
  ggml_cann_scale(ctx, dst);
1382
1680
  break;
1383
1681
  case GGML_OP_SQR:
1384
- ggml_cann_sqr(ctx, dst);
1682
+ GGML_ASSERT(dst->src[1] == nullptr);
1683
+ dst->src[1] = dst->src[0];
1684
+ ggml_cann_binary_op<aclnn_mul>(ctx, dst);
1685
+ break;
1686
+ case GGML_OP_SQRT:
1687
+ GGML_CANN_CALL_UNARY_OP(Sqrt);
1385
1688
  break;
1386
1689
  case GGML_OP_CLAMP:
1387
1690
  ggml_cann_clamp(ctx, dst);
@@ -1413,12 +1716,42 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1413
1716
  case GGML_OP_POOL_2D:
1414
1717
  ggml_cann_pool2d(ctx, dst);
1415
1718
  break;
1719
+ case GGML_OP_SUM:
1720
+ ggml_cann_sum(ctx, dst);
1721
+ break;
1416
1722
  case GGML_OP_SUM_ROWS:
1417
1723
  ggml_cann_sum_rows(ctx, dst);
1418
1724
  break;
1419
1725
  case GGML_OP_ARGSORT:
1420
1726
  ggml_cann_argsort(ctx, dst);
1421
1727
  break;
1728
+ case GGML_OP_ARGMAX:
1729
+ ggml_cann_argmax(ctx, dst);
1730
+ break;
1731
+ case GGML_OP_COS:
1732
+ ggml_cann_unary_op<aclnn_cos>(ctx, dst);
1733
+ break;
1734
+ case GGML_OP_SIN:
1735
+ ggml_cann_unary_op<aclnn_sin>(ctx, dst);
1736
+ break;
1737
+ case GGML_OP_CONV_TRANSPOSE_1D:
1738
+ ggml_cann_conv_transpose_1d(ctx, dst);
1739
+ break;
1740
+ case GGML_OP_LOG:
1741
+ GGML_CANN_CALL_UNARY_OP(Log);
1742
+ break;
1743
+ case GGML_OP_MEAN:
1744
+ ggml_cann_mean(ctx, dst);
1745
+ break;
1746
+ case GGML_OP_PAD_REFLECT_1D:
1747
+ ggml_cann_pad_reflect_1d(ctx, dst);
1748
+ break;
1749
+ case GGML_OP_COUNT_EQUAL:
1750
+ ggml_cann_count_equal(ctx, dst);
1751
+ break;
1752
+ case GGML_OP_FLASH_ATTN_EXT:
1753
+ ggml_cann_flash_attn_ext(ctx, dst);
1754
+ break;
1422
1755
  default:
1423
1756
  return false;
1424
1757
  }
@@ -1457,21 +1790,15 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
1457
1790
  ACL_CHECK(aclrtSynchronizeDevice());
1458
1791
  ACL_CHECK(aclrtResetDevice(cann_ctx->device));
1459
1792
 
1460
- // finalize when last backend freed.
1461
- if (cann_ctx->device == ggml_backend_cann_get_device_count() - 1) {
1462
- ACL_CHECK(aclFinalize());
1463
- }
1464
-
1465
1793
  delete cann_ctx;
1466
1794
  delete backend;
1467
1795
  }
1468
1796
 
1797
+
1469
1798
  /**
1470
1799
  * @brief Sets tensor data asynchronously in the CANN backend.
1471
1800
  *
1472
- * This function asynchronously sets tensor data in the CANN backend. Depending
1473
- * on the tensor type, it may perform data transformations before copying data
1474
- * to the device.
1801
+ * This function asynchronously sets tensor data in the CANN backend.
1475
1802
  *
1476
1803
  * @param backend Pointer to the CANN backend structure.
1477
1804
  * @param tensor Pointer to the tensor structure to set data for.
@@ -1486,23 +1813,28 @@ static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
1486
1813
  size_t size) {
1487
1814
  ggml_backend_cann_context *cann_ctx =
1488
1815
  (ggml_backend_cann_context *)backend->context;
1816
+ ggml_backend_buffer_t buf =
1817
+ tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
1489
1818
 
1490
- if (!need_transform(tensor->type)) {
1491
- ACL_CHECK(aclrtMemcpyAsync((char *)tensor->data + offset, size, data,
1492
- size, ACL_MEMCPY_HOST_TO_DEVICE,
1493
- cann_ctx->stream()));
1494
- } else {
1495
- void *transform_buffer = malloc(size);
1496
- ggml_backend_cann_transform(tensor, data, transform_buffer);
1819
+ GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
1820
+ "unsupported buffer type");
1821
+ GGML_ASSERT(!ggml_is_quantized(tensor->type));
1497
1822
 
1498
- ACL_CHECK(aclrtMemcpyAsync(
1499
- (char *)tensor->data + offset, size, transform_buffer, size,
1500
- ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
1501
- ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
1502
- free(transform_buffer);
1503
- }
1823
+ ggml_cann_async_memcpy(cann_ctx, (char *)tensor->data + offset, data, size,
1824
+ ACL_MEMCPY_HOST_TO_DEVICE);
1504
1825
  }
1505
1826
 
1827
+ /**
1828
+ * @brief Gets tensor data asynchronously in the CANN backend.
1829
+ *
1830
+ * This function asynchronously gets tensor data in the CANN backend.
1831
+ *
1832
+ * @param backend Pointer to the CANN backend structure.
1833
+ * @param tensor Pointer to the tensor structure to get data from.
1834
+ * @param data Pointer to the host data to copy from the tensor.
1835
+ * @param offset Offset in bytes within the host data.
1836
+ * @param size Size of the data to copy in bytes.
1837
+ */
1506
1838
  static void ggml_backend_cann_get_tensor_async(
1507
1839
  ggml_backend_t backend, const ggml_tensor *tensor, void *data,
1508
1840
  size_t offset, size_t size) {
@@ -1513,20 +1845,11 @@ static void ggml_backend_cann_get_tensor_async(
1513
1845
 
1514
1846
  GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
1515
1847
  "unsupported buffer type");
1848
+ GGML_ASSERT(!ggml_is_quantized(tensor->type));
1849
+
1850
+ ggml_cann_async_memcpy(cann_ctx, data, (char *)tensor->data + offset, size,
1851
+ ACL_MEMCPY_DEVICE_TO_HOST);
1516
1852
 
1517
- if (!need_transform(tensor->type)) {
1518
- ACL_CHECK(aclrtMemcpyAsync(data, size, (char *)tensor->data + offset,
1519
- size, ACL_MEMCPY_DEVICE_TO_HOST,
1520
- cann_ctx->stream()));
1521
- } else {
1522
- void *transform_buffer = malloc(size);
1523
- ACL_CHECK(aclrtMemcpyAsync(
1524
- transform_buffer, size, (char *)tensor->data + offset, size,
1525
- ACL_MEMCPY_DEVICE_TO_HOST, cann_ctx->stream()));
1526
- ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
1527
- ggml_backend_cann_transform_back(tensor, transform_buffer, data);
1528
- free(transform_buffer);
1529
- }
1530
1853
  }
1531
1854
 
1532
1855
  /**
@@ -1586,6 +1909,8 @@ static bool ggml_backend_cann_cpy_tensor_async(
1586
1909
  ggml_cann_set_device(cann_ctx_src->device);
1587
1910
  ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
1588
1911
 
1912
+ // wait for task_queue empty to keep task order.
1913
+ cann_ctx_src->task_queue.wait();
1589
1914
  ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
1590
1915
  ACL_MEMCPY_DEVICE_TO_DEVICE,
1591
1916
  cann_ctx_src->stream()));
@@ -1613,9 +1938,8 @@ static bool ggml_backend_cann_cpy_tensor_async(
1613
1938
  static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
1614
1939
  ggml_backend_cann_context* cann_ctx =
1615
1940
  (ggml_backend_cann_context*)backend->context;
1616
-
1941
+ cann_ctx->task_queue.wait();
1617
1942
  ggml_cann_set_device(cann_ctx->device);
1618
-
1619
1943
  ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
1620
1944
  }
1621
1945
 
@@ -1674,58 +1998,86 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1674
1998
  switch (op->op) {
1675
1999
  case GGML_OP_UNARY:
1676
2000
  switch (ggml_get_unary_op(op)) {
2001
+ case GGML_UNARY_OP_ABS:
2002
+ case GGML_UNARY_OP_NEG:
1677
2003
  case GGML_UNARY_OP_GELU:
1678
2004
  case GGML_UNARY_OP_SILU:
1679
2005
  case GGML_UNARY_OP_RELU:
2006
+ case GGML_UNARY_OP_SIGMOID:
1680
2007
  case GGML_UNARY_OP_HARDSIGMOID:
1681
2008
  case GGML_UNARY_OP_HARDSWISH:
1682
2009
  case GGML_UNARY_OP_GELU_QUICK:
1683
2010
  case GGML_UNARY_OP_TANH:
2011
+ case GGML_UNARY_OP_EXP:
2012
+ case GGML_UNARY_OP_ELU:
2013
+ case GGML_UNARY_OP_SGN:
2014
+ case GGML_UNARY_OP_STEP:
1684
2015
  return true;
1685
2016
  default:
1686
2017
  return false;
1687
2018
  }
1688
2019
  case GGML_OP_MUL_MAT: {
1689
2020
  switch (op->src[0]->type) {
1690
- case GGML_TYPE_Q8_0:
1691
- // Current groupsize should not be greater than k-1 in
1692
- // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
1693
- if (op->src[0]->ne[0] <= QK8_0) {
1694
- return false;
1695
- }
1696
2021
  case GGML_TYPE_F16:
1697
2022
  case GGML_TYPE_F32:
1698
- case GGML_TYPE_Q4_0:
1699
2023
  return true;
2024
+ case GGML_TYPE_Q8_0:
2025
+ case GGML_TYPE_Q4_0:
2026
+ #ifdef ASCEND_310P
2027
+ // Q4 && Q8 per group is not suppor on 310p device
2028
+ return false;
2029
+ #endif
2030
+ // only support contiguous for quantized types.
2031
+ return ggml_is_contiguous(op->src[0]) &&
2032
+ ggml_is_contiguous(op->src[1]);
1700
2033
  default:
1701
2034
  return false;
1702
2035
  }
1703
2036
  }
1704
2037
  case GGML_OP_MUL_MAT_ID:
1705
- return false;
1706
- // embedding
1707
- case GGML_OP_GET_ROWS: {
1708
2038
  switch (op->src[0]->type) {
1709
- case GGML_TYPE_F32:
1710
2039
  case GGML_TYPE_F16:
1711
- case GGML_TYPE_Q4_0:
1712
- case GGML_TYPE_Q8_0:
2040
+ case GGML_TYPE_F32:
1713
2041
  return true;
2042
+ case GGML_TYPE_Q8_0:
2043
+ case GGML_TYPE_Q4_0:
2044
+ #ifdef ASCEND_310P
2045
+ // Q4 && Q8 per group is not suppor on 310p device
2046
+ return false;
2047
+ #endif
2048
+ // only support contiguous for quantized types.
2049
+ return ggml_is_contiguous(op->src[0]) &&
2050
+ ggml_is_contiguous(op->src[1]);
1714
2051
  default:
1715
2052
  return false;
1716
2053
  }
1717
- } break;
1718
- case GGML_OP_CPY: {
1719
- switch (op->type) {
2054
+ // embedding
2055
+ case GGML_OP_GET_ROWS: {
2056
+ switch (op->src[0]->type) {
1720
2057
  case GGML_TYPE_F32:
1721
2058
  case GGML_TYPE_F16:
1722
2059
  case GGML_TYPE_Q8_0:
1723
- case GGML_TYPE_Q4_0:
1724
2060
  return true;
1725
2061
  default:
1726
2062
  return false;
1727
2063
  }
1728
- }
2064
+ } break;
2065
+ case GGML_OP_CPY: {
2066
+ ggml_tensor *src = op->src[0];
2067
+ if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
2068
+ (src->type != GGML_TYPE_F32 &&
2069
+ src->type != GGML_TYPE_F16)) {
2070
+ // only support F32 and F16.
2071
+ return false;
2072
+ }
2073
+
2074
+ if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
2075
+ // unsupport dst is not contiguous.
2076
+ return false;
2077
+ }
2078
+
2079
+ return true;
2080
+ } break;
1729
2081
  case GGML_OP_CONT: {
1730
2082
  // TODO: support GGML_TYPE_BF16
1731
2083
  switch (op->src[0]->type) {
@@ -1738,13 +2090,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1738
2090
  }
1739
2091
  case GGML_OP_ROPE: {
1740
2092
  // TODO: with ops-test v == 1
1741
- float * ext_factor = (float*)((int32_t*)op->op_params + 7);
2093
+ float ext_factor = 0.0f;
2094
+ memcpy(&ext_factor, (const float *) op->op_params + 7, sizeof(float));
1742
2095
  // TODO: n_dims <= ne0
1743
2096
  if (op->src[0]->ne[0] != op->op_params[1]) {
1744
2097
  return false;
1745
2098
  }
1746
2099
  // TODO: ext_factor != 0
1747
- if (*ext_factor != 0) {
2100
+ if (ext_factor != 0) {
1748
2101
  return false;
1749
2102
  }
1750
2103
 
@@ -1756,6 +2109,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1756
2109
  return false;
1757
2110
  }
1758
2111
 
2112
+ if(!ggml_is_contiguous(op->src[0])){
2113
+ return false;
2114
+ }
1759
2115
  return true;
1760
2116
  }
1761
2117
  case GGML_OP_UPSCALE: {
@@ -1764,11 +2120,31 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1764
2120
  if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
1765
2121
  return false;
1766
2122
  }
2123
+ if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) {
2124
+ return false;
2125
+ }
1767
2126
  return true;
1768
2127
  }
2128
+ case GGML_OP_POOL_2D: {
2129
+ const int32_t * opts = (const int32_t *) op->op_params;
2130
+ #ifdef ASCEND_310P
2131
+ enum ggml_op_pool opt = static_cast<ggml_op_pool>(opts[0]);
2132
+ if(opt == GGML_OP_POOL_MAX){
2133
+ return false;
2134
+ }
2135
+ #endif
2136
+ const int k0 = opts[1];
2137
+ const int k1 = opts[2];
2138
+ const int p0 = opts[5];
2139
+ const int p1 = opts[6];
2140
+ // value of paddingH should be at most half of kernelH
2141
+ // value of paddingW should be at most half of kernelW
2142
+ return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
2143
+ }
2144
+ case GGML_OP_SUM:
2145
+ case GGML_OP_DUP:
1769
2146
  case GGML_OP_IM2COL:
1770
2147
  case GGML_OP_CONCAT:
1771
- case GGML_OP_DUP:
1772
2148
  case GGML_OP_REPEAT:
1773
2149
  case GGML_OP_NONE:
1774
2150
  case GGML_OP_RESHAPE:
@@ -1777,15 +2153,17 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1777
2153
  case GGML_OP_TRANSPOSE:
1778
2154
  case GGML_OP_NORM:
1779
2155
  case GGML_OP_ADD:
2156
+ case GGML_OP_ADD1:
2157
+ case GGML_OP_SUB:
1780
2158
  case GGML_OP_MUL:
1781
2159
  case GGML_OP_DIV:
1782
2160
  case GGML_OP_RMS_NORM:
1783
2161
  case GGML_OP_SCALE:
1784
2162
  case GGML_OP_SQR:
2163
+ case GGML_OP_SQRT:
1785
2164
  case GGML_OP_CLAMP:
1786
2165
  case GGML_OP_DIAG_MASK_INF:
1787
2166
  case GGML_OP_SOFT_MAX:
1788
- case GGML_OP_POOL_2D:
1789
2167
  case GGML_OP_SUM_ROWS:
1790
2168
  case GGML_OP_ARGSORT:
1791
2169
  case GGML_OP_ACC:
@@ -1794,7 +2172,47 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1794
2172
  case GGML_OP_ARANGE:
1795
2173
  case GGML_OP_TIMESTEP_EMBEDDING:
1796
2174
  case GGML_OP_LEAKY_RELU:
2175
+ case GGML_OP_ARGMAX:
2176
+ case GGML_OP_COS:
2177
+ case GGML_OP_SIN:
2178
+ case GGML_OP_CONV_TRANSPOSE_1D:
2179
+ case GGML_OP_LOG:
2180
+ case GGML_OP_MEAN:
2181
+ case GGML_OP_PAD_REFLECT_1D:
2182
+ case GGML_OP_COUNT_EQUAL:
2183
+ return true;
2184
+ case GGML_OP_FLASH_ATTN_EXT:{
2185
+ // derived from [ggml-cuda.cu]
2186
+ if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
2187
+ return false;
2188
+ }
2189
+ if(op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 && op->src[1]->type != GGML_TYPE_BF16){
2190
+ return false;
2191
+ }
2192
+ if(op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16){
2193
+ return false;
2194
+ }
2195
+ if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
2196
+ // different head sizes of K and V are not supported yet
2197
+ return false;
2198
+ }
2199
+ if (op->src[0]->ne[0] == 192) {
2200
+ return false;
2201
+ }
2202
+ if (op->src[0]->ne[0] == 576) {
2203
+ // DeepSeek MLA
2204
+ return false;
2205
+ }
2206
+ if (op->src[0]->ne[3] != 1) {
2207
+ return false;
2208
+ }
2209
+ float logitSoftcap = 0.0f;
2210
+ memcpy(&logitSoftcap, (float*)op->op_params + 2, sizeof(float));
2211
+ if(logitSoftcap != 0.0f) {
2212
+ return false;
2213
+ }
1797
2214
  return true;
2215
+ }
1798
2216
  default:
1799
2217
  return false;
1800
2218
  }