whispercpp 1.3.1 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (857) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +7 -3
  3. data/README.md +161 -43
  4. data/Rakefile +45 -13
  5. data/ext/.gitignore +4 -8
  6. data/ext/dependencies.rb +73 -0
  7. data/ext/extconf.rb +21 -198
  8. data/ext/options.rb +85 -0
  9. data/ext/ruby_whisper.c +177 -0
  10. data/ext/ruby_whisper.h +17 -2
  11. data/ext/ruby_whisper_context.c +672 -0
  12. data/ext/ruby_whisper_error.c +52 -0
  13. data/ext/ruby_whisper_model.c +232 -0
  14. data/ext/ruby_whisper_params.c +1303 -0
  15. data/ext/ruby_whisper_segment.c +220 -0
  16. data/ext/ruby_whisper_transcribe.cpp +93 -0
  17. data/ext/ruby_whisper_vad_params.c +288 -0
  18. data/ext/sources/CMakeGraphVizOptions.cmake +8 -0
  19. data/ext/sources/CMakeLists.txt +255 -0
  20. data/ext/sources/bindings/javascript/CMakeLists.txt +41 -0
  21. data/ext/sources/bindings/javascript/emscripten.cpp +93 -0
  22. data/ext/sources/bindings/javascript/libwhisper.worker.js +1 -0
  23. data/ext/sources/bindings/javascript/package-tmpl.json +26 -0
  24. data/ext/sources/bindings/javascript/package.json +26 -0
  25. data/ext/sources/bindings/javascript/whisper.js +19 -0
  26. data/ext/sources/build-xcframework.sh +547 -0
  27. data/ext/sources/cmake/DefaultTargetOptions.cmake +16 -0
  28. data/ext/sources/cmake/FindFFmpeg.cmake +163 -0
  29. data/ext/sources/cmake/build-info.cmake +60 -0
  30. data/ext/sources/cmake/git-vars.cmake +22 -0
  31. data/ext/sources/cmake/whisper-config.cmake.in +65 -0
  32. data/ext/sources/cmake/whisper.pc.in +10 -0
  33. data/ext/sources/examples/CMakeLists.txt +124 -0
  34. data/ext/sources/examples/addon.node/CMakeLists.txt +31 -0
  35. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +133 -0
  36. data/ext/sources/examples/addon.node/addon.cpp +557 -0
  37. data/ext/sources/examples/addon.node/index.js +57 -0
  38. data/ext/sources/examples/addon.node/package.json +16 -0
  39. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  40. data/ext/sources/examples/bench/CMakeLists.txt +8 -0
  41. data/ext/sources/examples/bench/bench.cpp +176 -0
  42. data/ext/sources/examples/bench.wasm/CMakeLists.txt +49 -0
  43. data/ext/sources/examples/bench.wasm/emscripten.cpp +87 -0
  44. data/ext/sources/examples/bench.wasm/index-tmpl.html +284 -0
  45. data/ext/sources/examples/cli/CMakeLists.txt +8 -0
  46. data/ext/sources/examples/cli/cli.cpp +1295 -0
  47. data/ext/sources/examples/coi-serviceworker.js +146 -0
  48. data/ext/sources/examples/command/CMakeLists.txt +10 -0
  49. data/ext/sources/examples/command/command.cpp +800 -0
  50. data/ext/sources/examples/command/commands.txt +9 -0
  51. data/ext/sources/examples/command.wasm/CMakeLists.txt +50 -0
  52. data/ext/sources/examples/command.wasm/emscripten.cpp +327 -0
  53. data/ext/sources/examples/command.wasm/index-tmpl.html +414 -0
  54. data/ext/sources/examples/common-ggml.cpp +238 -0
  55. data/ext/sources/examples/common-ggml.h +18 -0
  56. data/ext/sources/examples/common-sdl.cpp +227 -0
  57. data/ext/sources/examples/common-sdl.h +49 -0
  58. data/ext/sources/examples/common-whisper.cpp +175 -0
  59. data/ext/sources/examples/common-whisper.h +24 -0
  60. data/ext/sources/examples/common.cpp +675 -0
  61. data/ext/sources/examples/common.h +322 -0
  62. data/ext/sources/examples/deprecation-warning/CMakeLists.txt +6 -0
  63. data/ext/sources/examples/deprecation-warning/deprecation-warning.cpp +38 -0
  64. data/ext/sources/examples/ffmpeg-transcode.cpp +368 -0
  65. data/ext/sources/examples/generate-karaoke.sh +57 -0
  66. data/ext/sources/examples/grammar-parser.cpp +423 -0
  67. data/ext/sources/examples/grammar-parser.h +29 -0
  68. data/ext/sources/examples/helpers.js +191 -0
  69. data/ext/sources/examples/json.hpp +24596 -0
  70. data/ext/sources/examples/livestream.sh +112 -0
  71. data/ext/sources/examples/lsp/CMakeLists.txt +9 -0
  72. data/ext/sources/examples/lsp/lsp.cpp +469 -0
  73. data/ext/sources/examples/lsp/whisper.vim +362 -0
  74. data/ext/sources/examples/miniaudio.h +93468 -0
  75. data/ext/sources/examples/python/test_whisper_processor.py +7 -0
  76. data/ext/sources/examples/python/whisper_processor.py +54 -0
  77. data/ext/sources/examples/quantize/CMakeLists.txt +6 -0
  78. data/ext/sources/examples/quantize/quantize.cpp +226 -0
  79. data/ext/sources/examples/server/CMakeLists.txt +15 -0
  80. data/ext/sources/examples/server/bench.js +29 -0
  81. data/ext/sources/examples/server/httplib.h +10497 -0
  82. data/ext/sources/examples/server/server.cpp +1238 -0
  83. data/ext/sources/examples/server.py +115 -0
  84. data/ext/sources/examples/stb_vorbis.c +5584 -0
  85. data/ext/sources/examples/stream/CMakeLists.txt +10 -0
  86. data/ext/sources/examples/stream/stream.cpp +435 -0
  87. data/ext/sources/examples/stream.wasm/CMakeLists.txt +49 -0
  88. data/ext/sources/examples/stream.wasm/emscripten.cpp +216 -0
  89. data/ext/sources/examples/stream.wasm/index-tmpl.html +414 -0
  90. data/ext/sources/examples/sycl/CMakeLists.txt +9 -0
  91. data/ext/sources/examples/sycl/build.sh +22 -0
  92. data/ext/sources/examples/sycl/ls-sycl-device.cpp +11 -0
  93. data/ext/sources/examples/sycl/run-whisper.sh +17 -0
  94. data/ext/sources/examples/talk-llama/CMakeLists.txt +43 -0
  95. data/ext/sources/examples/talk-llama/eleven-labs.py +80 -0
  96. data/ext/sources/examples/talk-llama/llama-adapter.cpp +388 -0
  97. data/ext/sources/examples/talk-llama/llama-adapter.h +76 -0
  98. data/ext/sources/examples/talk-llama/llama-arch.cpp +1914 -0
  99. data/ext/sources/examples/talk-llama/llama-arch.h +464 -0
  100. data/ext/sources/examples/talk-llama/llama-batch.cpp +843 -0
  101. data/ext/sources/examples/talk-llama/llama-batch.h +147 -0
  102. data/ext/sources/examples/talk-llama/llama-chat.cpp +685 -0
  103. data/ext/sources/examples/talk-llama/llama-chat.h +59 -0
  104. data/ext/sources/examples/talk-llama/llama-context.cpp +2845 -0
  105. data/ext/sources/examples/talk-llama/llama-context.h +297 -0
  106. data/ext/sources/examples/talk-llama/llama-cparams.cpp +5 -0
  107. data/ext/sources/examples/talk-llama/llama-cparams.h +41 -0
  108. data/ext/sources/examples/talk-llama/llama-grammar.cpp +1229 -0
  109. data/ext/sources/examples/talk-llama/llama-grammar.h +173 -0
  110. data/ext/sources/examples/talk-llama/llama-graph.cpp +1693 -0
  111. data/ext/sources/examples/talk-llama/llama-graph.h +710 -0
  112. data/ext/sources/examples/talk-llama/llama-hparams.cpp +103 -0
  113. data/ext/sources/examples/talk-llama/llama-hparams.h +207 -0
  114. data/ext/sources/examples/talk-llama/llama-impl.cpp +167 -0
  115. data/ext/sources/examples/talk-llama/llama-impl.h +61 -0
  116. data/ext/sources/examples/talk-llama/llama-io.cpp +15 -0
  117. data/ext/sources/examples/talk-llama/llama-io.h +35 -0
  118. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
  119. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
  120. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
  121. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
  122. data/ext/sources/examples/talk-llama/llama-kv-cache.h +44 -0
  123. data/ext/sources/examples/talk-llama/llama-kv-cells.h +439 -0
  124. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
  125. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
  126. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
  127. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
  128. data/ext/sources/examples/talk-llama/llama-memory.cpp +59 -0
  129. data/ext/sources/examples/talk-llama/llama-memory.h +116 -0
  130. data/ext/sources/examples/talk-llama/llama-mmap.cpp +600 -0
  131. data/ext/sources/examples/talk-llama/llama-mmap.h +68 -0
  132. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +1163 -0
  133. data/ext/sources/examples/talk-llama/llama-model-loader.h +169 -0
  134. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +282 -0
  135. data/ext/sources/examples/talk-llama/llama-model-saver.h +37 -0
  136. data/ext/sources/examples/talk-llama/llama-model.cpp +15114 -0
  137. data/ext/sources/examples/talk-llama/llama-model.h +452 -0
  138. data/ext/sources/examples/talk-llama/llama-quant.cpp +1049 -0
  139. data/ext/sources/examples/talk-llama/llama-quant.h +1 -0
  140. data/ext/sources/examples/talk-llama/llama-sampling.cpp +2575 -0
  141. data/ext/sources/examples/talk-llama/llama-sampling.h +32 -0
  142. data/ext/sources/examples/talk-llama/llama-vocab.cpp +3377 -0
  143. data/ext/sources/examples/talk-llama/llama-vocab.h +132 -0
  144. data/ext/sources/examples/talk-llama/llama.cpp +358 -0
  145. data/ext/sources/examples/talk-llama/llama.h +1484 -0
  146. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +23 -0
  147. data/ext/sources/examples/talk-llama/speak +40 -0
  148. data/ext/sources/examples/talk-llama/speak.bat +1 -0
  149. data/ext/sources/examples/talk-llama/speak.ps1 +14 -0
  150. data/ext/sources/examples/talk-llama/talk-llama.cpp +810 -0
  151. data/ext/sources/examples/talk-llama/unicode-data.cpp +7034 -0
  152. data/ext/sources/examples/talk-llama/unicode-data.h +20 -0
  153. data/ext/sources/examples/talk-llama/unicode.cpp +854 -0
  154. data/ext/sources/examples/talk-llama/unicode.h +66 -0
  155. data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +8 -0
  156. data/ext/sources/examples/vad-speech-segments/speech.cpp +149 -0
  157. data/ext/sources/examples/wchess/CMakeLists.txt +10 -0
  158. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +19 -0
  159. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +803 -0
  160. data/ext/sources/examples/wchess/libwchess/Chessboard.h +33 -0
  161. data/ext/sources/examples/wchess/libwchess/WChess.cpp +193 -0
  162. data/ext/sources/examples/wchess/libwchess/WChess.h +63 -0
  163. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +117 -0
  164. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +8 -0
  165. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +251 -0
  166. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +50 -0
  167. data/ext/sources/examples/whisper.wasm/emscripten.cpp +118 -0
  168. data/ext/sources/examples/whisper.wasm/index-tmpl.html +658 -0
  169. data/ext/sources/ggml/CMakeLists.txt +435 -0
  170. data/ext/sources/ggml/cmake/BuildTypes.cmake +54 -0
  171. data/ext/sources/ggml/cmake/GitVars.cmake +22 -0
  172. data/ext/sources/ggml/cmake/common.cmake +50 -0
  173. data/ext/sources/ggml/cmake/ggml-config.cmake.in +152 -0
  174. data/ext/{ggml → sources/ggml}/include/ggml-alloc.h +1 -1
  175. data/ext/{ggml → sources/ggml}/include/ggml-backend.h +10 -8
  176. data/ext/{ggml → sources/ggml}/include/ggml-cpp.h +2 -1
  177. data/ext/{ggml → sources/ggml}/include/ggml-cpu.h +11 -1
  178. data/ext/{ggml → sources/ggml}/include/ggml-metal.h +1 -1
  179. data/ext/{ggml → sources/ggml}/include/ggml-opt.h +49 -28
  180. data/ext/{ggml → sources/ggml}/include/ggml-rpc.h +6 -1
  181. data/ext/{ggml → sources/ggml}/include/ggml-vulkan.h +0 -2
  182. data/ext/{ggml → sources/ggml}/include/ggml.h +325 -269
  183. data/ext/sources/ggml/include/gguf.h +202 -0
  184. data/ext/sources/ggml/src/CMakeLists.txt +404 -0
  185. data/ext/{ggml → sources/ggml}/src/ggml-alloc.c +34 -29
  186. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  187. data/ext/{ggml → sources/ggml}/src/ggml-backend-impl.h +1 -2
  188. data/ext/{ggml → sources/ggml}/src/ggml-backend-reg.cpp +92 -53
  189. data/ext/{ggml → sources/ggml}/src/ggml-backend.cpp +69 -34
  190. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  191. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +75 -0
  192. data/ext/sources/ggml/src/ggml-cann/Doxyfile +2579 -0
  193. data/ext/{ggml → sources/ggml}/src/ggml-cann/acl_tensor.cpp +10 -4
  194. data/ext/{ggml → sources/ggml}/src/ggml-cann/acl_tensor.h +5 -5
  195. data/ext/{ggml → sources/ggml}/src/ggml-cann/aclnn_ops.cpp +1272 -1506
  196. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +1125 -0
  197. data/ext/{ggml → sources/ggml}/src/ggml-cann/common.h +140 -1
  198. data/ext/{ggml → sources/ggml}/src/ggml-cann/ggml-cann.cpp +588 -146
  199. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +30 -0
  200. data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/dup.cpp +3 -5
  201. data/ext/{ggml → sources/ggml}/src/ggml-common.h +16 -8
  202. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +597 -0
  203. data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/amx.cpp +3 -2
  204. data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/mmq.cpp +11 -10
  205. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  206. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  207. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  208. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  209. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  210. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  211. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  212. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  213. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  214. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  215. data/ext/{ggml/src/ggml-cpu/cpu-feats-x86.cpp → sources/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp} +5 -1
  216. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  217. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +3285 -0
  218. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  219. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  220. data/ext/sources/ggml/src/ggml-cpu/binary-ops.h +16 -0
  221. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  222. data/ext/sources/ggml/src/ggml-cpu/common.h +73 -0
  223. data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-impl.h +172 -41
  224. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +3551 -0
  225. data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu.cpp +78 -25
  226. data/ext/{ggml/src/ggml-cpu/ggml-cpu-hbm.cpp → sources/ggml/src/ggml-cpu/hbm.cpp} +1 -1
  227. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +337 -0
  228. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +95 -0
  229. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +482 -0
  230. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  231. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3594 -0
  232. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +19 -0
  233. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +9786 -0
  234. data/ext/sources/ggml/src/ggml-cpu/ops.h +118 -0
  235. data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
  236. data/ext/{ggml/src/ggml-cpu/ggml-cpu-quants.h → sources/ggml/src/ggml-cpu/quants.h} +26 -0
  237. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
  238. data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
  239. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +1184 -0
  240. data/ext/{ggml/src/ggml-cpu/ggml-cpu-traits.cpp → sources/ggml/src/ggml-cpu/traits.cpp} +1 -1
  241. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  242. data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +28 -0
  243. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +345 -0
  244. data/ext/sources/ggml/src/ggml-cpu/vec.h +1027 -0
  245. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +184 -0
  246. data/ext/sources/ggml/src/ggml-cuda/acc.cu +61 -0
  247. data/ext/sources/ggml/src/ggml-cuda/acc.cuh +5 -0
  248. data/ext/sources/ggml/src/ggml-cuda/arange.cu +34 -0
  249. data/ext/sources/ggml/src/ggml-cuda/arange.cuh +5 -0
  250. data/ext/sources/ggml/src/ggml-cuda/argmax.cu +91 -0
  251. data/ext/sources/ggml/src/ggml-cuda/argmax.cuh +3 -0
  252. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +104 -0
  253. data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +3 -0
  254. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +363 -0
  255. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +9 -0
  256. data/ext/sources/ggml/src/ggml-cuda/clamp.cu +45 -0
  257. data/ext/sources/ggml/src/ggml-cuda/clamp.cuh +5 -0
  258. data/ext/sources/ggml/src/ggml-cuda/common.cuh +851 -0
  259. data/ext/sources/ggml/src/ggml-cuda/concat.cu +221 -0
  260. data/ext/sources/ggml/src/ggml-cuda/concat.cuh +5 -0
  261. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +89 -0
  262. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
  263. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  264. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  265. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  266. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  267. data/ext/sources/ggml/src/ggml-cuda/convert.cu +752 -0
  268. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +31 -0
  269. data/ext/sources/ggml/src/ggml-cuda/count-equal.cu +64 -0
  270. data/ext/sources/ggml/src/ggml-cuda/count-equal.cuh +5 -0
  271. data/ext/sources/ggml/src/ggml-cuda/cp-async.cuh +57 -0
  272. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +705 -0
  273. data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +11 -0
  274. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +189 -0
  275. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
  276. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +103 -0
  277. data/ext/sources/ggml/src/ggml-cuda/diagmask.cu +40 -0
  278. data/ext/sources/ggml/src/ggml-cuda/diagmask.cuh +5 -0
  279. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +881 -0
  280. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1474 -0
  281. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +357 -0
  282. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +3 -0
  283. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +365 -0
  284. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +3 -0
  285. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +482 -0
  286. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +472 -0
  287. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +638 -0
  288. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +3 -0
  289. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +346 -0
  290. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +3 -0
  291. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +275 -0
  292. data/ext/sources/ggml/src/ggml-cuda/getrows.cuh +15 -0
  293. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +3647 -0
  294. data/ext/sources/ggml/src/ggml-cuda/gla.cu +93 -0
  295. data/ext/sources/ggml/src/ggml-cuda/gla.cuh +3 -0
  296. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +103 -0
  297. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +5 -0
  298. data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
  299. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  300. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +396 -0
  301. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +324 -0
  302. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +3217 -0
  303. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +506 -0
  304. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +11 -0
  305. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +595 -0
  306. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +12 -0
  307. data/ext/sources/ggml/src/ggml-cuda/norm.cu +458 -0
  308. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +11 -0
  309. data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
  310. data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
  311. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +68 -0
  312. data/ext/sources/ggml/src/ggml-cuda/out-prod.cuh +3 -0
  313. data/ext/sources/ggml/src/ggml-cuda/pad.cu +49 -0
  314. data/ext/sources/ggml/src/ggml-cuda/pad.cuh +5 -0
  315. data/ext/sources/ggml/src/ggml-cuda/pool2d.cu +94 -0
  316. data/ext/sources/ggml/src/ggml-cuda/pool2d.cuh +5 -0
  317. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +190 -0
  318. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +27 -0
  319. data/ext/sources/ggml/src/ggml-cuda/rope.cu +456 -0
  320. data/ext/sources/ggml/src/ggml-cuda/rope.cuh +7 -0
  321. data/ext/sources/ggml/src/ggml-cuda/scale.cu +31 -0
  322. data/ext/sources/ggml/src/ggml-cuda/scale.cuh +5 -0
  323. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +283 -0
  324. data/ext/sources/ggml/src/ggml-cuda/softmax.cuh +7 -0
  325. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +148 -0
  326. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
  327. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +155 -0
  328. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
  329. data/ext/sources/ggml/src/ggml-cuda/sum.cu +45 -0
  330. data/ext/sources/ggml/src/ggml-cuda/sum.cuh +5 -0
  331. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +26 -0
  332. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +4 -0
  333. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +5 -0
  334. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +10 -0
  335. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
  336. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
  337. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +10 -0
  338. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +5 -0
  339. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +10 -0
  340. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +10 -0
  341. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
  342. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
  343. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +5 -0
  344. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
  345. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +10 -0
  346. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +10 -0
  347. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
  348. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
  349. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
  350. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +10 -0
  351. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +10 -0
  352. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  353. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  354. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  355. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  356. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  357. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  358. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  359. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  360. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  361. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  362. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  363. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  364. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  365. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  366. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  367. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  368. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  369. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  370. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  371. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  372. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  373. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  374. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  375. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  376. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  377. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  378. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  379. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  380. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  381. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  382. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  383. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  384. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  385. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  386. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  387. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  388. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  389. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  390. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  391. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  392. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  393. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  394. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  395. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  396. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  397. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  398. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  399. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  400. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  401. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  402. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  403. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  404. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  405. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  406. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  407. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  408. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  409. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  410. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  411. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  412. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  413. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  414. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  415. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  416. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  417. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  418. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  419. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  420. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  421. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  422. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  423. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  424. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  425. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  426. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  427. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  428. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  429. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  430. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  431. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  432. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  433. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  434. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  435. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  436. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  437. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  438. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +78 -0
  439. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
  440. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
  441. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
  442. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
  443. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
  444. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
  445. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
  446. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
  447. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  448. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  449. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  450. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  451. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  452. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  453. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  454. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  455. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  456. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  457. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +47 -0
  458. data/ext/sources/ggml/src/ggml-cuda/tsembd.cuh +5 -0
  459. data/ext/sources/ggml/src/ggml-cuda/unary.cu +378 -0
  460. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +66 -0
  461. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +51 -0
  462. data/ext/sources/ggml/src/ggml-cuda/upscale.cuh +5 -0
  463. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +1135 -0
  464. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/cuda.h +1 -0
  465. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/hip.h +57 -0
  466. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/musa.h +7 -1
  467. data/ext/sources/ggml/src/ggml-cuda/wkv.cu +199 -0
  468. data/ext/sources/ggml/src/ggml-cuda/wkv.cuh +7 -0
  469. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +135 -0
  470. data/ext/{ggml → sources/ggml}/src/ggml-impl.h +147 -158
  471. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  472. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +112 -0
  473. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +58 -0
  474. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +25 -0
  475. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +52 -0
  476. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +52 -0
  477. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +52 -0
  478. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +52 -0
  479. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +30 -0
  480. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +22 -0
  481. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +17 -0
  482. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +31 -0
  483. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +31 -0
  484. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +38 -0
  485. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +39 -0
  486. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +44 -0
  487. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +52 -0
  488. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +69 -0
  489. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +51 -0
  490. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +33 -0
  491. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +35 -0
  492. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +140 -0
  493. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +106 -0
  494. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +73 -0
  495. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +52 -0
  496. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +28 -0
  497. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +84 -0
  498. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +21 -0
  499. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +53 -0
  500. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +52 -0
  501. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +52 -0
  502. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +52 -0
  503. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +52 -0
  504. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +19 -0
  505. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +23 -0
  506. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +22 -0
  507. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +72 -0
  508. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +71 -0
  509. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +121 -0
  510. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +649 -0
  511. data/ext/{ggml → sources/ggml}/src/ggml-metal/ggml-metal.m +2504 -1108
  512. data/ext/{ggml → sources/ggml}/src/ggml-metal/ggml-metal.metal +2102 -1463
  513. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +113 -0
  514. data/ext/sources/ggml/src/ggml-musa/mudnn.cu +112 -0
  515. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +12 -0
  516. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +110 -0
  517. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +6494 -0
  518. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +83 -0
  519. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  520. data/ext/sources/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  521. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  522. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  523. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +118 -0
  524. data/ext/sources/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  525. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  526. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  527. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +62 -0
  528. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  529. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  530. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +163 -0
  531. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
  532. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  533. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  534. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  535. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +79 -0
  536. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  537. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  538. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  539. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  540. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  541. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  542. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  543. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  544. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  545. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  546. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  547. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  548. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl +190 -0
  549. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +81 -0
  550. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  551. data/ext/sources/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  552. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  553. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +96 -0
  554. data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +721 -0
  555. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +16 -0
  556. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  557. data/ext/sources/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  558. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +87 -0
  559. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +87 -0
  560. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +86 -0
  561. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +86 -0
  562. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  563. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  564. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  565. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +84 -0
  566. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  567. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  568. data/ext/{ggml → sources/ggml}/src/ggml-opt.cpp +373 -190
  569. data/ext/{ggml → sources/ggml}/src/ggml-quants.c +120 -128
  570. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  571. data/ext/{ggml → sources/ggml}/src/ggml-rpc/ggml-rpc.cpp +494 -84
  572. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +189 -0
  573. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +37 -0
  574. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +344 -0
  575. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  576. data/ext/{ggml → sources/ggml}/src/ggml-sycl/common.cpp +20 -32
  577. data/ext/sources/ggml/src/ggml-sycl/common.hpp +561 -0
  578. data/ext/{ggml → sources/ggml}/src/ggml-sycl/concat.cpp +56 -70
  579. data/ext/sources/ggml/src/ggml-sycl/concat.hpp +20 -0
  580. data/ext/{ggml → sources/ggml}/src/ggml-sycl/conv.cpp +8 -12
  581. data/ext/sources/ggml/src/ggml-sycl/conv.hpp +20 -0
  582. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +575 -0
  583. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +34 -0
  584. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +839 -0
  585. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +11 -0
  586. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +823 -0
  587. data/ext/{ggml → sources/ggml}/src/ggml-sycl/dmmv.cpp +188 -67
  588. data/ext/sources/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  589. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +2987 -0
  590. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1120 -0
  591. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +84 -0
  592. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +102 -0
  593. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +212 -0
  594. data/ext/sources/ggml/src/ggml-sycl/getrows.hpp +20 -0
  595. data/ext/{ggml → sources/ggml}/src/ggml-sycl/ggml-sycl.cpp +1197 -1295
  596. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +106 -0
  597. data/ext/sources/ggml/src/ggml-sycl/gla.hpp +8 -0
  598. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +136 -0
  599. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +21 -0
  600. data/ext/{ggml → sources/ggml}/src/ggml-sycl/mmq.cpp +60 -81
  601. data/ext/sources/ggml/src/ggml-sycl/mmq.hpp +33 -0
  602. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1065 -0
  603. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  604. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +482 -0
  605. data/ext/sources/ggml/src/ggml-sycl/norm.hpp +26 -0
  606. data/ext/{ggml → sources/ggml}/src/ggml-sycl/outprod.cpp +8 -17
  607. data/ext/sources/ggml/src/ggml-sycl/outprod.hpp +10 -0
  608. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +74 -0
  609. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +111 -0
  610. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +472 -0
  611. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +20 -0
  612. data/ext/{ggml → sources/ggml}/src/ggml-sycl/softmax.cpp +38 -28
  613. data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +20 -0
  614. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +15 -0
  615. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +26 -0
  616. data/ext/{ggml → sources/ggml}/src/ggml-sycl/tsembd.cpp +6 -11
  617. data/ext/sources/ggml/src/ggml-sycl/tsembd.hpp +20 -0
  618. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +1307 -0
  619. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +289 -0
  620. data/ext/sources/ggml/src/ggml-sycl/wkv.hpp +10 -0
  621. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +200 -0
  622. data/ext/sources/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
  623. data/ext/{ggml → sources/ggml}/src/ggml-vulkan/ggml-vulkan.cpp +3822 -1335
  624. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +31 -0
  625. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +29 -0
  626. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +29 -0
  627. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +51 -0
  628. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +69 -0
  629. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
  630. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
  631. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
  632. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
  633. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  634. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
  635. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
  636. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +242 -0
  637. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
  638. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
  639. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
  640. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +462 -0
  641. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +699 -0
  642. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.comp +13 -0
  643. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
  644. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
  645. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
  646. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +48 -0
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +39 -0
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +49 -0
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
  663. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
  664. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +337 -0
  665. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  666. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  667. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +267 -0
  668. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +59 -0
  669. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  670. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
  671. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
  672. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +64 -0
  673. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.comp +9 -0
  674. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +76 -0
  675. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +33 -0
  676. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +41 -0
  677. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  678. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  679. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
  680. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +100 -0
  681. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +41 -0
  682. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
  683. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
  684. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
  685. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
  686. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +118 -0
  687. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +82 -0
  688. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +79 -0
  689. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
  690. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +87 -0
  691. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
  692. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
  693. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
  694. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +118 -0
  695. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +154 -0
  696. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +130 -0
  697. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
  698. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +136 -0
  699. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +167 -0
  700. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
  701. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +868 -0
  702. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +441 -0
  703. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +442 -0
  704. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +99 -0
  705. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
  706. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
  707. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +28 -0
  708. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
  709. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +77 -0
  710. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  711. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
  712. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
  713. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
  714. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +61 -0
  715. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
  716. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +58 -0
  717. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +60 -0
  718. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +43 -0
  719. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +43 -0
  720. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +47 -0
  721. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
  722. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
  723. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
  724. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
  725. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
  726. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +173 -0
  727. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +50 -0
  728. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
  729. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
  730. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +37 -0
  731. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  732. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
  733. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_bfloat16_support.comp +7 -0
  734. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp +7 -0
  735. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat_support.comp +7 -0
  736. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_integer_dot_support.comp +7 -0
  737. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +41 -0
  738. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +1373 -0
  739. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -0
  740. data/ext/{ggml → sources/ggml}/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +203 -36
  741. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
  742. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
  743. data/ext/{ggml → sources/ggml}/src/ggml.c +918 -1782
  744. data/ext/sources/ggml/src/ggml.cpp +26 -0
  745. data/ext/sources/ggml/src/gguf.cpp +1351 -0
  746. data/ext/{include → sources/include}/whisper.h +70 -2
  747. data/ext/sources/src/CMakeLists.txt +145 -0
  748. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  749. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  750. data/ext/{src → sources/src}/coreml/whisper-decoder-impl.h +27 -15
  751. data/ext/{src → sources/src}/coreml/whisper-decoder-impl.m +36 -10
  752. data/ext/{src → sources/src}/coreml/whisper-encoder-impl.h +21 -9
  753. data/ext/{src → sources/src}/coreml/whisper-encoder-impl.m +29 -3
  754. data/ext/sources/src/coreml/whisper-encoder.mm +73 -0
  755. data/ext/sources/src/whisper-arch.h +197 -0
  756. data/ext/{src → sources/src}/whisper.cpp +1966 -386
  757. data/ext/sources/tests/CMakeLists.txt +105 -0
  758. data/ext/sources/tests/earnings21/eval.mk +58 -0
  759. data/ext/sources/tests/earnings21/eval.py +68 -0
  760. data/ext/sources/tests/earnings21/normalizers/__init__.py +2 -0
  761. data/ext/sources/tests/earnings21/normalizers/basic.py +80 -0
  762. data/ext/sources/tests/earnings21/normalizers/english.json +1741 -0
  763. data/ext/sources/tests/earnings21/normalizers/english.py +550 -0
  764. data/ext/sources/tests/earnings21/requirements.txt +6 -0
  765. data/ext/sources/tests/en-0-ref.txt +1 -0
  766. data/ext/sources/tests/en-1-ref.txt +1 -0
  767. data/ext/sources/tests/en-2-ref.txt +1 -0
  768. data/ext/sources/tests/es-0-ref.txt +1 -0
  769. data/ext/sources/tests/librispeech/eval.mk +39 -0
  770. data/ext/sources/tests/librispeech/eval.py +47 -0
  771. data/ext/sources/tests/librispeech/normalizers/__init__.py +2 -0
  772. data/ext/sources/tests/librispeech/normalizers/basic.py +80 -0
  773. data/ext/sources/tests/librispeech/normalizers/english.json +1741 -0
  774. data/ext/sources/tests/librispeech/normalizers/english.py +550 -0
  775. data/ext/sources/tests/librispeech/requirements.txt +6 -0
  776. data/ext/sources/tests/run-tests.sh +130 -0
  777. data/ext/sources/tests/test-c.c +3 -0
  778. data/ext/sources/tests/test-vad-full.cpp +54 -0
  779. data/ext/sources/tests/test-vad.cpp +83 -0
  780. data/ext/sources/tests/test-whisper.js +58 -0
  781. data/extsources.rb +39 -5
  782. data/lib/whisper/context.rb +15 -0
  783. data/lib/whisper/model/uri.rb +202 -126
  784. data/lib/whisper/segment.rb +58 -0
  785. data/sig/whisper.rbs +510 -0
  786. data/test/helper.rb +24 -0
  787. data/{tests → test}/test_callback.rb +45 -3
  788. data/{tests → test}/test_error.rb +2 -2
  789. data/{tests → test}/test_model.rb +47 -0
  790. data/test/test_package.rb +51 -0
  791. data/test/test_params.rb +297 -0
  792. data/test/test_segment.rb +146 -0
  793. data/test/test_vad.rb +19 -0
  794. data/test/test_vad_params.rb +103 -0
  795. data/{tests → test}/test_whisper.rb +106 -36
  796. data/whispercpp.gemspec +5 -5
  797. metadata +837 -134
  798. data/ext/cpu.mk +0 -9
  799. data/ext/examples/dr_wav.h +0 -8815
  800. data/ext/ggml/src/ggml-cann/aclnn_ops.h +0 -592
  801. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -4262
  802. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  803. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -10835
  804. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +0 -14123
  805. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +0 -1884
  806. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +0 -14
  807. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +0 -288
  808. data/ext/ggml/src/ggml-sycl/convert.cpp +0 -547
  809. data/ext/ggml/src/ggml-sycl/element_wise.cpp +0 -1030
  810. data/ext/ggml/src/ggml-sycl/im2col.cpp +0 -126
  811. data/ext/ggml/src/ggml-sycl/mmvq.cpp +0 -1015
  812. data/ext/ggml/src/ggml-sycl/norm.cpp +0 -378
  813. data/ext/ggml/src/ggml-sycl/rope.cpp +0 -276
  814. data/ext/ggml/src/ggml-sycl/wkv6.cpp +0 -141
  815. data/ext/metal-embed.mk +0 -17
  816. data/ext/metal.mk +0 -6
  817. data/ext/ruby_whisper.cpp +0 -1909
  818. data/ext/scripts/get-flags.mk +0 -38
  819. data/lib/whisper.rb +0 -2
  820. data/tests/helper.rb +0 -7
  821. data/tests/test_package.rb +0 -31
  822. data/tests/test_params.rb +0 -160
  823. data/tests/test_segment.rb +0 -83
  824. /data/ext/{ggml → sources/ggml}/include/ggml-blas.h +0 -0
  825. /data/ext/{ggml → sources/ggml}/include/ggml-cann.h +0 -0
  826. /data/ext/{ggml → sources/ggml}/include/ggml-cuda.h +0 -0
  827. /data/ext/{ggml → sources/ggml}/include/ggml-kompute.h +0 -0
  828. /data/ext/{ggml → sources/ggml}/include/ggml-opencl.h +0 -0
  829. /data/ext/{ggml → sources/ggml}/include/ggml-sycl.h +0 -0
  830. /data/ext/{ggml → sources/ggml}/src/ggml-amx/common.h +0 -0
  831. /data/ext/{ggml → sources/ggml}/src/ggml-amx/ggml-amx.cpp +0 -0
  832. /data/ext/{ggml → sources/ggml}/src/ggml-amx/mmq.cpp +0 -0
  833. /data/ext/{ggml → sources/ggml}/src/ggml-amx/mmq.h +0 -0
  834. /data/ext/{ggml → sources/ggml}/src/ggml-blas/ggml-blas.cpp +0 -0
  835. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/ascendc_kernels.h +0 -0
  836. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_f16.cpp +0 -0
  837. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_f32.cpp +0 -0
  838. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -0
  839. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -0
  840. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -0
  841. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -0
  842. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -0
  843. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/amx.h +0 -0
  844. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/common.h +0 -0
  845. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/mmq.h +0 -0
  846. /data/ext/{ggml/src/ggml-cpu/ggml-cpu-hbm.h → sources/ggml/src/ggml-cpu/hbm.h} +0 -0
  847. /data/ext/{ggml/src/ggml-cpu/ggml-cpu-traits.h → sources/ggml/src/ggml-cpu/traits.h} +0 -0
  848. /data/ext/{ggml → sources/ggml}/src/ggml-kompute/ggml-kompute.cpp +0 -0
  849. /data/ext/{ggml → sources/ggml}/src/ggml-quants.h +0 -0
  850. /data/ext/{ggml → sources/ggml}/src/ggml-threading.cpp +0 -0
  851. /data/ext/{ggml → sources/ggml}/src/ggml-threading.h +0 -0
  852. /data/ext/{src → sources/src}/coreml/whisper-encoder.h +0 -0
  853. /data/ext/{src → sources/src}/openvino/whisper-openvino-encoder.cpp +0 -0
  854. /data/ext/{src → sources/src}/openvino/whisper-openvino-encoder.h +0 -0
  855. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  856. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  857. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
@@ -0,0 +1,3647 @@
1
+ #include "ggml-cuda.h"
2
+ #include "ggml-impl.h"
3
+ #include "ggml-backend-impl.h"
4
+
5
+ #include "ggml-cuda/common.cuh"
6
+ #include "ggml-cuda/acc.cuh"
7
+ #include "ggml-cuda/arange.cuh"
8
+ #include "ggml-cuda/argmax.cuh"
9
+ #include "ggml-cuda/argsort.cuh"
10
+ #include "ggml-cuda/binbcast.cuh"
11
+ #include "ggml-cuda/clamp.cuh"
12
+ #include "ggml-cuda/concat.cuh"
13
+ #include "ggml-cuda/conv-transpose-1d.cuh"
14
+ #include "ggml-cuda/conv2d-dw.cuh"
15
+ #include "ggml-cuda/conv2d-transpose.cuh"
16
+ #include "ggml-cuda/convert.cuh"
17
+ #include "ggml-cuda/count-equal.cuh"
18
+ #include "ggml-cuda/cpy.cuh"
19
+ #include "ggml-cuda/cross-entropy-loss.cuh"
20
+ #include "ggml-cuda/diagmask.cuh"
21
+ #include "ggml-cuda/fattn.cuh"
22
+ #include "ggml-cuda/getrows.cuh"
23
+ #include "ggml-cuda/im2col.cuh"
24
+ #include "ggml-cuda/mmq.cuh"
25
+ #include "ggml-cuda/mmv.cuh"
26
+ #include "ggml-cuda/mmvq.cuh"
27
+ #include "ggml-cuda/norm.cuh"
28
+ #include "ggml-cuda/opt-step-adamw.cuh"
29
+ #include "ggml-cuda/out-prod.cuh"
30
+ #include "ggml-cuda/pad.cuh"
31
+ #include "ggml-cuda/pool2d.cuh"
32
+ #include "ggml-cuda/quantize.cuh"
33
+ #include "ggml-cuda/rope.cuh"
34
+ #include "ggml-cuda/scale.cuh"
35
+ #include "ggml-cuda/softmax.cuh"
36
+ #include "ggml-cuda/ssm-conv.cuh"
37
+ #include "ggml-cuda/ssm-scan.cuh"
38
+ #include "ggml-cuda/sum.cuh"
39
+ #include "ggml-cuda/sumrows.cuh"
40
+ #include "ggml-cuda/mean.cuh"
41
+ #include "ggml-cuda/tsembd.cuh"
42
+ #include "ggml-cuda/unary.cuh"
43
+ #include "ggml-cuda/upscale.cuh"
44
+ #include "ggml-cuda/wkv.cuh"
45
+ #include "ggml-cuda/gla.cuh"
46
+ #include "ggml.h"
47
+
48
+ #include <algorithm>
49
+ #include <array>
50
+ #include <atomic>
51
+ #include <charconv>
52
+ #include <cinttypes>
53
+ #include <condition_variable>
54
+ #include <cstddef>
55
+ #include <cstdint>
56
+ #include <float.h>
57
+ #include <limits>
58
+ #include <map>
59
+ #include <memory>
60
+ #include <mutex>
61
+ #include <stdarg.h>
62
+ #include <stdio.h>
63
+ #include <stdlib.h>
64
+ #include <string>
65
+ #include <vector>
66
+
67
+ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
68
+
69
+ [[noreturn]]
70
+ void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
71
+ int id = -1; // in case cudaGetDevice fails
72
+ (void)cudaGetDevice(&id);
73
+
74
+ GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
75
+ GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
76
+ GGML_LOG_ERROR(" %s\n", stmt);
77
+ // abort with GGML_ABORT to get a stack trace
78
+ GGML_ABORT(GGML_CUDA_NAME " error");
79
+ }
80
+
81
+ // this is faster on Windows
82
+ // probably because the Windows CUDA libraries forget to make this check before invoking the drivers
83
+ void ggml_cuda_set_device(int device) {
84
+ int current_device;
85
+ CUDA_CHECK(cudaGetDevice(&current_device));
86
+
87
+ if (device == current_device) {
88
+ return;
89
+ }
90
+
91
+ CUDA_CHECK(cudaSetDevice(device));
92
+ }
93
+
94
+ int ggml_cuda_get_device() {
95
+ int id;
96
+ CUDA_CHECK(cudaGetDevice(&id));
97
+ return id;
98
+ }
99
+
100
+ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
101
+ ggml_cuda_set_device(device);
102
+ cudaError_t err;
103
+ if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) {
104
+ err = cudaMallocManaged(ptr, size);
105
+ #if defined(GGML_USE_HIP)
106
+ if (err == hipSuccess) {
107
+ CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
108
+ }
109
+
110
+ // fall back to cudaMalloc if not supported (e.g. on Windows)
111
+ if (err == hipErrorNotSupported) {
112
+ static bool warned_unsupported = false;
113
+ if (!warned_unsupported) {
114
+ GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n");
115
+ warned_unsupported = true;
116
+ }
117
+
118
+ err = cudaMalloc(ptr, size);
119
+ }
120
+ #endif // defined(GGML_USE_HIP)
121
+ } else {
122
+ err = cudaMalloc(ptr, size);
123
+ }
124
+ return err;
125
+ }
126
+
127
+ #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
128
+ static int ggml_cuda_parse_id(char devName[]) {
129
+ // A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
130
+ // these values are not stable so this is susceptible to breakage
131
+ // https://github.com/ROCm/clr/blob/amd-staging/rocclr/device/device.cpp
132
+ int archMajor = 0x0;
133
+ int archMinor = 0x0;
134
+ int archNum = GGML_CUDA_CC_OFFSET_AMD;
135
+ int archLen = strlen(devName);
136
+ char archName[archLen + 1];
137
+
138
+ // strip leading 'gfx' while copying into our buffer
139
+ if (archLen > 3) {
140
+ strcpy(archName, &devName[3]);
141
+ archLen -= 3;
142
+ }
143
+
144
+ // trim trailing :xnack- or :sramecc- statuses
145
+ archLen = strcspn(archName, ":");
146
+ archName[archLen] = '\0';
147
+
148
+ // tease out the version information
149
+ if (archLen > 8) {
150
+ // versions labeled generic use '-' as delimiter
151
+ // strip the trailing "-generic" then iterate through what remains
152
+ if ((strstr(archName, "-generic"))) {
153
+ archName[archLen - 8] = '\0';
154
+ char * pch;
155
+ if ((pch = strtok(archName, "-"))) {
156
+ archMajor = (int)strtoul(pch, 0, 16);
157
+ if ((pch = strtok(NULL, "-"))) {
158
+ archMinor = 0x10 * (int)strtoul(pch, 0, 16);
159
+ }
160
+ }
161
+ }
162
+ } else if (archLen >= 3) {
163
+ // last two digits should be the minor * 0x10 + stepping
164
+ archMinor = (int)strtoul(&archName[archLen - 2], 0, 16);
165
+ archName[archLen - 2] = '\0';
166
+
167
+ // only the major version remains
168
+ archMajor = (int)strtoul(archName, 0, 16);
169
+ }
170
+ archNum += archMajor * 0x100;
171
+ archNum += archMinor;
172
+ return archNum;
173
+ }
174
+ #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
175
+
176
+ static ggml_cuda_device_info ggml_cuda_init() {
177
+ #ifdef __HIP_PLATFORM_AMD__
178
+ // Workaround for a rocBLAS bug when using multiple graphics cards:
179
+ // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
180
+ {
181
+ int major_version = 0;
182
+ size_t version_length = 0;
183
+ if (rocblas_get_version_string_size(&version_length) == rocblas_status_success) {
184
+ std::vector<char> version(version_length+1, '\0');
185
+ if (rocblas_get_version_string(version.data(), version.size()) == rocblas_status_success) {
186
+ version.resize(::strlen(version.data()));
187
+ int parsed_value = 0;
188
+ if (std::from_chars(version.data(), version.data() + version.size(), parsed_value).ec == std::errc()) {
189
+ major_version = parsed_value;
190
+ }
191
+ }
192
+ }
193
+ if (major_version < 4) {
194
+ GGML_LOG_DEBUG(GGML_CUDA_NAME " calling rocblas_initialize as a workaround for a rocBLAS bug\n");
195
+ rocblas_initialize();
196
+ CUDA_CHECK(cudaDeviceSynchronize());
197
+ }
198
+ }
199
+ #endif
200
+
201
+ ggml_cuda_device_info info = {};
202
+
203
+ cudaError_t err = cudaGetDeviceCount(&info.device_count);
204
+ if (err != cudaSuccess) {
205
+ GGML_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
206
+ return info;
207
+ }
208
+
209
+ GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
210
+
211
+ int64_t total_vram = 0;
212
+ #ifdef GGML_CUDA_FORCE_MMQ
213
+ GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
214
+ #else
215
+ GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
216
+ #endif // GGML_CUDA_FORCE_MMQ
217
+ #ifdef GGML_CUDA_FORCE_CUBLAS
218
+ GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
219
+ #else
220
+ GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
221
+ #endif // GGML_CUDA_FORCE_CUBLAS
222
+ GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
223
+ for (int id = 0; id < info.device_count; ++id) {
224
+ int device_vmm = 0;
225
+
226
+ #if defined(GGML_USE_VMM)
227
+ CUdevice device;
228
+ CU_CHECK(cuDeviceGet(&device, id));
229
+ CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
230
+
231
+ if (device_vmm) {
232
+ CUmemAllocationProp alloc_prop = {};
233
+ alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
234
+ alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
235
+ alloc_prop.location.id = id;
236
+ CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
237
+ }
238
+ #endif // defined(GGML_USE_VMM)
239
+ info.devices[id].vmm = !!device_vmm;
240
+
241
+ cudaDeviceProp prop;
242
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
243
+
244
+ info.default_tensor_split[id] = total_vram;
245
+ total_vram += prop.totalGlobalMem;
246
+ info.devices[id].integrated = prop.integrated;
247
+ info.devices[id].nsm = prop.multiProcessorCount;
248
+ info.devices[id].smpb = prop.sharedMemPerBlock;
249
+ info.devices[id].warp_size = prop.warpSize;
250
+ #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
251
+ info.devices[id].smpbo = prop.sharedMemPerBlock;
252
+
253
+ info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
254
+ if ((info.devices[id].cc & 0xff00) == 0x0) {
255
+ GGML_LOG_WARN("invalid architecture ID received for device %d %s: %s cc %d.%d\n",
256
+ id, prop.name, prop.gcnArchName, prop.major, prop.minor);
257
+
258
+ // Fallback to prop.major and prop.minor
259
+ if (prop.major > 0) {
260
+ info.devices[id].cc = GGML_CUDA_CC_OFFSET_AMD + prop.major * 0x100;
261
+ info.devices[id].cc += prop.minor * 0x10;
262
+ }
263
+ }
264
+ GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
265
+ id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
266
+ device_vmm ? "yes" : "no", prop.warpSize);
267
+ #elif defined(GGML_USE_MUSA)
268
+ // FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
269
+ info.devices[id].warp_size = 32;
270
+ info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
271
+ info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
272
+ info.devices[id].cc += prop.minor * 0x10;
273
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
274
+ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
275
+ #else
276
+ info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
277
+ info.devices[id].cc = 100*prop.major + 10*prop.minor;
278
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
279
+ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
280
+ #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
281
+ }
282
+
283
+ for (int id = 0; id < info.device_count; ++id) {
284
+ info.default_tensor_split[id] /= total_vram;
285
+ }
286
+
287
+ // configure logging to stdout
288
+ // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
289
+
290
+ return info;
291
+ }
292
+
293
+ const ggml_cuda_device_info & ggml_cuda_info() {
294
+ static ggml_cuda_device_info info = ggml_cuda_init();
295
+ return info;
296
+ }
297
+
298
+ // #define DEBUG_CUDA_MALLOC
299
+
300
+ // buffer pool for cuda (legacy)
301
+ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
302
+ static const int MAX_BUFFERS = 256;
303
+
304
+ int device;
305
+ struct ggml_cuda_buffer {
306
+ void * ptr = nullptr;
307
+ size_t size = 0;
308
+ };
309
+
310
+ ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
311
+ size_t pool_size = 0;
312
+
313
+ explicit ggml_cuda_pool_leg(int device) :
314
+ device(device) {
315
+ }
316
+
317
+ ~ggml_cuda_pool_leg() {
318
+ ggml_cuda_set_device(device);
319
+ for (int i = 0; i < MAX_BUFFERS; ++i) {
320
+ ggml_cuda_buffer & b = buffer_pool[i];
321
+ if (b.ptr != nullptr) {
322
+ CUDA_CHECK(cudaFree(b.ptr));
323
+ pool_size -= b.size;
324
+ }
325
+ }
326
+ GGML_ASSERT(pool_size == 0);
327
+ }
328
+
329
+ void * alloc(size_t size, size_t * actual_size) override {
330
+ #ifdef DEBUG_CUDA_MALLOC
331
+ int nnz = 0;
332
+ size_t max_size = 0;
333
+ #endif
334
+ size_t best_diff = 1ull << 36;
335
+ int ibest = -1;
336
+ for (int i = 0; i < MAX_BUFFERS; ++i) {
337
+ ggml_cuda_buffer& b = buffer_pool[i];
338
+ if (b.ptr != nullptr) {
339
+ #ifdef DEBUG_CUDA_MALLOC
340
+ ++nnz;
341
+ if (b.size > max_size) max_size = b.size;
342
+ #endif
343
+ if (b.size >= size) {
344
+ size_t diff = b.size - size;
345
+ if (diff < best_diff) {
346
+ best_diff = diff;
347
+ ibest = i;
348
+ if (!best_diff) {
349
+ void * ptr = b.ptr;
350
+ *actual_size = b.size;
351
+ b.ptr = nullptr;
352
+ b.size = 0;
353
+ return ptr;
354
+ }
355
+ }
356
+ }
357
+ }
358
+ }
359
+ if (ibest >= 0) {
360
+ ggml_cuda_buffer& b = buffer_pool[ibest];
361
+ void * ptr = b.ptr;
362
+ *actual_size = b.size;
363
+ b.ptr = nullptr;
364
+ b.size = 0;
365
+ return ptr;
366
+ }
367
+ void * ptr;
368
+ size_t look_ahead_size = (size_t) (1.05 * size);
369
+ look_ahead_size = 256 * ((look_ahead_size + 255)/256);
370
+ ggml_cuda_set_device(device);
371
+ CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
372
+ *actual_size = look_ahead_size;
373
+ pool_size += look_ahead_size;
374
+ #ifdef DEBUG_CUDA_MALLOC
375
+ GGML_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
376
+ (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
377
+ #endif
378
+ return ptr;
379
+ }
380
+
381
+ void free(void * ptr, size_t size) override {
382
+ for (int i = 0; i < MAX_BUFFERS; ++i) {
383
+ ggml_cuda_buffer& b = buffer_pool[i];
384
+ if (b.ptr == nullptr) {
385
+ b.ptr = ptr;
386
+ b.size = size;
387
+ return;
388
+ }
389
+ }
390
+ GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
391
+ ggml_cuda_set_device(device);
392
+ CUDA_CHECK(cudaFree(ptr));
393
+ pool_size -= size;
394
+ }
395
+ };
396
+
397
+ // pool with virtual memory
398
+ #if defined(GGML_USE_VMM)
399
+ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
400
+ static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
401
+
402
+ int device;
403
+ CUdeviceptr pool_addr = 0;
404
+ size_t pool_used = 0;
405
+ size_t pool_size = 0;
406
+ size_t granularity;
407
+ #if defined(GGML_USE_HIP)
408
+ std::vector<std::pair<CUdeviceptr, size_t>> mappings;
409
+ #endif
410
+
411
+ explicit ggml_cuda_pool_vmm(int device) :
412
+ device(device),
413
+ granularity(ggml_cuda_info().devices[device].vmm_granularity) {
414
+ }
415
+
416
+ ~ggml_cuda_pool_vmm() {
417
+ if (pool_addr != 0) {
418
+ #if defined(GGML_USE_HIP)
419
+ // Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
420
+ for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
421
+ CU_CHECK(cuMemUnmap(mapping.first, mapping.second));
422
+ }
423
+ #else
424
+ CU_CHECK(cuMemUnmap(pool_addr, pool_size));
425
+ #endif
426
+ CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
427
+ }
428
+ }
429
+
430
+ void * alloc(size_t size, size_t * actual_size) override {
431
+ // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
432
+ const size_t alignment = 128;
433
+ size = alignment * ((size + alignment - 1) / alignment);
434
+
435
+ size_t avail = pool_size - pool_used;
436
+
437
+ if (size > avail) {
438
+ // round up to the next multiple of the granularity
439
+ size_t reserve_size = size - avail;
440
+ reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
441
+
442
+ GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
443
+
444
+ // allocate more physical memory
445
+ CUmemAllocationProp prop = {};
446
+ prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
447
+ prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
448
+ prop.location.id = device;
449
+ CUmemGenericAllocationHandle handle;
450
+ CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
451
+
452
+ // reserve virtual address space (if not already reserved)
453
+ if (pool_addr == 0) {
454
+ CU_CHECK(cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
455
+ }
456
+
457
+ // map at the end of the pool
458
+ CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size);
459
+ CU_CHECK(cuMemMap(start_ptr, reserve_size, 0, handle, 0));
460
+ #if defined(GGML_USE_HIP)
461
+ mappings.push_back({start_ptr, reserve_size});
462
+ #endif
463
+
464
+ // the memory allocation handle is no longer needed after mapping
465
+ CU_CHECK(cuMemRelease(handle));
466
+
467
+ // set access
468
+ CUmemAccessDesc access = {};
469
+ access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
470
+ access.location.id = device;
471
+ access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
472
+ CU_CHECK(cuMemSetAccess((CUdeviceptr)((char *)(pool_addr) + pool_size), reserve_size, &access, 1));
473
+
474
+ // add to the pool
475
+ pool_size += reserve_size;
476
+
477
+ //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
478
+ // device, (unsigned long long) (pool_size/1024/1024),
479
+ // (unsigned long long) (reserve_size/1024/1024));
480
+ }
481
+
482
+ GGML_ASSERT(pool_addr != 0);
483
+
484
+ void * ptr = (void *) ((CUdeviceptr)((char *)(pool_addr) + pool_used));
485
+ *actual_size = size;
486
+ pool_used += size;
487
+
488
+ #ifdef DEBUG_CUDA_MALLOC
489
+ printf("cuda pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
490
+ #endif
491
+
492
+ return ptr;
493
+ }
494
+
495
+ void free(void * ptr, size_t size) override {
496
+ #ifdef DEBUG_CUDA_MALLOC
497
+ printf("cuda pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
498
+ #endif
499
+
500
+ pool_used -= size;
501
+
502
+ // all deallocations must be in reverse order of the allocations
503
+ GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
504
+ }
505
+ };
506
+ #endif // defined(GGML_USE_VMM)
507
+
508
+ std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
509
+ #if defined(GGML_USE_VMM)
510
+ if (ggml_cuda_info().devices[device].vmm) {
511
+ return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
512
+ }
513
+ #endif // defined(GGML_USE_VMM)
514
+ return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
515
+ }
516
+
517
+ // destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error
518
+ // this lock is used to ensure that no cuBLAS handle is destroyed while a graph is being captured
519
+
520
+ static std::mutex ggml_cuda_lock;
521
+ static std::condition_variable ggml_cuda_lock_cv;
522
+ static std::atomic<int> ggml_cuda_lock_counter;
523
+
524
+ ggml_backend_cuda_context::~ggml_backend_cuda_context() {
525
+ std::unique_lock<std::mutex> lock(ggml_cuda_lock);
526
+ ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter.load(std::memory_order_relaxed) == 0; });
527
+
528
+ if (copy_event != nullptr) {
529
+ CUDA_CHECK(cudaEventDestroy(copy_event));
530
+ }
531
+ for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
532
+ for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
533
+ if (streams[i][j] != nullptr) {
534
+ CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
535
+ }
536
+ }
537
+ if (cublas_handles[i] != nullptr) {
538
+ CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
539
+ }
540
+ }
541
+ }
542
+
543
+
544
+ // cuda buffer
545
+
546
+ struct ggml_backend_cuda_buffer_context {
547
+ int device;
548
+ void * dev_ptr = nullptr;
549
+ std::string name;
550
+
551
+ ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
552
+ device(device), dev_ptr(dev_ptr),
553
+ name(GGML_CUDA_NAME + std::to_string(device)) {
554
+ }
555
+
556
+ ~ggml_backend_cuda_buffer_context() {
557
+ CUDA_CHECK(cudaFree(dev_ptr));
558
+ }
559
+ };
560
+
561
+ static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
562
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
563
+ delete ctx;
564
+ }
565
+
566
+ static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
567
+ return buffer->iface.free_buffer == ggml_backend_cuda_buffer_free_buffer;
568
+ }
569
+
570
+ static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
571
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
572
+ return ctx->dev_ptr;
573
+ }
574
+
575
+ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
576
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
577
+
578
+ if (tensor->view_src != NULL) {
579
+ assert(tensor->view_src->buffer->buft == buffer->buft);
580
+ return GGML_STATUS_SUCCESS;
581
+ }
582
+
583
+ if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
584
+ // initialize padding to 0 to avoid possible NaN values
585
+ const size_t original_size = ggml_nbytes(tensor);
586
+ const size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
587
+
588
+ if (padded_size > original_size) {
589
+ ggml_cuda_set_device(ctx->device);
590
+ CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
591
+ }
592
+ }
593
+ return GGML_STATUS_SUCCESS;
594
+ }
595
+
596
+ static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
597
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
598
+
599
+ ggml_cuda_set_device(ctx->device);
600
+ CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + offset, value, size, cudaStreamPerThread));
601
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
602
+ }
603
+
604
+ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
605
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
606
+
607
+ ggml_cuda_set_device(ctx->device);
608
+ CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
609
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
610
+ }
611
+
612
+ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
613
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
614
+
615
+ ggml_cuda_set_device(ctx->device);
616
+ CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
617
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
618
+ }
619
+
620
+ static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
621
+ if (ggml_backend_buffer_is_cuda(src->buffer)) {
622
+ ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
623
+ ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
624
+ if (src_ctx->device == dst_ctx->device) {
625
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
626
+ } else {
627
+ #ifdef GGML_CUDA_NO_PEER_COPY
628
+ return false;
629
+ #else
630
+ CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
631
+ #endif
632
+ }
633
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
634
+ return true;
635
+ }
636
+ return false;
637
+
638
+ GGML_UNUSED(buffer);
639
+ }
640
+
641
+ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
642
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
643
+
644
+ ggml_cuda_set_device(ctx->device);
645
+ CUDA_CHECK(cudaMemsetAsync(ctx->dev_ptr, value, buffer->size, cudaStreamPerThread));
646
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
647
+ }
648
+
649
+ static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
650
+ /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
651
+ /* .get_base = */ ggml_backend_cuda_buffer_get_base,
652
+ /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
653
+ /* .memset_tensor = */ ggml_backend_cuda_buffer_memset_tensor,
654
+ /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
655
+ /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
656
+ /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor,
657
+ /* .clear = */ ggml_backend_cuda_buffer_clear,
658
+ /* .reset = */ NULL,
659
+ };
660
+
661
+ // cuda buffer type
662
+ struct ggml_backend_cuda_buffer_type_context {
663
+ int device;
664
+ std::string name;
665
+ };
666
+
667
+ static const char * ggml_backend_cuda_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
668
+ ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
669
+
670
+ return ctx->name.c_str();
671
+ }
672
+
673
+ static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
674
+ return buft->iface.get_name == ggml_backend_cuda_buffer_type_get_name;
675
+ }
676
+
677
+ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
678
+ ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
679
+
680
+ ggml_cuda_set_device(buft_ctx->device);
681
+
682
+ void * dev_ptr;
683
+ cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
684
+ if (err != cudaSuccess) {
685
+ // clear the error
686
+ (void)cudaGetLastError();
687
+ GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
688
+ return nullptr;
689
+ }
690
+
691
+ ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
692
+
693
+ return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
694
+ }
695
+
696
+ static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
697
+ return 128;
698
+
699
+ GGML_UNUSED(buft);
700
+ }
701
+
702
+ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
703
+ size_t size = ggml_nbytes(tensor);
704
+ int64_t ne0 = tensor->ne[0];
705
+
706
+ if (ggml_is_quantized(tensor->type)) {
707
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
708
+ GGML_ASSERT(tensor->nb[0] == ggml_element_size(tensor));
709
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
710
+ }
711
+ }
712
+
713
+ return size;
714
+
715
+ GGML_UNUSED(buft);
716
+ }
717
+
718
+ static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
719
+ /* .get_name = */ ggml_backend_cuda_buffer_type_get_name,
720
+ /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
721
+ /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
722
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
723
+ /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
724
+ /* .is_host = */ NULL,
725
+ };
726
+
727
+ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
728
+ static std::mutex mutex;
729
+ std::lock_guard<std::mutex> lock(mutex);
730
+
731
+ if (device >= ggml_backend_cuda_get_device_count()) {
732
+ return nullptr;
733
+ }
734
+
735
+ static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
736
+
737
+ static bool ggml_backend_cuda_buffer_type_initialized = false;
738
+
739
+ if (!ggml_backend_cuda_buffer_type_initialized) {
740
+ for (int i = 0; i < ggml_backend_cuda_get_device_count(); i++) {
741
+ ggml_backend_cuda_buffer_types[i] = {
742
+ /* .iface = */ ggml_backend_cuda_buffer_type_interface,
743
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), i),
744
+ /* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
745
+ };
746
+ }
747
+ ggml_backend_cuda_buffer_type_initialized = true;
748
+ }
749
+
750
+ return &ggml_backend_cuda_buffer_types[device];
751
+ }
752
+
753
+ // cuda split buffer
754
+
755
+ static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
756
+ int64_t row_rounding = 0;
757
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
758
+ if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
759
+ continue;
760
+ }
761
+
762
+ const int cc = ggml_cuda_info().devices[id].cc;
763
+ row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc));
764
+ }
765
+ return row_rounding;
766
+ }
767
+
768
+ static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
769
+ const int64_t nrows = ggml_nrows(tensor);
770
+ const int64_t rounding = get_row_rounding(tensor_split);
771
+
772
+ *row_low = id == 0 ? 0 : nrows*tensor_split[id];
773
+ *row_low -= *row_low % rounding;
774
+
775
+ if (id == ggml_backend_cuda_get_device_count() - 1) {
776
+ *row_high = nrows;
777
+ } else {
778
+ *row_high = nrows*tensor_split[id + 1];
779
+ *row_high -= *row_high % rounding;
780
+ }
781
+ }
782
+
783
+ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
784
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
785
+
786
+ return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
787
+ }
788
+
789
+ struct ggml_backend_cuda_split_buffer_type_context {
790
+ int main_device;
791
+ std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
792
+ std::string name;
793
+ };
794
+
795
+ struct ggml_backend_cuda_split_buffer_context {
796
+ ~ggml_backend_cuda_split_buffer_context() {
797
+ for (ggml_tensor_extra_gpu * extra : tensor_extras) {
798
+ for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) {
799
+ for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
800
+ if (extra->events[id][is] != nullptr) {
801
+ CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
802
+ }
803
+ }
804
+ if (extra->data_device[id] != nullptr) {
805
+ CUDA_CHECK(cudaFree(extra->data_device[id]));
806
+ }
807
+ }
808
+ delete extra;
809
+ }
810
+ }
811
+
812
+ std::vector<ggml_tensor_extra_gpu *> tensor_extras;
813
+ };
814
+
815
+
816
+ static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
817
+ ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
818
+ delete ctx;
819
+ }
820
+
821
+ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
822
+ // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
823
+ return (void *)0x1000;
824
+
825
+ GGML_UNUSED(buffer);
826
+ }
827
+
828
+ static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
829
+ GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
830
+ GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
831
+
832
+ ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
833
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
834
+
835
+ const int64_t ne0 = tensor->ne[0];
836
+
837
+ ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
838
+ ctx->tensor_extras.push_back(extra);
839
+
840
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
841
+ int64_t row_low, row_high;
842
+ get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
843
+
844
+ int64_t nrows_split = row_high - row_low;
845
+ if (nrows_split == 0) {
846
+ continue;
847
+ }
848
+
849
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
850
+ const size_t original_size = size;
851
+
852
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
853
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
854
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
855
+ }
856
+
857
+ // FIXME: do not crash if cudaMalloc fails
858
+ // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
859
+ ggml_cuda_set_device(id);
860
+ char * buf;
861
+ CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
862
+
863
+ // set padding to 0 to avoid possible NaN values
864
+ if (size > original_size) {
865
+ CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
866
+ }
867
+
868
+ extra->data_device[id] = buf;
869
+
870
+ for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
871
+ CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
872
+ }
873
+ }
874
+ tensor->extra = extra;
875
+ return GGML_STATUS_SUCCESS;
876
+ }
877
+
878
+ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
879
+ // split tensors must always be set in their entirety at once
880
+ GGML_ASSERT(offset == 0);
881
+ GGML_ASSERT(size == ggml_nbytes(tensor));
882
+ GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
883
+
884
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
885
+
886
+ const int64_t ne0 = tensor->ne[0];
887
+ const size_t nb1 = tensor->nb[1];
888
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
889
+
890
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
891
+ int64_t row_low, row_high;
892
+ get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
893
+
894
+ int64_t nrows_split = row_high - row_low;
895
+ if (nrows_split == 0) {
896
+ continue;
897
+ }
898
+
899
+ const size_t offset_split = row_low*nb1;
900
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
901
+ const size_t original_size = size;
902
+
903
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
904
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
905
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
906
+ }
907
+
908
+ const char * buf_host = (const char *)data + offset_split;
909
+ CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
910
+ }
911
+
912
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
913
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
914
+ }
915
+ }
916
+
917
+ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
918
+ // split tensors must always be set in their entirety at once
919
+ GGML_ASSERT(offset == 0);
920
+ GGML_ASSERT(size == ggml_nbytes(tensor));
921
+ GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
922
+
923
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
924
+
925
+ const int64_t ne0 = tensor->ne[0];
926
+ const size_t nb1 = tensor->nb[1];
927
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
928
+
929
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
930
+ int64_t row_low, row_high;
931
+ get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
932
+
933
+ int64_t nrows_split = row_high - row_low;
934
+ if (nrows_split == 0) {
935
+ continue;
936
+ }
937
+
938
+ const size_t offset_split = row_low*nb1;
939
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
940
+ const size_t original_size = size;
941
+
942
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
943
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
944
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
945
+ }
946
+
947
+ char * buf_host = (char *)data + offset_split;
948
+ CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
949
+ }
950
+
951
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
952
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
953
+ }
954
+ }
955
+
956
+ static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
957
+ GGML_UNUSED(buffer);
958
+ GGML_UNUSED(value);
959
+ }
960
+
961
+ static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
962
+ /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
963
+ /* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
964
+ /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
965
+ /* .memset_tensor = */ NULL,
966
+ /* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
967
+ /* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
968
+ /* .cpy_tensor = */ NULL,
969
+ /* .clear = */ ggml_backend_cuda_split_buffer_clear,
970
+ /* .reset = */ NULL,
971
+ };
972
+
973
+ // cuda split buffer type
974
+
975
+ static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
976
+ ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
977
+
978
+ return ctx->name.c_str();
979
+ }
980
+
981
+ static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
982
+ return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_get_name;
983
+ }
984
+
985
+ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
986
+ // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
987
+ // instead, we allocate them for each tensor separately in init_tensor
988
+ // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
989
+ // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
990
+ ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
991
+
992
+ return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
993
+ }
994
+
995
+ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
996
+ return 128;
997
+
998
+ GGML_UNUSED(buft);
999
+ }
1000
+
1001
+ static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
1002
+ ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
1003
+ GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
1004
+
1005
+ size_t total_size = 0;
1006
+
1007
+ const int64_t ne0 = tensor->ne[0];
1008
+
1009
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1010
+ int64_t row_low, row_high;
1011
+ get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
1012
+
1013
+ int64_t nrows_split = row_high - row_low;
1014
+ if (nrows_split == 0) {
1015
+ continue;
1016
+ }
1017
+
1018
+ total_size += ggml_nbytes_split(tensor, nrows_split);
1019
+
1020
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
1021
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
1022
+ total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
1023
+ }
1024
+ }
1025
+
1026
+ return total_size;
1027
+ }
1028
+
1029
+ static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
1030
+ return false;
1031
+
1032
+ GGML_UNUSED(buft);
1033
+ }
1034
+
1035
+ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
1036
+ /* .get_name = */ ggml_backend_cuda_split_buffer_type_get_name,
1037
+ /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
1038
+ /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
1039
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1040
+ /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
1041
+ /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
1042
+ };
1043
+
1044
+ ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
1045
+ static std::mutex mutex;
1046
+ std::lock_guard<std::mutex> lock(mutex);
1047
+
1048
+ static std::map<std::pair<int, std::array<float, GGML_CUDA_MAX_DEVICES>>, struct ggml_backend_buffer_type> buft_map;
1049
+
1050
+ std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
1051
+
1052
+ bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
1053
+ if (all_zero) {
1054
+ tensor_split_arr = ggml_cuda_info().default_tensor_split;
1055
+ } else {
1056
+ float split_sum = 0.0f;
1057
+ for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
1058
+ tensor_split_arr[i] = split_sum;
1059
+ split_sum += tensor_split[i];
1060
+ }
1061
+ for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
1062
+ tensor_split_arr[i] /= split_sum;
1063
+ }
1064
+ }
1065
+
1066
+ auto it = buft_map.find({main_device, tensor_split_arr});
1067
+ if (it != buft_map.end()) {
1068
+ return &it->second;
1069
+ }
1070
+ auto * ctx = new ggml_backend_cuda_split_buffer_type_context{
1071
+ main_device,
1072
+ tensor_split_arr,
1073
+ GGML_CUDA_NAME + std::to_string(main_device) + "_Split",
1074
+ };
1075
+
1076
+ struct ggml_backend_buffer_type buft {
1077
+ /* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
1078
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), main_device),
1079
+ /* .context = */ ctx,
1080
+ };
1081
+
1082
+ auto result = buft_map.emplace(std::make_pair(main_device, tensor_split_arr), buft);
1083
+ return &result.first->second;
1084
+ }
1085
+
1086
+ // host buffer type
1087
+
1088
+ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
1089
+ return GGML_CUDA_NAME "_Host";
1090
+
1091
+ GGML_UNUSED(buft);
1092
+ }
1093
+
1094
+ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
1095
+ return buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
1096
+ }
1097
+
1098
+ static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1099
+ CUDA_CHECK(cudaFreeHost(buffer->context));
1100
+ }
1101
+
1102
+ static void * ggml_cuda_host_malloc(size_t size) {
1103
+ if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
1104
+ return nullptr;
1105
+ }
1106
+
1107
+ void * ptr = nullptr;
1108
+ cudaError_t err = cudaMallocHost((void **) &ptr, size);
1109
+ if (err != cudaSuccess) {
1110
+ // clear the error
1111
+ (void)cudaGetLastError();
1112
+ GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1113
+ size / 1024.0 / 1024.0, cudaGetErrorString(err));
1114
+ return nullptr;
1115
+ }
1116
+
1117
+ return ptr;
1118
+ }
1119
+
1120
+ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1121
+ void * ptr = ggml_cuda_host_malloc(size);
1122
+
1123
+ if (ptr == nullptr) {
1124
+ // fallback to cpu buffer
1125
+ return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
1126
+ }
1127
+
1128
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
1129
+ buffer->buft = buft;
1130
+ buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
1131
+
1132
+ return buffer;
1133
+ }
1134
+
1135
+ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
1136
+ static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
1137
+ /* .iface = */ {
1138
+ /* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
1139
+ /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
1140
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
1141
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1142
+ /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
1143
+ /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1144
+ },
1145
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0),
1146
+ /* .context = */ nullptr,
1147
+ };
1148
+
1149
+ return &ggml_backend_cuda_buffer_type_host;
1150
+ }
1151
+
1152
+ //static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
1153
+ // return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
1154
+ //}
1155
+
1156
+ /// kernels
1157
+
1158
+ typedef void (*ggml_cuda_op_mul_mat_t)(
1159
+ ggml_backend_cuda_context & ctx,
1160
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
1161
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
1162
+ const int64_t src1_padded_row_size, cudaStream_t stream);
1163
+
1164
+ #ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
1165
+ #define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
1166
+ #endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
1167
+
1168
+ #define MUL_MAT_SRC1_COL_STRIDE 128
1169
+
1170
+ static cudaError_t ggml_cuda_cpy_tensor_2d(
1171
+ void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
1172
+
1173
+ const char * src_ptr = (const char *) src->data;
1174
+ char * dst_ptr = (char *) dst;
1175
+
1176
+ const int64_t ne0 = src->ne[0];
1177
+ const int64_t nb0 = src->nb[0];
1178
+ const int64_t nb1 = src->nb[1];
1179
+ const int64_t nb2 = src->nb[2];
1180
+ const int64_t nb3 = src->nb[3];
1181
+ const enum ggml_type type = src->type;
1182
+ const int64_t ts = ggml_type_size(type);
1183
+ const int64_t bs = ggml_blck_size(type);
1184
+ const int64_t i1_diff = i1_high - i1_low;
1185
+
1186
+ const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
1187
+ if (nb0 == ts && nb1 == ts*ne0/bs) {
1188
+ return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, cudaMemcpyDeviceToDevice, stream);
1189
+ } else if (nb0 == ts) {
1190
+ return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, cudaMemcpyDeviceToDevice, stream);
1191
+ } else {
1192
+ for (int64_t i1 = 0; i1 < i1_diff; i1++) {
1193
+ const void * rx = (const void *) ((const char *) x + i1*nb1);
1194
+ void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
1195
+ // pretend the row is a matrix with cols=1
1196
+ cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyDeviceToDevice, stream);
1197
+ if (r != cudaSuccess) {
1198
+ return r;
1199
+ }
1200
+ }
1201
+ return cudaSuccess;
1202
+ }
1203
+ }
1204
+
1205
+ static void ggml_cuda_op_mul_mat_cublas(
1206
+ ggml_backend_cuda_context & ctx,
1207
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
1208
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
1209
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
1210
+
1211
+ GGML_ASSERT(src0_dd_i != nullptr);
1212
+ GGML_ASSERT(src1_ddf_i != nullptr);
1213
+ GGML_ASSERT(dst_dd_i != nullptr);
1214
+
1215
+ const int64_t ne00 = src0->ne[0];
1216
+ const int64_t ne10 = src1->ne[0];
1217
+
1218
+ const int64_t ne0 = dst->ne[0];
1219
+
1220
+ const int64_t row_diff = row_high - row_low;
1221
+
1222
+ int id = ggml_cuda_get_device();
1223
+
1224
+ // the main device has a larger memory buffer to hold the results from all GPUs
1225
+ // ldc == nrows of the matrix that cuBLAS writes into
1226
+ int64_t ldc = id == ctx.device ? ne0 : row_diff;
1227
+
1228
+ const int cc = ggml_cuda_info().devices[id].cc;
1229
+
1230
+ const bool supports_bf16 = GGML_CUDA_CC_IS_NVIDIA(cc) || GGML_CUDA_CC_IS_AMD(cc) ||
1231
+ (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
1232
+
1233
+ const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
1234
+
1235
+ if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
1236
+ ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
1237
+ if (src1->type != GGML_TYPE_BF16) {
1238
+ const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type);
1239
+ GGML_ASSERT(to_bf16_cuda != nullptr);
1240
+ size_t ne = src1_ncols*ne10;
1241
+ src1_as_bf16.alloc(ne);
1242
+ to_bf16_cuda(src1_ddf_i, src1_as_bf16.get(), ne, stream);
1243
+ }
1244
+ const nv_bfloat16 * src1_ptr = src1->type == GGML_TYPE_BF16 ? (const nv_bfloat16 *) src1_ddf_i : src1_as_bf16.get();
1245
+ const nv_bfloat16 * src0_ptr = (const nv_bfloat16 *)src0_dd_i;
1246
+ ggml_cuda_pool_alloc<nv_bfloat16> dst_bf16(ctx.pool(id), row_diff*src1_ncols);
1247
+
1248
+ const float alpha_f32 = 1.0f;
1249
+ const float beta_f32 = 0.0f;
1250
+
1251
+ CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
1252
+ CUBLAS_CHECK(
1253
+ cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
1254
+ row_diff, src1_ncols, ne10,
1255
+ &alpha_f32, src0_ptr, CUDA_R_16BF, ne00,
1256
+ src1_ptr, CUDA_R_16BF, ne10,
1257
+ &beta_f32, dst_bf16.get(), CUDA_R_16BF, ldc,
1258
+ CUBLAS_COMPUTE_32F,
1259
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1260
+
1261
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
1262
+ to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
1263
+ } else if (fast_fp16_hardware_available(cc) && use_fp16) {
1264
+ // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
1265
+ ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
1266
+ if (src0->type != GGML_TYPE_F16) {
1267
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
1268
+ GGML_ASSERT(to_fp16_cuda != nullptr);
1269
+ size_t ne = row_diff*ne00;
1270
+ src0_as_f16.alloc(ne);
1271
+ to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
1272
+ }
1273
+ const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
1274
+
1275
+ ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool(id));
1276
+ if (src1->type != GGML_TYPE_F16) {
1277
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
1278
+ GGML_ASSERT(to_fp16_cuda != nullptr);
1279
+ size_t ne = src1_ncols*ne10;
1280
+ src1_as_f16.alloc(ne);
1281
+ to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
1282
+ }
1283
+ const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
1284
+
1285
+ CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
1286
+
1287
+ if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
1288
+ const float alpha = 1.0f;
1289
+ const float beta = 0.0f;
1290
+ CUBLAS_CHECK(
1291
+ cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
1292
+ row_diff, src1_ncols, ne10,
1293
+ &alpha, src0_ptr, CUDA_R_16F, ne00,
1294
+ src1_ptr, CUDA_R_16F, ne10,
1295
+ &beta, dst_dd_i, CUDA_R_32F, ldc,
1296
+ CUBLAS_COMPUTE_32F,
1297
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1298
+ } else {
1299
+ ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(id), row_diff*src1_ncols);
1300
+
1301
+ const half alpha_f16 = 1.0f;
1302
+ const half beta_f16 = 0.0f;
1303
+
1304
+ CUBLAS_CHECK(
1305
+ cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
1306
+ row_diff, src1_ncols, ne10,
1307
+ &alpha_f16, src0_ptr, CUDA_R_16F, ne00,
1308
+ src1_ptr, CUDA_R_16F, ne10,
1309
+ &beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
1310
+ CUBLAS_COMPUTE_16F,
1311
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1312
+
1313
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
1314
+ to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
1315
+ }
1316
+ } else {
1317
+ ggml_cuda_pool_alloc<float> src0_ddq_as_f32(ctx.pool(id));
1318
+ ggml_cuda_pool_alloc<float> src1_ddq_as_f32(ctx.pool(id));
1319
+
1320
+ if (src0->type != GGML_TYPE_F32) {
1321
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
1322
+ GGML_ASSERT(to_fp32_cuda != nullptr);
1323
+ src0_ddq_as_f32.alloc(row_diff*ne00);
1324
+ to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
1325
+ }
1326
+ if (src1->type != GGML_TYPE_F32) {
1327
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
1328
+ GGML_ASSERT(to_fp32_cuda != nullptr);
1329
+ src1_ddq_as_f32.alloc(src1_ncols*ne10);
1330
+ to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
1331
+ }
1332
+
1333
+ const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
1334
+ const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
1335
+
1336
+ const float alpha = 1.0f;
1337
+ const float beta = 0.0f;
1338
+
1339
+ CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
1340
+ CUBLAS_CHECK(
1341
+ cublasSgemm(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
1342
+ row_diff, src1_ncols, ne10,
1343
+ &alpha, src0_ddf_i, ne00,
1344
+ src1_ddf1_i, ne10,
1345
+ &beta, dst_dd_i, ldc));
1346
+ }
1347
+
1348
+ GGML_UNUSED(dst);
1349
+ GGML_UNUSED(src1_ddq_i);
1350
+ GGML_UNUSED(src1_padded_row_size);
1351
+ }
1352
+
1353
+ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
1354
+ static bool peer_access_enabled = false;
1355
+
1356
+ const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
1357
+
1358
+ if (peer_access_enabled == enable_peer_access) {
1359
+ return;
1360
+ }
1361
+
1362
+ #ifdef NDEBUG
1363
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1364
+ ggml_cuda_set_device(id);
1365
+ CUDA_CHECK(cudaDeviceSynchronize());
1366
+ }
1367
+
1368
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1369
+ ggml_cuda_set_device(id);
1370
+
1371
+ for (int id_other = 0; id_other < ggml_backend_cuda_get_device_count(); ++id_other) {
1372
+ if (id == id_other) {
1373
+ continue;
1374
+ }
1375
+ if (id != main_device && id_other != main_device) {
1376
+ continue;
1377
+ }
1378
+
1379
+ int can_access_peer;
1380
+ CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
1381
+ if (can_access_peer) {
1382
+ if (enable_peer_access) {
1383
+ cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
1384
+ if (err != cudaErrorPeerAccessAlreadyEnabled) {
1385
+ CUDA_CHECK(err);
1386
+ } else {
1387
+ // reset the error
1388
+ (void)cudaGetLastError();
1389
+ }
1390
+ } else {
1391
+ cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
1392
+ if (err != cudaErrorPeerAccessNotEnabled) {
1393
+ CUDA_CHECK(err);
1394
+ } else {
1395
+ // reset the error
1396
+ (void)cudaGetLastError();
1397
+ }
1398
+ }
1399
+ }
1400
+ }
1401
+ }
1402
+
1403
+ ggml_cuda_set_device(main_device);
1404
+ #endif // NDEBUG
1405
+
1406
+ peer_access_enabled = enable_peer_access;
1407
+
1408
+ GGML_UNUSED(main_device);
1409
+ }
1410
+
1411
+ static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
1412
+ void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
1413
+
1414
+ #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1415
+ // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
1416
+ cudaMemcpy3DPeerParms p = {};
1417
+ p.dstDevice = dstDevice;
1418
+ p.dstPtr = make_cudaPitchedPtr(dst, dpitch, dpitch, height);
1419
+ p.srcDevice = srcDevice;
1420
+ p.srcPtr = make_cudaPitchedPtr(src, spitch, spitch, height);
1421
+ p.extent = make_cudaExtent(width, height, 1);
1422
+ return cudaMemcpy3DPeerAsync(&p, stream);
1423
+ #else
1424
+ // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
1425
+ GGML_UNUSED(dstDevice);
1426
+ GGML_UNUSED(srcDevice);
1427
+ return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
1428
+ #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1429
+ }
1430
+
1431
+ static void ggml_cuda_op_mul_mat(
1432
+ ggml_backend_cuda_context & ctx,
1433
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
1434
+ quantize_cuda_t quantize_src1) {
1435
+
1436
+ const int64_t ne00 = src0->ne[0];
1437
+ const int64_t ne01 = src0->ne[1];
1438
+ const int64_t ne02 = src0->ne[2];
1439
+ const int64_t ne03 = src0->ne[3];
1440
+
1441
+ const int64_t ne10 = src1->ne[0];
1442
+ const int64_t ne11 = src1->ne[1];
1443
+ const int64_t ne12 = src1->ne[2];
1444
+ const int64_t ne13 = src1->ne[3];
1445
+ const int64_t nrows1 = ggml_nrows(src1);
1446
+
1447
+ const int64_t ne0 = dst->ne[0];
1448
+ const int64_t ne1 = dst->ne[1];
1449
+
1450
+ // const int64_t nb10 = src1->nb[0];
1451
+ const int64_t nb11 = src1->nb[1];
1452
+ const int64_t nb12 = src1->nb[2];
1453
+ const int64_t nb13 = src1->nb[3];
1454
+
1455
+ const int64_t nb2 = dst->nb[2];
1456
+ const int64_t nb3 = dst->nb[3];
1457
+
1458
+ ggml_backend_cuda_buffer_context * src1_ctx = (ggml_backend_cuda_buffer_context *) src1->buffer->context;
1459
+ ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *) dst->buffer->context;
1460
+
1461
+ GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
1462
+
1463
+ GGML_ASSERT(ne12 % ne02 == 0);
1464
+ GGML_ASSERT(ne13 % ne03 == 0);
1465
+
1466
+ const int64_t i02_divisor = ne12 / ne02;
1467
+ const int64_t i03_divisor = ne13 / ne03;
1468
+
1469
+ const size_t src0_ts = ggml_type_size(src0->type);
1470
+ const size_t src0_bs = ggml_blck_size(src0->type);
1471
+ const size_t q8_1_ts = sizeof(block_q8_1);
1472
+ const size_t q8_1_bs = QK8_1;
1473
+
1474
+ const bool src0_is_contiguous = ggml_is_contiguous(src0);
1475
+ const bool src1_is_contiguous = ggml_is_contiguous(src1);
1476
+
1477
+ const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
1478
+
1479
+ const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
1480
+ GGML_ASSERT(!(split && ne02 > 1));
1481
+ GGML_ASSERT(!(split && ne03 > 1));
1482
+ GGML_ASSERT(!(split && ne02 < ne12));
1483
+ GGML_ASSERT(!(split && ne03 < ne13));
1484
+
1485
+ ggml_tensor_extra_gpu * src0_extra = split ? (ggml_tensor_extra_gpu *) src0->extra : nullptr;
1486
+
1487
+
1488
+ std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
1489
+ if (split) {
1490
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
1491
+ tensor_split = buft_ctx->tensor_split;
1492
+ }
1493
+
1494
+ struct dev_data {
1495
+ int cc;
1496
+
1497
+ ggml_cuda_pool_alloc<char> src0_dd_alloc;
1498
+ ggml_cuda_pool_alloc<float> src1_ddf_alloc;
1499
+ ggml_cuda_pool_alloc<char> src1_ddq_alloc;
1500
+ ggml_cuda_pool_alloc<float> dst_dd_alloc;
1501
+
1502
+ char * src0_dd = nullptr;
1503
+ float * src1_ddf = nullptr; // float
1504
+ char * src1_ddq = nullptr; // q8_1
1505
+ float * dst_dd = nullptr;
1506
+
1507
+ int64_t row_low;
1508
+ int64_t row_high;
1509
+ };
1510
+
1511
+ dev_data dev[GGML_CUDA_MAX_DEVICES];
1512
+
1513
+ int used_devices = 0;
1514
+
1515
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1516
+ dev[id].cc = ggml_cuda_info().devices[id].cc;
1517
+
1518
+ // by default, use all rows
1519
+ dev[id].row_low = 0;
1520
+ dev[id].row_high = ne01;
1521
+
1522
+ // for multi GPU, get the row boundaries from tensor split
1523
+ // and round to mul_mat_q tile sizes
1524
+ if (split) {
1525
+ const int64_t rounding = get_row_rounding(tensor_split);
1526
+
1527
+ if (id != 0) {
1528
+ dev[id].row_low = ne01*tensor_split[id];
1529
+ if (dev[id].row_low < ne01) {
1530
+ dev[id].row_low -= dev[id].row_low % rounding;
1531
+ }
1532
+ }
1533
+
1534
+ if (id != ggml_backend_cuda_get_device_count() - 1) {
1535
+ dev[id].row_high = ne01*tensor_split[id + 1];
1536
+ if (dev[id].row_high < ne01) {
1537
+ dev[id].row_high -= dev[id].row_high % rounding;
1538
+ }
1539
+ }
1540
+ }
1541
+ }
1542
+
1543
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1544
+ if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
1545
+ continue;
1546
+ }
1547
+
1548
+ used_devices++;
1549
+
1550
+ const bool src1_on_device = id == src1_ctx->device;
1551
+ const bool dst_on_device = id == dst_ctx->device;
1552
+
1553
+ ggml_cuda_set_device(id);
1554
+ cudaStream_t stream = ctx.stream(id, 0);
1555
+
1556
+ if (src0_is_contiguous) {
1557
+ dev[id].src0_dd = split ? (char *) src0_extra->data_device[id] : (char *) src0->data;
1558
+ } else {
1559
+ // If src0 is not contiguous it will be copied to a temporary buffer.
1560
+ // This buffer needs to be cleared entirely because multiple regions will function as padding.
1561
+ const size_t nbytes_data = ggml_nbytes(src0);
1562
+ const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
1563
+ dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), nbytes_data + nbytes_padding);
1564
+ CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd, 0, nbytes_data + nbytes_padding, stream));
1565
+ }
1566
+
1567
+ // If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
1568
+ if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
1569
+ GGML_ASSERT(ggml_is_contiguously_allocated(src0));
1570
+ GGML_ASSERT(!src0->view_src);
1571
+ const size_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
1572
+ const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
1573
+ CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
1574
+ }
1575
+
1576
+ if (src1_on_device && src1_is_contiguous) {
1577
+ dev[id].src1_ddf = (float *) src1->data;
1578
+ } else {
1579
+ dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
1580
+ }
1581
+
1582
+ if (quantize_src1) {
1583
+ size_t src_1_ddq_size = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
1584
+ if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1585
+ src_1_ddq_size += get_mmq_x_max_host(dev[id].cc)*sizeof(block_q8_1_mmq);
1586
+ }
1587
+ dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
1588
+
1589
+ if (src1_on_device && src1_is_contiguous) {
1590
+ quantize_src1(
1591
+ dev[id].src1_ddf, nullptr, dev[id].src1_ddq, src0->type, ne10,
1592
+ nb11/sizeof(float), nb12/sizeof(float), nb13/sizeof(float),
1593
+ src1_padded_col_size, ne11, ne12, ne13, stream);
1594
+ CUDA_CHECK(cudaGetLastError());
1595
+ }
1596
+ }
1597
+
1598
+ if (dst_on_device) {
1599
+ dev[id].dst_dd = (float *) dst->data;
1600
+ } else {
1601
+ const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
1602
+ dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(ctx.pool(id), size_dst_ddf);
1603
+ }
1604
+ }
1605
+
1606
+ // if multiple devices are used they need to wait for the main device
1607
+ // here an event is recorded that signals that the main device has finished calculating the input data
1608
+ if (split && used_devices > 1) {
1609
+ ggml_cuda_set_device(ctx.device);
1610
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[ctx.device][0], ctx.stream()));
1611
+ }
1612
+
1613
+ const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
1614
+ for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
1615
+ const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_CUDA_MAX_STREAMS : 0;
1616
+ const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
1617
+
1618
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1619
+ if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
1620
+ continue;
1621
+ }
1622
+
1623
+ const bool src1_on_device = id == src1_ctx->device;
1624
+ const bool dst_on_device = id == dst_ctx->device;
1625
+ const int64_t row_diff = dev[id].row_high - dev[id].row_low;
1626
+
1627
+ ggml_cuda_set_device(id);
1628
+ cudaStream_t stream = ctx.stream(id, is);
1629
+
1630
+ // wait for main GPU data if necessary
1631
+ if (split && (id != ctx.device || is != 0)) {
1632
+ CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[ctx.device][0], 0));
1633
+ }
1634
+
1635
+ for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
1636
+ const int64_t i03 = i0 / ne12;
1637
+ const int64_t i02 = i0 % ne12;
1638
+
1639
+ size_t src1_ddq_i_offset = i0*ne11 * src1_padded_col_size*q8_1_ts/q8_1_bs;
1640
+ if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1641
+ src1_ddq_i_offset += src1_col_0 * sizeof(block_q8_1_mmq);
1642
+ } else {
1643
+ src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs;
1644
+ }
1645
+
1646
+ // for split tensors the data begins at i0 == i0_offset_low
1647
+ const size_t nbytes_src0_matrix = ne01*ne00*src0_ts / src0_bs;
1648
+ char * src0_dd_i = dev[id].src0_dd + ((i03/i03_divisor)*ne02 + (i02/i02_divisor)) * nbytes_src0_matrix;
1649
+ float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
1650
+ char * src1_ddq_i = dev[id].src1_ddq + src1_ddq_i_offset;
1651
+ float * dst_dd_i = dev[id].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
1652
+
1653
+ // the main device memory buffer can be on VRAM scratch, with space for all partial results
1654
+ // in that case an offset on dst_ddf_i is needed
1655
+ if (id == ctx.device) {
1656
+ dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
1657
+ }
1658
+
1659
+ // copy src0, src1 to device if necessary
1660
+ if (src1_is_contiguous) {
1661
+ if (id != ctx.device) {
1662
+ if (quantize_src1) {
1663
+ char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
1664
+ if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1665
+ const size_t pitch = ne11*sizeof(block_q8_1_mmq);
1666
+ const size_t width = src1_ncols*sizeof(block_q8_1_mmq);
1667
+ const size_t height = src1_padded_col_size/(4*QK8_1);
1668
+ CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device, pitch, width, height, stream));
1669
+ } else {
1670
+ CUDA_CHECK(cudaMemcpyPeerAsync(
1671
+ src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
1672
+ }
1673
+ } else {
1674
+ float * src1_ddf_i_source = (float *) src1->data;
1675
+ src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
1676
+ CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, ctx.device,
1677
+ src1_ncols*ne10*sizeof(float), stream));
1678
+ }
1679
+ }
1680
+ } else if (src1_on_device && !src1_is_contiguous) {
1681
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
1682
+ src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
1683
+ } else {
1684
+ GGML_ABORT("fatal error");
1685
+ }
1686
+
1687
+ if (quantize_src1 && !src1_is_contiguous) {
1688
+ quantize_src1(
1689
+ src1_ddf_i, nullptr, src1_ddq_i, src0->type, ne10, ne10, ne11*ne10, ne12*ne11*ne10,
1690
+ src1_padded_col_size, src1_ncols, 1, 1, stream);
1691
+ CUDA_CHECK(cudaGetLastError());
1692
+ }
1693
+
1694
+ if (src1_col_0 == 0 && !src0_is_contiguous && i03 % i03_divisor == 0 && i02 % i02_divisor == 0) {
1695
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
1696
+ src0_dd_i, src0, i03/i03_divisor, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
1697
+ }
1698
+
1699
+ // do the computation
1700
+ op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
1701
+ dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream);
1702
+ CUDA_CHECK(cudaGetLastError());
1703
+
1704
+ // copy dst to host or other device if necessary
1705
+ if (!dst_on_device) {
1706
+ void * dst_off_device = dst->data;
1707
+ if (split) {
1708
+ // src0 = weight matrix is saved as a transposed matrix for better memory layout.
1709
+ // dst is NOT transposed.
1710
+ // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
1711
+ // Instead they need to be copied to the correct slice in ne0 = dst row index.
1712
+ // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
1713
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
1714
+ GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
1715
+ dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
1716
+ CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(
1717
+ dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream));
1718
+ } else {
1719
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
1720
+ GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
1721
+ dhf_dst_i += src1_col_0*ne0;
1722
+ CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), cudaMemcpyDeviceToDevice, stream));
1723
+ }
1724
+ }
1725
+
1726
+ // add event for the main device to wait on until other device is done
1727
+ if (split && (id != ctx.device || is != 0)) {
1728
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
1729
+ }
1730
+ }
1731
+ }
1732
+ }
1733
+
1734
+ // main device waits for all other devices to be finished
1735
+ if (split && ggml_backend_cuda_get_device_count() > 1) {
1736
+ int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
1737
+ is_max = is_max <= GGML_CUDA_MAX_STREAMS ? is_max : GGML_CUDA_MAX_STREAMS;
1738
+
1739
+ ggml_cuda_set_device(ctx.device);
1740
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1741
+ if (dev[id].row_low == dev[id].row_high) {
1742
+ continue;
1743
+ }
1744
+ for (int64_t is = 0; is < is_max; ++is) {
1745
+ CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), src0_extra->events[id][is], 0));
1746
+ }
1747
+ }
1748
+ }
1749
+ }
1750
+
1751
+ static __global__ void k_compute_batched_ptrs(
1752
+ const void * src0_as_f16, const void * src1_as_f16, char * dst,
1753
+ const void ** ptrs_src, void ** ptrs_dst,
1754
+ int64_t ne12, int64_t ne13,
1755
+ int64_t ne23,
1756
+ size_t nb02, size_t nb03,
1757
+ size_t nb12, size_t nb13,
1758
+ size_t nbd2, size_t nbd3,
1759
+ int64_t r2, int64_t r3) {
1760
+ const int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
1761
+ const int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
1762
+
1763
+ if (i13 >= ne13 || i12 >= ne12) {
1764
+ return;
1765
+ }
1766
+
1767
+ const int64_t i03 = i13 / r3;
1768
+ const int64_t i02 = i12 / r2;
1769
+
1770
+ ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
1771
+ ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
1772
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
1773
+ }
1774
+
1775
+ // Type traits for mapping ggml types to CUDA/cuBLAS types
1776
+ template<ggml_type T>
1777
+ struct batched_mul_mat_traits;
1778
+
1779
+ template<>
1780
+ struct batched_mul_mat_traits<GGML_TYPE_F32> {
1781
+ using cuda_type = float;
1782
+ static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
1783
+ static inline const cudaDataType_t data_type = CUDA_R_32F;
1784
+ static inline const ggml_type ggml_type_val = GGML_TYPE_F32;
1785
+ static inline const float alpha = 1.0f;
1786
+ static inline const float beta = 0.0f;
1787
+ static inline const void* get_alpha() { static const float val = alpha; return &val; }
1788
+ static inline const void* get_beta() { static const float val = beta; return &val; }
1789
+ static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_fp32_nc_cuda(src_type); }
1790
+ };
1791
+
1792
+ template<>
1793
+ struct batched_mul_mat_traits<GGML_TYPE_BF16> {
1794
+ using cuda_type = nv_bfloat16;
1795
+ static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
1796
+ static inline const cudaDataType_t data_type = CUDA_R_16BF;
1797
+ static inline const ggml_type ggml_type_val = GGML_TYPE_BF16;
1798
+ static inline const float alpha = 1.0f;
1799
+ static inline const float beta = 0.0f;
1800
+ static inline const void* get_alpha() { static const float val = alpha; return &val; }
1801
+ static inline const void* get_beta() { static const float val = beta; return &val; }
1802
+ static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_bf16_nc_cuda(src_type); }
1803
+ };
1804
+
1805
+ template<>
1806
+ struct batched_mul_mat_traits<GGML_TYPE_F16> {
1807
+ using cuda_type = half;
1808
+ static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
1809
+ static inline const cudaDataType_t data_type = CUDA_R_16F;
1810
+ static inline const ggml_type ggml_type_val = GGML_TYPE_F16;
1811
+ static inline const half alpha = 1.0;
1812
+ static inline const half beta = 0.0;
1813
+ static inline const void* get_alpha() { static const half val = alpha; return &val; }
1814
+ static inline const void* get_beta() { static const half val = beta; return &val; }
1815
+ static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_fp16_nc_cuda(src_type); }
1816
+ };
1817
+
1818
+ template<ggml_type src0_type>
1819
+ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1820
+ using traits = batched_mul_mat_traits<src0_type>;
1821
+ using cuda_t = typename traits::cuda_type;
1822
+
1823
+ GGML_ASSERT(!ggml_is_transposed(src0));
1824
+ GGML_ASSERT(!ggml_is_transposed(src1));
1825
+ GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft));
1826
+ GGML_ASSERT(src0->type == src0_type);
1827
+ GGML_ASSERT(ggml_is_contiguous(dst));
1828
+
1829
+ // Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.
1830
+ // As long as dst is contiguous this does not matter though.
1831
+
1832
+ GGML_TENSOR_BINARY_OP_LOCALS
1833
+
1834
+ const int64_t ne_dst = ggml_nelements(dst);
1835
+ cudaStream_t main_stream = ctx.stream();
1836
+ CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream));
1837
+
1838
+ float * dst_ddf = (float *) dst->data;
1839
+ const size_t ts_src1 = ggml_type_size(src1->type);
1840
+ GGML_ASSERT(nb10 == ts_src1);
1841
+ int64_t s11 = nb11 / ts_src1;
1842
+ int64_t s12 = nb12 / ts_src1;
1843
+ int64_t s13 = nb13 / ts_src1;
1844
+
1845
+ const cuda_t * src0_ptr = nullptr;
1846
+ const cuda_t * src1_ptr = nullptr;
1847
+
1848
+ ggml_cuda_pool_alloc<cuda_t> src0_alloc(ctx.pool());
1849
+ ggml_cuda_pool_alloc<cuda_t> src1_alloc(ctx.pool());
1850
+
1851
+ // Handle src0
1852
+ src0_ptr = (const cuda_t *) src0->data;
1853
+
1854
+ // Handle src1 - convert if necessary
1855
+ if (src1->type == src0_type) {
1856
+ src1_ptr = (const cuda_t *) src1->data;
1857
+ } else {
1858
+ // Convert src1 to target type using traits conversion functions
1859
+ const int64_t ne_src1 = ggml_nelements(src1);
1860
+ src1_alloc.alloc(ne_src1);
1861
+
1862
+ const auto convert_func = traits::get_nc_converter(src1->type);
1863
+ GGML_ASSERT(convert_func != nullptr);
1864
+ convert_func(src1->data, src1_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream);
1865
+ src1_ptr = src1_alloc.get();
1866
+ s11 = ne10;
1867
+ s12 = ne11*s11;
1868
+ s13 = ne12*s12;
1869
+ }
1870
+
1871
+ // Setup destination buffer
1872
+ ggml_cuda_pool_alloc<cuda_t> dst_temp(ctx.pool());
1873
+ char * dst_t;
1874
+ size_t nbd2 = dst->nb[2];
1875
+ size_t nbd3 = dst->nb[3];
1876
+
1877
+ cublasComputeType_t cu_compute_type = traits::compute_type;
1878
+ cudaDataType_t cu_data_type = traits::data_type;
1879
+ cudaDataType_t cu_data_type_a = traits::data_type;
1880
+ cudaDataType_t cu_data_type_b = traits::data_type;
1881
+ const void * alpha = traits::get_alpha();
1882
+ const void * beta = traits::get_beta();
1883
+ const float alpha_f32 = 1.0f;
1884
+ const float beta_f32 = 0.0f;
1885
+
1886
+ if (dst->op_params[0] == GGML_PREC_DEFAULT) {
1887
+ if constexpr (src0_type == GGML_TYPE_F32) {
1888
+ dst_t = (char *) dst_ddf; // Direct F32 output
1889
+ } else {
1890
+ dst_t = (char *) dst_temp.alloc(ne_dst);
1891
+ nbd2 /= sizeof(float) / sizeof(cuda_t);
1892
+ nbd3 /= sizeof(float) / sizeof(cuda_t);
1893
+ }
1894
+ } else {
1895
+ dst_t = (char *) dst_ddf;
1896
+ cu_compute_type = CUBLAS_COMPUTE_32F;
1897
+ cu_data_type = CUDA_R_32F;
1898
+ alpha = &alpha_f32;
1899
+ beta = &beta_f32;
1900
+ }
1901
+
1902
+ int id = ggml_cuda_get_device();
1903
+ const int cc = ggml_cuda_info().devices[id].cc;
1904
+ if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
1905
+ cu_compute_type = CUBLAS_COMPUTE_32F;
1906
+ alpha = &alpha_f32;
1907
+ beta = &beta_f32;
1908
+ }
1909
+
1910
+ GGML_ASSERT(ne12 % ne02 == 0);
1911
+ GGML_ASSERT(ne13 % ne03 == 0);
1912
+
1913
+ // broadcast factors
1914
+ const int64_t r2 = ne12/ne02;
1915
+ const int64_t r3 = ne13/ne03;
1916
+
1917
+ if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
1918
+ // there is no broadcast and src0, src1 are contiguous across dims 2, 3
1919
+ // use cublasGemmStridedBatchedEx
1920
+ CUBLAS_CHECK(
1921
+ cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
1922
+ ne01, ne11, ne10,
1923
+ alpha, src0_ptr, cu_data_type_a, nb01/nb00, nb02/nb00, // strideA
1924
+ src1_ptr, cu_data_type_b, s11, s12, // strideB
1925
+ beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
1926
+ ne12*ne13,
1927
+ cu_compute_type,
1928
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1929
+ } else {
1930
+ // use cublasGemmBatchedEx
1931
+ const int64_t ne23 = ne12*ne13;
1932
+
1933
+ ggml_cuda_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
1934
+ ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
1935
+
1936
+ size_t src1_stride_size = sizeof(cuda_t);
1937
+
1938
+ dim3 block_dims(ne13, ne12);
1939
+ k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
1940
+ src0_ptr, src1_ptr, dst_t,
1941
+ ptrs_src.get(), ptrs_dst.get(),
1942
+ ne12, ne13,
1943
+ ne23,
1944
+ nb02, nb03,
1945
+ (src1->type == src0_type) ? nb12 : s12*src1_stride_size,
1946
+ (src1->type == src0_type) ? nb13 : s13*src1_stride_size,
1947
+ nbd2, nbd3,
1948
+ r2, r3);
1949
+
1950
+ CUDA_CHECK(cudaGetLastError());
1951
+
1952
+ CUBLAS_CHECK(
1953
+ cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
1954
+ ne01, ne11, ne10,
1955
+ alpha, (const void **) (ptrs_src.get() + 0*ne23), cu_data_type_a, nb01/nb00,
1956
+ (const void **) (ptrs_src.get() + 1*ne23), cu_data_type_b, s11,
1957
+ beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0,
1958
+ ne23,
1959
+ cu_compute_type,
1960
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1961
+ }
1962
+
1963
+ // Convert output back to F32 if needed
1964
+ if (dst->op_params[0] == GGML_PREC_DEFAULT && cu_data_type != CUDA_R_32F) {
1965
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(traits::ggml_type_val);
1966
+ to_fp32_cuda(dst_temp.get(), dst_ddf, ne_dst, main_stream);
1967
+ }
1968
+ }
1969
+
1970
+ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1971
+ GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16 || src0->type == GGML_TYPE_F32);
1972
+
1973
+ switch (src0->type) {
1974
+ case GGML_TYPE_F32:
1975
+ ggml_cuda_mul_mat_batched_cublas_impl<GGML_TYPE_F32>(ctx, src0, src1, dst);
1976
+ break;
1977
+ case GGML_TYPE_BF16:
1978
+ ggml_cuda_mul_mat_batched_cublas_impl<GGML_TYPE_BF16>(ctx, src0, src1, dst);
1979
+ break;
1980
+ case GGML_TYPE_F16:
1981
+ ggml_cuda_mul_mat_batched_cublas_impl<GGML_TYPE_F16>(ctx, src0, src1, dst);
1982
+ break;
1983
+ default:
1984
+ GGML_ABORT("Unsupported type");
1985
+ }
1986
+ }
1987
+
1988
+ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1989
+ const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
1990
+
1991
+ // If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
1992
+ // But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
1993
+ // Therefore, in such cases use cuBLAS.
1994
+ const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
1995
+ && ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
1996
+
1997
+ bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
1998
+ && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
1999
+ bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
2000
+ && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
2001
+ && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
2002
+ bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
2003
+ && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
2004
+
2005
+ bool any_gpus_with_slow_fp16 = false;
2006
+
2007
+ if (split) {
2008
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
2009
+ auto & tensor_split = buft_ctx->tensor_split;
2010
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
2011
+ // skip devices that are not going to do any work:
2012
+ if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
2013
+ continue;
2014
+ }
2015
+
2016
+ const int cc = ggml_cuda_info().devices[id].cc;
2017
+ use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
2018
+ use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]);
2019
+ any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
2020
+ }
2021
+ } else {
2022
+ const int cc = ggml_cuda_info().devices[ctx.device].cc;
2023
+ use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
2024
+ use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]);
2025
+ any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
2026
+ }
2027
+
2028
+ // debug helpers
2029
+ //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
2030
+ //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
2031
+ //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
2032
+ //printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
2033
+ //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
2034
+ //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
2035
+
2036
+ //TODO update for generic tensor parallelism
2037
+ const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
2038
+ bool use_batched_cublas_f16 = src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16);
2039
+ bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc);
2040
+ bool use_batched_cublas_f32 = src0->type == GGML_TYPE_F32;
2041
+
2042
+ if (!split && use_mul_mat_vec) {
2043
+ // the custom F16 vector kernel can be used over batched cuBLAS GEMM
2044
+ // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
2045
+ ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
2046
+ } else if (!split && use_mul_mat_vec_q) {
2047
+ ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
2048
+ } else if (!split && use_mul_mat_q) {
2049
+ ggml_cuda_mul_mat_q(ctx, src0, src1, nullptr, dst);
2050
+ } else if (!split && (use_batched_cublas_f16 || use_batched_cublas_bf16 || use_batched_cublas_f32)
2051
+ && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
2052
+ // general KQ + KQV multi-batch without FlashAttention
2053
+ ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
2054
+ } else if (use_mul_mat_vec) {
2055
+ ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr);
2056
+ } else if (use_mul_mat_vec_q) {
2057
+ ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
2058
+ } else if (use_mul_mat_q) {
2059
+ ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
2060
+ } else {
2061
+ ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
2062
+ }
2063
+ }
2064
+
2065
+ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
2066
+ const ggml_tensor * src0 = dst->src[0];
2067
+ const ggml_tensor * src1 = dst->src[1];
2068
+ const ggml_tensor * ids = dst->src[2];
2069
+
2070
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
2071
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
2072
+ GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
2073
+
2074
+ GGML_TENSOR_BINARY_OP_LOCALS
2075
+
2076
+ const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
2077
+
2078
+ if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
2079
+ if (ne2 == 1) {
2080
+ if (ggml_is_quantized(src0->type)) {
2081
+ ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
2082
+ } else {
2083
+ ggml_cuda_mul_mat_vec(ctx, src0, src1, ids, dst);
2084
+ }
2085
+ return;
2086
+ }
2087
+
2088
+ if (ggml_cuda_should_use_mmq(src0->type, cc, ne12)) {
2089
+ ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst);
2090
+ return;
2091
+ }
2092
+ }
2093
+
2094
+ cudaStream_t stream = ctx.stream();
2095
+
2096
+ GGML_ASSERT(nb12 % nb11 == 0);
2097
+ GGML_ASSERT(nb2 % nb1 == 0);
2098
+
2099
+ const ggml_type type_src1_sorted = (src0->type == GGML_TYPE_F16 && !fast_fp16_hardware_available(cc))
2100
+ || ggml_is_quantized(src0->type) ? GGML_TYPE_F32 : src0->type;
2101
+ const ggml_type type_dst_sorted = GGML_TYPE_F32;
2102
+ const size_t ts_src1_sorted = ggml_type_size(type_src1_sorted);
2103
+ const size_t ts_dst_sorted = ggml_type_size(type_dst_sorted);
2104
+
2105
+ const int64_t n_expert_used = ids->ne[0];
2106
+ const int64_t ne_get_rows = ne12 * n_expert_used;
2107
+
2108
+ std::vector<int32_t> ids_to_sorted_host;
2109
+ ids_to_sorted_host.reserve(2*ne_get_rows);
2110
+ std::vector<int32_t> ids_from_sorted_host(ne_get_rows);
2111
+
2112
+ ggml_cuda_pool_alloc<int32_t> ids_buf_dev(ctx.pool(), 2*ne_get_rows);
2113
+
2114
+ std::vector<int32_t> tokens_per_expert(ne02);
2115
+
2116
+ ggml_cuda_pool_alloc<char> src1_sorted(ctx.pool(), ne12*n_expert_used*ne10*ts_src1_sorted);
2117
+ ggml_cuda_pool_alloc<char> dst_sorted(ctx.pool(), ne2 *n_expert_used* ne0*ts_dst_sorted);
2118
+
2119
+ std::vector<char> ids_host(ggml_nbytes(ids));
2120
+ CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
2121
+ CUDA_CHECK(cudaStreamSynchronize(stream));
2122
+
2123
+ for (int64_t i02 = 0; i02 < ne02; ++i02) { // expert matrices
2124
+ for (int64_t i12 = 0; i12 < ne12; ++i12) { // tokens
2125
+ for (int64_t iex = 0; iex < n_expert_used; ++iex) {
2126
+ const int32_t expert_to_use = *(const int32_t *)(ids_host.data() + i12*ids->nb[1] + iex*ids->nb[0]);
2127
+ assert(expert_to_use >= 0 && expert_to_use < ne02);
2128
+ if (expert_to_use == i02) {
2129
+ ids_from_sorted_host[i12*n_expert_used + iex] = ids_to_sorted_host.size();
2130
+ ids_to_sorted_host.push_back(i12*ne11 + iex % ne11);
2131
+ tokens_per_expert[i02]++;
2132
+ break;
2133
+ }
2134
+ }
2135
+ }
2136
+ }
2137
+ GGML_ASSERT(ids_to_sorted_host.size() == size_t(ne_get_rows));
2138
+
2139
+ ids_to_sorted_host.insert(ids_to_sorted_host.end(), ids_from_sorted_host.begin(), ids_from_sorted_host.end());
2140
+
2141
+ CUDA_CHECK(cudaMemcpyAsync(ids_buf_dev.ptr, ids_to_sorted_host.data(), 2*ne_get_rows*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
2142
+ CUDA_CHECK(cudaStreamSynchronize(stream));
2143
+
2144
+ const int32_t * ids_to_sorted = ids_buf_dev.ptr + 0*ne_get_rows;
2145
+ const int32_t * ids_from_sorted = ids_buf_dev.ptr + 1*ne_get_rows;
2146
+
2147
+ get_rows_cuda(src1->data, src1->type, ids_to_sorted, src1_sorted.ptr, type_src1_sorted,
2148
+ ne10, nb11, nb12, nb13,
2149
+ ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t),
2150
+ ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, stream);
2151
+ CUDA_CHECK(cudaGetLastError());
2152
+
2153
+ char * src1_data_cur = (char *) src1_sorted.ptr;
2154
+ char * dst_data_cur = (char *) dst_sorted.ptr;
2155
+ for (int64_t i02 = 0; i02 < ne02; ++i02) {
2156
+ if (tokens_per_expert[i02] == 0) {
2157
+ continue;
2158
+ }
2159
+
2160
+ ggml_tensor src0_slice = *src0;
2161
+ src0_slice.ne[2] = 1;
2162
+ src0_slice.nb[3] = src0_slice.nb[2];
2163
+ src0_slice.op = GGML_OP_VIEW;
2164
+ src0_slice.view_src = dst->src[0]; // non-const pointer to src0
2165
+ src0_slice.data = (char *) src0->data + i02*nb02;
2166
+
2167
+ ggml_tensor src1_slice;
2168
+ memset(&src1_slice, 0, sizeof(src1_slice));
2169
+ src1_slice.buffer = src1->buffer;
2170
+ src1_slice.type = type_src1_sorted;
2171
+ src1_slice.ne[0] = ne10;
2172
+ src1_slice.ne[1] = tokens_per_expert[i02];
2173
+ src1_slice.ne[2] = 1;
2174
+ src1_slice.ne[3] = 1;
2175
+ src1_slice.nb[0] = ts_src1_sorted;
2176
+ src1_slice.nb[1] = src1_slice.ne[0] * src1_slice.nb[0];
2177
+ src1_slice.nb[2] = src1_slice.ne[1] * src1_slice.nb[1];
2178
+ src1_slice.nb[3] = src1_slice.ne[2] * src1_slice.nb[2];
2179
+ src1_slice.data = src1_data_cur;
2180
+
2181
+ ggml_tensor dst_slice;
2182
+ memset(&dst_slice, 0, sizeof(dst_slice));
2183
+ dst_slice.buffer = dst->buffer;
2184
+ dst_slice.type = type_dst_sorted;
2185
+ dst_slice.ne[0] = ne0;
2186
+ dst_slice.ne[1] = tokens_per_expert[i02];
2187
+ dst_slice.ne[2] = 1;
2188
+ dst_slice.ne[3] = 1;
2189
+ dst_slice.nb[0] = ts_dst_sorted;
2190
+ dst_slice.nb[1] = dst_slice.ne[0] * dst_slice.nb[0];
2191
+ dst_slice.nb[2] = dst_slice.ne[1] * dst_slice.nb[1];
2192
+ dst_slice.nb[3] = dst_slice.ne[2] * dst_slice.nb[2];
2193
+ dst_slice.data = dst_data_cur;
2194
+
2195
+ ggml_cuda_mul_mat(ctx, &src0_slice, &src1_slice, &dst_slice);
2196
+ CUDA_CHECK(cudaGetLastError());
2197
+
2198
+ src1_data_cur += src1_slice.nb[2];
2199
+ dst_data_cur += dst_slice.nb[2];
2200
+ }
2201
+
2202
+ get_rows_cuda(dst_sorted.ptr, type_dst_sorted, ids_from_sorted, dst->data, dst->type,
2203
+ ne0, ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted,
2204
+ ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t),
2205
+ nb1, nb2, nb3, stream);
2206
+ }
2207
+
2208
+ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
2209
+ // why is this here instead of mul_mat?
2210
+ if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
2211
+ ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
2212
+ }
2213
+
2214
+ switch (dst->op) {
2215
+ case GGML_OP_ARGMAX:
2216
+ ggml_cuda_argmax(ctx, dst);
2217
+ break;
2218
+ case GGML_OP_COUNT_EQUAL:
2219
+ ggml_cuda_count_equal(ctx, dst);
2220
+ break;
2221
+ case GGML_OP_REPEAT:
2222
+ ggml_cuda_op_repeat(ctx, dst);
2223
+ break;
2224
+ case GGML_OP_REPEAT_BACK:
2225
+ ggml_cuda_op_repeat_back(ctx, dst);
2226
+ break;
2227
+ case GGML_OP_GET_ROWS:
2228
+ ggml_cuda_op_get_rows(ctx, dst);
2229
+ break;
2230
+ case GGML_OP_GET_ROWS_BACK:
2231
+ ggml_cuda_op_get_rows_back(ctx, dst);
2232
+ break;
2233
+ case GGML_OP_DUP:
2234
+ ggml_cuda_dup(ctx, dst);
2235
+ break;
2236
+ case GGML_OP_CPY:
2237
+ ggml_cuda_cpy(ctx, dst->src[0], dst->src[1]);
2238
+ break;
2239
+ case GGML_OP_CONT:
2240
+ ggml_cuda_dup(ctx, dst);
2241
+ break;
2242
+ case GGML_OP_ADD:
2243
+ case GGML_OP_ADD1: // TODO: more efficient implementation
2244
+ ggml_cuda_op_add(ctx, dst);
2245
+ break;
2246
+ case GGML_OP_SUB:
2247
+ ggml_cuda_op_sub(ctx, dst);
2248
+ break;
2249
+ case GGML_OP_ACC:
2250
+ ggml_cuda_op_acc(ctx, dst);
2251
+ break;
2252
+ case GGML_OP_MUL:
2253
+ ggml_cuda_op_mul(ctx, dst);
2254
+ break;
2255
+ case GGML_OP_DIV:
2256
+ ggml_cuda_op_div(ctx, dst);
2257
+ break;
2258
+ case GGML_OP_UNARY:
2259
+ switch (ggml_get_unary_op(dst)) {
2260
+ case GGML_UNARY_OP_ABS:
2261
+ ggml_cuda_op_abs(ctx, dst);
2262
+ break;
2263
+ case GGML_UNARY_OP_SGN:
2264
+ ggml_cuda_op_sgn(ctx, dst);
2265
+ break;
2266
+ case GGML_UNARY_OP_NEG:
2267
+ ggml_cuda_op_neg(ctx, dst);
2268
+ break;
2269
+ case GGML_UNARY_OP_STEP:
2270
+ ggml_cuda_op_step(ctx, dst);
2271
+ break;
2272
+ case GGML_UNARY_OP_GELU:
2273
+ ggml_cuda_op_gelu(ctx, dst);
2274
+ break;
2275
+ case GGML_UNARY_OP_SILU:
2276
+ ggml_cuda_op_silu(ctx, dst);
2277
+ break;
2278
+ case GGML_UNARY_OP_GELU_ERF:
2279
+ ggml_cuda_op_gelu_erf(ctx, dst);
2280
+ break;
2281
+ case GGML_UNARY_OP_GELU_QUICK:
2282
+ ggml_cuda_op_gelu_quick(ctx, dst);
2283
+ break;
2284
+ case GGML_UNARY_OP_TANH:
2285
+ ggml_cuda_op_tanh(ctx, dst);
2286
+ break;
2287
+ case GGML_UNARY_OP_RELU:
2288
+ ggml_cuda_op_relu(ctx, dst);
2289
+ break;
2290
+ case GGML_UNARY_OP_SIGMOID:
2291
+ ggml_cuda_op_sigmoid(ctx, dst);
2292
+ break;
2293
+ case GGML_UNARY_OP_HARDSIGMOID:
2294
+ ggml_cuda_op_hardsigmoid(ctx, dst);
2295
+ break;
2296
+ case GGML_UNARY_OP_HARDSWISH:
2297
+ ggml_cuda_op_hardswish(ctx, dst);
2298
+ break;
2299
+ case GGML_UNARY_OP_EXP:
2300
+ ggml_cuda_op_exp(ctx, dst);
2301
+ break;
2302
+ default:
2303
+ return false;
2304
+ }
2305
+ break;
2306
+ case GGML_OP_GLU:
2307
+ switch (ggml_get_glu_op(dst)) {
2308
+ case GGML_GLU_OP_REGLU:
2309
+ ggml_cuda_op_reglu(ctx, dst);
2310
+ break;
2311
+ case GGML_GLU_OP_GEGLU:
2312
+ ggml_cuda_op_geglu(ctx, dst);
2313
+ break;
2314
+ case GGML_GLU_OP_SWIGLU:
2315
+ ggml_cuda_op_swiglu(ctx, dst);
2316
+ break;
2317
+ default:
2318
+ return false;
2319
+ }
2320
+ break;
2321
+ case GGML_OP_NORM:
2322
+ ggml_cuda_op_norm(ctx, dst);
2323
+ break;
2324
+ case GGML_OP_GROUP_NORM:
2325
+ ggml_cuda_op_group_norm(ctx, dst);
2326
+ break;
2327
+ case GGML_OP_L2_NORM:
2328
+ ggml_cuda_op_l2_norm(ctx, dst);
2329
+ break;
2330
+ case GGML_OP_CONCAT:
2331
+ ggml_cuda_op_concat(ctx, dst);
2332
+ break;
2333
+ case GGML_OP_UPSCALE:
2334
+ ggml_cuda_op_upscale(ctx, dst);
2335
+ break;
2336
+ case GGML_OP_PAD:
2337
+ ggml_cuda_op_pad(ctx, dst);
2338
+ break;
2339
+ case GGML_OP_ARANGE:
2340
+ ggml_cuda_op_arange(ctx, dst);
2341
+ break;
2342
+ case GGML_OP_TIMESTEP_EMBEDDING:
2343
+ ggml_cuda_op_timestep_embedding(ctx, dst);
2344
+ break;
2345
+ case GGML_OP_LEAKY_RELU:
2346
+ ggml_cuda_op_leaky_relu(ctx, dst);
2347
+ break;
2348
+ case GGML_OP_SILU_BACK:
2349
+ ggml_cuda_op_silu_back(ctx, dst);
2350
+ break;
2351
+ case GGML_OP_RMS_NORM:
2352
+ ggml_cuda_op_rms_norm(ctx, dst);
2353
+ break;
2354
+ case GGML_OP_RMS_NORM_BACK:
2355
+ ggml_cuda_op_rms_norm_back(ctx, dst);
2356
+ break;
2357
+ case GGML_OP_MUL_MAT:
2358
+ ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
2359
+ break;
2360
+ case GGML_OP_MUL_MAT_ID:
2361
+ ggml_cuda_mul_mat_id(ctx, dst);
2362
+ break;
2363
+ case GGML_OP_OUT_PROD:
2364
+ ggml_cuda_out_prod(ctx, dst);
2365
+ break;
2366
+ case GGML_OP_SCALE:
2367
+ ggml_cuda_op_scale(ctx, dst);
2368
+ break;
2369
+ case GGML_OP_SQR:
2370
+ ggml_cuda_op_sqr(ctx, dst);
2371
+ break;
2372
+ case GGML_OP_SQRT:
2373
+ ggml_cuda_op_sqrt(ctx, dst);
2374
+ break;
2375
+ case GGML_OP_SIN:
2376
+ ggml_cuda_op_sin(ctx, dst);
2377
+ break;
2378
+ case GGML_OP_COS:
2379
+ ggml_cuda_op_cos(ctx, dst);
2380
+ break;
2381
+ case GGML_OP_CLAMP:
2382
+ ggml_cuda_op_clamp(ctx, dst);
2383
+ break;
2384
+ case GGML_OP_LOG:
2385
+ ggml_cuda_op_log(ctx, dst);
2386
+ break;
2387
+ case GGML_OP_NONE:
2388
+ case GGML_OP_RESHAPE:
2389
+ case GGML_OP_VIEW:
2390
+ case GGML_OP_PERMUTE:
2391
+ case GGML_OP_TRANSPOSE:
2392
+ break;
2393
+ case GGML_OP_DIAG_MASK_INF:
2394
+ ggml_cuda_op_diag_mask_inf(ctx, dst);
2395
+ break;
2396
+ case GGML_OP_SOFT_MAX:
2397
+ ggml_cuda_op_soft_max(ctx, dst);
2398
+ break;
2399
+ case GGML_OP_SOFT_MAX_BACK:
2400
+ ggml_cuda_op_soft_max_back(ctx, dst);
2401
+ break;
2402
+ case GGML_OP_ROPE:
2403
+ ggml_cuda_op_rope(ctx, dst);
2404
+ break;
2405
+ case GGML_OP_ROPE_BACK:
2406
+ ggml_cuda_op_rope_back(ctx, dst);
2407
+ break;
2408
+ case GGML_OP_IM2COL:
2409
+ ggml_cuda_op_im2col(ctx, dst);
2410
+ break;
2411
+ case GGML_OP_CONV_2D_DW:
2412
+ ggml_cuda_op_conv2d_dw(ctx, dst);
2413
+ break;
2414
+ case GGML_OP_CONV_TRANSPOSE_2D:
2415
+ ggml_cuda_conv_2d_transpose_p0(ctx, dst);
2416
+ break;
2417
+ case GGML_OP_CONV_TRANSPOSE_1D:
2418
+ ggml_cuda_op_conv_transpose_1d(ctx,dst);
2419
+ break;
2420
+ case GGML_OP_POOL_2D:
2421
+ ggml_cuda_op_pool2d(ctx, dst);
2422
+ break;
2423
+ case GGML_OP_SUM:
2424
+ ggml_cuda_op_sum(ctx, dst);
2425
+ break;
2426
+ case GGML_OP_SUM_ROWS:
2427
+ ggml_cuda_op_sum_rows(ctx, dst);
2428
+ break;
2429
+ case GGML_OP_MEAN:
2430
+ ggml_cuda_op_mean(ctx, dst);
2431
+ break;
2432
+ case GGML_OP_SSM_CONV:
2433
+ ggml_cuda_op_ssm_conv(ctx, dst);
2434
+ break;
2435
+ case GGML_OP_SSM_SCAN:
2436
+ ggml_cuda_op_ssm_scan(ctx, dst);
2437
+ break;
2438
+ case GGML_OP_ARGSORT:
2439
+ ggml_cuda_op_argsort(ctx, dst);
2440
+ break;
2441
+ case GGML_OP_FLASH_ATTN_EXT:
2442
+ ggml_cuda_flash_attn_ext(ctx, dst);
2443
+ break;
2444
+ case GGML_OP_CROSS_ENTROPY_LOSS:
2445
+ ggml_cuda_cross_entropy_loss(ctx, dst);
2446
+ break;
2447
+ case GGML_OP_RWKV_WKV6:
2448
+ ggml_cuda_op_rwkv_wkv6(ctx, dst);
2449
+ break;
2450
+ case GGML_OP_GATED_LINEAR_ATTN:
2451
+ ggml_cuda_op_gated_linear_attn(ctx, dst);
2452
+ break;
2453
+ case GGML_OP_RWKV_WKV7:
2454
+ ggml_cuda_op_rwkv_wkv7(ctx, dst);
2455
+ break;
2456
+ case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
2457
+ ggml_cuda_cross_entropy_loss_back(ctx, dst);
2458
+ break;
2459
+ case GGML_OP_OPT_STEP_ADAMW:
2460
+ ggml_cuda_opt_step_adamw(ctx, dst);
2461
+ break;
2462
+ default:
2463
+ return false;
2464
+ }
2465
+
2466
+ cudaError_t err = cudaGetLastError();
2467
+ if (err != cudaSuccess) {
2468
+ GGML_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
2469
+ CUDA_CHECK(err);
2470
+ }
2471
+
2472
+ return true;
2473
+ }
2474
+
2475
+ ////////////////////////////////////////////////////////////////////////////////
2476
+
2477
+ // backend
2478
+
2479
+ static const char * ggml_backend_cuda_get_name(ggml_backend_t backend) {
2480
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2481
+
2482
+ return cuda_ctx->name.c_str();
2483
+ }
2484
+
2485
+ static void ggml_backend_cuda_free(ggml_backend_t backend) {
2486
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2487
+
2488
+ delete cuda_ctx;
2489
+ delete backend;
2490
+ }
2491
+
2492
+ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2493
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2494
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
2495
+
2496
+ GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
2497
+
2498
+ CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
2499
+ }
2500
+
2501
+ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
2502
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2503
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
2504
+
2505
+ GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
2506
+
2507
+ CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
2508
+ }
2509
+
2510
+ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
2511
+ ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
2512
+ ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
2513
+
2514
+ if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
2515
+ return false;
2516
+ }
2517
+
2518
+ if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
2519
+ return false;
2520
+ }
2521
+
2522
+ // device -> device copy
2523
+ ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
2524
+ ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
2525
+
2526
+ ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
2527
+ ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
2528
+
2529
+ if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
2530
+ #ifndef NDEBUG
2531
+ GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
2532
+ #endif
2533
+ return false;
2534
+ }
2535
+
2536
+ if (backend_src != backend_dst) {
2537
+ // copy on src stream
2538
+ if (cuda_ctx_src->device == cuda_ctx_dst->device) {
2539
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
2540
+ } else {
2541
+ #ifdef GGML_CUDA_NO_PEER_COPY
2542
+ return false;
2543
+ #else
2544
+ CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
2545
+ #endif
2546
+ }
2547
+
2548
+ // record event on src stream after the copy
2549
+ if (!cuda_ctx_src->copy_event) {
2550
+ ggml_cuda_set_device(cuda_ctx_src->device);
2551
+ CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
2552
+ }
2553
+
2554
+ CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));
2555
+
2556
+ // wait on dst stream for the copy to complete
2557
+ CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
2558
+ } else {
2559
+ // src and dst are on the same backend
2560
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
2561
+ }
2562
+ return true;
2563
+ }
2564
+
2565
+ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
2566
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2567
+
2568
+ CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream()));
2569
+
2570
+ GGML_UNUSED(backend);
2571
+ }
2572
+
2573
+ #ifdef USE_CUDA_GRAPH
2574
+ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
2575
+ bool use_cuda_graph) {
2576
+
2577
+ // Loop over nodes in GGML graph to obtain info needed for CUDA graph
2578
+ cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
2579
+
2580
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2581
+ ggml_tensor * node = cgraph->nodes[i];
2582
+
2583
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2584
+ continue;
2585
+ }
2586
+
2587
+ if (node->src[0] && node->src[0]->buffer && ggml_backend_buft_is_cuda_split(node->src[0]->buffer->buft)) {
2588
+ use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2589
+ #ifndef NDEBUG
2590
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
2591
+ #endif
2592
+ }
2593
+
2594
+ if (node->op == GGML_OP_MUL_MAT_ID && node->ne[2] != 1) {
2595
+ use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2596
+ #ifndef NDEBUG
2597
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
2598
+ #endif
2599
+ }
2600
+
2601
+ if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
2602
+ // disable CUDA graphs for batch size > 1 for now.
2603
+ // Changes in batch size or context size can cause changes to the grid size of some kernels.
2604
+ use_cuda_graph = false;
2605
+ #ifndef NDEBUG
2606
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2607
+ #endif
2608
+ }
2609
+
2610
+ if (node->op == GGML_OP_CPY) {
2611
+
2612
+ // Store the pointers which are updated for each token, such that these can be sent
2613
+ // to the device and accessed using indirection from CUDA graph
2614
+ cuda_ctx->cuda_graph->cpy_dest_ptrs.push_back((char *) node->src[1]->data);
2615
+
2616
+ // store a pointer to each copy op CUDA kernel to identify it later
2617
+ void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
2618
+ if (!ptr) {
2619
+ use_cuda_graph = false;
2620
+ #ifndef NDEBUG
2621
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
2622
+ #endif
2623
+ }
2624
+ }
2625
+
2626
+ if (!use_cuda_graph) {
2627
+ break;
2628
+ }
2629
+ }
2630
+
2631
+ if (use_cuda_graph) {
2632
+ cuda_ctx->cuda_graph->use_cpy_indirection = true;
2633
+ // copy pointers to GPU so they can be accessed via indirection within CUDA graph
2634
+ ggml_cuda_cpy_dest_ptrs_copy(cuda_ctx->cuda_graph.get(), cuda_ctx->cuda_graph->cpy_dest_ptrs.data(), cuda_ctx->cuda_graph->cpy_dest_ptrs.size(), cuda_ctx->stream());
2635
+ }
2636
+
2637
+ return use_cuda_graph;
2638
+ }
2639
+
2640
+ static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2641
+ graph_node_properties->node_address = node->data;
2642
+ graph_node_properties->node_op = node->op;
2643
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
2644
+ graph_node_properties->ne[i] = node->ne[i];
2645
+ graph_node_properties->nb[i] = node->nb[i];
2646
+ }
2647
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2648
+ graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
2649
+ }
2650
+ memcpy(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS);
2651
+ }
2652
+
2653
+ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2654
+ if (node->data != graph_node_properties->node_address &&
2655
+ node->op != GGML_OP_CPY &&
2656
+ node->op != GGML_OP_VIEW) {
2657
+ return false;
2658
+ }
2659
+
2660
+ if (node->op != graph_node_properties->node_op) {
2661
+ return false;
2662
+ }
2663
+
2664
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
2665
+ if (node->ne[i] != graph_node_properties->ne[i]) {
2666
+ return false;
2667
+ }
2668
+ if (node->nb[i] != graph_node_properties->nb[i]) {
2669
+ return false;
2670
+ }
2671
+ }
2672
+
2673
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2674
+ if (node->src[i] &&
2675
+ node->src[i]->data != graph_node_properties->src_address[i] &&
2676
+ node->op != GGML_OP_CPY &&
2677
+ node->op != GGML_OP_VIEW
2678
+ ) {
2679
+ return false;
2680
+ }
2681
+ }
2682
+
2683
+ if (node->op == GGML_OP_SCALE &&
2684
+ memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
2685
+ return false;
2686
+ }
2687
+
2688
+ return true;
2689
+ }
2690
+
2691
+ static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
2692
+
2693
+ bool cuda_graph_update_required = false;
2694
+
2695
+ if (cuda_ctx->cuda_graph->instance == nullptr) {
2696
+ cuda_graph_update_required = true;
2697
+ }
2698
+
2699
+ // Check if the graph size has changed
2700
+ if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
2701
+ cuda_graph_update_required = true;
2702
+ cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
2703
+ }
2704
+
2705
+ // Loop over nodes in GGML graph to determine if CUDA graph update is required
2706
+ // and store properties to allow this comparison for the next token
2707
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2708
+ bool has_matching_properties = true;
2709
+ if (!cuda_graph_update_required) {
2710
+ has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2711
+ }
2712
+ if (!has_matching_properties) {
2713
+ cuda_graph_update_required = true;
2714
+ }
2715
+ set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2716
+ }
2717
+
2718
+ return cuda_graph_update_required;
2719
+ }
2720
+
2721
+ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
2722
+
2723
+ #if CUDART_VERSION >= 12000
2724
+ cudaGraphExecUpdateResultInfo result_info;
2725
+ cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2726
+ #else
2727
+ cudaGraphNode_t errorNode;
2728
+ cudaGraphExecUpdateResult result_info;
2729
+ cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info);
2730
+ #endif // CUDART_VERSION >= 12000
2731
+
2732
+ if (stat == cudaErrorGraphExecUpdateFailure) {
2733
+ #ifndef NDEBUG
2734
+ GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
2735
+ #endif
2736
+
2737
+ // The pre-existing graph exec cannot be updated due to violated constraints
2738
+ // so instead clear error and re-instantiate
2739
+ (void)cudaGetLastError();
2740
+ CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
2741
+ cuda_ctx->cuda_graph->instance = nullptr;
2742
+ CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2743
+ } else {
2744
+ GGML_ASSERT(stat == cudaSuccess);
2745
+ }
2746
+ }
2747
+ #endif
2748
+
2749
+ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
2750
+ bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
2751
+ // flag used to determine whether it is an integrated_gpu
2752
+ const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
2753
+
2754
+ while (!graph_evaluated_or_captured) {
2755
+ // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
2756
+ // With the use of CUDA graphs, the execution will be performed by the graph launch.
2757
+ if (!use_cuda_graph || cuda_graph_update_required) {
2758
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2759
+ ggml_tensor * node = cgraph->nodes[i];
2760
+
2761
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2762
+ continue;
2763
+ }
2764
+
2765
+ #ifndef NDEBUG
2766
+ assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
2767
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
2768
+ if (node->src[j] != nullptr) {
2769
+ assert(node->src[j]->buffer);
2770
+ assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
2771
+ ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft)));
2772
+ }
2773
+ }
2774
+ #else
2775
+ GGML_UNUSED(integrated);
2776
+ #endif // NDEBUG
2777
+
2778
+ bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2779
+ if (!ok) {
2780
+ GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2781
+ }
2782
+ GGML_ASSERT(ok);
2783
+ }
2784
+ }
2785
+
2786
+ #ifdef USE_CUDA_GRAPH
2787
+ if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture
2788
+ if (cuda_ctx->cuda_graph->graph != nullptr) {
2789
+ CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
2790
+ cuda_ctx->cuda_graph->graph = nullptr;
2791
+ }
2792
+
2793
+ CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
2794
+ graph_evaluated_or_captured = true; // CUDA graph has been captured
2795
+
2796
+ std::lock_guard<std::mutex> lock(ggml_cuda_lock);
2797
+ if (ggml_cuda_lock_counter.fetch_sub(1, std::memory_order_relaxed) == 1) {
2798
+ ggml_cuda_lock_cv.notify_all();
2799
+ }
2800
+ } else {
2801
+ graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
2802
+ }
2803
+ }
2804
+
2805
+ if (use_cuda_graph) {
2806
+ if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
2807
+ CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2808
+ }
2809
+ if (cuda_graph_update_required) { // Update graph executable
2810
+ update_cuda_graph_executable(cuda_ctx);
2811
+ }
2812
+ // Launch graph
2813
+ CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
2814
+ #else
2815
+ graph_evaluated_or_captured = true;
2816
+ #endif // USE_CUDA_GRAPH
2817
+ }
2818
+ }
2819
+
2820
+ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
2821
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2822
+
2823
+ ggml_cuda_set_device(cuda_ctx->device);
2824
+
2825
+ #ifdef USE_CUDA_GRAPH
2826
+ static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
2827
+
2828
+ // Objects required for CUDA Graph
2829
+ if (cuda_ctx->cuda_graph == nullptr) {
2830
+ cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
2831
+ }
2832
+
2833
+ bool use_cuda_graph = true;
2834
+ bool cuda_graph_update_required = false;
2835
+
2836
+ if (cuda_ctx->cuda_graph->graph == nullptr) {
2837
+ if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
2838
+ cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2839
+ #ifndef NDEBUG
2840
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2841
+ #endif
2842
+ }
2843
+ }
2844
+
2845
+ // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
2846
+ // or previous graph capture failure.
2847
+ // Also disable for multi-gpu for now. TO DO investigate
2848
+ if (disable_cuda_graphs_due_to_env
2849
+ || cuda_ctx->cuda_graph->disable_due_to_gpu_arch
2850
+ || cuda_ctx->cuda_graph->disable_due_to_too_many_updates
2851
+ || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
2852
+ use_cuda_graph = false;
2853
+ }
2854
+
2855
+ if (use_cuda_graph) {
2856
+ cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
2857
+
2858
+ use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph);
2859
+
2860
+ // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
2861
+ if (use_cuda_graph && cuda_graph_update_required) {
2862
+ cuda_ctx->cuda_graph->number_consecutive_updates++;
2863
+ } else {
2864
+ cuda_ctx->cuda_graph->number_consecutive_updates = 0;
2865
+ }
2866
+
2867
+ if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2868
+ cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2869
+ #ifndef NDEBUG
2870
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2871
+ #endif
2872
+ }
2873
+ }
2874
+
2875
+ if (use_cuda_graph && cuda_graph_update_required) {
2876
+ // Start CUDA graph capture
2877
+ {
2878
+ std::lock_guard<std::mutex> lock(ggml_cuda_lock);
2879
+ ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed);
2880
+ }
2881
+
2882
+ CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
2883
+ }
2884
+
2885
+ if (!use_cuda_graph) {
2886
+ cuda_ctx->cuda_graph->use_cpy_indirection = false;
2887
+ }
2888
+
2889
+ #else
2890
+ bool use_cuda_graph = false;
2891
+ bool cuda_graph_update_required = false;
2892
+ #endif // USE_CUDA_GRAPH
2893
+
2894
+ bool graph_evaluated_or_captured = false;
2895
+
2896
+ evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
2897
+
2898
+ return GGML_STATUS_SUCCESS;
2899
+ }
2900
+
2901
+ static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
2902
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2903
+
2904
+ CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream()));
2905
+ }
2906
+
2907
+ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
2908
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2909
+
2910
+ if (ggml_backend_is_cuda(backend)) {
2911
+ CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0));
2912
+ } else {
2913
+ #if 0
2914
+ // untested
2915
+ auto wait_fn = [](void * user_data) {
2916
+ ggml_backend_event_t event = (ggml_backend_event_t)user_data;
2917
+ ggml_backend_event_synchronize(event);
2918
+ };
2919
+
2920
+ CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
2921
+ #endif
2922
+ GGML_ABORT("fatal error");
2923
+ }
2924
+ }
2925
+
2926
+ static const ggml_backend_i ggml_backend_cuda_interface = {
2927
+ /* .get_name = */ ggml_backend_cuda_get_name,
2928
+ /* .free = */ ggml_backend_cuda_free,
2929
+ /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
2930
+ /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
2931
+ /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
2932
+ /* .synchronize = */ ggml_backend_cuda_synchronize,
2933
+ /* .graph_plan_create = */ NULL,
2934
+ /* .graph_plan_free = */ NULL,
2935
+ /* .graph_plan_update = */ NULL,
2936
+ /* .graph_plan_compute = */ NULL,
2937
+ /* .graph_compute = */ ggml_backend_cuda_graph_compute,
2938
+ /* .event_record = */ ggml_backend_cuda_event_record,
2939
+ /* .event_wait = */ ggml_backend_cuda_event_wait,
2940
+ };
2941
+
2942
+ static ggml_guid_t ggml_backend_cuda_guid() {
2943
+ static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
2944
+ return &guid;
2945
+ }
2946
+
2947
+ bool ggml_backend_is_cuda(ggml_backend_t backend) {
2948
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
2949
+ }
2950
+
2951
+ int ggml_backend_cuda_get_device_count() {
2952
+ return ggml_cuda_info().device_count;
2953
+ }
2954
+
2955
+ void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
2956
+ cudaDeviceProp prop;
2957
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
2958
+ snprintf(description, description_size, "%s", prop.name);
2959
+ }
2960
+
2961
+ void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
2962
+ ggml_cuda_set_device(device);
2963
+
2964
+ CUDA_CHECK(cudaMemGetInfo(free, total));
2965
+ }
2966
+
2967
+ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
2968
+ if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
2969
+ return false;
2970
+ }
2971
+
2972
+ #if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
2973
+ cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
2974
+ if (err != cudaSuccess) {
2975
+ // clear the error
2976
+ (void)cudaGetLastError();
2977
+
2978
+ GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
2979
+ size / 1024.0 / 1024.0, cudaGetErrorString(err));
2980
+ return false;
2981
+ }
2982
+ return true;
2983
+ #else
2984
+ GGML_UNUSED(buffer);
2985
+ GGML_UNUSED(size);
2986
+ return false;
2987
+ #endif // CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
2988
+ }
2989
+
2990
+ void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
2991
+ if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
2992
+ return;
2993
+ }
2994
+
2995
+ cudaError_t err = cudaHostUnregister(buffer);
2996
+ if (err != cudaSuccess) {
2997
+ // clear the error
2998
+ (void)cudaGetLastError();
2999
+ }
3000
+ }
3001
+
3002
+
3003
+ // backend device
3004
+
3005
+ struct ggml_backend_cuda_device_context {
3006
+ int device;
3007
+ std::string name;
3008
+ std::string description;
3009
+ };
3010
+
3011
+ static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
3012
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
3013
+ return ctx->name.c_str();
3014
+ }
3015
+
3016
+ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t dev) {
3017
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
3018
+ return ctx->description.c_str();
3019
+ }
3020
+
3021
+ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
3022
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
3023
+ ggml_cuda_set_device(ctx->device);
3024
+ CUDA_CHECK(cudaMemGetInfo(free, total));
3025
+ }
3026
+
3027
+ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
3028
+ GGML_UNUSED(dev);
3029
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
3030
+ }
3031
+
3032
+ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
3033
+ props->name = ggml_backend_cuda_device_get_name(dev);
3034
+ props->description = ggml_backend_cuda_device_get_description(dev);
3035
+ props->type = ggml_backend_cuda_device_get_type(dev);
3036
+ ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
3037
+
3038
+ bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
3039
+ #ifdef GGML_CUDA_NO_PEER_COPY
3040
+ bool events = false;
3041
+ #else
3042
+ bool events = true;
3043
+ #endif
3044
+
3045
+ props->caps = {
3046
+ /* .async = */ true,
3047
+ /* .host_buffer = */ host_buffer,
3048
+ /* .buffer_from_host_ptr = */ false,
3049
+ /* .events = */ events,
3050
+ };
3051
+ }
3052
+
3053
+ static ggml_backend_t ggml_backend_cuda_device_init_backend(ggml_backend_dev_t dev, const char * params) {
3054
+ GGML_UNUSED(params);
3055
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
3056
+ return ggml_backend_cuda_init(ctx->device);
3057
+ }
3058
+
3059
+ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_buffer_type(ggml_backend_dev_t dev) {
3060
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
3061
+ return ggml_backend_cuda_buffer_type(ctx->device);
3062
+ }
3063
+
3064
+ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(ggml_backend_dev_t dev) {
3065
+ GGML_UNUSED(dev);
3066
+ return ggml_backend_cuda_host_buffer_type();
3067
+ }
3068
+
3069
+ // TODO: move these functions here
3070
+ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
3071
+ ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
3072
+
3073
+ // split buffers can only be used with GGML_OP_MUL_MAT
3074
+ if (op->op != GGML_OP_MUL_MAT) {
3075
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
3076
+ if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda_split(op->src[i]->buffer->buft)) {
3077
+ return false;
3078
+ }
3079
+ }
3080
+ }
3081
+
3082
+ // check if all the sources are allocated on this device
3083
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
3084
+ if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda(op->src[i]->buffer->buft)) {
3085
+ ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)op->src[i]->buffer->buft->context;
3086
+ if (buft_ctx->device != dev_ctx->device) {
3087
+ return false;
3088
+ }
3089
+ }
3090
+ }
3091
+
3092
+ switch (op->op) {
3093
+ case GGML_OP_UNARY:
3094
+ switch (ggml_get_unary_op(op)) {
3095
+ case GGML_UNARY_OP_ABS:
3096
+ case GGML_UNARY_OP_SGN:
3097
+ case GGML_UNARY_OP_NEG:
3098
+ case GGML_UNARY_OP_STEP:
3099
+ case GGML_UNARY_OP_GELU:
3100
+ case GGML_UNARY_OP_SILU:
3101
+ case GGML_UNARY_OP_RELU:
3102
+ case GGML_UNARY_OP_SIGMOID:
3103
+ case GGML_UNARY_OP_HARDSIGMOID:
3104
+ case GGML_UNARY_OP_HARDSWISH:
3105
+ case GGML_UNARY_OP_GELU_ERF:
3106
+ case GGML_UNARY_OP_GELU_QUICK:
3107
+ case GGML_UNARY_OP_TANH:
3108
+ case GGML_UNARY_OP_EXP:
3109
+ return ggml_is_contiguous(op->src[0]);
3110
+ default:
3111
+ return false;
3112
+ }
3113
+ break;
3114
+ case GGML_OP_GLU:
3115
+ switch (ggml_get_glu_op(op)) {
3116
+ case GGML_GLU_OP_REGLU:
3117
+ case GGML_GLU_OP_GEGLU:
3118
+ case GGML_GLU_OP_SWIGLU:
3119
+ return ggml_is_contiguous_1(op->src[0]);
3120
+ default:
3121
+ return false;
3122
+ }
3123
+ break;
3124
+ case GGML_OP_MUL_MAT:
3125
+ case GGML_OP_MUL_MAT_ID:
3126
+ {
3127
+ struct ggml_tensor * a = op->src[0];
3128
+ struct ggml_tensor * b = op->src[1];
3129
+ if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
3130
+ if (a->ne[2] > 1 || a->ne[3] > 1) {
3131
+ return false;
3132
+ }
3133
+ // for small weight matrices the active device can end up without any rows, don't use row split in those cases
3134
+ // this avoids some edge cases (and the performance would not be good anyways)
3135
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
3136
+ int64_t row_low;
3137
+ int64_t row_high;
3138
+ get_row_split(&row_low, &row_high, a, buft_ctx->tensor_split, dev_ctx->device);
3139
+ if (row_low == row_high) {
3140
+ return false;
3141
+ }
3142
+ }
3143
+ if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
3144
+ return false;
3145
+ }
3146
+ #ifdef GGML_USE_MUSA
3147
+ const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
3148
+ if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
3149
+ if (GGML_CUDA_CC_IS_QY1(cc) && op->op == GGML_OP_MUL_MAT &&
3150
+ a->type == GGML_TYPE_F16 && b->type == GGML_TYPE_F16) {
3151
+ return false;
3152
+ }
3153
+ if (GGML_CUDA_CC_IS_QY2(cc) && op->op == GGML_OP_MUL_MAT_ID &&
3154
+ a->type == GGML_TYPE_Q2_K && b->type == GGML_TYPE_F32) {
3155
+ return false;
3156
+ }
3157
+ }
3158
+ #endif // GGML_USE_MUSA
3159
+ switch (a->type) {
3160
+ case GGML_TYPE_F32:
3161
+ case GGML_TYPE_F16:
3162
+ case GGML_TYPE_Q4_0:
3163
+ case GGML_TYPE_Q4_1:
3164
+ case GGML_TYPE_Q5_0:
3165
+ case GGML_TYPE_Q5_1:
3166
+ case GGML_TYPE_Q8_0:
3167
+ case GGML_TYPE_Q2_K:
3168
+ case GGML_TYPE_Q3_K:
3169
+ case GGML_TYPE_Q4_K:
3170
+ case GGML_TYPE_Q5_K:
3171
+ case GGML_TYPE_Q6_K:
3172
+ case GGML_TYPE_Q8_K:
3173
+ case GGML_TYPE_IQ1_M:
3174
+ case GGML_TYPE_IQ1_S:
3175
+ case GGML_TYPE_IQ2_S:
3176
+ case GGML_TYPE_IQ2_XS:
3177
+ case GGML_TYPE_IQ2_XXS:
3178
+ case GGML_TYPE_IQ3_S:
3179
+ case GGML_TYPE_IQ3_XXS:
3180
+ case GGML_TYPE_IQ4_NL:
3181
+ case GGML_TYPE_IQ4_XS:
3182
+ case GGML_TYPE_BF16:
3183
+ return true;
3184
+ default:
3185
+ return false;
3186
+ }
3187
+ } break;
3188
+ case GGML_OP_OUT_PROD:
3189
+ return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
3190
+ case GGML_OP_GET_ROWS:
3191
+ {
3192
+ switch (op->src[0]->type) {
3193
+ case GGML_TYPE_F16:
3194
+ case GGML_TYPE_F32:
3195
+ case GGML_TYPE_Q4_0:
3196
+ case GGML_TYPE_Q4_1:
3197
+ case GGML_TYPE_Q5_0:
3198
+ case GGML_TYPE_Q5_1:
3199
+ case GGML_TYPE_Q8_0:
3200
+ return true;
3201
+ default:
3202
+ return false;
3203
+ }
3204
+ } break;
3205
+ case GGML_OP_GET_ROWS_BACK:
3206
+ {
3207
+ return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
3208
+ } break;
3209
+ case GGML_OP_CPY:
3210
+ {
3211
+ ggml_type src0_type = op->src[0]->type;
3212
+ ggml_type src1_type = op->src[1]->type;
3213
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
3214
+ return true;
3215
+ }
3216
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_BF16) {
3217
+ return true;
3218
+ }
3219
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
3220
+ return true;
3221
+ }
3222
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
3223
+ return true;
3224
+ }
3225
+ if (src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_F32) {
3226
+ return true;
3227
+ }
3228
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
3229
+ return true;
3230
+ }
3231
+ if (src0_type == GGML_TYPE_Q4_0 && src1_type == GGML_TYPE_F32) {
3232
+ return true;
3233
+ }
3234
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
3235
+ return true;
3236
+ }
3237
+ if (src0_type == GGML_TYPE_Q4_1 && src1_type == GGML_TYPE_F32) {
3238
+ return true;
3239
+ }
3240
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_0) {
3241
+ return true;
3242
+ }
3243
+ if (src0_type == GGML_TYPE_Q5_0 && src1_type == GGML_TYPE_F32) {
3244
+ return true;
3245
+ }
3246
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_1) {
3247
+ return true;
3248
+ }
3249
+ if (src0_type == GGML_TYPE_Q5_1 && src1_type == GGML_TYPE_F32) {
3250
+ return true;
3251
+ }
3252
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
3253
+ return true;
3254
+ }
3255
+ if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
3256
+ return true;
3257
+ }
3258
+ if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
3259
+ return true;
3260
+ }
3261
+ if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) {
3262
+ return true;
3263
+ }
3264
+ return false;
3265
+ } break;
3266
+ case GGML_OP_DUP:
3267
+ {
3268
+ ggml_type src0_type = op->src[0]->type;
3269
+ return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
3270
+ } break;
3271
+ case GGML_OP_ARGMAX:
3272
+ case GGML_OP_COUNT_EQUAL:
3273
+ {
3274
+ return true;
3275
+ } break;
3276
+ case GGML_OP_REPEAT:
3277
+ {
3278
+ ggml_type src0_type = op->src[0]->type;
3279
+ return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
3280
+ } break;
3281
+ case GGML_OP_REPEAT_BACK:
3282
+ return op->type == GGML_TYPE_F32 && (op->src[0]->ne[2]*op->src[0]->ne[3]) <= (1 << 15);
3283
+ case GGML_OP_CONCAT:
3284
+ {
3285
+ ggml_type src0_type = op->src[0]->type;
3286
+ return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
3287
+ } break;
3288
+ case GGML_OP_CONV_TRANSPOSE_1D:
3289
+ {
3290
+ ggml_type src0_type = op->src[0]->type;
3291
+ ggml_type src1_type = op->src[1]->type;
3292
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
3293
+ return true;
3294
+ }
3295
+ return false;
3296
+ } break;
3297
+ case GGML_OP_SILU_BACK:
3298
+ return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
3299
+ break;
3300
+ case GGML_OP_NORM:
3301
+ case GGML_OP_RMS_NORM:
3302
+ case GGML_OP_L2_NORM:
3303
+ return true;
3304
+ case GGML_OP_RMS_NORM_BACK:
3305
+ return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0;
3306
+ break;
3307
+ case GGML_OP_NONE:
3308
+ case GGML_OP_RESHAPE:
3309
+ case GGML_OP_VIEW:
3310
+ case GGML_OP_PERMUTE:
3311
+ case GGML_OP_TRANSPOSE:
3312
+ case GGML_OP_ADD:
3313
+ case GGML_OP_ADD1:
3314
+ case GGML_OP_SUB:
3315
+ case GGML_OP_MUL:
3316
+ case GGML_OP_DIV:
3317
+ case GGML_OP_SCALE:
3318
+ case GGML_OP_SQR:
3319
+ case GGML_OP_SQRT:
3320
+ case GGML_OP_SIN:
3321
+ case GGML_OP_COS:
3322
+ case GGML_OP_CLAMP:
3323
+ case GGML_OP_LOG:
3324
+ case GGML_OP_SSM_SCAN:
3325
+ case GGML_OP_SSM_CONV:
3326
+ return true;
3327
+ case GGML_OP_CONT:
3328
+ return op->src[0]->type != GGML_TYPE_BF16;
3329
+ case GGML_OP_DIAG_MASK_INF:
3330
+ case GGML_OP_SOFT_MAX:
3331
+ return true;
3332
+ case GGML_OP_SOFT_MAX_BACK: {
3333
+ float max_bias = 0.0f;
3334
+ memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float));
3335
+ return max_bias == 0.0f;
3336
+ }
3337
+ case GGML_OP_ROPE:
3338
+ case GGML_OP_ROPE_BACK: {
3339
+ return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]);
3340
+ }
3341
+ case GGML_OP_IM2COL:
3342
+ case GGML_OP_CONV_2D_DW:
3343
+ case GGML_OP_CONV_TRANSPOSE_2D:
3344
+ case GGML_OP_POOL_2D:
3345
+ case GGML_OP_SUM:
3346
+ case GGML_OP_SUM_ROWS:
3347
+ case GGML_OP_MEAN:
3348
+ case GGML_OP_ARGSORT:
3349
+ case GGML_OP_ACC:
3350
+ return true;
3351
+ case GGML_OP_GROUP_NORM:
3352
+ return ggml_is_contiguous(op->src[0]);
3353
+ case GGML_OP_UPSCALE:
3354
+ return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
3355
+ case GGML_OP_PAD:
3356
+ case GGML_OP_ARANGE:
3357
+ case GGML_OP_TIMESTEP_EMBEDDING:
3358
+ case GGML_OP_LEAKY_RELU:
3359
+ case GGML_OP_RWKV_WKV6:
3360
+ case GGML_OP_GATED_LINEAR_ATTN:
3361
+ case GGML_OP_RWKV_WKV7:
3362
+ return true;
3363
+ case GGML_OP_FLASH_ATTN_EXT: {
3364
+ #ifndef FLASH_ATTN_AVAILABLE
3365
+ return false;
3366
+ #endif // FLASH_ATTN_AVAILABLE
3367
+ if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
3368
+ const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
3369
+ if (!new_mma_available(cc)) {
3370
+ return false;
3371
+ }
3372
+ const int gqa_ratio = op->src[0]->ne[2] / op->src[1]->ne[2];
3373
+ return op->src[1]->ne[0] == 576 && op->src[2]->ne[0] == 512 && op->src[3] && gqa_ratio % 16 == 0;
3374
+ }
3375
+ if (op->src[0]->ne[0] == 192) {
3376
+ return false;
3377
+ }
3378
+ if (op->src[0]->ne[3] != 1) {
3379
+ return false;
3380
+ }
3381
+ if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) {
3382
+ return false;
3383
+ }
3384
+ if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) {
3385
+ return true;
3386
+ }
3387
+ if (op->src[0]->ne[0] == 128) {
3388
+ return true;
3389
+ }
3390
+ if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) {
3391
+ return true;
3392
+ }
3393
+ return fp16_mma_available(ggml_cuda_info().devices[dev_ctx->device].cc) &&
3394
+ op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
3395
+ }
3396
+ case GGML_OP_CROSS_ENTROPY_LOSS:
3397
+ case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
3398
+ case GGML_OP_OPT_STEP_ADAMW:
3399
+ return true;
3400
+ default:
3401
+ return false;
3402
+ }
3403
+ }
3404
+
3405
+ static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
3406
+ ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
3407
+ const bool integrated = ggml_cuda_info().devices[dev_ctx->device].integrated;
3408
+ return (((ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev) || (integrated && ggml_backend_buft_is_cuda_host(buft)));
3409
+ }
3410
+
3411
+ static int64_t get_op_batch_size(const ggml_tensor * op) {
3412
+ switch (op->op) {
3413
+ case GGML_OP_GET_ROWS:
3414
+ return 0;
3415
+ case GGML_OP_MUL_MAT:
3416
+ return op->ne[1];
3417
+ case GGML_OP_MUL_MAT_ID:
3418
+ case GGML_OP_ROPE:
3419
+ case GGML_OP_ROPE_BACK:
3420
+ return op->ne[2];
3421
+ default:
3422
+ return ggml_nrows(op);
3423
+ }
3424
+ }
3425
+
3426
+ static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
3427
+ const int min_batch_size = 32;
3428
+
3429
+ return get_op_batch_size(op) >= min_batch_size;
3430
+
3431
+ GGML_UNUSED(dev);
3432
+ }
3433
+
3434
+ static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
3435
+ #ifdef GGML_CUDA_NO_PEER_COPY
3436
+ return nullptr;
3437
+ #else
3438
+ ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context;
3439
+
3440
+ ggml_cuda_set_device(dev_ctx->device);
3441
+
3442
+ cudaEvent_t event;
3443
+ CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
3444
+
3445
+ return new ggml_backend_event {
3446
+ /* .device = */ dev,
3447
+ /* .context = */ event,
3448
+ };
3449
+ #endif
3450
+ }
3451
+
3452
+ static void ggml_backend_cuda_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
3453
+ GGML_UNUSED(dev);
3454
+
3455
+ CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
3456
+ delete event;
3457
+ }
3458
+
3459
+ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
3460
+ GGML_UNUSED(dev);
3461
+ CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
3462
+ }
3463
+
3464
+ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
3465
+ /* .get_name = */ ggml_backend_cuda_device_get_name,
3466
+ /* .get_description = */ ggml_backend_cuda_device_get_description,
3467
+ /* .get_memory = */ ggml_backend_cuda_device_get_memory,
3468
+ /* .get_type = */ ggml_backend_cuda_device_get_type,
3469
+ /* .get_props = */ ggml_backend_cuda_device_get_props,
3470
+ /* .init_backend = */ ggml_backend_cuda_device_init_backend,
3471
+ /* .get_buffer_type = */ ggml_backend_cuda_device_get_buffer_type,
3472
+ /* .get_host_buffer_type = */ ggml_backend_cuda_device_get_host_buffer_type,
3473
+ /* .buffer_from_host_ptr = */ NULL,
3474
+ /* .supports_op = */ ggml_backend_cuda_device_supports_op,
3475
+ /* .supports_buft = */ ggml_backend_cuda_device_supports_buft,
3476
+ /* .offload_op = */ ggml_backend_cuda_device_offload_op,
3477
+ /* .event_new = */ ggml_backend_cuda_device_event_new,
3478
+ /* .event_free = */ ggml_backend_cuda_device_event_free,
3479
+ /* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
3480
+ };
3481
+
3482
+ // backend reg
3483
+
3484
+ struct ggml_backend_cuda_reg_context {
3485
+ std::vector<ggml_backend_dev_t> devices;
3486
+ };
3487
+
3488
+ static const char * ggml_backend_cuda_reg_get_name(ggml_backend_reg_t reg) {
3489
+ GGML_UNUSED(reg);
3490
+ return GGML_CUDA_NAME;
3491
+ }
3492
+
3493
+ static size_t ggml_backend_cuda_reg_get_device_count(ggml_backend_reg_t reg) {
3494
+ ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context;
3495
+ return ctx->devices.size();
3496
+ }
3497
+
3498
+ static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t reg, size_t index) {
3499
+ ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context;
3500
+ GGML_ASSERT(index < ctx->devices.size());
3501
+ return ctx->devices[index];
3502
+ }
3503
+
3504
+ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t reg) {
3505
+ static std::vector<ggml_backend_feature> features = []() {
3506
+ std::vector<ggml_backend_feature> features;
3507
+ #define _STRINGIFY(...) #__VA_ARGS__
3508
+ #define STRINGIFY(...) _STRINGIFY(__VA_ARGS__)
3509
+
3510
+ #ifdef __CUDA_ARCH_LIST__
3511
+ features.push_back({ "ARCHS", STRINGIFY(__CUDA_ARCH_LIST__) });
3512
+ #endif
3513
+
3514
+ #ifdef GGML_CUDA_FORCE_MMQ
3515
+ features.push_back({ "FORCE_MMQ", "1" });
3516
+ #endif
3517
+
3518
+ #ifdef GGML_CUDA_FORCE_CUBLAS
3519
+ features.push_back({ "FORCE_CUBLAS", "1" });
3520
+ #endif
3521
+
3522
+ #ifndef GGML_USE_VMM
3523
+ features.push_back({ "NO_VMM", "1" });
3524
+ #endif
3525
+
3526
+ #ifdef GGML_CUDA_NO_PEER_COPY
3527
+ features.push_back({ "NO_PEER_COPY", "1" });
3528
+ #endif
3529
+
3530
+ #ifdef GGML_CUDA_F16
3531
+ features.push_back({ "F16", "1" });
3532
+ #endif
3533
+
3534
+ #ifdef GGML_CUDA_USE_GRAPHS
3535
+ features.push_back({ "USE_GRAPHS", "1" });
3536
+ #endif
3537
+
3538
+ #ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
3539
+ features.push_back({ "PEER_MAX_BATCH_SIZE", STRINGIFY(GGML_CUDA_PEER_MAX_BATCH_SIZE) });
3540
+ #endif
3541
+
3542
+ #ifdef GGML_CUDA_FA_ALL_QUANTS
3543
+ features.push_back({ "FA_ALL_QUANTS", "1" });
3544
+ #endif
3545
+
3546
+ #undef _STRINGIFY
3547
+ #undef STRINGIFY
3548
+
3549
+ features.push_back({ nullptr, nullptr });
3550
+
3551
+ return features;
3552
+ }();
3553
+
3554
+ return features.data();
3555
+
3556
+ GGML_UNUSED(reg);
3557
+ }
3558
+
3559
+ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
3560
+ GGML_UNUSED(reg);
3561
+ if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
3562
+ return (void *)ggml_backend_cuda_split_buffer_type;
3563
+ }
3564
+ if (strcmp(name, "ggml_backend_register_host_buffer") == 0) {
3565
+ return (void *)ggml_backend_cuda_register_host_buffer;
3566
+ }
3567
+ if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) {
3568
+ return (void *)ggml_backend_cuda_unregister_host_buffer;
3569
+ }
3570
+ if (strcmp(name, "ggml_backend_get_features") == 0) {
3571
+ return (void *)ggml_backend_cuda_get_features;
3572
+ }
3573
+ return nullptr;
3574
+ }
3575
+
3576
+ static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = {
3577
+ /* .get_name = */ ggml_backend_cuda_reg_get_name,
3578
+ /* .get_device_count = */ ggml_backend_cuda_reg_get_device_count,
3579
+ /* .get_device = */ ggml_backend_cuda_reg_get_device,
3580
+ /* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address,
3581
+ };
3582
+
3583
+ // backend registry
3584
+ ggml_backend_reg_t ggml_backend_cuda_reg() {
3585
+ static ggml_backend_reg reg;
3586
+ static bool initialized = false;
3587
+
3588
+ {
3589
+ static std::mutex mutex;
3590
+ std::lock_guard<std::mutex> lock(mutex);
3591
+ if (!initialized) {
3592
+ ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
3593
+
3594
+ for (int i = 0; i < ggml_cuda_info().device_count; i++) {
3595
+ ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
3596
+ dev_ctx->device = i;
3597
+ dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);
3598
+
3599
+ ggml_cuda_set_device(i);
3600
+ cudaDeviceProp prop;
3601
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
3602
+ dev_ctx->description = prop.name;
3603
+
3604
+ ggml_backend_dev_t dev = new ggml_backend_device {
3605
+ /* .iface = */ ggml_backend_cuda_device_interface,
3606
+ /* .reg = */ &reg,
3607
+ /* .context = */ dev_ctx
3608
+ };
3609
+ ctx->devices.push_back(dev);
3610
+ }
3611
+
3612
+ reg = ggml_backend_reg {
3613
+ /* .api_version = */ GGML_BACKEND_API_VERSION,
3614
+ /* .iface = */ ggml_backend_cuda_reg_interface,
3615
+ /* .context = */ ctx
3616
+ };
3617
+ }
3618
+
3619
+ initialized = true;
3620
+ }
3621
+
3622
+ return &reg;
3623
+ }
3624
+
3625
+ ggml_backend_t ggml_backend_cuda_init(int device) {
3626
+ if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
3627
+ GGML_LOG_ERROR("%s: invalid device %d\n", __func__, device);
3628
+ return nullptr;
3629
+ }
3630
+
3631
+ ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
3632
+ if (ctx == nullptr) {
3633
+ GGML_LOG_ERROR("%s: failed to allocate context\n", __func__);
3634
+ return nullptr;
3635
+ }
3636
+
3637
+ ggml_backend_t cuda_backend = new ggml_backend {
3638
+ /* .guid = */ ggml_backend_cuda_guid(),
3639
+ /* .interface = */ ggml_backend_cuda_interface,
3640
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
3641
+ /* .context = */ ctx,
3642
+ };
3643
+
3644
+ return cuda_backend;
3645
+ }
3646
+
3647
+ GGML_BACKEND_DL_IMPL(ggml_backend_cuda_reg)