whispercpp 1.3.1 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (797) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +4 -3
  3. data/README.md +92 -31
  4. data/Rakefile +26 -7
  5. data/ext/.gitignore +5 -7
  6. data/ext/dependencies.rb +61 -0
  7. data/ext/extconf.rb +21 -198
  8. data/ext/options.rb +221 -0
  9. data/ext/ruby_whisper.c +159 -0
  10. data/ext/ruby_whisper.h +17 -2
  11. data/ext/ruby_whisper_context.c +641 -0
  12. data/ext/ruby_whisper_error.c +52 -0
  13. data/ext/ruby_whisper_model.c +232 -0
  14. data/ext/ruby_whisper_params.c +1301 -0
  15. data/ext/ruby_whisper_segment.c +143 -0
  16. data/ext/ruby_whisper_transcribe.cpp +87 -0
  17. data/ext/ruby_whisper_vad_params.c +288 -0
  18. data/ext/sources/.dockerignore +3 -0
  19. data/ext/sources/.github/workflows/bindings-ruby.yml +21 -0
  20. data/ext/sources/CMakeGraphVizOptions.cmake +8 -0
  21. data/ext/sources/CMakeLists.txt +251 -0
  22. data/ext/sources/bindings/javascript/CMakeLists.txt +41 -0
  23. data/ext/sources/bindings/javascript/emscripten.cpp +93 -0
  24. data/ext/sources/bindings/javascript/libwhisper.worker.js +1 -0
  25. data/ext/sources/bindings/javascript/package-tmpl.json +26 -0
  26. data/ext/sources/bindings/javascript/package.json +26 -0
  27. data/ext/sources/bindings/javascript/whisper.js +19 -0
  28. data/ext/sources/build-xcframework.sh +547 -0
  29. data/ext/sources/ci/run.sh +336 -0
  30. data/ext/sources/close-issue.yml +28 -0
  31. data/ext/sources/cmake/DefaultTargetOptions.cmake +16 -0
  32. data/ext/sources/cmake/FindFFmpeg.cmake +163 -0
  33. data/ext/sources/cmake/build-info.cmake +60 -0
  34. data/ext/sources/cmake/git-vars.cmake +22 -0
  35. data/ext/sources/cmake/whisper-config.cmake.in +65 -0
  36. data/ext/sources/cmake/whisper.pc.in +10 -0
  37. data/ext/sources/examples/CMakeLists.txt +124 -0
  38. data/ext/sources/examples/addon.node/CMakeLists.txt +31 -0
  39. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +37 -0
  40. data/ext/sources/examples/addon.node/addon.cpp +438 -0
  41. data/ext/sources/examples/addon.node/index.js +54 -0
  42. data/ext/sources/examples/addon.node/package.json +16 -0
  43. data/ext/sources/examples/bench/CMakeLists.txt +8 -0
  44. data/ext/sources/examples/bench/bench.cpp +175 -0
  45. data/ext/sources/examples/bench.wasm/CMakeLists.txt +49 -0
  46. data/ext/sources/examples/bench.wasm/emscripten.cpp +87 -0
  47. data/ext/sources/examples/bench.wasm/index-tmpl.html +284 -0
  48. data/ext/sources/examples/cli/CMakeLists.txt +8 -0
  49. data/ext/sources/examples/cli/cli.cpp +1294 -0
  50. data/ext/sources/examples/coi-serviceworker.js +146 -0
  51. data/ext/sources/examples/command/CMakeLists.txt +10 -0
  52. data/ext/sources/examples/command/command.cpp +776 -0
  53. data/ext/sources/examples/command/commands.txt +9 -0
  54. data/ext/sources/examples/command.wasm/CMakeLists.txt +50 -0
  55. data/ext/sources/examples/command.wasm/emscripten.cpp +327 -0
  56. data/ext/sources/examples/command.wasm/index-tmpl.html +414 -0
  57. data/ext/sources/examples/common-ggml.cpp +238 -0
  58. data/ext/sources/examples/common-ggml.h +18 -0
  59. data/ext/sources/examples/common-sdl.cpp +227 -0
  60. data/ext/sources/examples/common-sdl.h +49 -0
  61. data/ext/sources/examples/common-whisper.cpp +168 -0
  62. data/ext/sources/examples/common-whisper.h +24 -0
  63. data/ext/sources/examples/common.cpp +675 -0
  64. data/ext/sources/examples/common.h +322 -0
  65. data/ext/sources/examples/deprecation-warning/CMakeLists.txt +6 -0
  66. data/ext/sources/examples/deprecation-warning/deprecation-warning.cpp +38 -0
  67. data/ext/sources/examples/ffmpeg-transcode.cpp +368 -0
  68. data/ext/sources/examples/generate-karaoke.sh +57 -0
  69. data/ext/sources/examples/grammar-parser.cpp +423 -0
  70. data/ext/sources/examples/grammar-parser.h +29 -0
  71. data/ext/sources/examples/helpers.js +191 -0
  72. data/ext/sources/examples/json.hpp +24596 -0
  73. data/ext/sources/examples/livestream.sh +112 -0
  74. data/ext/sources/examples/lsp/CMakeLists.txt +9 -0
  75. data/ext/sources/examples/lsp/lsp.cpp +467 -0
  76. data/ext/sources/examples/lsp/whisper.vim +362 -0
  77. data/ext/sources/examples/miniaudio.h +93468 -0
  78. data/ext/sources/examples/python/test_whisper_processor.py +7 -0
  79. data/ext/sources/examples/python/whisper_processor.py +54 -0
  80. data/ext/sources/examples/quantize/CMakeLists.txt +6 -0
  81. data/ext/sources/examples/quantize/quantize.cpp +223 -0
  82. data/ext/sources/examples/server/CMakeLists.txt +12 -0
  83. data/ext/sources/examples/server/bench.js +29 -0
  84. data/ext/sources/examples/server/httplib.h +10497 -0
  85. data/ext/sources/examples/server/server.cpp +1091 -0
  86. data/ext/sources/examples/server.py +115 -0
  87. data/ext/sources/examples/stb_vorbis.c +5584 -0
  88. data/ext/sources/examples/stream/CMakeLists.txt +10 -0
  89. data/ext/sources/examples/stream/stream.cpp +429 -0
  90. data/ext/sources/examples/stream.wasm/CMakeLists.txt +49 -0
  91. data/ext/sources/examples/stream.wasm/emscripten.cpp +216 -0
  92. data/ext/sources/examples/stream.wasm/index-tmpl.html +414 -0
  93. data/ext/sources/examples/sycl/CMakeLists.txt +9 -0
  94. data/ext/sources/examples/sycl/build.sh +22 -0
  95. data/ext/sources/examples/sycl/ls-sycl-device.cpp +11 -0
  96. data/ext/sources/examples/sycl/run-whisper.sh +17 -0
  97. data/ext/sources/examples/talk-llama/CMakeLists.txt +40 -0
  98. data/ext/sources/examples/talk-llama/eleven-labs.py +80 -0
  99. data/ext/sources/examples/talk-llama/llama-adapter.cpp +388 -0
  100. data/ext/sources/examples/talk-llama/llama-adapter.h +76 -0
  101. data/ext/sources/examples/talk-llama/llama-arch.cpp +1746 -0
  102. data/ext/sources/examples/talk-llama/llama-arch.h +437 -0
  103. data/ext/sources/examples/talk-llama/llama-batch.cpp +374 -0
  104. data/ext/sources/examples/talk-llama/llama-batch.h +89 -0
  105. data/ext/sources/examples/talk-llama/llama-chat.cpp +663 -0
  106. data/ext/sources/examples/talk-llama/llama-chat.h +58 -0
  107. data/ext/sources/examples/talk-llama/llama-context.cpp +2676 -0
  108. data/ext/sources/examples/talk-llama/llama-context.h +276 -0
  109. data/ext/sources/examples/talk-llama/llama-cparams.cpp +5 -0
  110. data/ext/sources/examples/talk-llama/llama-cparams.h +41 -0
  111. data/ext/sources/examples/talk-llama/llama-grammar.cpp +1229 -0
  112. data/ext/sources/examples/talk-llama/llama-grammar.h +173 -0
  113. data/ext/sources/examples/talk-llama/llama-graph.cpp +1618 -0
  114. data/ext/sources/examples/talk-llama/llama-graph.h +640 -0
  115. data/ext/sources/examples/talk-llama/llama-hparams.cpp +95 -0
  116. data/ext/sources/examples/talk-llama/llama-hparams.h +190 -0
  117. data/ext/sources/examples/talk-llama/llama-impl.cpp +167 -0
  118. data/ext/sources/examples/talk-llama/llama-impl.h +61 -0
  119. data/ext/sources/examples/talk-llama/llama-io.cpp +15 -0
  120. data/ext/sources/examples/talk-llama/llama-io.h +35 -0
  121. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2739 -0
  122. data/ext/sources/examples/talk-llama/llama-kv-cache.h +502 -0
  123. data/ext/sources/examples/talk-llama/llama-kv-cells.h +379 -0
  124. data/ext/sources/examples/talk-llama/llama-memory.cpp +1 -0
  125. data/ext/sources/examples/talk-llama/llama-memory.h +32 -0
  126. data/ext/sources/examples/talk-llama/llama-mmap.cpp +600 -0
  127. data/ext/sources/examples/talk-llama/llama-mmap.h +68 -0
  128. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +1138 -0
  129. data/ext/sources/examples/talk-llama/llama-model-loader.h +169 -0
  130. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +281 -0
  131. data/ext/sources/examples/talk-llama/llama-model-saver.h +37 -0
  132. data/ext/sources/examples/talk-llama/llama-model.cpp +13814 -0
  133. data/ext/sources/examples/talk-llama/llama-model.h +425 -0
  134. data/ext/sources/examples/talk-llama/llama-quant.cpp +966 -0
  135. data/ext/sources/examples/talk-llama/llama-quant.h +1 -0
  136. data/ext/sources/examples/talk-llama/llama-sampling.cpp +2575 -0
  137. data/ext/sources/examples/talk-llama/llama-sampling.h +32 -0
  138. data/ext/sources/examples/talk-llama/llama-vocab.cpp +3340 -0
  139. data/ext/sources/examples/talk-llama/llama-vocab.h +131 -0
  140. data/ext/sources/examples/talk-llama/llama.cpp +354 -0
  141. data/ext/sources/examples/talk-llama/llama.h +1377 -0
  142. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +23 -0
  143. data/ext/sources/examples/talk-llama/speak +40 -0
  144. data/ext/sources/examples/talk-llama/speak.bat +1 -0
  145. data/ext/sources/examples/talk-llama/speak.ps1 +14 -0
  146. data/ext/sources/examples/talk-llama/talk-llama.cpp +808 -0
  147. data/ext/sources/examples/talk-llama/unicode-data.cpp +7034 -0
  148. data/ext/sources/examples/talk-llama/unicode-data.h +20 -0
  149. data/ext/sources/examples/talk-llama/unicode.cpp +849 -0
  150. data/ext/sources/examples/talk-llama/unicode.h +66 -0
  151. data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +8 -0
  152. data/ext/sources/examples/vad-speech-segments/speech.cpp +143 -0
  153. data/ext/sources/examples/wchess/CMakeLists.txt +10 -0
  154. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +19 -0
  155. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +803 -0
  156. data/ext/sources/examples/wchess/libwchess/Chessboard.h +33 -0
  157. data/ext/sources/examples/wchess/libwchess/WChess.cpp +193 -0
  158. data/ext/sources/examples/wchess/libwchess/WChess.h +63 -0
  159. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +117 -0
  160. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +8 -0
  161. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +249 -0
  162. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +50 -0
  163. data/ext/sources/examples/whisper.wasm/emscripten.cpp +118 -0
  164. data/ext/sources/examples/whisper.wasm/index-tmpl.html +658 -0
  165. data/ext/sources/ggml/CMakeLists.txt +390 -0
  166. data/ext/sources/ggml/cmake/BuildTypes.cmake +54 -0
  167. data/ext/sources/ggml/cmake/GitVars.cmake +22 -0
  168. data/ext/sources/ggml/cmake/common.cmake +26 -0
  169. data/ext/sources/ggml/cmake/ggml-config.cmake.in +152 -0
  170. data/ext/{ggml → sources/ggml}/include/ggml-alloc.h +1 -1
  171. data/ext/{ggml → sources/ggml}/include/ggml-backend.h +9 -7
  172. data/ext/{ggml → sources/ggml}/include/ggml-cpp.h +2 -1
  173. data/ext/{ggml → sources/ggml}/include/ggml-cpu.h +9 -1
  174. data/ext/{ggml → sources/ggml}/include/ggml-metal.h +1 -1
  175. data/ext/{ggml → sources/ggml}/include/ggml-opt.h +49 -28
  176. data/ext/{ggml → sources/ggml}/include/ggml-rpc.h +6 -1
  177. data/ext/{ggml → sources/ggml}/include/ggml-vulkan.h +0 -2
  178. data/ext/{ggml → sources/ggml}/include/ggml.h +182 -265
  179. data/ext/sources/ggml/include/gguf.h +202 -0
  180. data/ext/sources/ggml/src/CMakeLists.txt +346 -0
  181. data/ext/{ggml → sources/ggml}/src/ggml-alloc.c +34 -29
  182. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  183. data/ext/{ggml → sources/ggml}/src/ggml-backend-impl.h +1 -2
  184. data/ext/{ggml → sources/ggml}/src/ggml-backend-reg.cpp +87 -53
  185. data/ext/{ggml → sources/ggml}/src/ggml-backend.cpp +26 -14
  186. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  187. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +74 -0
  188. data/ext/sources/ggml/src/ggml-cann/Doxyfile +2579 -0
  189. data/ext/{ggml → sources/ggml}/src/ggml-cann/acl_tensor.cpp +10 -4
  190. data/ext/{ggml → sources/ggml}/src/ggml-cann/acl_tensor.h +5 -5
  191. data/ext/{ggml → sources/ggml}/src/ggml-cann/aclnn_ops.cpp +1272 -1506
  192. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +1125 -0
  193. data/ext/{ggml → sources/ggml}/src/ggml-cann/common.h +135 -1
  194. data/ext/{ggml → sources/ggml}/src/ggml-cann/ggml-cann.cpp +564 -146
  195. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +30 -0
  196. data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/dup.cpp +3 -5
  197. data/ext/{ggml → sources/ggml}/src/ggml-common.h +12 -8
  198. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +504 -0
  199. data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/amx.cpp +2 -1
  200. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  201. data/ext/sources/ggml/src/ggml-cpu/binary-ops.h +16 -0
  202. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  203. data/ext/sources/ggml/src/ggml-cpu/common.h +72 -0
  204. data/ext/{ggml → sources/ggml}/src/ggml-cpu/cpu-feats-x86.cpp +5 -1
  205. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +6431 -0
  206. data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-impl.h +163 -41
  207. data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-quants.c +4029 -1117
  208. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +3510 -0
  209. data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu.cpp +67 -18
  210. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +337 -0
  211. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +95 -0
  212. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +482 -0
  213. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  214. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3544 -0
  215. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
  216. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +8903 -0
  217. data/ext/sources/ggml/src/ggml-cpu/ops.h +110 -0
  218. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  219. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  220. data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +28 -0
  221. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +252 -0
  222. data/ext/sources/ggml/src/ggml-cpu/vec.h +818 -0
  223. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +184 -0
  224. data/ext/sources/ggml/src/ggml-cuda/acc.cu +61 -0
  225. data/ext/sources/ggml/src/ggml-cuda/acc.cuh +5 -0
  226. data/ext/sources/ggml/src/ggml-cuda/arange.cu +34 -0
  227. data/ext/sources/ggml/src/ggml-cuda/arange.cuh +5 -0
  228. data/ext/sources/ggml/src/ggml-cuda/argmax.cu +91 -0
  229. data/ext/sources/ggml/src/ggml-cuda/argmax.cuh +3 -0
  230. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +104 -0
  231. data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +3 -0
  232. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +363 -0
  233. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +9 -0
  234. data/ext/sources/ggml/src/ggml-cuda/clamp.cu +45 -0
  235. data/ext/sources/ggml/src/ggml-cuda/clamp.cuh +5 -0
  236. data/ext/sources/ggml/src/ggml-cuda/common.cuh +828 -0
  237. data/ext/sources/ggml/src/ggml-cuda/concat.cu +221 -0
  238. data/ext/sources/ggml/src/ggml-cuda/concat.cuh +5 -0
  239. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +89 -0
  240. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
  241. data/ext/sources/ggml/src/ggml-cuda/convert.cu +730 -0
  242. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +26 -0
  243. data/ext/sources/ggml/src/ggml-cuda/count-equal.cu +64 -0
  244. data/ext/sources/ggml/src/ggml-cuda/count-equal.cuh +5 -0
  245. data/ext/sources/ggml/src/ggml-cuda/cp-async.cuh +57 -0
  246. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +705 -0
  247. data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +11 -0
  248. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +189 -0
  249. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
  250. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +103 -0
  251. data/ext/sources/ggml/src/ggml-cuda/diagmask.cu +40 -0
  252. data/ext/sources/ggml/src/ggml-cuda/diagmask.cuh +5 -0
  253. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +881 -0
  254. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1471 -0
  255. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +357 -0
  256. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +3 -0
  257. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +365 -0
  258. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +3 -0
  259. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +482 -0
  260. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +472 -0
  261. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +634 -0
  262. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +3 -0
  263. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +346 -0
  264. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +3 -0
  265. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +275 -0
  266. data/ext/sources/ggml/src/ggml-cuda/getrows.cuh +15 -0
  267. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +3505 -0
  268. data/ext/sources/ggml/src/ggml-cuda/gla.cu +93 -0
  269. data/ext/sources/ggml/src/ggml-cuda/gla.cuh +3 -0
  270. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +103 -0
  271. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +5 -0
  272. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +396 -0
  273. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +324 -0
  274. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +3217 -0
  275. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +336 -0
  276. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +12 -0
  277. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +595 -0
  278. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +12 -0
  279. data/ext/sources/ggml/src/ggml-cuda/norm.cu +458 -0
  280. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +11 -0
  281. data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
  282. data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
  283. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +68 -0
  284. data/ext/sources/ggml/src/ggml-cuda/out-prod.cuh +3 -0
  285. data/ext/sources/ggml/src/ggml-cuda/pad.cu +49 -0
  286. data/ext/sources/ggml/src/ggml-cuda/pad.cuh +5 -0
  287. data/ext/sources/ggml/src/ggml-cuda/pool2d.cu +94 -0
  288. data/ext/sources/ggml/src/ggml-cuda/pool2d.cuh +5 -0
  289. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +190 -0
  290. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +27 -0
  291. data/ext/sources/ggml/src/ggml-cuda/rope.cu +456 -0
  292. data/ext/sources/ggml/src/ggml-cuda/rope.cuh +7 -0
  293. data/ext/sources/ggml/src/ggml-cuda/scale.cu +31 -0
  294. data/ext/sources/ggml/src/ggml-cuda/scale.cuh +5 -0
  295. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +283 -0
  296. data/ext/sources/ggml/src/ggml-cuda/softmax.cuh +7 -0
  297. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +148 -0
  298. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
  299. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +153 -0
  300. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
  301. data/ext/sources/ggml/src/ggml-cuda/sum.cu +45 -0
  302. data/ext/sources/ggml/src/ggml-cuda/sum.cuh +5 -0
  303. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +39 -0
  304. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +5 -0
  305. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +5 -0
  306. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +10 -0
  307. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
  308. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
  309. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +10 -0
  310. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +5 -0
  311. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +10 -0
  312. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +10 -0
  313. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
  314. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
  315. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +5 -0
  316. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
  317. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +10 -0
  318. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +10 -0
  319. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
  320. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
  321. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
  322. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +10 -0
  323. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +10 -0
  324. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  325. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  326. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  327. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  328. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  329. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  330. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  331. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  332. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  333. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  334. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  335. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  336. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  337. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  338. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  339. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  340. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  341. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  342. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  343. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  344. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  345. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  346. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  347. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  348. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  349. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  350. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  351. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  352. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  353. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  354. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  355. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  356. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  357. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  358. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  359. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  360. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  361. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  362. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  363. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  364. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  365. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  366. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  367. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  368. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  369. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  370. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  371. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  372. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  373. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  374. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  375. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  376. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  377. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  378. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  379. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  380. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  381. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  382. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  383. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  384. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  385. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  386. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  387. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  388. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  389. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  390. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  391. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  392. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  393. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  394. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  395. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  396. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  397. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  398. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  399. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  400. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  401. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  402. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  403. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  404. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  405. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  406. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  407. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  408. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  409. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  410. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +78 -0
  411. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
  412. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
  413. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
  414. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
  415. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
  416. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
  417. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
  418. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
  419. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  420. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  421. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  422. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  423. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  424. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  425. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  426. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  427. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  428. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  429. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +47 -0
  430. data/ext/sources/ggml/src/ggml-cuda/tsembd.cuh +5 -0
  431. data/ext/sources/ggml/src/ggml-cuda/unary.cu +289 -0
  432. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +59 -0
  433. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +51 -0
  434. data/ext/sources/ggml/src/ggml-cuda/upscale.cuh +5 -0
  435. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +1135 -0
  436. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/cuda.h +1 -0
  437. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/hip.h +57 -0
  438. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/musa.h +7 -1
  439. data/ext/sources/ggml/src/ggml-cuda/wkv.cu +199 -0
  440. data/ext/sources/ggml/src/ggml-cuda/wkv.cuh +7 -0
  441. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +131 -0
  442. data/ext/{ggml → sources/ggml}/src/ggml-impl.h +64 -19
  443. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  444. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +112 -0
  445. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +58 -0
  446. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +25 -0
  447. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +52 -0
  448. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +52 -0
  449. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +52 -0
  450. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +52 -0
  451. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +30 -0
  452. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +22 -0
  453. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +17 -0
  454. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +31 -0
  455. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +31 -0
  456. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +38 -0
  457. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +39 -0
  458. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +44 -0
  459. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +52 -0
  460. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +69 -0
  461. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +51 -0
  462. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +33 -0
  463. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +35 -0
  464. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +140 -0
  465. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +106 -0
  466. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +73 -0
  467. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +52 -0
  468. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +28 -0
  469. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +84 -0
  470. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +21 -0
  471. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +53 -0
  472. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +52 -0
  473. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +52 -0
  474. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +52 -0
  475. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +52 -0
  476. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +19 -0
  477. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +23 -0
  478. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +22 -0
  479. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +72 -0
  480. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +71 -0
  481. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +120 -0
  482. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +622 -0
  483. data/ext/{ggml → sources/ggml}/src/ggml-metal/ggml-metal.m +2178 -1064
  484. data/ext/{ggml → sources/ggml}/src/ggml-metal/ggml-metal.metal +1575 -1218
  485. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +113 -0
  486. data/ext/sources/ggml/src/ggml-musa/mudnn.cu +112 -0
  487. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +12 -0
  488. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +96 -0
  489. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +5124 -0
  490. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +83 -0
  491. data/ext/sources/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  492. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  493. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +118 -0
  494. data/ext/sources/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  495. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  496. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +62 -0
  497. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  498. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  499. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +163 -0
  500. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  501. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  502. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +79 -0
  503. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  504. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  505. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  506. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  507. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  508. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  509. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  510. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  511. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  512. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  513. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  514. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl +190 -0
  515. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +81 -0
  516. data/ext/sources/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  517. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +96 -0
  518. data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +721 -0
  519. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +16 -0
  520. data/ext/sources/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  521. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +87 -0
  522. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +87 -0
  523. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +86 -0
  524. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +86 -0
  525. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +84 -0
  526. data/ext/{ggml → sources/ggml}/src/ggml-opt.cpp +373 -190
  527. data/ext/{ggml → sources/ggml}/src/ggml-quants.c +114 -120
  528. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  529. data/ext/{ggml → sources/ggml}/src/ggml-rpc/ggml-rpc.cpp +480 -73
  530. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +189 -0
  531. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +37 -0
  532. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +345 -0
  533. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  534. data/ext/{ggml → sources/ggml}/src/ggml-sycl/common.cpp +20 -32
  535. data/ext/sources/ggml/src/ggml-sycl/common.hpp +589 -0
  536. data/ext/{ggml → sources/ggml}/src/ggml-sycl/concat.cpp +32 -33
  537. data/ext/sources/ggml/src/ggml-sycl/concat.hpp +20 -0
  538. data/ext/{ggml → sources/ggml}/src/ggml-sycl/conv.cpp +4 -2
  539. data/ext/sources/ggml/src/ggml-sycl/conv.hpp +20 -0
  540. data/ext/{ggml → sources/ggml}/src/ggml-sycl/convert.cpp +104 -28
  541. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +34 -0
  542. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +700 -0
  543. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +11 -0
  544. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +791 -0
  545. data/ext/{ggml → sources/ggml}/src/ggml-sycl/dmmv.cpp +156 -17
  546. data/ext/sources/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  547. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +2957 -0
  548. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1511 -0
  549. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +75 -0
  550. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +99 -0
  551. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +309 -0
  552. data/ext/sources/ggml/src/ggml-sycl/getrows.hpp +20 -0
  553. data/ext/{ggml → sources/ggml}/src/ggml-sycl/ggml-sycl.cpp +1004 -1240
  554. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +106 -0
  555. data/ext/sources/ggml/src/ggml-sycl/gla.hpp +8 -0
  556. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +136 -0
  557. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +21 -0
  558. data/ext/{ggml → sources/ggml}/src/ggml-sycl/mmq.cpp +0 -1
  559. data/ext/sources/ggml/src/ggml-sycl/mmq.hpp +33 -0
  560. data/ext/{ggml → sources/ggml}/src/ggml-sycl/mmvq.cpp +261 -166
  561. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  562. data/ext/{ggml → sources/ggml}/src/ggml-sycl/norm.cpp +204 -81
  563. data/ext/sources/ggml/src/ggml-sycl/norm.hpp +26 -0
  564. data/ext/{ggml → sources/ggml}/src/ggml-sycl/outprod.cpp +8 -17
  565. data/ext/sources/ggml/src/ggml-sycl/outprod.hpp +10 -0
  566. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +74 -0
  567. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +83 -0
  568. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +361 -0
  569. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +20 -0
  570. data/ext/{ggml → sources/ggml}/src/ggml-sycl/softmax.cpp +35 -25
  571. data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +20 -0
  572. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  573. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  574. data/ext/{ggml → sources/ggml}/src/ggml-sycl/tsembd.cpp +3 -3
  575. data/ext/sources/ggml/src/ggml-sycl/tsembd.hpp +20 -0
  576. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +1215 -0
  577. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +293 -0
  578. data/ext/sources/ggml/src/ggml-sycl/wkv.hpp +10 -0
  579. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +196 -0
  580. data/ext/sources/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
  581. data/ext/{ggml → sources/ggml}/src/ggml-vulkan/ggml-vulkan.cpp +3130 -1087
  582. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +39 -0
  583. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +29 -0
  584. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +29 -0
  585. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +51 -0
  586. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +69 -0
  587. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
  588. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
  589. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
  590. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
  591. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
  592. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
  593. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +242 -0
  594. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
  595. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
  596. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
  597. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +462 -0
  598. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +699 -0
  599. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.comp +13 -0
  600. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
  601. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
  602. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
  603. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
  604. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +48 -0
  605. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +39 -0
  606. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +49 -0
  607. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
  608. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
  609. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
  610. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
  611. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
  612. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
  613. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
  614. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
  615. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
  616. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
  617. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
  618. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
  619. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
  620. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
  621. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +337 -0
  622. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  623. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  624. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +267 -0
  625. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +59 -0
  626. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
  627. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
  628. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +64 -0
  629. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.comp +9 -0
  630. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +76 -0
  631. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +33 -0
  632. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +41 -0
  633. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
  634. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +100 -0
  635. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +41 -0
  636. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
  637. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
  638. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
  639. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
  640. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +118 -0
  641. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +82 -0
  642. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +79 -0
  643. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
  644. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +87 -0
  645. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
  646. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +118 -0
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +154 -0
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +130 -0
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +136 -0
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +167 -0
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +868 -0
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +441 -0
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +442 -0
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +99 -0
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +28 -0
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
  663. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +77 -0
  664. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
  665. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
  666. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
  667. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +52 -0
  668. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
  669. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +58 -0
  670. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +60 -0
  671. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +43 -0
  672. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +43 -0
  673. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +47 -0
  674. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
  675. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
  676. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
  677. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
  678. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
  679. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +173 -0
  680. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +50 -0
  681. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
  682. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
  683. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +37 -0
  684. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
  685. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_bfloat16_support.comp +7 -0
  686. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp +7 -0
  687. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat_support.comp +7 -0
  688. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_integer_dot_support.comp +7 -0
  689. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +41 -0
  690. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +1373 -0
  691. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -0
  692. data/ext/{ggml → sources/ggml}/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +193 -35
  693. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
  694. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
  695. data/ext/{ggml → sources/ggml}/src/ggml.c +676 -1820
  696. data/ext/sources/ggml/src/gguf.cpp +1330 -0
  697. data/ext/{include → sources/include}/whisper.h +68 -2
  698. data/ext/sources/src/CMakeLists.txt +143 -0
  699. data/ext/{src → sources/src}/coreml/whisper-decoder-impl.h +27 -15
  700. data/ext/{src → sources/src}/coreml/whisper-decoder-impl.m +35 -10
  701. data/ext/{src → sources/src}/coreml/whisper-encoder-impl.h +21 -9
  702. data/ext/{src → sources/src}/coreml/whisper-encoder-impl.m +28 -3
  703. data/ext/sources/src/coreml/whisper-encoder.mm +73 -0
  704. data/ext/sources/src/whisper-arch.h +197 -0
  705. data/ext/{src → sources/src}/whisper.cpp +1905 -374
  706. data/ext/sources/tests/CMakeLists.txt +105 -0
  707. data/ext/sources/tests/earnings21/eval.mk +58 -0
  708. data/ext/sources/tests/earnings21/eval.py +68 -0
  709. data/ext/sources/tests/earnings21/normalizers/__init__.py +2 -0
  710. data/ext/sources/tests/earnings21/normalizers/basic.py +80 -0
  711. data/ext/sources/tests/earnings21/normalizers/english.json +1741 -0
  712. data/ext/sources/tests/earnings21/normalizers/english.py +550 -0
  713. data/ext/sources/tests/earnings21/requirements.txt +6 -0
  714. data/ext/sources/tests/en-0-ref.txt +1 -0
  715. data/ext/sources/tests/en-1-ref.txt +1 -0
  716. data/ext/sources/tests/en-2-ref.txt +1 -0
  717. data/ext/sources/tests/es-0-ref.txt +1 -0
  718. data/ext/sources/tests/librispeech/eval.mk +39 -0
  719. data/ext/sources/tests/librispeech/eval.py +47 -0
  720. data/ext/sources/tests/librispeech/normalizers/__init__.py +2 -0
  721. data/ext/sources/tests/librispeech/normalizers/basic.py +80 -0
  722. data/ext/sources/tests/librispeech/normalizers/english.json +1741 -0
  723. data/ext/sources/tests/librispeech/normalizers/english.py +550 -0
  724. data/ext/sources/tests/librispeech/requirements.txt +6 -0
  725. data/ext/sources/tests/run-tests.sh +130 -0
  726. data/ext/sources/tests/test-c.c +3 -0
  727. data/ext/sources/tests/test-vad-full.cpp +54 -0
  728. data/ext/sources/tests/test-vad.cpp +83 -0
  729. data/ext/sources/tests/test-whisper.js +58 -0
  730. data/extsources.rb +33 -5
  731. data/lib/whisper/model/uri.rb +149 -128
  732. data/sig/whisper.rbs +480 -0
  733. data/tests/helper.rb +28 -0
  734. data/tests/test_callback.rb +45 -3
  735. data/tests/test_error.rb +2 -2
  736. data/tests/test_model.rb +38 -0
  737. data/tests/test_package.rb +18 -3
  738. data/tests/test_params.rb +145 -8
  739. data/tests/test_segment.rb +10 -19
  740. data/tests/test_vad.rb +19 -0
  741. data/tests/test_vad_params.rb +103 -0
  742. data/tests/test_whisper.rb +37 -37
  743. data/whispercpp.gemspec +5 -4
  744. metadata +766 -111
  745. data/ext/cpu.mk +0 -9
  746. data/ext/examples/dr_wav.h +0 -8815
  747. data/ext/ggml/src/ggml-cann/aclnn_ops.h +0 -592
  748. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -4262
  749. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +0 -14123
  750. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +0 -1884
  751. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +0 -14
  752. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +0 -288
  753. data/ext/ggml/src/ggml-sycl/element_wise.cpp +0 -1030
  754. data/ext/ggml/src/ggml-sycl/im2col.cpp +0 -126
  755. data/ext/ggml/src/ggml-sycl/rope.cpp +0 -276
  756. data/ext/ggml/src/ggml-sycl/wkv6.cpp +0 -141
  757. data/ext/metal-embed.mk +0 -17
  758. data/ext/metal.mk +0 -6
  759. data/ext/ruby_whisper.cpp +0 -1909
  760. data/ext/scripts/get-flags.mk +0 -38
  761. data/lib/whisper.rb +0 -2
  762. /data/ext/{ggml → sources/ggml}/include/ggml-blas.h +0 -0
  763. /data/ext/{ggml → sources/ggml}/include/ggml-cann.h +0 -0
  764. /data/ext/{ggml → sources/ggml}/include/ggml-cuda.h +0 -0
  765. /data/ext/{ggml → sources/ggml}/include/ggml-kompute.h +0 -0
  766. /data/ext/{ggml → sources/ggml}/include/ggml-opencl.h +0 -0
  767. /data/ext/{ggml → sources/ggml}/include/ggml-sycl.h +0 -0
  768. /data/ext/{ggml → sources/ggml}/src/ggml-amx/common.h +0 -0
  769. /data/ext/{ggml → sources/ggml}/src/ggml-amx/ggml-amx.cpp +0 -0
  770. /data/ext/{ggml → sources/ggml}/src/ggml-amx/mmq.cpp +0 -0
  771. /data/ext/{ggml → sources/ggml}/src/ggml-amx/mmq.h +0 -0
  772. /data/ext/{ggml → sources/ggml}/src/ggml-blas/ggml-blas.cpp +0 -0
  773. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/ascendc_kernels.h +0 -0
  774. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_f16.cpp +0 -0
  775. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_f32.cpp +0 -0
  776. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -0
  777. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -0
  778. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -0
  779. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -0
  780. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -0
  781. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/amx.h +0 -0
  782. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/common.h +0 -0
  783. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/mmq.cpp +0 -0
  784. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/mmq.h +0 -0
  785. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-aarch64.h +0 -0
  786. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-hbm.cpp +0 -0
  787. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-hbm.h +0 -0
  788. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-quants.h +0 -0
  789. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-traits.cpp +0 -0
  790. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-traits.h +0 -0
  791. /data/ext/{ggml → sources/ggml}/src/ggml-kompute/ggml-kompute.cpp +0 -0
  792. /data/ext/{ggml → sources/ggml}/src/ggml-quants.h +0 -0
  793. /data/ext/{ggml → sources/ggml}/src/ggml-threading.cpp +0 -0
  794. /data/ext/{ggml → sources/ggml}/src/ggml-threading.h +0 -0
  795. /data/ext/{src → sources/src}/coreml/whisper-encoder.h +0 -0
  796. /data/ext/{src → sources/src}/openvino/whisper-openvino-encoder.cpp +0 -0
  797. /data/ext/{src → sources/src}/openvino/whisper-openvino-encoder.h +0 -0
@@ -0,0 +1,3505 @@
1
+ #include "ggml-cuda.h"
2
+ #include "ggml-impl.h"
3
+ #include "ggml-backend-impl.h"
4
+
5
+ #include "ggml-cuda/common.cuh"
6
+ #include "ggml-cuda/acc.cuh"
7
+ #include "ggml-cuda/arange.cuh"
8
+ #include "ggml-cuda/argmax.cuh"
9
+ #include "ggml-cuda/argsort.cuh"
10
+ #include "ggml-cuda/binbcast.cuh"
11
+ #include "ggml-cuda/clamp.cuh"
12
+ #include "ggml-cuda/concat.cuh"
13
+ #include "ggml-cuda/conv-transpose-1d.cuh"
14
+ #include "ggml-cuda/convert.cuh"
15
+ #include "ggml-cuda/count-equal.cuh"
16
+ #include "ggml-cuda/cpy.cuh"
17
+ #include "ggml-cuda/cross-entropy-loss.cuh"
18
+ #include "ggml-cuda/diagmask.cuh"
19
+ #include "ggml-cuda/fattn.cuh"
20
+ #include "ggml-cuda/getrows.cuh"
21
+ #include "ggml-cuda/im2col.cuh"
22
+ #include "ggml-cuda/mmq.cuh"
23
+ #include "ggml-cuda/mmv.cuh"
24
+ #include "ggml-cuda/mmvq.cuh"
25
+ #include "ggml-cuda/norm.cuh"
26
+ #include "ggml-cuda/opt-step-adamw.cuh"
27
+ #include "ggml-cuda/out-prod.cuh"
28
+ #include "ggml-cuda/pad.cuh"
29
+ #include "ggml-cuda/pool2d.cuh"
30
+ #include "ggml-cuda/quantize.cuh"
31
+ #include "ggml-cuda/rope.cuh"
32
+ #include "ggml-cuda/scale.cuh"
33
+ #include "ggml-cuda/softmax.cuh"
34
+ #include "ggml-cuda/ssm-conv.cuh"
35
+ #include "ggml-cuda/ssm-scan.cuh"
36
+ #include "ggml-cuda/sum.cuh"
37
+ #include "ggml-cuda/sumrows.cuh"
38
+ #include "ggml-cuda/tsembd.cuh"
39
+ #include "ggml-cuda/unary.cuh"
40
+ #include "ggml-cuda/upscale.cuh"
41
+ #include "ggml-cuda/wkv.cuh"
42
+ #include "ggml-cuda/gla.cuh"
43
+ #include "ggml.h"
44
+
45
+ #include <algorithm>
46
+ #include <array>
47
+ #include <atomic>
48
+ #include <charconv>
49
+ #include <cinttypes>
50
+ #include <cstddef>
51
+ #include <cstdint>
52
+ #include <float.h>
53
+ #include <limits>
54
+ #include <map>
55
+ #include <memory>
56
+ #include <mutex>
57
+ #include <stdint.h>
58
+ #include <stdio.h>
59
+ #include <stdarg.h>
60
+ #include <stdlib.h>
61
+ #include <string>
62
+ #include <vector>
63
+
64
+ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
65
+
66
+ [[noreturn]]
67
+ void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
68
+ int id = -1; // in case cudaGetDevice fails
69
+ (void)cudaGetDevice(&id);
70
+
71
+ GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
72
+ GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
73
+ GGML_LOG_ERROR(" %s\n", stmt);
74
+ // abort with GGML_ABORT to get a stack trace
75
+ GGML_ABORT(GGML_CUDA_NAME " error");
76
+ }
77
+
78
+ // this is faster on Windows
79
+ // probably because the Windows CUDA libraries forget to make this check before invoking the drivers
80
+ void ggml_cuda_set_device(int device) {
81
+ int current_device;
82
+ CUDA_CHECK(cudaGetDevice(&current_device));
83
+
84
+ if (device == current_device) {
85
+ return;
86
+ }
87
+
88
+ CUDA_CHECK(cudaSetDevice(device));
89
+ }
90
+
91
+ int ggml_cuda_get_device() {
92
+ int id;
93
+ CUDA_CHECK(cudaGetDevice(&id));
94
+ return id;
95
+ }
96
+
97
+ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
98
+ ggml_cuda_set_device(device);
99
+ cudaError_t err;
100
+ if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
101
+ {
102
+ err = cudaMallocManaged(ptr, size);
103
+ #if defined(GGML_USE_HIP)
104
+ if (err == hipSuccess) {
105
+ CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
106
+ }
107
+
108
+ // fall back to cudaMalloc if not supported (e.g. on Windows)
109
+ if (err == hipErrorNotSupported) {
110
+ static bool warned_unsupported = false;
111
+ if (!warned_unsupported) {
112
+ GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n");
113
+ warned_unsupported = true;
114
+ }
115
+
116
+ err = cudaMalloc(ptr, size);
117
+ }
118
+ #endif // defined(GGML_USE_HIP)
119
+ }
120
+ else
121
+ {
122
+ err = cudaMalloc(ptr, size);
123
+ }
124
+ return err;
125
+ }
126
+
127
+ #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
128
+ static int ggml_cuda_parse_id(char devName[]) {
129
+ // A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
130
+ // these values are not stable so this is susceptible to breakage
131
+ // https://github.com/ROCm/clr/blob/amd-staging/rocclr/device/device.cpp
132
+ int archMajor = 0x0;
133
+ int archMinor = 0x0;
134
+ int archNum = GGML_CUDA_CC_OFFSET_AMD;
135
+ int archLen = strlen(devName);
136
+ char archName[archLen + 1];
137
+
138
+ // strip leading 'gfx' while copying into our buffer
139
+ if (archLen > 3) {
140
+ strcpy(archName, &devName[3]);
141
+ archLen -= 3;
142
+ }
143
+
144
+ // trim trailing :xnack- or :sramecc- statuses
145
+ archLen = strcspn(archName, ":");
146
+ archName[archLen] = '\0';
147
+
148
+ // tease out the version information
149
+ if (archLen > 8) {
150
+ // versions labeled generic use '-' as delimiter
151
+ // strip the trailing "-generic" then iterate through what remains
152
+ if ((strstr(archName, "-generic"))) {
153
+ archName[archLen - 8] = '\0';
154
+ char * pch;
155
+ if ((pch = strtok(archName, "-"))) {
156
+ archMajor = (int)strtoul(pch, 0, 16);
157
+ if ((pch = strtok(NULL, "-"))) {
158
+ archMinor = 0x10 * (int)strtoul(pch, 0, 16);
159
+ }
160
+ }
161
+ }
162
+ } else if (archLen >= 3) {
163
+ // last two digits should be the minor * 0x10 + stepping
164
+ archMinor = (int)strtoul(&archName[archLen - 2], 0, 16);
165
+ archName[archLen - 2] = '\0';
166
+
167
+ // only the major version remains
168
+ archMajor = (int)strtoul(archName, 0, 16);
169
+ }
170
+ archNum += archMajor * 0x100;
171
+ archNum += archMinor;
172
+ return archNum;
173
+ }
174
+ #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
175
+
176
+ static ggml_cuda_device_info ggml_cuda_init() {
177
+ #ifdef __HIP_PLATFORM_AMD__
178
+ // Workaround for a rocBLAS bug when using multiple graphics cards:
179
+ // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
180
+ {
181
+ int major_version = 0;
182
+ size_t version_length = 0;
183
+ if (rocblas_get_version_string_size(&version_length) == rocblas_status_success) {
184
+ std::vector<char> version(version_length+1, '\0');
185
+ if (rocblas_get_version_string(version.data(), version.size()) == rocblas_status_success) {
186
+ version.resize(::strlen(version.data()));
187
+ int parsed_value = 0;
188
+ if (std::from_chars(version.data(), version.data() + version.size(), parsed_value).ec == std::errc()) {
189
+ major_version = parsed_value;
190
+ }
191
+ }
192
+ }
193
+ if (major_version < 4) {
194
+ GGML_LOG_DEBUG(GGML_CUDA_NAME " calling rocblas_initialize as a workaround for a rocBLAS bug\n");
195
+ rocblas_initialize();
196
+ CUDA_CHECK(cudaDeviceSynchronize());
197
+ }
198
+ }
199
+ #endif
200
+
201
+ ggml_cuda_device_info info = {};
202
+
203
+ cudaError_t err = cudaGetDeviceCount(&info.device_count);
204
+ if (err != cudaSuccess) {
205
+ GGML_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
206
+ return info;
207
+ }
208
+
209
+ GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
210
+
211
+ int64_t total_vram = 0;
212
+ #ifdef GGML_CUDA_FORCE_MMQ
213
+ GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
214
+ #else
215
+ GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
216
+ #endif // GGML_CUDA_FORCE_MMQ
217
+ #ifdef GGML_CUDA_FORCE_CUBLAS
218
+ GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
219
+ #else
220
+ GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
221
+ #endif // GGML_CUDA_FORCE_CUBLAS
222
+ GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
223
+ for (int id = 0; id < info.device_count; ++id) {
224
+ int device_vmm = 0;
225
+
226
+ #if defined(GGML_USE_VMM)
227
+ CUdevice device;
228
+ CU_CHECK(cuDeviceGet(&device, id));
229
+ CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
230
+
231
+ if (device_vmm) {
232
+ CUmemAllocationProp alloc_prop = {};
233
+ alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
234
+ alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
235
+ alloc_prop.location.id = id;
236
+ CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
237
+ }
238
+ #endif // defined(GGML_USE_VMM)
239
+ info.devices[id].vmm = !!device_vmm;
240
+
241
+ cudaDeviceProp prop;
242
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
243
+
244
+ info.default_tensor_split[id] = total_vram;
245
+ total_vram += prop.totalGlobalMem;
246
+
247
+ info.devices[id].nsm = prop.multiProcessorCount;
248
+ info.devices[id].smpb = prop.sharedMemPerBlock;
249
+ info.devices[id].warp_size = prop.warpSize;
250
+ #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
251
+ info.devices[id].smpbo = prop.sharedMemPerBlock;
252
+
253
+ info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
254
+ if ((info.devices[id].cc & 0xff00) == 0x0) {
255
+ GGML_LOG_WARN("invalid architecture ID received for device %d %s: %s cc %d.%d\n",
256
+ id, prop.name, prop.gcnArchName, prop.major, prop.minor);
257
+
258
+ // Fallback to prop.major and prop.minor
259
+ if (prop.major > 0) {
260
+ info.devices[id].cc = GGML_CUDA_CC_OFFSET_AMD + prop.major * 0x100;
261
+ info.devices[id].cc += prop.minor * 0x10;
262
+ }
263
+ }
264
+ GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
265
+ id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
266
+ device_vmm ? "yes" : "no", prop.warpSize);
267
+ #elif defined(GGML_USE_MUSA)
268
+ // FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
269
+ info.devices[id].warp_size = 32;
270
+ info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
271
+ info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
272
+ info.devices[id].cc += prop.minor * 0x10;
273
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
274
+ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
275
+ #else
276
+ info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
277
+ info.devices[id].cc = 100*prop.major + 10*prop.minor;
278
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
279
+ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
280
+ #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
281
+ }
282
+
283
+ for (int id = 0; id < info.device_count; ++id) {
284
+ info.default_tensor_split[id] /= total_vram;
285
+ }
286
+
287
+ // configure logging to stdout
288
+ // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
289
+
290
+ return info;
291
+ }
292
+
293
+ const ggml_cuda_device_info & ggml_cuda_info() {
294
+ static ggml_cuda_device_info info = ggml_cuda_init();
295
+ return info;
296
+ }
297
+
298
+ // #define DEBUG_CUDA_MALLOC
299
+
300
+ // buffer pool for cuda (legacy)
301
+ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
302
+ static const int MAX_BUFFERS = 256;
303
+
304
+ int device;
305
+ struct ggml_cuda_buffer {
306
+ void * ptr = nullptr;
307
+ size_t size = 0;
308
+ };
309
+
310
+ ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
311
+ size_t pool_size = 0;
312
+
313
+ explicit ggml_cuda_pool_leg(int device) :
314
+ device(device) {
315
+ }
316
+
317
+ ~ggml_cuda_pool_leg() {
318
+ ggml_cuda_set_device(device);
319
+ for (int i = 0; i < MAX_BUFFERS; ++i) {
320
+ ggml_cuda_buffer & b = buffer_pool[i];
321
+ if (b.ptr != nullptr) {
322
+ CUDA_CHECK(cudaFree(b.ptr));
323
+ pool_size -= b.size;
324
+ }
325
+ }
326
+ GGML_ASSERT(pool_size == 0);
327
+ }
328
+
329
+ void * alloc(size_t size, size_t * actual_size) override {
330
+ #ifdef DEBUG_CUDA_MALLOC
331
+ int nnz = 0;
332
+ size_t max_size = 0;
333
+ #endif
334
+ size_t best_diff = 1ull << 36;
335
+ int ibest = -1;
336
+ for (int i = 0; i < MAX_BUFFERS; ++i) {
337
+ ggml_cuda_buffer& b = buffer_pool[i];
338
+ if (b.ptr != nullptr) {
339
+ #ifdef DEBUG_CUDA_MALLOC
340
+ ++nnz;
341
+ if (b.size > max_size) max_size = b.size;
342
+ #endif
343
+ if (b.size >= size) {
344
+ size_t diff = b.size - size;
345
+ if (diff < best_diff) {
346
+ best_diff = diff;
347
+ ibest = i;
348
+ if (!best_diff) {
349
+ void * ptr = b.ptr;
350
+ *actual_size = b.size;
351
+ b.ptr = nullptr;
352
+ b.size = 0;
353
+ return ptr;
354
+ }
355
+ }
356
+ }
357
+ }
358
+ }
359
+ if (ibest >= 0) {
360
+ ggml_cuda_buffer& b = buffer_pool[ibest];
361
+ void * ptr = b.ptr;
362
+ *actual_size = b.size;
363
+ b.ptr = nullptr;
364
+ b.size = 0;
365
+ return ptr;
366
+ }
367
+ void * ptr;
368
+ size_t look_ahead_size = (size_t) (1.05 * size);
369
+ look_ahead_size = 256 * ((look_ahead_size + 255)/256);
370
+ ggml_cuda_set_device(device);
371
+ CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
372
+ *actual_size = look_ahead_size;
373
+ pool_size += look_ahead_size;
374
+ #ifdef DEBUG_CUDA_MALLOC
375
+ GGML_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
376
+ (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
377
+ #endif
378
+ return ptr;
379
+ }
380
+
381
+ void free(void * ptr, size_t size) override {
382
+ for (int i = 0; i < MAX_BUFFERS; ++i) {
383
+ ggml_cuda_buffer& b = buffer_pool[i];
384
+ if (b.ptr == nullptr) {
385
+ b.ptr = ptr;
386
+ b.size = size;
387
+ return;
388
+ }
389
+ }
390
+ GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
391
+ ggml_cuda_set_device(device);
392
+ CUDA_CHECK(cudaFree(ptr));
393
+ pool_size -= size;
394
+ }
395
+ };
396
+
397
+ // pool with virtual memory
398
+ #if defined(GGML_USE_VMM)
399
+ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
400
+ static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
401
+
402
+ int device;
403
+ CUdeviceptr pool_addr = 0;
404
+ size_t pool_used = 0;
405
+ size_t pool_size = 0;
406
+ size_t granularity;
407
+ #if defined(GGML_USE_HIP)
408
+ std::vector<std::pair<CUdeviceptr, size_t>> mappings;
409
+ #endif
410
+
411
+ explicit ggml_cuda_pool_vmm(int device) :
412
+ device(device),
413
+ granularity(ggml_cuda_info().devices[device].vmm_granularity) {
414
+ }
415
+
416
+ ~ggml_cuda_pool_vmm() {
417
+ if (pool_addr != 0) {
418
+ #if defined(GGML_USE_HIP)
419
+ // Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
420
+ for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
421
+ CU_CHECK(cuMemUnmap(mapping.first, mapping.second));
422
+ }
423
+ #else
424
+ CU_CHECK(cuMemUnmap(pool_addr, pool_size));
425
+ #endif
426
+ CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
427
+ }
428
+ }
429
+
430
+ void * alloc(size_t size, size_t * actual_size) override {
431
+ // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
432
+ const size_t alignment = 128;
433
+ size = alignment * ((size + alignment - 1) / alignment);
434
+
435
+ size_t avail = pool_size - pool_used;
436
+
437
+ if (size > avail) {
438
+ // round up to the next multiple of the granularity
439
+ size_t reserve_size = size - avail;
440
+ reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
441
+
442
+ GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
443
+
444
+ // allocate more physical memory
445
+ CUmemAllocationProp prop = {};
446
+ prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
447
+ prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
448
+ prop.location.id = device;
449
+ CUmemGenericAllocationHandle handle;
450
+ CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
451
+
452
+ // reserve virtual address space (if not already reserved)
453
+ if (pool_addr == 0) {
454
+ CU_CHECK(cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
455
+ }
456
+
457
+ // map at the end of the pool
458
+ CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size);
459
+ CU_CHECK(cuMemMap(start_ptr, reserve_size, 0, handle, 0));
460
+ #if defined(GGML_USE_HIP)
461
+ mappings.push_back({start_ptr, reserve_size});
462
+ #endif
463
+
464
+ // the memory allocation handle is no longer needed after mapping
465
+ CU_CHECK(cuMemRelease(handle));
466
+
467
+ // set access
468
+ CUmemAccessDesc access = {};
469
+ access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
470
+ access.location.id = device;
471
+ access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
472
+ CU_CHECK(cuMemSetAccess((CUdeviceptr)((char *)(pool_addr) + pool_size), reserve_size, &access, 1));
473
+
474
+ // add to the pool
475
+ pool_size += reserve_size;
476
+
477
+ //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
478
+ // device, (unsigned long long) (pool_size/1024/1024),
479
+ // (unsigned long long) (reserve_size/1024/1024));
480
+ }
481
+
482
+ GGML_ASSERT(pool_addr != 0);
483
+
484
+ void * ptr = (void *) ((CUdeviceptr)((char *)(pool_addr) + pool_used));
485
+ *actual_size = size;
486
+ pool_used += size;
487
+
488
+ #ifdef DEBUG_CUDA_MALLOC
489
+ printf("cuda pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
490
+ #endif
491
+
492
+ return ptr;
493
+ }
494
+
495
+ void free(void * ptr, size_t size) override {
496
+ #ifdef DEBUG_CUDA_MALLOC
497
+ printf("cuda pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
498
+ #endif
499
+
500
+ pool_used -= size;
501
+
502
+ // all deallocations must be in reverse order of the allocations
503
+ GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
504
+ }
505
+ };
506
+ #endif // defined(GGML_USE_VMM)
507
+
508
+ std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
509
+ #if defined(GGML_USE_VMM)
510
+ if (ggml_cuda_info().devices[device].vmm) {
511
+ return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
512
+ }
513
+ #endif // defined(GGML_USE_VMM)
514
+ return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
515
+ }
516
+
517
+ // cuda buffer
518
+
519
+ struct ggml_backend_cuda_buffer_context {
520
+ int device;
521
+ void * dev_ptr = nullptr;
522
+ std::string name;
523
+
524
+ ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
525
+ device(device), dev_ptr(dev_ptr),
526
+ name(GGML_CUDA_NAME + std::to_string(device)) {
527
+ }
528
+
529
+ ~ggml_backend_cuda_buffer_context() {
530
+ CUDA_CHECK(cudaFree(dev_ptr));
531
+ }
532
+ };
533
+
534
+ static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
535
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
536
+ delete ctx;
537
+ }
538
+
539
+ static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
540
+ return buffer->iface.free_buffer == ggml_backend_cuda_buffer_free_buffer;
541
+ }
542
+
543
+ static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
544
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
545
+ return ctx->dev_ptr;
546
+ }
547
+
548
+ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
549
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
550
+
551
+ if (tensor->view_src != NULL) {
552
+ assert(tensor->view_src->buffer->buft == buffer->buft);
553
+ return GGML_STATUS_SUCCESS;
554
+ }
555
+
556
+ if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
557
+ // initialize padding to 0 to avoid possible NaN values
558
+ const size_t original_size = ggml_nbytes(tensor);
559
+ const size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
560
+
561
+ if (padded_size > original_size) {
562
+ ggml_cuda_set_device(ctx->device);
563
+ CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
564
+ }
565
+ }
566
+ return GGML_STATUS_SUCCESS;
567
+ }
568
+
569
+ static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
570
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
571
+
572
+ ggml_cuda_set_device(ctx->device);
573
+ CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + offset, value, size, cudaStreamPerThread));
574
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
575
+ }
576
+
577
+ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
578
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
579
+
580
+ ggml_cuda_set_device(ctx->device);
581
+ CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
582
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
583
+ }
584
+
585
+ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
586
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
587
+
588
+ ggml_cuda_set_device(ctx->device);
589
+ CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
590
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
591
+ }
592
+
593
+ static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
594
+ if (ggml_backend_buffer_is_cuda(src->buffer)) {
595
+ ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
596
+ ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
597
+ if (src_ctx->device == dst_ctx->device) {
598
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
599
+ } else {
600
+ #ifdef GGML_CUDA_NO_PEER_COPY
601
+ return false;
602
+ #else
603
+ CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
604
+ #endif
605
+ }
606
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
607
+ return true;
608
+ }
609
+ return false;
610
+
611
+ GGML_UNUSED(buffer);
612
+ }
613
+
614
+ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
615
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
616
+
617
+ ggml_cuda_set_device(ctx->device);
618
+ CUDA_CHECK(cudaDeviceSynchronize());
619
+ CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
620
+ CUDA_CHECK(cudaDeviceSynchronize());
621
+ }
622
+
623
+ static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
624
+ /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
625
+ /* .get_base = */ ggml_backend_cuda_buffer_get_base,
626
+ /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
627
+ /* .memset_tensor = */ ggml_backend_cuda_buffer_memset_tensor,
628
+ /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
629
+ /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
630
+ /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor,
631
+ /* .clear = */ ggml_backend_cuda_buffer_clear,
632
+ /* .reset = */ NULL,
633
+ };
634
+
635
+ // cuda buffer type
636
+ struct ggml_backend_cuda_buffer_type_context {
637
+ int device;
638
+ std::string name;
639
+ };
640
+
641
+ static const char * ggml_backend_cuda_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
642
+ ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
643
+
644
+ return ctx->name.c_str();
645
+ }
646
+
647
+ static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
648
+ return buft->iface.get_name == ggml_backend_cuda_buffer_type_get_name;
649
+ }
650
+
651
+ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
652
+ ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
653
+
654
+ ggml_cuda_set_device(buft_ctx->device);
655
+
656
+ void * dev_ptr;
657
+ cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
658
+ if (err != cudaSuccess) {
659
+ // clear the error
660
+ (void)cudaGetLastError();
661
+ GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
662
+ return nullptr;
663
+ }
664
+
665
+ ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
666
+
667
+ return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
668
+ }
669
+
670
+ static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
671
+ return 128;
672
+
673
+ GGML_UNUSED(buft);
674
+ }
675
+
676
+ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
677
+ size_t size = ggml_nbytes(tensor);
678
+ int64_t ne0 = tensor->ne[0];
679
+
680
+ if (ggml_is_quantized(tensor->type)) {
681
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
682
+ GGML_ASSERT(tensor->nb[0] == ggml_element_size(tensor));
683
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
684
+ }
685
+ }
686
+
687
+ return size;
688
+
689
+ GGML_UNUSED(buft);
690
+ }
691
+
692
+ static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
693
+ /* .get_name = */ ggml_backend_cuda_buffer_type_get_name,
694
+ /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
695
+ /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
696
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
697
+ /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
698
+ /* .is_host = */ NULL,
699
+ };
700
+
701
+ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
702
+ static std::mutex mutex;
703
+ std::lock_guard<std::mutex> lock(mutex);
704
+
705
+ if (device >= ggml_backend_cuda_get_device_count()) {
706
+ return nullptr;
707
+ }
708
+
709
+ static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
710
+
711
+ static bool ggml_backend_cuda_buffer_type_initialized = false;
712
+
713
+ if (!ggml_backend_cuda_buffer_type_initialized) {
714
+ for (int i = 0; i < ggml_backend_cuda_get_device_count(); i++) {
715
+ ggml_backend_cuda_buffer_types[i] = {
716
+ /* .iface = */ ggml_backend_cuda_buffer_type_interface,
717
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), i),
718
+ /* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
719
+ };
720
+ }
721
+ ggml_backend_cuda_buffer_type_initialized = true;
722
+ }
723
+
724
+ return &ggml_backend_cuda_buffer_types[device];
725
+ }
726
+
727
+ // cuda split buffer
728
+
729
+ static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
730
+ int64_t row_rounding = 0;
731
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
732
+ if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
733
+ continue;
734
+ }
735
+
736
+ const int cc = ggml_cuda_info().devices[id].cc;
737
+ row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc));
738
+ }
739
+ return row_rounding;
740
+ }
741
+
742
+ static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
743
+ const int64_t nrows = ggml_nrows(tensor);
744
+ const int64_t rounding = get_row_rounding(tensor_split);
745
+
746
+ *row_low = id == 0 ? 0 : nrows*tensor_split[id];
747
+ *row_low -= *row_low % rounding;
748
+
749
+ if (id == ggml_backend_cuda_get_device_count() - 1) {
750
+ *row_high = nrows;
751
+ } else {
752
+ *row_high = nrows*tensor_split[id + 1];
753
+ *row_high -= *row_high % rounding;
754
+ }
755
+ }
756
+
757
+ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
758
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
759
+
760
+ return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
761
+ }
762
+
763
+ struct ggml_backend_cuda_split_buffer_type_context {
764
+ int main_device;
765
+ std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
766
+ std::string name;
767
+ };
768
+
769
+ struct ggml_backend_cuda_split_buffer_context {
770
+ ~ggml_backend_cuda_split_buffer_context() {
771
+ for (ggml_tensor_extra_gpu * extra : tensor_extras) {
772
+ for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) {
773
+ for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
774
+ if (extra->events[id][is] != nullptr) {
775
+ CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
776
+ }
777
+ }
778
+ if (extra->data_device[id] != nullptr) {
779
+ CUDA_CHECK(cudaFree(extra->data_device[id]));
780
+ }
781
+ }
782
+ delete extra;
783
+ }
784
+ }
785
+
786
+ std::vector<ggml_tensor_extra_gpu *> tensor_extras;
787
+ };
788
+
789
+
790
+ static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
791
+ ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
792
+ delete ctx;
793
+ }
794
+
795
+ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
796
+ // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
797
+ return (void *)0x1000;
798
+
799
+ GGML_UNUSED(buffer);
800
+ }
801
+
802
+ static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
803
+ GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
804
+ GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
805
+
806
+ ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
807
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
808
+
809
+ const int64_t ne0 = tensor->ne[0];
810
+
811
+ ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
812
+ ctx->tensor_extras.push_back(extra);
813
+
814
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
815
+ int64_t row_low, row_high;
816
+ get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
817
+
818
+ int64_t nrows_split = row_high - row_low;
819
+ if (nrows_split == 0) {
820
+ continue;
821
+ }
822
+
823
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
824
+ const size_t original_size = size;
825
+
826
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
827
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
828
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
829
+ }
830
+
831
+ // FIXME: do not crash if cudaMalloc fails
832
+ // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
833
+ ggml_cuda_set_device(id);
834
+ char * buf;
835
+ CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
836
+
837
+ // set padding to 0 to avoid possible NaN values
838
+ if (size > original_size) {
839
+ CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
840
+ }
841
+
842
+ extra->data_device[id] = buf;
843
+
844
+ for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
845
+ CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
846
+ }
847
+ }
848
+ tensor->extra = extra;
849
+ return GGML_STATUS_SUCCESS;
850
+ }
851
+
852
+ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
853
+ // split tensors must always be set in their entirety at once
854
+ GGML_ASSERT(offset == 0);
855
+ GGML_ASSERT(size == ggml_nbytes(tensor));
856
+ GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
857
+
858
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
859
+
860
+ const int64_t ne0 = tensor->ne[0];
861
+ const size_t nb1 = tensor->nb[1];
862
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
863
+
864
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
865
+ int64_t row_low, row_high;
866
+ get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
867
+
868
+ int64_t nrows_split = row_high - row_low;
869
+ if (nrows_split == 0) {
870
+ continue;
871
+ }
872
+
873
+ const size_t offset_split = row_low*nb1;
874
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
875
+ const size_t original_size = size;
876
+
877
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
878
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
879
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
880
+ }
881
+
882
+ const char * buf_host = (const char *)data + offset_split;
883
+ CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
884
+ }
885
+
886
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
887
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
888
+ }
889
+ }
890
+
891
+ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
892
+ // split tensors must always be set in their entirety at once
893
+ GGML_ASSERT(offset == 0);
894
+ GGML_ASSERT(size == ggml_nbytes(tensor));
895
+ GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
896
+
897
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
898
+
899
+ const int64_t ne0 = tensor->ne[0];
900
+ const size_t nb1 = tensor->nb[1];
901
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
902
+
903
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
904
+ int64_t row_low, row_high;
905
+ get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
906
+
907
+ int64_t nrows_split = row_high - row_low;
908
+ if (nrows_split == 0) {
909
+ continue;
910
+ }
911
+
912
+ const size_t offset_split = row_low*nb1;
913
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
914
+ const size_t original_size = size;
915
+
916
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
917
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
918
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
919
+ }
920
+
921
+ char * buf_host = (char *)data + offset_split;
922
+ CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
923
+ }
924
+
925
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
926
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
927
+ }
928
+ }
929
+
930
+ static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
931
+ GGML_UNUSED(buffer);
932
+ GGML_UNUSED(value);
933
+ }
934
+
935
+ static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
936
+ /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
937
+ /* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
938
+ /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
939
+ /* .memset_tensor = */ NULL,
940
+ /* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
941
+ /* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
942
+ /* .cpy_tensor = */ NULL,
943
+ /* .clear = */ ggml_backend_cuda_split_buffer_clear,
944
+ /* .reset = */ NULL,
945
+ };
946
+
947
+ // cuda split buffer type
948
+
949
+ static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
950
+ ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
951
+
952
+ return ctx->name.c_str();
953
+ }
954
+
955
+ static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
956
+ return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_get_name;
957
+ }
958
+
959
+ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
960
+ // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
961
+ // instead, we allocate them for each tensor separately in init_tensor
962
+ // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
963
+ // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
964
+ ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
965
+
966
+ return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
967
+ }
968
+
969
+ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
970
+ return 128;
971
+
972
+ GGML_UNUSED(buft);
973
+ }
974
+
975
+ static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
976
+ ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
977
+ GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
978
+
979
+ size_t total_size = 0;
980
+
981
+ const int64_t ne0 = tensor->ne[0];
982
+
983
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
984
+ int64_t row_low, row_high;
985
+ get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
986
+
987
+ int64_t nrows_split = row_high - row_low;
988
+ if (nrows_split == 0) {
989
+ continue;
990
+ }
991
+
992
+ total_size += ggml_nbytes_split(tensor, nrows_split);
993
+
994
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
995
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
996
+ total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
997
+ }
998
+ }
999
+
1000
+ return total_size;
1001
+ }
1002
+
1003
+ static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
1004
+ return false;
1005
+
1006
+ GGML_UNUSED(buft);
1007
+ }
1008
+
1009
+ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
1010
+ /* .get_name = */ ggml_backend_cuda_split_buffer_type_get_name,
1011
+ /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
1012
+ /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
1013
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1014
+ /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
1015
+ /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
1016
+ };
1017
+
1018
+ ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
1019
+ static std::mutex mutex;
1020
+ std::lock_guard<std::mutex> lock(mutex);
1021
+
1022
+ static std::map<std::pair<int, std::array<float, GGML_CUDA_MAX_DEVICES>>, struct ggml_backend_buffer_type> buft_map;
1023
+
1024
+ std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
1025
+
1026
+ bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
1027
+ if (all_zero) {
1028
+ tensor_split_arr = ggml_cuda_info().default_tensor_split;
1029
+ } else {
1030
+ float split_sum = 0.0f;
1031
+ for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
1032
+ tensor_split_arr[i] = split_sum;
1033
+ split_sum += tensor_split[i];
1034
+ }
1035
+ for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
1036
+ tensor_split_arr[i] /= split_sum;
1037
+ }
1038
+ }
1039
+
1040
+ auto it = buft_map.find({main_device, tensor_split_arr});
1041
+ if (it != buft_map.end()) {
1042
+ return &it->second;
1043
+ }
1044
+ auto * ctx = new ggml_backend_cuda_split_buffer_type_context{
1045
+ main_device,
1046
+ tensor_split_arr,
1047
+ GGML_CUDA_NAME + std::to_string(main_device) + "_Split",
1048
+ };
1049
+
1050
+ struct ggml_backend_buffer_type buft {
1051
+ /* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
1052
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), main_device),
1053
+ /* .context = */ ctx,
1054
+ };
1055
+
1056
+ auto result = buft_map.emplace(std::make_pair(main_device, tensor_split_arr), buft);
1057
+ return &result.first->second;
1058
+ }
1059
+
1060
+ // host buffer type
1061
+
1062
+ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
1063
+ return GGML_CUDA_NAME "_Host";
1064
+
1065
+ GGML_UNUSED(buft);
1066
+ }
1067
+
1068
+ static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1069
+ CUDA_CHECK(cudaFreeHost(buffer->context));
1070
+ }
1071
+
1072
+ static void * ggml_cuda_host_malloc(size_t size) {
1073
+ if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
1074
+ return nullptr;
1075
+ }
1076
+
1077
+ void * ptr = nullptr;
1078
+ cudaError_t err = cudaMallocHost((void **) &ptr, size);
1079
+ if (err != cudaSuccess) {
1080
+ // clear the error
1081
+ (void)cudaGetLastError();
1082
+ GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1083
+ size / 1024.0 / 1024.0, cudaGetErrorString(err));
1084
+ return nullptr;
1085
+ }
1086
+
1087
+ return ptr;
1088
+ }
1089
+
1090
+ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1091
+ void * ptr = ggml_cuda_host_malloc(size);
1092
+
1093
+ if (ptr == nullptr) {
1094
+ // fallback to cpu buffer
1095
+ return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
1096
+ }
1097
+
1098
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
1099
+ buffer->buft = buft;
1100
+ buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
1101
+
1102
+ return buffer;
1103
+ }
1104
+
1105
+ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
1106
+ static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
1107
+ /* .iface = */ {
1108
+ /* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
1109
+ /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
1110
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
1111
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1112
+ /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
1113
+ /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1114
+ },
1115
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0),
1116
+ /* .context = */ nullptr,
1117
+ };
1118
+
1119
+ return &ggml_backend_cuda_buffer_type_host;
1120
+ }
1121
+
1122
+ //static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
1123
+ // return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
1124
+ //}
1125
+
1126
+ /// kernels
1127
+
1128
+ typedef void (*ggml_cuda_op_mul_mat_t)(
1129
+ ggml_backend_cuda_context & ctx,
1130
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
1131
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
1132
+ const int64_t src1_padded_row_size, cudaStream_t stream);
1133
+
1134
+ #ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
1135
+ #define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
1136
+ #endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
1137
+
1138
+ #define MUL_MAT_SRC1_COL_STRIDE 128
1139
+
1140
+ static cudaError_t ggml_cuda_cpy_tensor_2d(
1141
+ void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
1142
+
1143
+ GGML_ASSERT(ggml_backend_buffer_is_cuda(src->buffer));
1144
+ const char * src_ptr = (const char *) src->data;
1145
+ char * dst_ptr = (char *) dst;
1146
+
1147
+ const int64_t ne0 = src->ne[0];
1148
+ const int64_t nb0 = src->nb[0];
1149
+ const int64_t nb1 = src->nb[1];
1150
+ const int64_t nb2 = src->nb[2];
1151
+ const int64_t nb3 = src->nb[3];
1152
+ const enum ggml_type type = src->type;
1153
+ const int64_t ts = ggml_type_size(type);
1154
+ const int64_t bs = ggml_blck_size(type);
1155
+ const int64_t i1_diff = i1_high - i1_low;
1156
+
1157
+ const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
1158
+ if (nb0 == ts && nb1 == ts*ne0/bs) {
1159
+ return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, cudaMemcpyDeviceToDevice, stream);
1160
+ } else if (nb0 == ts) {
1161
+ return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, cudaMemcpyDeviceToDevice, stream);
1162
+ } else {
1163
+ for (int64_t i1 = 0; i1 < i1_diff; i1++) {
1164
+ const void * rx = (const void *) ((const char *) x + i1*nb1);
1165
+ void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
1166
+ // pretend the row is a matrix with cols=1
1167
+ cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyDeviceToDevice, stream);
1168
+ if (r != cudaSuccess) {
1169
+ return r;
1170
+ }
1171
+ }
1172
+ return cudaSuccess;
1173
+ }
1174
+ }
1175
+
1176
+ static void ggml_cuda_op_mul_mat_cublas(
1177
+ ggml_backend_cuda_context & ctx,
1178
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
1179
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
1180
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
1181
+
1182
+ GGML_ASSERT(src0_dd_i != nullptr);
1183
+ GGML_ASSERT(src1_ddf_i != nullptr);
1184
+ GGML_ASSERT(dst_dd_i != nullptr);
1185
+
1186
+ const int64_t ne00 = src0->ne[0];
1187
+ const int64_t ne10 = src1->ne[0];
1188
+
1189
+ const int64_t ne0 = dst->ne[0];
1190
+
1191
+ const int64_t row_diff = row_high - row_low;
1192
+
1193
+ int id = ggml_cuda_get_device();
1194
+
1195
+ // the main device has a larger memory buffer to hold the results from all GPUs
1196
+ // ldc == nrows of the matrix that cuBLAS writes into
1197
+ int64_t ldc = id == ctx.device ? ne0 : row_diff;
1198
+
1199
+ const int cc = ggml_cuda_info().devices[id].cc;
1200
+
1201
+ const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
1202
+
1203
+ if (src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
1204
+ ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
1205
+ if (src1->type != GGML_TYPE_BF16) {
1206
+ const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type);
1207
+ GGML_ASSERT(to_bf16_cuda != nullptr);
1208
+ size_t ne = src1_ncols*ne10;
1209
+ src1_as_bf16.alloc(ne);
1210
+ to_bf16_cuda(src1_ddf_i, src1_as_bf16.get(), ne, stream);
1211
+ }
1212
+ const nv_bfloat16 * src1_ptr = src1->type == GGML_TYPE_BF16 ? (const nv_bfloat16 *) src1_ddf_i : src1_as_bf16.get();
1213
+ const nv_bfloat16 * src0_ptr = (const nv_bfloat16 *)src0_dd_i;
1214
+ ggml_cuda_pool_alloc<nv_bfloat16> dst_bf16(ctx.pool(id), row_diff*src1_ncols);
1215
+
1216
+ const float alpha_f32 = 1.0f;
1217
+ const float beta_f32 = 0.0f;
1218
+
1219
+ CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
1220
+ CUBLAS_CHECK(
1221
+ cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
1222
+ row_diff, src1_ncols, ne10,
1223
+ &alpha_f32, src0_ptr, CUDA_R_16BF, ne00,
1224
+ src1_ptr, CUDA_R_16BF, ne10,
1225
+ &beta_f32, dst_bf16.get(), CUDA_R_16BF, ldc,
1226
+ CUBLAS_COMPUTE_32F,
1227
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1228
+
1229
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
1230
+ to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
1231
+ } else if (((GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) {
1232
+ // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
1233
+ ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
1234
+ if (src0->type != GGML_TYPE_F16) {
1235
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
1236
+ GGML_ASSERT(to_fp16_cuda != nullptr);
1237
+ size_t ne = row_diff*ne00;
1238
+ src0_as_f16.alloc(ne);
1239
+ to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
1240
+ }
1241
+ const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
1242
+
1243
+ ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool(id));
1244
+ if (src1->type != GGML_TYPE_F16) {
1245
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
1246
+ GGML_ASSERT(to_fp16_cuda != nullptr);
1247
+ size_t ne = src1_ncols*ne10;
1248
+ src1_as_f16.alloc(ne);
1249
+ to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
1250
+ }
1251
+ const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
1252
+
1253
+ CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
1254
+
1255
+ if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
1256
+ const float alpha = 1.0f;
1257
+ const float beta = 0.0f;
1258
+ CUBLAS_CHECK(
1259
+ cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
1260
+ row_diff, src1_ncols, ne10,
1261
+ &alpha, src0_ptr, CUDA_R_16F, ne00,
1262
+ src1_ptr, CUDA_R_16F, ne10,
1263
+ &beta, dst_dd_i, CUDA_R_32F, ldc,
1264
+ CUBLAS_COMPUTE_32F,
1265
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1266
+ } else {
1267
+ ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(id), row_diff*src1_ncols);
1268
+
1269
+ const half alpha_f16 = 1.0f;
1270
+ const half beta_f16 = 0.0f;
1271
+
1272
+ CUBLAS_CHECK(
1273
+ cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
1274
+ row_diff, src1_ncols, ne10,
1275
+ &alpha_f16, src0_ptr, CUDA_R_16F, ne00,
1276
+ src1_ptr, CUDA_R_16F, ne10,
1277
+ &beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
1278
+ CUBLAS_COMPUTE_16F,
1279
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1280
+
1281
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
1282
+ to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
1283
+ }
1284
+ } else {
1285
+ ggml_cuda_pool_alloc<float> src0_ddq_as_f32(ctx.pool(id));
1286
+ ggml_cuda_pool_alloc<float> src1_ddq_as_f32(ctx.pool(id));
1287
+
1288
+ if (src0->type != GGML_TYPE_F32) {
1289
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
1290
+ GGML_ASSERT(to_fp32_cuda != nullptr);
1291
+ src0_ddq_as_f32.alloc(row_diff*ne00);
1292
+ to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
1293
+ }
1294
+ if (src1->type != GGML_TYPE_F32) {
1295
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
1296
+ GGML_ASSERT(to_fp32_cuda != nullptr);
1297
+ src1_ddq_as_f32.alloc(src1_ncols*ne10);
1298
+ to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
1299
+ }
1300
+
1301
+ const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
1302
+ const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
1303
+
1304
+ const float alpha = 1.0f;
1305
+ const float beta = 0.0f;
1306
+
1307
+ CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
1308
+ CUBLAS_CHECK(
1309
+ cublasSgemm(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
1310
+ row_diff, src1_ncols, ne10,
1311
+ &alpha, src0_ddf_i, ne00,
1312
+ src1_ddf1_i, ne10,
1313
+ &beta, dst_dd_i, ldc));
1314
+ }
1315
+
1316
+ GGML_UNUSED(dst);
1317
+ GGML_UNUSED(src1_ddq_i);
1318
+ GGML_UNUSED(src1_padded_row_size);
1319
+ }
1320
+
1321
+ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
1322
+ static bool peer_access_enabled = false;
1323
+
1324
+ const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
1325
+
1326
+ if (peer_access_enabled == enable_peer_access) {
1327
+ return;
1328
+ }
1329
+
1330
+ #ifdef NDEBUG
1331
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1332
+ ggml_cuda_set_device(id);
1333
+ CUDA_CHECK(cudaDeviceSynchronize());
1334
+ }
1335
+
1336
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1337
+ ggml_cuda_set_device(id);
1338
+
1339
+ for (int id_other = 0; id_other < ggml_backend_cuda_get_device_count(); ++id_other) {
1340
+ if (id == id_other) {
1341
+ continue;
1342
+ }
1343
+ if (id != main_device && id_other != main_device) {
1344
+ continue;
1345
+ }
1346
+
1347
+ int can_access_peer;
1348
+ CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
1349
+ if (can_access_peer) {
1350
+ if (enable_peer_access) {
1351
+ cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
1352
+ if (err != cudaErrorPeerAccessAlreadyEnabled) {
1353
+ CUDA_CHECK(err);
1354
+ } else {
1355
+ // reset the error
1356
+ (void)cudaGetLastError();
1357
+ }
1358
+ } else {
1359
+ cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
1360
+ if (err != cudaErrorPeerAccessNotEnabled) {
1361
+ CUDA_CHECK(err);
1362
+ } else {
1363
+ // reset the error
1364
+ (void)cudaGetLastError();
1365
+ }
1366
+ }
1367
+ }
1368
+ }
1369
+ }
1370
+
1371
+ ggml_cuda_set_device(main_device);
1372
+ #endif // NDEBUG
1373
+
1374
+ peer_access_enabled = enable_peer_access;
1375
+
1376
+ GGML_UNUSED(main_device);
1377
+ }
1378
+
1379
+ static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
1380
+ void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
1381
+
1382
+ #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1383
+ // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
1384
+ cudaMemcpy3DPeerParms p = {};
1385
+ p.dstDevice = dstDevice;
1386
+ p.dstPtr = make_cudaPitchedPtr(dst, dpitch, dpitch, height);
1387
+ p.srcDevice = srcDevice;
1388
+ p.srcPtr = make_cudaPitchedPtr(src, spitch, spitch, height);
1389
+ p.extent = make_cudaExtent(width, height, 1);
1390
+ return cudaMemcpy3DPeerAsync(&p, stream);
1391
+ #else
1392
+ // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
1393
+ GGML_UNUSED(dstDevice);
1394
+ GGML_UNUSED(srcDevice);
1395
+ return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
1396
+ #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1397
+ }
1398
+
1399
+ static void ggml_cuda_op_mul_mat(
1400
+ ggml_backend_cuda_context & ctx,
1401
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
1402
+ quantize_cuda_t quantize_src1) {
1403
+
1404
+ const int64_t ne00 = src0->ne[0];
1405
+ const int64_t ne01 = src0->ne[1];
1406
+ const int64_t ne02 = src0->ne[2];
1407
+ const int64_t ne03 = src0->ne[3];
1408
+
1409
+ const int64_t ne10 = src1->ne[0];
1410
+ const int64_t ne11 = src1->ne[1];
1411
+ const int64_t ne12 = src1->ne[2];
1412
+ const int64_t ne13 = src1->ne[3];
1413
+ const int64_t nrows1 = ggml_nrows(src1);
1414
+
1415
+ const int64_t ne0 = dst->ne[0];
1416
+ const int64_t ne1 = dst->ne[1];
1417
+
1418
+ // const int64_t nb10 = src1->nb[0];
1419
+ const int64_t nb11 = src1->nb[1];
1420
+ const int64_t nb12 = src1->nb[2];
1421
+ const int64_t nb13 = src1->nb[3];
1422
+
1423
+ const int64_t nb2 = dst->nb[2];
1424
+ const int64_t nb3 = dst->nb[3];
1425
+
1426
+ GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
1427
+ GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
1428
+ ggml_backend_cuda_buffer_context * src1_ctx = (ggml_backend_cuda_buffer_context *) src1->buffer->context;
1429
+ ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *) dst->buffer->context;
1430
+
1431
+ GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
1432
+
1433
+ GGML_ASSERT(ne12 % ne02 == 0);
1434
+ GGML_ASSERT(ne13 % ne03 == 0);
1435
+
1436
+ const int64_t i02_divisor = ne12 / ne02;
1437
+ const int64_t i03_divisor = ne13 / ne03;
1438
+
1439
+ const size_t src0_ts = ggml_type_size(src0->type);
1440
+ const size_t src0_bs = ggml_blck_size(src0->type);
1441
+ const size_t q8_1_ts = sizeof(block_q8_1);
1442
+ const size_t q8_1_bs = QK8_1;
1443
+
1444
+ const bool src0_is_contiguous = ggml_is_contiguous(src0);
1445
+ const bool src1_is_contiguous = ggml_is_contiguous(src1);
1446
+
1447
+ const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
1448
+
1449
+ const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
1450
+ GGML_ASSERT(!(split && ne02 > 1));
1451
+ GGML_ASSERT(!(split && ne03 > 1));
1452
+ GGML_ASSERT(!(split && ne02 < ne12));
1453
+ GGML_ASSERT(!(split && ne03 < ne13));
1454
+
1455
+ ggml_tensor_extra_gpu * src0_extra = split ? (ggml_tensor_extra_gpu *) src0->extra : nullptr;
1456
+
1457
+
1458
+ std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
1459
+ if (split) {
1460
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
1461
+ tensor_split = buft_ctx->tensor_split;
1462
+ }
1463
+
1464
+ struct dev_data {
1465
+ int cc;
1466
+
1467
+ ggml_cuda_pool_alloc<char> src0_dd_alloc;
1468
+ ggml_cuda_pool_alloc<float> src1_ddf_alloc;
1469
+ ggml_cuda_pool_alloc<char> src1_ddq_alloc;
1470
+ ggml_cuda_pool_alloc<float> dst_dd_alloc;
1471
+
1472
+ char * src0_dd = nullptr;
1473
+ float * src1_ddf = nullptr; // float
1474
+ char * src1_ddq = nullptr; // q8_1
1475
+ float * dst_dd = nullptr;
1476
+
1477
+ int64_t row_low;
1478
+ int64_t row_high;
1479
+ };
1480
+
1481
+ dev_data dev[GGML_CUDA_MAX_DEVICES];
1482
+
1483
+ int used_devices = 0;
1484
+
1485
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1486
+ dev[id].cc = ggml_cuda_info().devices[id].cc;
1487
+
1488
+ // by default, use all rows
1489
+ dev[id].row_low = 0;
1490
+ dev[id].row_high = ne01;
1491
+
1492
+ // for multi GPU, get the row boundaries from tensor split
1493
+ // and round to mul_mat_q tile sizes
1494
+ if (split) {
1495
+ const int64_t rounding = get_row_rounding(tensor_split);
1496
+
1497
+ if (id != 0) {
1498
+ dev[id].row_low = ne01*tensor_split[id];
1499
+ if (dev[id].row_low < ne01) {
1500
+ dev[id].row_low -= dev[id].row_low % rounding;
1501
+ }
1502
+ }
1503
+
1504
+ if (id != ggml_backend_cuda_get_device_count() - 1) {
1505
+ dev[id].row_high = ne01*tensor_split[id + 1];
1506
+ if (dev[id].row_high < ne01) {
1507
+ dev[id].row_high -= dev[id].row_high % rounding;
1508
+ }
1509
+ }
1510
+ }
1511
+ }
1512
+
1513
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1514
+ if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
1515
+ continue;
1516
+ }
1517
+
1518
+ used_devices++;
1519
+
1520
+ const bool src1_on_device = id == src1_ctx->device;
1521
+ const bool dst_on_device = id == dst_ctx->device;
1522
+
1523
+ ggml_cuda_set_device(id);
1524
+ cudaStream_t stream = ctx.stream(id, 0);
1525
+
1526
+ if (src0_is_contiguous) {
1527
+ dev[id].src0_dd = split ? (char *) src0_extra->data_device[id] : (char *) src0->data;
1528
+ } else {
1529
+ // If src0 is not contiguous it will be copied to a temporary buffer.
1530
+ // This buffer needs to be cleared entirely because multiple regions will function as padding.
1531
+ const size_t nbytes_data = ggml_nbytes(src0);
1532
+ const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
1533
+ dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), nbytes_data + nbytes_padding);
1534
+ CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd, 0, nbytes_data + nbytes_padding, stream));
1535
+ }
1536
+
1537
+ // If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
1538
+ if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
1539
+ GGML_ASSERT(ggml_is_contiguously_allocated(src0));
1540
+ GGML_ASSERT(!src0->view_src);
1541
+ const size_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
1542
+ const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
1543
+ CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
1544
+ }
1545
+
1546
+ if (src1_on_device && src1_is_contiguous) {
1547
+ dev[id].src1_ddf = (float *) src1->data;
1548
+ } else {
1549
+ dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
1550
+ }
1551
+
1552
+ if (quantize_src1) {
1553
+ size_t src_1_ddq_size = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
1554
+ if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1555
+ src_1_ddq_size += get_mmq_x_max_host(dev[id].cc)*sizeof(block_q8_1_mmq);
1556
+ }
1557
+ dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
1558
+
1559
+ if (src1_on_device && src1_is_contiguous) {
1560
+ quantize_src1(
1561
+ dev[id].src1_ddf, nullptr, dev[id].src1_ddq, src0->type, ne10,
1562
+ nb11/sizeof(float), nb12/sizeof(float), nb13/sizeof(float),
1563
+ src1_padded_col_size, ne11, ne12, ne13, stream);
1564
+ CUDA_CHECK(cudaGetLastError());
1565
+ }
1566
+ }
1567
+
1568
+ if (dst_on_device) {
1569
+ dev[id].dst_dd = (float *) dst->data;
1570
+ } else {
1571
+ const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
1572
+ dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(ctx.pool(id), size_dst_ddf);
1573
+ }
1574
+ }
1575
+
1576
+ // if multiple devices are used they need to wait for the main device
1577
+ // here an event is recorded that signals that the main device has finished calculating the input data
1578
+ if (split && used_devices > 1) {
1579
+ ggml_cuda_set_device(ctx.device);
1580
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[ctx.device][0], ctx.stream()));
1581
+ }
1582
+
1583
+ const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
1584
+ for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
1585
+ const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_CUDA_MAX_STREAMS : 0;
1586
+ const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
1587
+
1588
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1589
+ if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
1590
+ continue;
1591
+ }
1592
+
1593
+ const bool src1_on_device = id == src1_ctx->device;
1594
+ const bool dst_on_device = id == dst_ctx->device;
1595
+ const int64_t row_diff = dev[id].row_high - dev[id].row_low;
1596
+
1597
+ ggml_cuda_set_device(id);
1598
+ cudaStream_t stream = ctx.stream(id, is);
1599
+
1600
+ // wait for main GPU data if necessary
1601
+ if (split && (id != ctx.device || is != 0)) {
1602
+ CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[ctx.device][0], 0));
1603
+ }
1604
+
1605
+ for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
1606
+ const int64_t i03 = i0 / ne12;
1607
+ const int64_t i02 = i0 % ne12;
1608
+
1609
+ size_t src1_ddq_i_offset = i0*ne11 * src1_padded_col_size*q8_1_ts/q8_1_bs;
1610
+ if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1611
+ src1_ddq_i_offset += src1_col_0 * sizeof(block_q8_1_mmq);
1612
+ } else {
1613
+ src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs;
1614
+ }
1615
+
1616
+ // for split tensors the data begins at i0 == i0_offset_low
1617
+ const size_t nbytes_src0_matrix = ne01*ne00*src0_ts / src0_bs;
1618
+ char * src0_dd_i = dev[id].src0_dd + ((i03/i03_divisor)*ne02 + (i02/i02_divisor)) * nbytes_src0_matrix;
1619
+ float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
1620
+ char * src1_ddq_i = dev[id].src1_ddq + src1_ddq_i_offset;
1621
+ float * dst_dd_i = dev[id].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
1622
+
1623
+ // the main device memory buffer can be on VRAM scratch, with space for all partial results
1624
+ // in that case an offset on dst_ddf_i is needed
1625
+ if (id == ctx.device) {
1626
+ dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
1627
+ }
1628
+
1629
+ // copy src0, src1 to device if necessary
1630
+ if (src1_is_contiguous) {
1631
+ if (id != ctx.device) {
1632
+ if (quantize_src1) {
1633
+ char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
1634
+ if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1635
+ const size_t pitch = ne11*sizeof(block_q8_1_mmq);
1636
+ const size_t width = src1_ncols*sizeof(block_q8_1_mmq);
1637
+ const size_t height = src1_padded_col_size/(4*QK8_1);
1638
+ CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device, pitch, width, height, stream));
1639
+ } else {
1640
+ CUDA_CHECK(cudaMemcpyPeerAsync(
1641
+ src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
1642
+ }
1643
+ } else {
1644
+ float * src1_ddf_i_source = (float *) src1->data;
1645
+ src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
1646
+ CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, ctx.device,
1647
+ src1_ncols*ne10*sizeof(float), stream));
1648
+ }
1649
+ }
1650
+ } else if (src1_on_device && !src1_is_contiguous) {
1651
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
1652
+ src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
1653
+ } else {
1654
+ GGML_ABORT("fatal error");
1655
+ }
1656
+
1657
+ if (quantize_src1 && !src1_is_contiguous) {
1658
+ quantize_src1(
1659
+ src1_ddf_i, nullptr, src1_ddq_i, src0->type, ne10, ne10, ne11*ne10, ne12*ne11*ne10,
1660
+ src1_padded_col_size, src1_ncols, 1, 1, stream);
1661
+ CUDA_CHECK(cudaGetLastError());
1662
+ }
1663
+
1664
+ if (src1_col_0 == 0 && !src0_is_contiguous && i03 % i03_divisor == 0 && i02 % i02_divisor == 0) {
1665
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
1666
+ src0_dd_i, src0, i03/i03_divisor, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
1667
+ }
1668
+
1669
+ // do the computation
1670
+ op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
1671
+ dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream);
1672
+ CUDA_CHECK(cudaGetLastError());
1673
+
1674
+ // copy dst to host or other device if necessary
1675
+ if (!dst_on_device) {
1676
+ void * dst_off_device = dst->data;
1677
+ if (split) {
1678
+ // src0 = weight matrix is saved as a transposed matrix for better memory layout.
1679
+ // dst is NOT transposed.
1680
+ // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
1681
+ // Instead they need to be copied to the correct slice in ne0 = dst row index.
1682
+ // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
1683
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
1684
+ GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
1685
+ dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
1686
+ CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(
1687
+ dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream));
1688
+ } else {
1689
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
1690
+ GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
1691
+ dhf_dst_i += src1_col_0*ne0;
1692
+ CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), cudaMemcpyDeviceToDevice, stream));
1693
+ }
1694
+ }
1695
+
1696
+ // add event for the main device to wait on until other device is done
1697
+ if (split && (id != ctx.device || is != 0)) {
1698
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
1699
+ }
1700
+ }
1701
+ }
1702
+ }
1703
+
1704
+ // main device waits for all other devices to be finished
1705
+ if (split && ggml_backend_cuda_get_device_count() > 1) {
1706
+ int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
1707
+ is_max = is_max <= GGML_CUDA_MAX_STREAMS ? is_max : GGML_CUDA_MAX_STREAMS;
1708
+
1709
+ ggml_cuda_set_device(ctx.device);
1710
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1711
+ if (dev[id].row_low == dev[id].row_high) {
1712
+ continue;
1713
+ }
1714
+ for (int64_t is = 0; is < is_max; ++is) {
1715
+ CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), src0_extra->events[id][is], 0));
1716
+ }
1717
+ }
1718
+ }
1719
+ }
1720
+
1721
+ static __global__ void k_compute_batched_ptrs(
1722
+ const half * src0_as_f16, const half * src1_as_f16, char * dst,
1723
+ const void ** ptrs_src, void ** ptrs_dst,
1724
+ int64_t ne12, int64_t ne13,
1725
+ int64_t ne23,
1726
+ size_t nb02, size_t nb03,
1727
+ size_t nb12, size_t nb13,
1728
+ size_t nbd2, size_t nbd3,
1729
+ int64_t r2, int64_t r3) {
1730
+ const int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
1731
+ const int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
1732
+
1733
+ if (i13 >= ne13 || i12 >= ne12) {
1734
+ return;
1735
+ }
1736
+
1737
+ const int64_t i03 = i13 / r3;
1738
+ const int64_t i02 = i12 / r2;
1739
+
1740
+ ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
1741
+ ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
1742
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
1743
+ }
1744
+
1745
+ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1746
+ GGML_ASSERT(!ggml_is_transposed(src0));
1747
+ GGML_ASSERT(!ggml_is_transposed(src1));
1748
+
1749
+ GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
1750
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
1751
+
1752
+ // Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.
1753
+ // As long as dst is contiguous this does not matter though.
1754
+ GGML_ASSERT(ggml_is_contiguous(dst));
1755
+
1756
+ GGML_TENSOR_BINARY_OP_LOCALS
1757
+
1758
+ const int64_t ne_dst = ggml_nelements(dst);
1759
+
1760
+ cudaStream_t main_stream = ctx.stream();
1761
+
1762
+ CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream));
1763
+
1764
+ const half * src0_f16 = (const half *) src0->data;
1765
+ float * dst_ddf = (float *) dst->data;
1766
+
1767
+ const half * src1_f16 = (const half *) src1->data;
1768
+ const size_t ts_src1 = ggml_type_size(src1->type);
1769
+ GGML_ASSERT(nb10 == ts_src1);
1770
+ int64_t s11 = nb11 / ts_src1;
1771
+ int64_t s12 = nb12 / ts_src1;
1772
+ int64_t s13 = nb13 / ts_src1;
1773
+ ggml_cuda_pool_alloc<half> src1_f16_alloc(ctx.pool());
1774
+
1775
+ // convert src1 to fp16
1776
+ if (src1->type != GGML_TYPE_F16) {
1777
+ const to_fp16_nc_cuda_t to_fp16_cuda = ggml_get_to_fp16_nc_cuda(src1->type);
1778
+ const int64_t ne_src1 = ggml_nelements(src1);
1779
+ src1_f16_alloc.alloc(ne_src1);
1780
+ GGML_ASSERT(to_fp16_cuda != nullptr);
1781
+
1782
+ to_fp16_cuda(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream);
1783
+
1784
+ src1_f16 = src1_f16_alloc.get();
1785
+ s11 = ne10;
1786
+ s12 = ne11*s11;
1787
+ s13 = ne12*s12;
1788
+ }
1789
+
1790
+ ggml_cuda_pool_alloc<half> dst_f16(ctx.pool());
1791
+ char * dst_t;
1792
+
1793
+ cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
1794
+ cudaDataType_t cu_data_type = CUDA_R_16F;
1795
+
1796
+ // dst strides
1797
+ size_t nbd2 = dst->nb[2];
1798
+ size_t nbd3 = dst->nb[3];
1799
+
1800
+ const half alpha_f16 = 1.0f;
1801
+ const half beta_f16 = 0.0f;
1802
+
1803
+ const float alpha_f32 = 1.0f;
1804
+ const float beta_f32 = 0.0f;
1805
+
1806
+ const void * alpha = &alpha_f16;
1807
+ const void * beta = &beta_f16;
1808
+
1809
+ if (dst->op_params[0] == GGML_PREC_DEFAULT) {
1810
+ dst_t = (char *) dst_f16.alloc(ne_dst);
1811
+
1812
+ nbd2 /= sizeof(float) / sizeof(half);
1813
+ nbd3 /= sizeof(float) / sizeof(half);
1814
+ } else {
1815
+ dst_t = (char *) dst_ddf;
1816
+
1817
+ cu_compute_type = CUBLAS_COMPUTE_32F;
1818
+ cu_data_type = CUDA_R_32F;
1819
+
1820
+ alpha = &alpha_f32;
1821
+ beta = &beta_f32;
1822
+ }
1823
+
1824
+ int id = ggml_cuda_get_device();
1825
+ const int cc = ggml_cuda_info().devices[id].cc;
1826
+ if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
1827
+ cu_compute_type = CUBLAS_COMPUTE_32F;
1828
+ alpha = &alpha_f32;
1829
+ beta = &beta_f32;
1830
+ }
1831
+
1832
+ GGML_ASSERT(ne12 % ne02 == 0);
1833
+ GGML_ASSERT(ne13 % ne03 == 0);
1834
+
1835
+ // broadcast factors
1836
+ const int64_t r2 = ne12/ne02;
1837
+ const int64_t r3 = ne13/ne03;
1838
+
1839
+ #if 0
1840
+ // use cublasGemmEx
1841
+ {
1842
+ for (int i13 = 0; i13 < ne13; ++i13) {
1843
+ for (int i12 = 0; i12 < ne12; ++i12) {
1844
+ int i03 = i13 / r3;
1845
+ int i02 = i12 / r2;
1846
+
1847
+ CUBLAS_CHECK(
1848
+ cublasGemmEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
1849
+ ne01, ne11, ne10,
1850
+ alpha, (const char *) src0_f16 + i03*nb03 + i02*nb02, CUDA_R_16F, nb01/sizeof(half),
1851
+ src1_f16 + i13*s13 + i12*s12, CUDA_R_16F, s11,
1852
+ beta, ( char *) dst_t + i13*nbd3 + i12*nbd2, cu_data_type, ne0,
1853
+ cu_compute_type,
1854
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1855
+ }
1856
+ }
1857
+ }
1858
+ #else
1859
+ if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
1860
+ // there is no broadcast and src0, src1 are contiguous across dims 2, 3
1861
+ // use cublasGemmStridedBatchedEx
1862
+ CUBLAS_CHECK(
1863
+ cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
1864
+ ne01, ne11, ne10,
1865
+ alpha, src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
1866
+ src1_f16, CUDA_R_16F, s11, s12, // strideB
1867
+ beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
1868
+ ne12*ne13,
1869
+ cu_compute_type,
1870
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1871
+ } else {
1872
+ // use cublasGemmBatchedEx
1873
+ const int64_t ne23 = ne12*ne13;
1874
+
1875
+ ggml_cuda_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
1876
+ ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
1877
+
1878
+ dim3 block_dims(ne13, ne12);
1879
+ k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
1880
+ src0_f16, src1_f16, dst_t,
1881
+ ptrs_src.get(), ptrs_dst.get(),
1882
+ ne12, ne13,
1883
+ ne23,
1884
+ nb02, nb03,
1885
+ src1->type == GGML_TYPE_F16 ? nb12 : s12*sizeof(half),
1886
+ src1->type == GGML_TYPE_F16 ? nb13 : s13*sizeof(half),
1887
+ nbd2, nbd3,
1888
+ r2, r3);
1889
+ CUDA_CHECK(cudaGetLastError());
1890
+
1891
+ CUBLAS_CHECK(
1892
+ cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
1893
+ ne01, ne11, ne10,
1894
+ alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00,
1895
+ (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, s11,
1896
+ beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0,
1897
+ ne23,
1898
+ cu_compute_type,
1899
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1900
+ }
1901
+ #endif
1902
+
1903
+ if (dst->op_params[0] == GGML_PREC_DEFAULT && cu_data_type == CUDA_R_16F) {
1904
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
1905
+ to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
1906
+ }
1907
+ }
1908
+
1909
+ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1910
+ const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
1911
+
1912
+ // If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
1913
+ // But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
1914
+ // Therefore, in such cases use cuBLAS.
1915
+ const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
1916
+ && ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
1917
+
1918
+ bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
1919
+ && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1920
+ && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
1921
+ bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
1922
+ && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1923
+ && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
1924
+ bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
1925
+ && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
1926
+
1927
+ bool any_gpus_with_slow_fp16 = false;
1928
+ bool any_gpus_without_fp16_mma = false;
1929
+
1930
+ if (split) {
1931
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
1932
+ auto & tensor_split = buft_ctx->tensor_split;
1933
+ for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1934
+ // skip devices that are not going to do any work:
1935
+ if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
1936
+ continue;
1937
+ }
1938
+
1939
+ const int cc = ggml_cuda_info().devices[id].cc;
1940
+ use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1941
+ any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
1942
+ any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
1943
+ }
1944
+ } else {
1945
+ const int cc = ggml_cuda_info().devices[ctx.device].cc;
1946
+ use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1947
+ any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
1948
+ any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
1949
+ }
1950
+
1951
+ // debug helpers
1952
+ //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
1953
+ //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
1954
+ //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
1955
+ //printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
1956
+ //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
1957
+ //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
1958
+
1959
+ if (!split && use_mul_mat_vec && (src0->ne[1] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
1960
+ // the custom F16 vector kernel can be used over batched cuBLAS GEMM
1961
+ // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
1962
+ ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
1963
+ } else if (!split && use_mul_mat_vec_q) {
1964
+ ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
1965
+ } else if (!split && use_mul_mat_q) {
1966
+ ggml_cuda_mul_mat_q(ctx, src0, src1, nullptr, dst);
1967
+ } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
1968
+ !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
1969
+ // general KQ + KQV multi-batch without FlashAttention
1970
+ ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
1971
+ } else if (use_mul_mat_vec) {
1972
+ ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr);
1973
+ } else if (use_mul_mat_vec_q) {
1974
+ ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
1975
+ } else if (use_mul_mat_q) {
1976
+ ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
1977
+ } else {
1978
+ ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
1979
+ }
1980
+ }
1981
+
1982
+ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
1983
+ const ggml_tensor * src0 = dst->src[0];
1984
+ const ggml_tensor * src1 = dst->src[1];
1985
+ const ggml_tensor * ids = dst->src[2];
1986
+
1987
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
1988
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
1989
+ GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
1990
+
1991
+ GGML_TENSOR_BINARY_OP_LOCALS
1992
+
1993
+ const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
1994
+
1995
+ if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
1996
+ if (ne2 == 1) {
1997
+ if (ggml_is_quantized(src0->type)) {
1998
+ ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
1999
+ } else {
2000
+ ggml_cuda_mul_mat_vec(ctx, src0, src1, ids, dst);
2001
+ }
2002
+ return;
2003
+ }
2004
+
2005
+ if (ggml_cuda_should_use_mmq(src0->type, cc, ne12)) {
2006
+ ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst);
2007
+ return;
2008
+ }
2009
+ }
2010
+
2011
+ cudaStream_t stream = ctx.stream();
2012
+
2013
+ GGML_ASSERT(nb12 % nb11 == 0);
2014
+ GGML_ASSERT(nb2 % nb1 == 0);
2015
+
2016
+ const ggml_type type_src1_sorted = (src0->type == GGML_TYPE_F16 && !fast_fp16_hardware_available(cc))
2017
+ || ggml_is_quantized(src0->type) ? GGML_TYPE_F32 : src0->type;
2018
+ const ggml_type type_dst_sorted = GGML_TYPE_F32;
2019
+ const size_t ts_src1_sorted = ggml_type_size(type_src1_sorted);
2020
+ const size_t ts_dst_sorted = ggml_type_size(type_dst_sorted);
2021
+
2022
+ const int64_t n_expert_used = ids->ne[0];
2023
+ const int64_t ne_get_rows = ne12 * n_expert_used;
2024
+
2025
+ std::vector<int32_t> ids_to_sorted_host;
2026
+ ids_to_sorted_host.reserve(2*ne_get_rows);
2027
+ std::vector<int32_t> ids_from_sorted_host(ne_get_rows);
2028
+
2029
+ ggml_cuda_pool_alloc<int32_t> ids_buf_dev(ctx.pool(), 2*ne_get_rows);
2030
+
2031
+ std::vector<int32_t> tokens_per_expert(ne02);
2032
+
2033
+ ggml_cuda_pool_alloc<char> src1_sorted(ctx.pool(), ne12*n_expert_used*ne10*ts_src1_sorted);
2034
+ ggml_cuda_pool_alloc<char> dst_sorted(ctx.pool(), ne2 *n_expert_used* ne0*ts_dst_sorted);
2035
+
2036
+ std::vector<char> ids_host(ggml_nbytes(ids));
2037
+ CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
2038
+ CUDA_CHECK(cudaStreamSynchronize(stream));
2039
+
2040
+ for (int64_t i02 = 0; i02 < ne02; ++i02) { // expert matrices
2041
+ for (int64_t i12 = 0; i12 < ne12; ++i12) { // tokens
2042
+ for (int64_t iex = 0; iex < n_expert_used; ++iex) {
2043
+ const int32_t expert_to_use = *(const int32_t *)(ids_host.data() + i12*ids->nb[1] + iex*ids->nb[0]);
2044
+ assert(expert_to_use >= 0 && expert_to_use < ne02);
2045
+ if (expert_to_use == i02) {
2046
+ ids_from_sorted_host[i12*n_expert_used + iex] = ids_to_sorted_host.size();
2047
+ ids_to_sorted_host.push_back(i12*ne11 + iex % ne11);
2048
+ tokens_per_expert[i02]++;
2049
+ break;
2050
+ }
2051
+ }
2052
+ }
2053
+ }
2054
+ GGML_ASSERT(ids_to_sorted_host.size() == size_t(ne_get_rows));
2055
+
2056
+ ids_to_sorted_host.insert(ids_to_sorted_host.end(), ids_from_sorted_host.begin(), ids_from_sorted_host.end());
2057
+
2058
+ CUDA_CHECK(cudaMemcpyAsync(ids_buf_dev.ptr, ids_to_sorted_host.data(), 2*ne_get_rows*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
2059
+ CUDA_CHECK(cudaStreamSynchronize(stream));
2060
+
2061
+ const int32_t * ids_to_sorted = ids_buf_dev.ptr + 0*ne_get_rows;
2062
+ const int32_t * ids_from_sorted = ids_buf_dev.ptr + 1*ne_get_rows;
2063
+
2064
+ get_rows_cuda(src1->data, src1->type, ids_to_sorted, src1_sorted.ptr, type_src1_sorted,
2065
+ ne10, nb11, nb12, nb13,
2066
+ ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t),
2067
+ ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, stream);
2068
+ CUDA_CHECK(cudaGetLastError());
2069
+
2070
+ char * src1_data_cur = (char *) src1_sorted.ptr;
2071
+ char * dst_data_cur = (char *) dst_sorted.ptr;
2072
+ for (int64_t i02 = 0; i02 < ne02; ++i02) {
2073
+ if (tokens_per_expert[i02] == 0) {
2074
+ continue;
2075
+ }
2076
+
2077
+ ggml_tensor src0_slice = *src0;
2078
+ src0_slice.ne[2] = 1;
2079
+ src0_slice.nb[3] = src0_slice.nb[2];
2080
+ src0_slice.op = GGML_OP_VIEW;
2081
+ src0_slice.view_src = dst->src[0]; // non-const pointer to src0
2082
+ src0_slice.data = (char *) src0->data + i02*nb02;
2083
+
2084
+ ggml_tensor src1_slice;
2085
+ memset(&src1_slice, 0, sizeof(src1_slice));
2086
+ src1_slice.buffer = src1->buffer;
2087
+ src1_slice.type = type_src1_sorted;
2088
+ src1_slice.ne[0] = ne10;
2089
+ src1_slice.ne[1] = tokens_per_expert[i02];
2090
+ src1_slice.ne[2] = 1;
2091
+ src1_slice.ne[3] = 1;
2092
+ src1_slice.nb[0] = ts_src1_sorted;
2093
+ src1_slice.nb[1] = src1_slice.ne[0] * src1_slice.nb[0];
2094
+ src1_slice.nb[2] = src1_slice.ne[1] * src1_slice.nb[1];
2095
+ src1_slice.nb[3] = src1_slice.ne[2] * src1_slice.nb[2];
2096
+ src1_slice.data = src1_data_cur;
2097
+
2098
+ ggml_tensor dst_slice;
2099
+ memset(&dst_slice, 0, sizeof(dst_slice));
2100
+ dst_slice.buffer = dst->buffer;
2101
+ dst_slice.type = type_dst_sorted;
2102
+ dst_slice.ne[0] = ne0;
2103
+ dst_slice.ne[1] = tokens_per_expert[i02];
2104
+ dst_slice.ne[2] = 1;
2105
+ dst_slice.ne[3] = 1;
2106
+ dst_slice.nb[0] = ts_dst_sorted;
2107
+ dst_slice.nb[1] = dst_slice.ne[0] * dst_slice.nb[0];
2108
+ dst_slice.nb[2] = dst_slice.ne[1] * dst_slice.nb[1];
2109
+ dst_slice.nb[3] = dst_slice.ne[2] * dst_slice.nb[2];
2110
+ dst_slice.data = dst_data_cur;
2111
+
2112
+ ggml_cuda_mul_mat(ctx, &src0_slice, &src1_slice, &dst_slice);
2113
+ CUDA_CHECK(cudaGetLastError());
2114
+
2115
+ src1_data_cur += src1_slice.nb[2];
2116
+ dst_data_cur += dst_slice.nb[2];
2117
+ }
2118
+
2119
+ get_rows_cuda(dst_sorted.ptr, type_dst_sorted, ids_from_sorted, dst->data, dst->type,
2120
+ ne0, ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted,
2121
+ ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t),
2122
+ nb1, nb2, nb3, stream);
2123
+ }
2124
+
2125
+ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
2126
+ // why is this here instead of mul_mat?
2127
+ if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
2128
+ ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
2129
+ }
2130
+
2131
+ switch (dst->op) {
2132
+ case GGML_OP_ARGMAX:
2133
+ ggml_cuda_argmax(ctx, dst);
2134
+ break;
2135
+ case GGML_OP_COUNT_EQUAL:
2136
+ ggml_cuda_count_equal(ctx, dst);
2137
+ break;
2138
+ case GGML_OP_REPEAT:
2139
+ ggml_cuda_op_repeat(ctx, dst);
2140
+ break;
2141
+ case GGML_OP_REPEAT_BACK:
2142
+ ggml_cuda_op_repeat_back(ctx, dst);
2143
+ break;
2144
+ case GGML_OP_GET_ROWS:
2145
+ ggml_cuda_op_get_rows(ctx, dst);
2146
+ break;
2147
+ case GGML_OP_GET_ROWS_BACK:
2148
+ ggml_cuda_op_get_rows_back(ctx, dst);
2149
+ break;
2150
+ case GGML_OP_DUP:
2151
+ ggml_cuda_dup(ctx, dst);
2152
+ break;
2153
+ case GGML_OP_CPY:
2154
+ ggml_cuda_cpy(ctx, dst->src[0], dst->src[1]);
2155
+ break;
2156
+ case GGML_OP_CONT:
2157
+ ggml_cuda_dup(ctx, dst);
2158
+ break;
2159
+ case GGML_OP_ADD:
2160
+ case GGML_OP_ADD1: // TODO: more efficient implementation
2161
+ ggml_cuda_op_add(ctx, dst);
2162
+ break;
2163
+ case GGML_OP_SUB:
2164
+ ggml_cuda_op_sub(ctx, dst);
2165
+ break;
2166
+ case GGML_OP_ACC:
2167
+ ggml_cuda_op_acc(ctx, dst);
2168
+ break;
2169
+ case GGML_OP_MUL:
2170
+ ggml_cuda_op_mul(ctx, dst);
2171
+ break;
2172
+ case GGML_OP_DIV:
2173
+ ggml_cuda_op_div(ctx, dst);
2174
+ break;
2175
+ case GGML_OP_UNARY:
2176
+ switch (ggml_get_unary_op(dst)) {
2177
+ case GGML_UNARY_OP_ABS:
2178
+ ggml_cuda_op_abs(ctx, dst);
2179
+ break;
2180
+ case GGML_UNARY_OP_SGN:
2181
+ ggml_cuda_op_sgn(ctx, dst);
2182
+ break;
2183
+ case GGML_UNARY_OP_NEG:
2184
+ ggml_cuda_op_neg(ctx, dst);
2185
+ break;
2186
+ case GGML_UNARY_OP_STEP:
2187
+ ggml_cuda_op_step(ctx, dst);
2188
+ break;
2189
+ case GGML_UNARY_OP_GELU:
2190
+ ggml_cuda_op_gelu(ctx, dst);
2191
+ break;
2192
+ case GGML_UNARY_OP_SILU:
2193
+ ggml_cuda_op_silu(ctx, dst);
2194
+ break;
2195
+ case GGML_UNARY_OP_GELU_ERF:
2196
+ ggml_cuda_op_gelu_erf(ctx, dst);
2197
+ break;
2198
+ case GGML_UNARY_OP_GELU_QUICK:
2199
+ ggml_cuda_op_gelu_quick(ctx, dst);
2200
+ break;
2201
+ case GGML_UNARY_OP_TANH:
2202
+ ggml_cuda_op_tanh(ctx, dst);
2203
+ break;
2204
+ case GGML_UNARY_OP_RELU:
2205
+ ggml_cuda_op_relu(ctx, dst);
2206
+ break;
2207
+ case GGML_UNARY_OP_SIGMOID:
2208
+ ggml_cuda_op_sigmoid(ctx, dst);
2209
+ break;
2210
+ case GGML_UNARY_OP_HARDSIGMOID:
2211
+ ggml_cuda_op_hardsigmoid(ctx, dst);
2212
+ break;
2213
+ case GGML_UNARY_OP_HARDSWISH:
2214
+ ggml_cuda_op_hardswish(ctx, dst);
2215
+ break;
2216
+ case GGML_UNARY_OP_EXP:
2217
+ ggml_cuda_op_exp(ctx, dst);
2218
+ break;
2219
+ default:
2220
+ return false;
2221
+ }
2222
+ break;
2223
+ case GGML_OP_NORM:
2224
+ ggml_cuda_op_norm(ctx, dst);
2225
+ break;
2226
+ case GGML_OP_GROUP_NORM:
2227
+ ggml_cuda_op_group_norm(ctx, dst);
2228
+ break;
2229
+ case GGML_OP_L2_NORM:
2230
+ ggml_cuda_op_l2_norm(ctx, dst);
2231
+ break;
2232
+ case GGML_OP_CONCAT:
2233
+ ggml_cuda_op_concat(ctx, dst);
2234
+ break;
2235
+ case GGML_OP_UPSCALE:
2236
+ ggml_cuda_op_upscale(ctx, dst);
2237
+ break;
2238
+ case GGML_OP_PAD:
2239
+ ggml_cuda_op_pad(ctx, dst);
2240
+ break;
2241
+ case GGML_OP_ARANGE:
2242
+ ggml_cuda_op_arange(ctx, dst);
2243
+ break;
2244
+ case GGML_OP_TIMESTEP_EMBEDDING:
2245
+ ggml_cuda_op_timestep_embedding(ctx, dst);
2246
+ break;
2247
+ case GGML_OP_LEAKY_RELU:
2248
+ ggml_cuda_op_leaky_relu(ctx, dst);
2249
+ break;
2250
+ case GGML_OP_SILU_BACK:
2251
+ ggml_cuda_op_silu_back(ctx, dst);
2252
+ break;
2253
+ case GGML_OP_RMS_NORM:
2254
+ ggml_cuda_op_rms_norm(ctx, dst);
2255
+ break;
2256
+ case GGML_OP_RMS_NORM_BACK:
2257
+ ggml_cuda_op_rms_norm_back(ctx, dst);
2258
+ break;
2259
+ case GGML_OP_MUL_MAT:
2260
+ ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
2261
+ break;
2262
+ case GGML_OP_MUL_MAT_ID:
2263
+ ggml_cuda_mul_mat_id(ctx, dst);
2264
+ break;
2265
+ case GGML_OP_OUT_PROD:
2266
+ ggml_cuda_out_prod(ctx, dst);
2267
+ break;
2268
+ case GGML_OP_SCALE:
2269
+ ggml_cuda_op_scale(ctx, dst);
2270
+ break;
2271
+ case GGML_OP_SQR:
2272
+ ggml_cuda_op_sqr(ctx, dst);
2273
+ break;
2274
+ case GGML_OP_SQRT:
2275
+ ggml_cuda_op_sqrt(ctx, dst);
2276
+ break;
2277
+ case GGML_OP_SIN:
2278
+ ggml_cuda_op_sin(ctx, dst);
2279
+ break;
2280
+ case GGML_OP_COS:
2281
+ ggml_cuda_op_cos(ctx, dst);
2282
+ break;
2283
+ case GGML_OP_CLAMP:
2284
+ ggml_cuda_op_clamp(ctx, dst);
2285
+ break;
2286
+ case GGML_OP_LOG:
2287
+ ggml_cuda_op_log(ctx, dst);
2288
+ break;
2289
+ case GGML_OP_NONE:
2290
+ case GGML_OP_RESHAPE:
2291
+ case GGML_OP_VIEW:
2292
+ case GGML_OP_PERMUTE:
2293
+ case GGML_OP_TRANSPOSE:
2294
+ break;
2295
+ case GGML_OP_DIAG_MASK_INF:
2296
+ ggml_cuda_op_diag_mask_inf(ctx, dst);
2297
+ break;
2298
+ case GGML_OP_SOFT_MAX:
2299
+ ggml_cuda_op_soft_max(ctx, dst);
2300
+ break;
2301
+ case GGML_OP_SOFT_MAX_BACK:
2302
+ ggml_cuda_op_soft_max_back(ctx, dst);
2303
+ break;
2304
+ case GGML_OP_ROPE:
2305
+ ggml_cuda_op_rope(ctx, dst);
2306
+ break;
2307
+ case GGML_OP_ROPE_BACK:
2308
+ ggml_cuda_op_rope_back(ctx, dst);
2309
+ break;
2310
+ case GGML_OP_IM2COL:
2311
+ ggml_cuda_op_im2col(ctx, dst);
2312
+ break;
2313
+ case GGML_OP_CONV_TRANSPOSE_1D:
2314
+ ggml_cuda_op_conv_transpose_1d(ctx,dst);
2315
+ break;
2316
+ case GGML_OP_POOL_2D:
2317
+ ggml_cuda_op_pool2d(ctx, dst);
2318
+ break;
2319
+ case GGML_OP_SUM:
2320
+ ggml_cuda_op_sum(ctx, dst);
2321
+ break;
2322
+ case GGML_OP_SUM_ROWS:
2323
+ ggml_cuda_op_sum_rows(ctx, dst);
2324
+ break;
2325
+ case GGML_OP_SSM_CONV:
2326
+ ggml_cuda_op_ssm_conv(ctx, dst);
2327
+ break;
2328
+ case GGML_OP_SSM_SCAN:
2329
+ ggml_cuda_op_ssm_scan(ctx, dst);
2330
+ break;
2331
+ case GGML_OP_ARGSORT:
2332
+ ggml_cuda_op_argsort(ctx, dst);
2333
+ break;
2334
+ case GGML_OP_FLASH_ATTN_EXT:
2335
+ ggml_cuda_flash_attn_ext(ctx, dst);
2336
+ break;
2337
+ case GGML_OP_CROSS_ENTROPY_LOSS:
2338
+ ggml_cuda_cross_entropy_loss(ctx, dst);
2339
+ break;
2340
+ case GGML_OP_RWKV_WKV6:
2341
+ ggml_cuda_op_rwkv_wkv6(ctx, dst);
2342
+ break;
2343
+ case GGML_OP_GATED_LINEAR_ATTN:
2344
+ ggml_cuda_op_gated_linear_attn(ctx, dst);
2345
+ break;
2346
+ case GGML_OP_RWKV_WKV7:
2347
+ ggml_cuda_op_rwkv_wkv7(ctx, dst);
2348
+ break;
2349
+ case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
2350
+ ggml_cuda_cross_entropy_loss_back(ctx, dst);
2351
+ break;
2352
+ case GGML_OP_OPT_STEP_ADAMW:
2353
+ ggml_cuda_opt_step_adamw(ctx, dst);
2354
+ break;
2355
+ default:
2356
+ return false;
2357
+ }
2358
+
2359
+ cudaError_t err = cudaGetLastError();
2360
+ if (err != cudaSuccess) {
2361
+ GGML_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
2362
+ CUDA_CHECK(err);
2363
+ }
2364
+
2365
+ return true;
2366
+ }
2367
+
2368
+ ////////////////////////////////////////////////////////////////////////////////
2369
+
2370
+ // backend
2371
+
2372
+ static const char * ggml_backend_cuda_get_name(ggml_backend_t backend) {
2373
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2374
+
2375
+ return cuda_ctx->name.c_str();
2376
+ }
2377
+
2378
+ static void ggml_backend_cuda_free(ggml_backend_t backend) {
2379
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2380
+
2381
+ delete cuda_ctx;
2382
+ delete backend;
2383
+ }
2384
+
2385
+ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2386
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2387
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
2388
+
2389
+ GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
2390
+
2391
+ CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
2392
+ }
2393
+
2394
+ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
2395
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2396
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
2397
+
2398
+ GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
2399
+
2400
+ CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
2401
+ }
2402
+
2403
+ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
2404
+ ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
2405
+ ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
2406
+
2407
+ if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
2408
+ return false;
2409
+ }
2410
+
2411
+ if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
2412
+ return false;
2413
+ }
2414
+
2415
+ // device -> device copy
2416
+ ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
2417
+ ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
2418
+
2419
+ ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
2420
+ ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
2421
+
2422
+ if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
2423
+ #ifndef NDEBUG
2424
+ GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
2425
+ #endif
2426
+ return false;
2427
+ }
2428
+
2429
+ if (backend_src != backend_dst) {
2430
+ // copy on src stream
2431
+ if (cuda_ctx_src->device == cuda_ctx_dst->device) {
2432
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
2433
+ } else {
2434
+ #ifdef GGML_CUDA_NO_PEER_COPY
2435
+ return false;
2436
+ #else
2437
+ CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
2438
+ #endif
2439
+ }
2440
+
2441
+ // record event on src stream after the copy
2442
+ if (!cuda_ctx_src->copy_event) {
2443
+ ggml_cuda_set_device(cuda_ctx_src->device);
2444
+ CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
2445
+ }
2446
+
2447
+ CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));
2448
+
2449
+ // wait on dst stream for the copy to complete
2450
+ CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
2451
+ } else {
2452
+ // src and dst are on the same backend
2453
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
2454
+ }
2455
+ return true;
2456
+ }
2457
+
2458
+ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
2459
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2460
+
2461
+ CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream()));
2462
+
2463
+ GGML_UNUSED(backend);
2464
+ }
2465
+
2466
+ #ifdef USE_CUDA_GRAPH
2467
+ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
2468
+ bool use_cuda_graph) {
2469
+
2470
+ // Loop over nodes in GGML graph to obtain info needed for CUDA graph
2471
+ cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
2472
+
2473
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2474
+ ggml_tensor * node = cgraph->nodes[i];
2475
+
2476
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2477
+ continue;
2478
+ }
2479
+
2480
+ if (node->src[0] && node->src[0]->buffer && ggml_backend_buft_is_cuda_split(node->src[0]->buffer->buft)) {
2481
+ use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2482
+ #ifndef NDEBUG
2483
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
2484
+ #endif
2485
+ }
2486
+
2487
+ if (node->op == GGML_OP_MUL_MAT_ID && node->ne[2] != 1) {
2488
+ use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2489
+ #ifndef NDEBUG
2490
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
2491
+ #endif
2492
+ }
2493
+
2494
+ if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
2495
+ // disable CUDA graphs for batch size > 1 for now.
2496
+ // Changes in batch size or context size can cause changes to the grid size of some kernels.
2497
+ use_cuda_graph = false;
2498
+ #ifndef NDEBUG
2499
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2500
+ #endif
2501
+ }
2502
+
2503
+ if (node->op == GGML_OP_CPY) {
2504
+
2505
+ // Store the pointers which are updated for each token, such that these can be sent
2506
+ // to the device and accessed using indirection from CUDA graph
2507
+ cuda_ctx->cuda_graph->cpy_dest_ptrs.push_back((char *) node->src[1]->data);
2508
+
2509
+ // store a pointer to each copy op CUDA kernel to identify it later
2510
+ void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
2511
+ if (!ptr) {
2512
+ use_cuda_graph = false;
2513
+ #ifndef NDEBUG
2514
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
2515
+ #endif
2516
+ }
2517
+ }
2518
+
2519
+ if (!use_cuda_graph) {
2520
+ break;
2521
+ }
2522
+ }
2523
+
2524
+ if (use_cuda_graph) {
2525
+ cuda_ctx->cuda_graph->use_cpy_indirection = true;
2526
+ // copy pointers to GPU so they can be accessed via indirection within CUDA graph
2527
+ ggml_cuda_cpy_dest_ptrs_copy(cuda_ctx->cuda_graph.get(), cuda_ctx->cuda_graph->cpy_dest_ptrs.data(), cuda_ctx->cuda_graph->cpy_dest_ptrs.size(), cuda_ctx->stream());
2528
+ }
2529
+
2530
+ return use_cuda_graph;
2531
+ }
2532
+
2533
+ static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2534
+ graph_node_properties->node_address = node->data;
2535
+ graph_node_properties->node_op = node->op;
2536
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
2537
+ graph_node_properties->ne[i] = node->ne[i];
2538
+ graph_node_properties->nb[i] = node->nb[i];
2539
+ }
2540
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2541
+ graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
2542
+ }
2543
+ memcpy(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS);
2544
+ }
2545
+
2546
+ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2547
+ if (node->data != graph_node_properties->node_address &&
2548
+ node->op != GGML_OP_CPY &&
2549
+ node->op != GGML_OP_VIEW) {
2550
+ return false;
2551
+ }
2552
+
2553
+ if (node->op != graph_node_properties->node_op) {
2554
+ return false;
2555
+ }
2556
+
2557
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
2558
+ if (node->ne[i] != graph_node_properties->ne[i]) {
2559
+ return false;
2560
+ }
2561
+ if (node->nb[i] != graph_node_properties->nb[i]) {
2562
+ return false;
2563
+ }
2564
+ }
2565
+
2566
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2567
+ if (node->src[i] &&
2568
+ node->src[i]->data != graph_node_properties->src_address[i] &&
2569
+ node->op != GGML_OP_CPY &&
2570
+ node->op != GGML_OP_VIEW
2571
+ ) {
2572
+ return false;
2573
+ }
2574
+ }
2575
+
2576
+ if (node->op == GGML_OP_SCALE &&
2577
+ memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
2578
+ return false;
2579
+ }
2580
+
2581
+ return true;
2582
+ }
2583
+
2584
+ static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
2585
+
2586
+ bool cuda_graph_update_required = false;
2587
+
2588
+ if (cuda_ctx->cuda_graph->instance == nullptr) {
2589
+ cuda_graph_update_required = true;
2590
+ }
2591
+
2592
+ // Check if the graph size has changed
2593
+ if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
2594
+ cuda_graph_update_required = true;
2595
+ cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
2596
+ }
2597
+
2598
+ // Loop over nodes in GGML graph to determine if CUDA graph update is required
2599
+ // and store properties to allow this comparison for the next token
2600
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2601
+ bool has_matching_properties = true;
2602
+ if (!cuda_graph_update_required) {
2603
+ has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2604
+ }
2605
+ if (!has_matching_properties) {
2606
+ cuda_graph_update_required = true;
2607
+ }
2608
+ set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2609
+ }
2610
+
2611
+ return cuda_graph_update_required;
2612
+ }
2613
+
2614
+ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
2615
+
2616
+ #if CUDART_VERSION >= 12000
2617
+ cudaGraphExecUpdateResultInfo result_info;
2618
+ cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2619
+ #else
2620
+ cudaGraphNode_t errorNode;
2621
+ cudaGraphExecUpdateResult result_info;
2622
+ cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info);
2623
+ #endif // CUDART_VERSION >= 12000
2624
+
2625
+ if (stat == cudaErrorGraphExecUpdateFailure) {
2626
+ #ifndef NDEBUG
2627
+ GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
2628
+ #endif
2629
+
2630
+ // The pre-existing graph exec cannot be updated due to violated constraints
2631
+ // so instead clear error and re-instantiate
2632
+ (void)cudaGetLastError();
2633
+ CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
2634
+ cuda_ctx->cuda_graph->instance = nullptr;
2635
+ CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2636
+ } else {
2637
+ GGML_ASSERT(stat == cudaSuccess);
2638
+ }
2639
+ }
2640
+ #endif
2641
+
2642
+ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
2643
+ bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
2644
+
2645
+ while (!graph_evaluated_or_captured) {
2646
+ // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
2647
+ // With the use of CUDA graphs, the execution will be performed by the graph launch.
2648
+ if (!use_cuda_graph || cuda_graph_update_required) {
2649
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2650
+ ggml_tensor * node = cgraph->nodes[i];
2651
+
2652
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2653
+ continue;
2654
+ }
2655
+
2656
+ #ifndef NDEBUG
2657
+ assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
2658
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
2659
+ if (node->src[j] != nullptr) {
2660
+ assert(node->src[j]->buffer);
2661
+ assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
2662
+ ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft));
2663
+ }
2664
+ }
2665
+ #endif
2666
+
2667
+ bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2668
+ if (!ok) {
2669
+ GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2670
+ }
2671
+ GGML_ASSERT(ok);
2672
+ }
2673
+ }
2674
+
2675
+ #ifdef USE_CUDA_GRAPH
2676
+ if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture
2677
+ if (cuda_ctx->cuda_graph->graph != nullptr) {
2678
+ CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
2679
+ cuda_ctx->cuda_graph->graph = nullptr;
2680
+ }
2681
+
2682
+ CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
2683
+ graph_evaluated_or_captured = true; // CUDA graph has been captured
2684
+ } else {
2685
+ graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
2686
+ }
2687
+ }
2688
+
2689
+ if (use_cuda_graph) {
2690
+ if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
2691
+ CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2692
+ }
2693
+ if (cuda_graph_update_required) { // Update graph executable
2694
+ update_cuda_graph_executable(cuda_ctx);
2695
+ }
2696
+ // Launch graph
2697
+ CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
2698
+ #else
2699
+ graph_evaluated_or_captured = true;
2700
+ #endif // USE_CUDA_GRAPH
2701
+ }
2702
+ }
2703
+
2704
+ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
2705
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2706
+
2707
+ ggml_cuda_set_device(cuda_ctx->device);
2708
+
2709
+ #ifdef USE_CUDA_GRAPH
2710
+ static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
2711
+
2712
+ // Objects required for CUDA Graph
2713
+ if (cuda_ctx->cuda_graph == nullptr) {
2714
+ cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
2715
+ }
2716
+
2717
+ bool use_cuda_graph = true;
2718
+ bool cuda_graph_update_required = false;
2719
+
2720
+ if (cuda_ctx->cuda_graph->graph == nullptr) {
2721
+ if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
2722
+ cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2723
+ #ifndef NDEBUG
2724
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2725
+ #endif
2726
+ }
2727
+ }
2728
+
2729
+ // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
2730
+ // or previous graph capture failure.
2731
+ // Also disable for multi-gpu for now. TO DO investigate
2732
+ if (disable_cuda_graphs_due_to_env
2733
+ || cuda_ctx->cuda_graph->disable_due_to_gpu_arch
2734
+ || cuda_ctx->cuda_graph->disable_due_to_too_many_updates
2735
+ || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
2736
+ use_cuda_graph = false;
2737
+ }
2738
+
2739
+ if (use_cuda_graph) {
2740
+ cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
2741
+
2742
+ use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph);
2743
+
2744
+ // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
2745
+ if (use_cuda_graph && cuda_graph_update_required) {
2746
+ cuda_ctx->cuda_graph->number_consecutive_updates++;
2747
+ } else {
2748
+ cuda_ctx->cuda_graph->number_consecutive_updates = 0;
2749
+ }
2750
+
2751
+ if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2752
+ cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2753
+ #ifndef NDEBUG
2754
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2755
+ #endif
2756
+ }
2757
+ }
2758
+
2759
+ if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
2760
+ CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
2761
+ }
2762
+
2763
+ if (!use_cuda_graph) {
2764
+ cuda_ctx->cuda_graph->use_cpy_indirection = false;
2765
+ }
2766
+
2767
+ #else
2768
+ bool use_cuda_graph = false;
2769
+ bool cuda_graph_update_required = false;
2770
+ #endif // USE_CUDA_GRAPH
2771
+
2772
+ bool graph_evaluated_or_captured = false;
2773
+
2774
+ evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
2775
+
2776
+ return GGML_STATUS_SUCCESS;
2777
+ }
2778
+
2779
+ static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
2780
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2781
+
2782
+ CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream()));
2783
+ }
2784
+
2785
+ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
2786
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2787
+
2788
+ if (ggml_backend_is_cuda(backend)) {
2789
+ CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0));
2790
+ } else {
2791
+ #if 0
2792
+ // untested
2793
+ auto wait_fn = [](void * user_data) {
2794
+ ggml_backend_event_t event = (ggml_backend_event_t)user_data;
2795
+ ggml_backend_event_synchronize(event);
2796
+ };
2797
+
2798
+ CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
2799
+ #endif
2800
+ GGML_ABORT("fatal error");
2801
+ }
2802
+ }
2803
+
2804
+ static const ggml_backend_i ggml_backend_cuda_interface = {
2805
+ /* .get_name = */ ggml_backend_cuda_get_name,
2806
+ /* .free = */ ggml_backend_cuda_free,
2807
+ /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
2808
+ /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
2809
+ /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
2810
+ /* .synchronize = */ ggml_backend_cuda_synchronize,
2811
+ /* .graph_plan_create = */ NULL,
2812
+ /* .graph_plan_free = */ NULL,
2813
+ /* .graph_plan_update = */ NULL,
2814
+ /* .graph_plan_compute = */ NULL,
2815
+ /* .graph_compute = */ ggml_backend_cuda_graph_compute,
2816
+ /* .event_record = */ ggml_backend_cuda_event_record,
2817
+ /* .event_wait = */ ggml_backend_cuda_event_wait,
2818
+ };
2819
+
2820
+ static ggml_guid_t ggml_backend_cuda_guid() {
2821
+ static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
2822
+ return &guid;
2823
+ }
2824
+
2825
+ bool ggml_backend_is_cuda(ggml_backend_t backend) {
2826
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
2827
+ }
2828
+
2829
+ int ggml_backend_cuda_get_device_count() {
2830
+ return ggml_cuda_info().device_count;
2831
+ }
2832
+
2833
+ void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
2834
+ cudaDeviceProp prop;
2835
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
2836
+ snprintf(description, description_size, "%s", prop.name);
2837
+ }
2838
+
2839
+ void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
2840
+ ggml_cuda_set_device(device);
2841
+
2842
+ CUDA_CHECK(cudaMemGetInfo(free, total));
2843
+ }
2844
+
2845
+ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
2846
+ if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
2847
+ return false;
2848
+ }
2849
+
2850
+ #if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
2851
+ cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
2852
+ if (err != cudaSuccess) {
2853
+ // clear the error
2854
+ (void)cudaGetLastError();
2855
+
2856
+ GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
2857
+ size / 1024.0 / 1024.0, cudaGetErrorString(err));
2858
+ return false;
2859
+ }
2860
+ return true;
2861
+ #else
2862
+ GGML_UNUSED(buffer);
2863
+ GGML_UNUSED(size);
2864
+ return false;
2865
+ #endif // CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
2866
+ }
2867
+
2868
+ void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
2869
+ if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
2870
+ return;
2871
+ }
2872
+
2873
+ cudaError_t err = cudaHostUnregister(buffer);
2874
+ if (err != cudaSuccess) {
2875
+ // clear the error
2876
+ (void)cudaGetLastError();
2877
+ }
2878
+ }
2879
+
2880
+
2881
+ // backend device
2882
+
2883
+ struct ggml_backend_cuda_device_context {
2884
+ int device;
2885
+ std::string name;
2886
+ std::string description;
2887
+ };
2888
+
2889
+ static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
2890
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
2891
+ return ctx->name.c_str();
2892
+ }
2893
+
2894
+ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t dev) {
2895
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
2896
+ return ctx->description.c_str();
2897
+ }
2898
+
2899
+ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
2900
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
2901
+ ggml_cuda_set_device(ctx->device);
2902
+ CUDA_CHECK(cudaMemGetInfo(free, total));
2903
+ }
2904
+
2905
+ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
2906
+ GGML_UNUSED(dev);
2907
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
2908
+ }
2909
+
2910
+ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
2911
+ props->name = ggml_backend_cuda_device_get_name(dev);
2912
+ props->description = ggml_backend_cuda_device_get_description(dev);
2913
+ props->type = ggml_backend_cuda_device_get_type(dev);
2914
+ ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
2915
+
2916
+ bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
2917
+ #ifdef GGML_CUDA_NO_PEER_COPY
2918
+ bool events = false;
2919
+ #else
2920
+ bool events = true;
2921
+ #endif
2922
+
2923
+ props->caps = {
2924
+ /* .async = */ true,
2925
+ /* .host_buffer = */ host_buffer,
2926
+ /* .buffer_from_host_ptr = */ false,
2927
+ /* .events = */ events,
2928
+ };
2929
+ }
2930
+
2931
+ static ggml_backend_t ggml_backend_cuda_device_init_backend(ggml_backend_dev_t dev, const char * params) {
2932
+ GGML_UNUSED(params);
2933
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
2934
+ return ggml_backend_cuda_init(ctx->device);
2935
+ }
2936
+
2937
+ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_buffer_type(ggml_backend_dev_t dev) {
2938
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
2939
+ return ggml_backend_cuda_buffer_type(ctx->device);
2940
+ }
2941
+
2942
+ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(ggml_backend_dev_t dev) {
2943
+ GGML_UNUSED(dev);
2944
+ return ggml_backend_cuda_host_buffer_type();
2945
+ }
2946
+
2947
+ // TODO: move these functions here
2948
+ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
2949
+ ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
2950
+
2951
+ // split buffers can only be used with GGML_OP_MUL_MAT
2952
+ if (op->op != GGML_OP_MUL_MAT) {
2953
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2954
+ if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda_split(op->src[i]->buffer->buft)) {
2955
+ return false;
2956
+ }
2957
+ }
2958
+ }
2959
+
2960
+ // check if all the sources are allocated on this device
2961
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2962
+ if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda(op->src[i]->buffer->buft)) {
2963
+ ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)op->src[i]->buffer->buft->context;
2964
+ if (buft_ctx->device != dev_ctx->device) {
2965
+ return false;
2966
+ }
2967
+ }
2968
+ }
2969
+
2970
+ switch (op->op) {
2971
+ case GGML_OP_UNARY:
2972
+ switch (ggml_get_unary_op(op)) {
2973
+ case GGML_UNARY_OP_ABS:
2974
+ case GGML_UNARY_OP_SGN:
2975
+ case GGML_UNARY_OP_NEG:
2976
+ case GGML_UNARY_OP_STEP:
2977
+ case GGML_UNARY_OP_GELU:
2978
+ case GGML_UNARY_OP_SILU:
2979
+ case GGML_UNARY_OP_RELU:
2980
+ case GGML_UNARY_OP_SIGMOID:
2981
+ case GGML_UNARY_OP_HARDSIGMOID:
2982
+ case GGML_UNARY_OP_HARDSWISH:
2983
+ case GGML_UNARY_OP_GELU_ERF:
2984
+ case GGML_UNARY_OP_GELU_QUICK:
2985
+ case GGML_UNARY_OP_TANH:
2986
+ case GGML_UNARY_OP_EXP:
2987
+ return ggml_is_contiguous(op->src[0]);
2988
+ default:
2989
+ return false;
2990
+ }
2991
+ break;
2992
+ case GGML_OP_MUL_MAT:
2993
+ case GGML_OP_MUL_MAT_ID:
2994
+ {
2995
+ struct ggml_tensor * a = op->src[0];
2996
+ struct ggml_tensor * b = op->src[1];
2997
+ // for small weight matrices the active device can end up without any rows, don't use row split in those cases
2998
+ // this avoids some edge cases (and the performance would not be good anyways)
2999
+ if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
3000
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
3001
+ int64_t row_low;
3002
+ int64_t row_high;
3003
+ get_row_split(&row_low, &row_high, a, buft_ctx->tensor_split, dev_ctx->device);
3004
+ if (row_low == row_high) {
3005
+ return false;
3006
+ }
3007
+ }
3008
+ if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
3009
+ return false;
3010
+ }
3011
+ #ifdef GGML_USE_MUSA
3012
+ if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 &&
3013
+ !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
3014
+ return false;
3015
+ }
3016
+ #endif // GGML_USE_MUSA
3017
+ switch (a->type) {
3018
+ case GGML_TYPE_F32:
3019
+ case GGML_TYPE_F16:
3020
+ case GGML_TYPE_Q4_0:
3021
+ case GGML_TYPE_Q4_1:
3022
+ case GGML_TYPE_Q5_0:
3023
+ case GGML_TYPE_Q5_1:
3024
+ case GGML_TYPE_Q8_0:
3025
+ case GGML_TYPE_Q2_K:
3026
+ case GGML_TYPE_Q3_K:
3027
+ case GGML_TYPE_Q4_K:
3028
+ case GGML_TYPE_Q5_K:
3029
+ case GGML_TYPE_Q6_K:
3030
+ case GGML_TYPE_Q8_K:
3031
+ case GGML_TYPE_IQ1_M:
3032
+ case GGML_TYPE_IQ1_S:
3033
+ case GGML_TYPE_IQ2_S:
3034
+ case GGML_TYPE_IQ2_XS:
3035
+ case GGML_TYPE_IQ2_XXS:
3036
+ case GGML_TYPE_IQ3_S:
3037
+ case GGML_TYPE_IQ3_XXS:
3038
+ case GGML_TYPE_IQ4_NL:
3039
+ case GGML_TYPE_IQ4_XS:
3040
+ case GGML_TYPE_BF16:
3041
+ #ifdef GGML_USE_MUSA
3042
+ if (a->type == GGML_TYPE_Q3_K) {
3043
+ return false;
3044
+ }
3045
+ #endif // GGML_USE_MUSA
3046
+ return true;
3047
+ default:
3048
+ return false;
3049
+ }
3050
+ } break;
3051
+ case GGML_OP_OUT_PROD:
3052
+ return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
3053
+ case GGML_OP_GET_ROWS:
3054
+ {
3055
+ switch (op->src[0]->type) {
3056
+ case GGML_TYPE_F16:
3057
+ case GGML_TYPE_F32:
3058
+ case GGML_TYPE_Q4_0:
3059
+ case GGML_TYPE_Q4_1:
3060
+ case GGML_TYPE_Q5_0:
3061
+ case GGML_TYPE_Q5_1:
3062
+ case GGML_TYPE_Q8_0:
3063
+ return true;
3064
+ default:
3065
+ return false;
3066
+ }
3067
+ } break;
3068
+ case GGML_OP_GET_ROWS_BACK:
3069
+ {
3070
+ return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
3071
+ } break;
3072
+ case GGML_OP_CPY:
3073
+ {
3074
+ ggml_type src0_type = op->src[0]->type;
3075
+ ggml_type src1_type = op->src[1]->type;
3076
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
3077
+ return true;
3078
+ }
3079
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_BF16) {
3080
+ return true;
3081
+ }
3082
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
3083
+ return true;
3084
+ }
3085
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
3086
+ return true;
3087
+ }
3088
+ if (src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_F32) {
3089
+ return true;
3090
+ }
3091
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
3092
+ return true;
3093
+ }
3094
+ if (src0_type == GGML_TYPE_Q4_0 && src1_type == GGML_TYPE_F32) {
3095
+ return true;
3096
+ }
3097
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
3098
+ return true;
3099
+ }
3100
+ if (src0_type == GGML_TYPE_Q4_1 && src1_type == GGML_TYPE_F32) {
3101
+ return true;
3102
+ }
3103
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_0) {
3104
+ return true;
3105
+ }
3106
+ if (src0_type == GGML_TYPE_Q5_0 && src1_type == GGML_TYPE_F32) {
3107
+ return true;
3108
+ }
3109
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_1) {
3110
+ return true;
3111
+ }
3112
+ if (src0_type == GGML_TYPE_Q5_1 && src1_type == GGML_TYPE_F32) {
3113
+ return true;
3114
+ }
3115
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
3116
+ return true;
3117
+ }
3118
+ if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
3119
+ return true;
3120
+ }
3121
+ if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
3122
+ return true;
3123
+ }
3124
+ if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) {
3125
+ return true;
3126
+ }
3127
+ return false;
3128
+ } break;
3129
+ case GGML_OP_DUP:
3130
+ {
3131
+ ggml_type src0_type = op->src[0]->type;
3132
+ return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
3133
+ } break;
3134
+ case GGML_OP_ARGMAX:
3135
+ case GGML_OP_COUNT_EQUAL:
3136
+ {
3137
+ return true;
3138
+ } break;
3139
+ case GGML_OP_REPEAT:
3140
+ {
3141
+ ggml_type src0_type = op->src[0]->type;
3142
+ return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
3143
+ } break;
3144
+ case GGML_OP_REPEAT_BACK:
3145
+ return op->type == GGML_TYPE_F32 && (op->src[0]->ne[2]*op->src[0]->ne[3]) <= (1 << 15);
3146
+ case GGML_OP_CONCAT:
3147
+ {
3148
+ ggml_type src0_type = op->src[0]->type;
3149
+ return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
3150
+ } break;
3151
+ case GGML_OP_CONV_TRANSPOSE_1D:
3152
+ {
3153
+ ggml_type src0_type = op->src[0]->type;
3154
+ ggml_type src1_type = op->src[1]->type;
3155
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
3156
+ return true;
3157
+ }
3158
+ return false;
3159
+ } break;
3160
+ case GGML_OP_SILU_BACK:
3161
+ return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
3162
+ break;
3163
+ case GGML_OP_NORM:
3164
+ case GGML_OP_RMS_NORM:
3165
+ case GGML_OP_L2_NORM:
3166
+ return true;
3167
+ case GGML_OP_RMS_NORM_BACK:
3168
+ return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0;
3169
+ break;
3170
+ case GGML_OP_NONE:
3171
+ case GGML_OP_RESHAPE:
3172
+ case GGML_OP_VIEW:
3173
+ case GGML_OP_PERMUTE:
3174
+ case GGML_OP_TRANSPOSE:
3175
+ case GGML_OP_ADD:
3176
+ case GGML_OP_ADD1:
3177
+ case GGML_OP_SUB:
3178
+ case GGML_OP_MUL:
3179
+ case GGML_OP_DIV:
3180
+ case GGML_OP_SCALE:
3181
+ case GGML_OP_SQR:
3182
+ case GGML_OP_SQRT:
3183
+ case GGML_OP_SIN:
3184
+ case GGML_OP_COS:
3185
+ case GGML_OP_CLAMP:
3186
+ case GGML_OP_LOG:
3187
+ case GGML_OP_SSM_SCAN:
3188
+ case GGML_OP_SSM_CONV:
3189
+ return true;
3190
+ case GGML_OP_CONT:
3191
+ return op->src[0]->type != GGML_TYPE_BF16;
3192
+ case GGML_OP_DIAG_MASK_INF:
3193
+ case GGML_OP_SOFT_MAX:
3194
+ return true;
3195
+ case GGML_OP_SOFT_MAX_BACK: {
3196
+ float max_bias = 0.0f;
3197
+ memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float));
3198
+ return max_bias == 0.0f;
3199
+ }
3200
+ case GGML_OP_ROPE:
3201
+ case GGML_OP_ROPE_BACK: {
3202
+ return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]);
3203
+ }
3204
+ case GGML_OP_IM2COL:
3205
+ case GGML_OP_POOL_2D:
3206
+ case GGML_OP_SUM:
3207
+ case GGML_OP_SUM_ROWS:
3208
+ case GGML_OP_ARGSORT:
3209
+ case GGML_OP_ACC:
3210
+ return true;
3211
+ case GGML_OP_GROUP_NORM:
3212
+ return ggml_is_contiguous(op->src[0]);
3213
+ case GGML_OP_UPSCALE:
3214
+ return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
3215
+ case GGML_OP_PAD:
3216
+ case GGML_OP_ARANGE:
3217
+ case GGML_OP_TIMESTEP_EMBEDDING:
3218
+ case GGML_OP_LEAKY_RELU:
3219
+ case GGML_OP_RWKV_WKV6:
3220
+ case GGML_OP_GATED_LINEAR_ATTN:
3221
+ case GGML_OP_RWKV_WKV7:
3222
+ return true;
3223
+ case GGML_OP_FLASH_ATTN_EXT: {
3224
+ #ifndef FLASH_ATTN_AVAILABLE
3225
+ return false;
3226
+ #endif // FLASH_ATTN_AVAILABLE
3227
+ if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
3228
+ const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
3229
+ if (!new_mma_available(cc)) {
3230
+ return false;
3231
+ }
3232
+ const int gqa_ratio = op->src[0]->ne[2] / op->src[1]->ne[2];
3233
+ return op->src[1]->ne[0] == 576 && op->src[2]->ne[0] == 512 && op->src[3] && gqa_ratio % 16 == 0;
3234
+ }
3235
+ if (op->src[0]->ne[0] == 192) {
3236
+ return false;
3237
+ }
3238
+ if (op->src[0]->ne[3] != 1) {
3239
+ return false;
3240
+ }
3241
+ if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) {
3242
+ return false;
3243
+ }
3244
+ if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) {
3245
+ return true;
3246
+ }
3247
+ if (op->src[0]->ne[0] == 128) {
3248
+ return true;
3249
+ }
3250
+ if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) {
3251
+ return true;
3252
+ }
3253
+ return fp16_mma_available(ggml_cuda_info().devices[dev_ctx->device].cc) &&
3254
+ op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
3255
+ }
3256
+ case GGML_OP_CROSS_ENTROPY_LOSS:
3257
+ case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
3258
+ case GGML_OP_OPT_STEP_ADAMW:
3259
+ return true;
3260
+ default:
3261
+ return false;
3262
+ }
3263
+ }
3264
+
3265
+ static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
3266
+ return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
3267
+ }
3268
+
3269
+ static int64_t get_op_batch_size(const ggml_tensor * op) {
3270
+ switch (op->op) {
3271
+ case GGML_OP_GET_ROWS:
3272
+ return 0;
3273
+ case GGML_OP_MUL_MAT:
3274
+ return op->ne[1];
3275
+ case GGML_OP_MUL_MAT_ID:
3276
+ case GGML_OP_ROPE:
3277
+ case GGML_OP_ROPE_BACK:
3278
+ return op->ne[2];
3279
+ default:
3280
+ return ggml_nrows(op);
3281
+ }
3282
+ }
3283
+
3284
+ static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
3285
+ const int min_batch_size = 32;
3286
+
3287
+ return get_op_batch_size(op) >= min_batch_size;
3288
+
3289
+ GGML_UNUSED(dev);
3290
+ }
3291
+
3292
+ static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
3293
+ #ifdef GGML_CUDA_NO_PEER_COPY
3294
+ return nullptr;
3295
+ #else
3296
+ ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context;
3297
+
3298
+ ggml_cuda_set_device(dev_ctx->device);
3299
+
3300
+ cudaEvent_t event;
3301
+ CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
3302
+
3303
+ return new ggml_backend_event {
3304
+ /* .device = */ dev,
3305
+ /* .context = */ event,
3306
+ };
3307
+ #endif
3308
+ }
3309
+
3310
+ static void ggml_backend_cuda_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
3311
+ GGML_UNUSED(dev);
3312
+
3313
+ CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
3314
+ delete event;
3315
+ }
3316
+
3317
+ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
3318
+ GGML_UNUSED(dev);
3319
+ CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
3320
+ }
3321
+
3322
+ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
3323
+ /* .get_name = */ ggml_backend_cuda_device_get_name,
3324
+ /* .get_description = */ ggml_backend_cuda_device_get_description,
3325
+ /* .get_memory = */ ggml_backend_cuda_device_get_memory,
3326
+ /* .get_type = */ ggml_backend_cuda_device_get_type,
3327
+ /* .get_props = */ ggml_backend_cuda_device_get_props,
3328
+ /* .init_backend = */ ggml_backend_cuda_device_init_backend,
3329
+ /* .get_buffer_type = */ ggml_backend_cuda_device_get_buffer_type,
3330
+ /* .get_host_buffer_type = */ ggml_backend_cuda_device_get_host_buffer_type,
3331
+ /* .buffer_from_host_ptr = */ NULL,
3332
+ /* .supports_op = */ ggml_backend_cuda_device_supports_op,
3333
+ /* .supports_buft = */ ggml_backend_cuda_device_supports_buft,
3334
+ /* .offload_op = */ ggml_backend_cuda_device_offload_op,
3335
+ /* .event_new = */ ggml_backend_cuda_device_event_new,
3336
+ /* .event_free = */ ggml_backend_cuda_device_event_free,
3337
+ /* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
3338
+ };
3339
+
3340
+ // backend reg
3341
+
3342
+ struct ggml_backend_cuda_reg_context {
3343
+ std::vector<ggml_backend_dev_t> devices;
3344
+ };
3345
+
3346
+ static const char * ggml_backend_cuda_reg_get_name(ggml_backend_reg_t reg) {
3347
+ GGML_UNUSED(reg);
3348
+ return GGML_CUDA_NAME;
3349
+ }
3350
+
3351
+ static size_t ggml_backend_cuda_reg_get_device_count(ggml_backend_reg_t reg) {
3352
+ ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context;
3353
+ return ctx->devices.size();
3354
+ }
3355
+
3356
+ static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t reg, size_t index) {
3357
+ ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context;
3358
+ GGML_ASSERT(index < ctx->devices.size());
3359
+ return ctx->devices[index];
3360
+ }
3361
+
3362
+ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t reg) {
3363
+ static std::vector<ggml_backend_feature> features = []() {
3364
+ std::vector<ggml_backend_feature> features;
3365
+ #define _STRINGIFY(...) #__VA_ARGS__
3366
+ #define STRINGIFY(...) _STRINGIFY(__VA_ARGS__)
3367
+
3368
+ #ifdef __CUDA_ARCH_LIST__
3369
+ features.push_back({ "ARCHS", STRINGIFY(__CUDA_ARCH_LIST__) });
3370
+ #endif
3371
+
3372
+ #ifdef GGML_CUDA_FORCE_MMQ
3373
+ features.push_back({ "FORCE_MMQ", "1" });
3374
+ #endif
3375
+
3376
+ #ifdef GGML_CUDA_FORCE_CUBLAS
3377
+ features.push_back({ "FORCE_CUBLAS", "1" });
3378
+ #endif
3379
+
3380
+ #ifndef GGML_USE_VMM
3381
+ features.push_back({ "NO_VMM", "1" });
3382
+ #endif
3383
+
3384
+ #ifdef GGML_CUDA_NO_PEER_COPY
3385
+ features.push_back({ "NO_PEER_COPY", "1" });
3386
+ #endif
3387
+
3388
+ #ifdef GGML_CUDA_F16
3389
+ features.push_back({ "F16", "1" });
3390
+ #endif
3391
+
3392
+ #ifdef GGML_CUDA_USE_GRAPHS
3393
+ features.push_back({ "USE_GRAPHS", "1" });
3394
+ #endif
3395
+
3396
+ #ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
3397
+ features.push_back({ "PEER_MAX_BATCH_SIZE", STRINGIFY(GGML_CUDA_PEER_MAX_BATCH_SIZE) });
3398
+ #endif
3399
+
3400
+ #ifdef GGML_CUDA_FA_ALL_QUANTS
3401
+ features.push_back({ "FA_ALL_QUANTS", "1" });
3402
+ #endif
3403
+
3404
+ #undef _STRINGIFY
3405
+ #undef STRINGIFY
3406
+
3407
+ features.push_back({ nullptr, nullptr });
3408
+
3409
+ return features;
3410
+ }();
3411
+
3412
+ return features.data();
3413
+
3414
+ GGML_UNUSED(reg);
3415
+ }
3416
+
3417
+ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
3418
+ GGML_UNUSED(reg);
3419
+ if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
3420
+ return (void *)ggml_backend_cuda_split_buffer_type;
3421
+ }
3422
+ if (strcmp(name, "ggml_backend_register_host_buffer") == 0) {
3423
+ return (void *)ggml_backend_cuda_register_host_buffer;
3424
+ }
3425
+ if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) {
3426
+ return (void *)ggml_backend_cuda_unregister_host_buffer;
3427
+ }
3428
+ if (strcmp(name, "ggml_backend_get_features") == 0) {
3429
+ return (void *)ggml_backend_cuda_get_features;
3430
+ }
3431
+ return nullptr;
3432
+ }
3433
+
3434
+ static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = {
3435
+ /* .get_name = */ ggml_backend_cuda_reg_get_name,
3436
+ /* .get_device_count = */ ggml_backend_cuda_reg_get_device_count,
3437
+ /* .get_device = */ ggml_backend_cuda_reg_get_device,
3438
+ /* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address,
3439
+ };
3440
+
3441
+ // backend registry
3442
+ ggml_backend_reg_t ggml_backend_cuda_reg() {
3443
+ static ggml_backend_reg reg;
3444
+ static bool initialized = false;
3445
+
3446
+ {
3447
+ static std::mutex mutex;
3448
+ std::lock_guard<std::mutex> lock(mutex);
3449
+ if (!initialized) {
3450
+ ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
3451
+
3452
+ for (int i = 0; i < ggml_cuda_info().device_count; i++) {
3453
+ ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
3454
+ dev_ctx->device = i;
3455
+ dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);
3456
+
3457
+ ggml_cuda_set_device(i);
3458
+ cudaDeviceProp prop;
3459
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
3460
+ dev_ctx->description = prop.name;
3461
+
3462
+ ggml_backend_dev_t dev = new ggml_backend_device {
3463
+ /* .iface = */ ggml_backend_cuda_device_interface,
3464
+ /* .reg = */ &reg,
3465
+ /* .context = */ dev_ctx
3466
+ };
3467
+ ctx->devices.push_back(dev);
3468
+ }
3469
+
3470
+ reg = ggml_backend_reg {
3471
+ /* .api_version = */ GGML_BACKEND_API_VERSION,
3472
+ /* .iface = */ ggml_backend_cuda_reg_interface,
3473
+ /* .context = */ ctx
3474
+ };
3475
+ }
3476
+
3477
+ initialized = true;
3478
+ }
3479
+
3480
+ return &reg;
3481
+ }
3482
+
3483
+ ggml_backend_t ggml_backend_cuda_init(int device) {
3484
+ if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
3485
+ GGML_LOG_ERROR("%s: invalid device %d\n", __func__, device);
3486
+ return nullptr;
3487
+ }
3488
+
3489
+ ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
3490
+ if (ctx == nullptr) {
3491
+ GGML_LOG_ERROR("%s: failed to allocate context\n", __func__);
3492
+ return nullptr;
3493
+ }
3494
+
3495
+ ggml_backend_t cuda_backend = new ggml_backend {
3496
+ /* .guid = */ ggml_backend_cuda_guid(),
3497
+ /* .interface = */ ggml_backend_cuda_interface,
3498
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
3499
+ /* .context = */ ctx,
3500
+ };
3501
+
3502
+ return cuda_backend;
3503
+ }
3504
+
3505
+ GGML_BACKEND_DL_IMPL(ggml_backend_cuda_reg)