llama_cpp 0.16.2 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +18 -0
  3. data/README.md +7 -12
  4. data/ext/llama_cpp/extconf.rb +2 -43
  5. data/ext/llama_cpp/llama_cpp.cpp +8 -0
  6. data/lib/llama_cpp/version.rb +3 -3
  7. data/sig/llama_cpp.rbs +3 -0
  8. metadata +2 -171
  9. data/vendor/include/.gitkeep +0 -0
  10. data/vendor/lib/.gitkeep +0 -0
  11. data/vendor/tmp/llama.cpp/LICENSE +0 -21
  12. data/vendor/tmp/llama.cpp/Makefile +0 -1124
  13. data/vendor/tmp/llama.cpp/ggml-alloc.c +0 -1041
  14. data/vendor/tmp/llama.cpp/ggml-alloc.h +0 -76
  15. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +0 -153
  16. data/vendor/tmp/llama.cpp/ggml-backend.c +0 -2225
  17. data/vendor/tmp/llama.cpp/ggml-backend.h +0 -236
  18. data/vendor/tmp/llama.cpp/ggml-blas.cpp +0 -363
  19. data/vendor/tmp/llama.cpp/ggml-blas.h +0 -23
  20. data/vendor/tmp/llama.cpp/ggml-common.h +0 -1805
  21. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +0 -47
  22. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +0 -34
  23. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +0 -104
  24. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +0 -280
  25. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +0 -34
  26. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +0 -196
  27. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +0 -686
  28. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +0 -490
  29. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +0 -40
  30. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +0 -674
  31. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +0 -319
  32. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +0 -312
  33. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +0 -345
  34. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +0 -178
  35. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +0 -104
  36. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +0 -88
  37. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +0 -419
  38. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +0 -221
  39. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +0 -49
  40. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +0 -94
  41. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +0 -112
  42. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +0 -271
  43. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +0 -31
  44. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +0 -206
  45. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +0 -40
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  127. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  128. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  129. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  130. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  131. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  132. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +0 -10
  133. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +0 -9
  134. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +0 -10
  135. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +0 -10
  136. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +0 -8
  137. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +0 -5
  138. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +0 -5
  139. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +0 -5
  140. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +0 -5
  141. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +0 -5
  142. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +0 -5
  143. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +0 -5
  144. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +0 -5
  145. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +0 -5
  146. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +0 -5
  147. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +0 -47
  148. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +0 -314
  149. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +0 -51
  150. data/vendor/tmp/llama.cpp/ggml-cuda.cu +0 -3069
  151. data/vendor/tmp/llama.cpp/ggml-cuda.h +0 -44
  152. data/vendor/tmp/llama.cpp/ggml-impl.h +0 -651
  153. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -2038
  154. data/vendor/tmp/llama.cpp/ggml-kompute.h +0 -46
  155. data/vendor/tmp/llama.cpp/ggml-metal.h +0 -66
  156. data/vendor/tmp/llama.cpp/ggml-metal.m +0 -3273
  157. data/vendor/tmp/llama.cpp/ggml-metal.metal +0 -6540
  158. data/vendor/tmp/llama.cpp/ggml-quants.c +0 -14994
  159. data/vendor/tmp/llama.cpp/ggml-quants.h +0 -133
  160. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +0 -1178
  161. data/vendor/tmp/llama.cpp/ggml-rpc.h +0 -24
  162. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +0 -6351
  163. data/vendor/tmp/llama.cpp/ggml-sycl.h +0 -40
  164. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +0 -144508
  165. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +0 -7183
  166. data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -29
  167. data/vendor/tmp/llama.cpp/ggml.c +0 -22506
  168. data/vendor/tmp/llama.cpp/ggml.h +0 -2458
  169. data/vendor/tmp/llama.cpp/llama.cpp +0 -18985
  170. data/vendor/tmp/llama.cpp/llama.h +0 -1147
  171. data/vendor/tmp/llama.cpp/scripts/get-flags.mk +0 -38
  172. data/vendor/tmp/llama.cpp/sgemm.cpp +0 -1032
  173. data/vendor/tmp/llama.cpp/sgemm.h +0 -14
  174. data/vendor/tmp/llama.cpp/unicode-data.cpp +0 -7033
  175. data/vendor/tmp/llama.cpp/unicode-data.h +0 -20
  176. data/vendor/tmp/llama.cpp/unicode.cpp +0 -810
  177. data/vendor/tmp/llama.cpp/unicode.h +0 -63
@@ -1,3069 +0,0 @@
1
- #include "ggml-cuda.h"
2
- #include "ggml.h"
3
- #include "ggml-backend-impl.h"
4
-
5
- #include "ggml-cuda/common.cuh"
6
- #include "ggml-cuda/acc.cuh"
7
- #include "ggml-cuda/arange.cuh"
8
- #include "ggml-cuda/argsort.cuh"
9
- #include "ggml-cuda/binbcast.cuh"
10
- #include "ggml-cuda/clamp.cuh"
11
- #include "ggml-cuda/concat.cuh"
12
- #include "ggml-cuda/convert.cuh"
13
- #include "ggml-cuda/cpy.cuh"
14
- #include "ggml-cuda/diagmask.cuh"
15
- #include "ggml-cuda/dmmv.cuh"
16
- #include "ggml-cuda/fattn.cuh"
17
- #include "ggml-cuda/getrows.cuh"
18
- #include "ggml-cuda/im2col.cuh"
19
- #include "ggml-cuda/mmq.cuh"
20
- #include "ggml-cuda/mmvq.cuh"
21
- #include "ggml-cuda/norm.cuh"
22
- #include "ggml-cuda/pad.cuh"
23
- #include "ggml-cuda/pool2d.cuh"
24
- #include "ggml-cuda/quantize.cuh"
25
- #include "ggml-cuda/rope.cuh"
26
- #include "ggml-cuda/scale.cuh"
27
- #include "ggml-cuda/softmax.cuh"
28
- #include "ggml-cuda/sumrows.cuh"
29
- #include "ggml-cuda/tsembd.cuh"
30
- #include "ggml-cuda/unary.cuh"
31
- #include "ggml-cuda/upscale.cuh"
32
-
33
- #include <algorithm>
34
- #include <array>
35
- #include <atomic>
36
- #include <cinttypes>
37
- #include <cstddef>
38
- #include <cstdint>
39
- #include <float.h>
40
- #include <limits>
41
- #include <map>
42
- #include <memory>
43
- #include <mutex>
44
- #include <stdint.h>
45
- #include <stdio.h>
46
- #include <stdarg.h>
47
- #include <stdlib.h>
48
- #include <string>
49
- #include <vector>
50
-
51
- static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
52
-
53
- static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
54
- GGML_UNUSED(level);
55
- GGML_UNUSED(user_data);
56
- fprintf(stderr, "%s", msg);
57
- }
58
-
59
- ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
60
- void * ggml_cuda_log_user_data = NULL;
61
-
62
- GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
63
- ggml_cuda_log_callback = log_callback;
64
- ggml_cuda_log_user_data = user_data;
65
- }
66
-
67
- #define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
68
- #define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
69
- #define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
70
-
71
- GGML_ATTRIBUTE_FORMAT(2, 3)
72
- static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
73
- if (ggml_cuda_log_callback != NULL) {
74
- va_list args;
75
- va_start(args, format);
76
- char buffer[128];
77
- int len = vsnprintf(buffer, 128, format, args);
78
- if (len < 128) {
79
- ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
80
- } else {
81
- std::vector<char> buffer2(len + 1); // vsnprintf adds a null terminator
82
- va_end(args);
83
- va_start(args, format);
84
- vsnprintf(&buffer2[0], buffer2.size(), format, args);
85
- ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
86
- }
87
- va_end(args);
88
- }
89
- }
90
-
91
- [[noreturn]]
92
- void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
93
- int id = -1; // in case cudaGetDevice fails
94
- cudaGetDevice(&id);
95
-
96
- GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
97
- GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
98
- GGML_CUDA_LOG_ERROR(" %s\n", stmt);
99
- // abort with GGML_ASSERT to get a stack trace
100
- GGML_ASSERT(!"CUDA error");
101
- }
102
-
103
- // this is faster on Windows
104
- // probably because the Windows CUDA libraries forget to make this check before invoking the drivers
105
- void ggml_cuda_set_device(int device) {
106
- int current_device;
107
- CUDA_CHECK(cudaGetDevice(&current_device));
108
-
109
- if (device == current_device) {
110
- return;
111
- }
112
-
113
- CUDA_CHECK(cudaSetDevice(device));
114
- }
115
-
116
- int ggml_cuda_get_device() {
117
- int id;
118
- CUDA_CHECK(cudaGetDevice(&id));
119
- return id;
120
- }
121
-
122
- static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
123
- ggml_cuda_set_device(device);
124
- #if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA)
125
- auto res = hipMallocManaged(ptr, size);
126
- if (res == hipSuccess) {
127
- // if error we "need" to know why...
128
- CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
129
- }
130
- return res;
131
- #else
132
- return cudaMalloc(ptr, size);
133
- #endif
134
- }
135
-
136
- static ggml_cuda_device_info ggml_cuda_init() {
137
- #ifdef __HIP_PLATFORM_AMD__
138
- // Workaround for a rocBLAS bug when using multiple graphics cards:
139
- // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
140
- rocblas_initialize();
141
- CUDA_CHECK(cudaDeviceSynchronize());
142
- #endif
143
-
144
- ggml_cuda_device_info info = {};
145
-
146
- cudaError_t err = cudaGetDeviceCount(&info.device_count);
147
- if (err != cudaSuccess) {
148
- GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
149
- return info;
150
- }
151
-
152
- GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
153
-
154
- int64_t total_vram = 0;
155
- #if defined(GGML_CUDA_FORCE_MMQ)
156
- GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
157
- #else
158
- GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
159
- #endif
160
- #if defined(CUDA_USE_TENSOR_CORES)
161
- GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
162
- #else
163
- GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
164
- #endif
165
- GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
166
- for (int id = 0; id < info.device_count; ++id) {
167
- int device_vmm = 0;
168
-
169
- #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
170
- CUdevice device;
171
- CU_CHECK(cuDeviceGet(&device, id));
172
- CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
173
-
174
- if (device_vmm) {
175
- CUmemAllocationProp alloc_prop = {};
176
- alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
177
- alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
178
- alloc_prop.location.id = id;
179
- CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
180
- }
181
- #endif // !defined(GGML_USE_HIPBLAS)
182
- info.devices[id].vmm = !!device_vmm;
183
-
184
- cudaDeviceProp prop;
185
- CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
186
- GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
187
-
188
- info.default_tensor_split[id] = total_vram;
189
- total_vram += prop.totalGlobalMem;
190
-
191
- info.devices[id].nsm = prop.multiProcessorCount;
192
- info.devices[id].smpb = prop.sharedMemPerBlock;
193
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
194
- info.devices[id].smpbo = prop.sharedMemPerBlock;
195
- info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
196
- #else
197
- info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
198
- info.devices[id].cc = 100*prop.major + 10*prop.minor;
199
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
200
- }
201
-
202
- for (int id = 0; id < info.device_count; ++id) {
203
- info.default_tensor_split[id] /= total_vram;
204
- }
205
-
206
- // configure logging to stdout
207
- // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
208
-
209
- return info;
210
- }
211
-
212
- const ggml_cuda_device_info & ggml_cuda_info() {
213
- static ggml_cuda_device_info info = ggml_cuda_init();
214
- return info;
215
- }
216
-
217
- // #define DEBUG_CUDA_MALLOC
218
-
219
- // buffer pool for cuda (legacy)
220
- struct ggml_cuda_pool_leg : public ggml_cuda_pool {
221
- static const int MAX_BUFFERS = 256;
222
-
223
- int device;
224
- struct ggml_cuda_buffer {
225
- void * ptr = nullptr;
226
- size_t size = 0;
227
- };
228
-
229
- ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
230
- size_t pool_size = 0;
231
-
232
- explicit ggml_cuda_pool_leg(int device) :
233
- device(device) {
234
- }
235
-
236
- ~ggml_cuda_pool_leg() {
237
- ggml_cuda_set_device(device);
238
- for (int i = 0; i < MAX_BUFFERS; ++i) {
239
- ggml_cuda_buffer & b = buffer_pool[i];
240
- if (b.ptr != nullptr) {
241
- CUDA_CHECK(cudaFree(b.ptr));
242
- pool_size -= b.size;
243
- }
244
- }
245
- GGML_ASSERT(pool_size == 0);
246
- }
247
-
248
- void * alloc(size_t size, size_t * actual_size) override {
249
- #ifdef DEBUG_CUDA_MALLOC
250
- int nnz = 0;
251
- size_t max_size = 0;
252
- #endif
253
- size_t best_diff = 1ull << 36;
254
- int ibest = -1;
255
- for (int i = 0; i < MAX_BUFFERS; ++i) {
256
- ggml_cuda_buffer& b = buffer_pool[i];
257
- if (b.ptr != nullptr) {
258
- #ifdef DEBUG_CUDA_MALLOC
259
- ++nnz;
260
- if (b.size > max_size) max_size = b.size;
261
- #endif
262
- if (b.size >= size) {
263
- size_t diff = b.size - size;
264
- if (diff < best_diff) {
265
- best_diff = diff;
266
- ibest = i;
267
- if (!best_diff) {
268
- void * ptr = b.ptr;
269
- *actual_size = b.size;
270
- b.ptr = nullptr;
271
- b.size = 0;
272
- return ptr;
273
- }
274
- }
275
- }
276
- }
277
- }
278
- if (ibest >= 0) {
279
- ggml_cuda_buffer& b = buffer_pool[ibest];
280
- void * ptr = b.ptr;
281
- *actual_size = b.size;
282
- b.ptr = nullptr;
283
- b.size = 0;
284
- return ptr;
285
- }
286
- void * ptr;
287
- size_t look_ahead_size = (size_t) (1.05 * size);
288
- look_ahead_size = 256 * ((look_ahead_size + 255)/256);
289
- ggml_cuda_set_device(device);
290
- CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
291
- *actual_size = look_ahead_size;
292
- pool_size += look_ahead_size;
293
- #ifdef DEBUG_CUDA_MALLOC
294
- GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
295
- (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
296
- #endif
297
- return ptr;
298
- }
299
-
300
- void free(void * ptr, size_t size) override {
301
- for (int i = 0; i < MAX_BUFFERS; ++i) {
302
- ggml_cuda_buffer& b = buffer_pool[i];
303
- if (b.ptr == nullptr) {
304
- b.ptr = ptr;
305
- b.size = size;
306
- return;
307
- }
308
- }
309
- GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
310
- ggml_cuda_set_device(device);
311
- CUDA_CHECK(cudaFree(ptr));
312
- pool_size -= size;
313
- }
314
- };
315
-
316
- // pool with virtual memory
317
- #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
318
- struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
319
- static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
320
-
321
- int device;
322
- CUdeviceptr pool_addr = 0;
323
- size_t pool_used = 0;
324
- size_t pool_size = 0;
325
- size_t granularity;
326
-
327
- explicit ggml_cuda_pool_vmm(int device) :
328
- device(device),
329
- granularity(ggml_cuda_info().devices[device].vmm_granularity) {
330
- }
331
-
332
- ~ggml_cuda_pool_vmm() {
333
- if (pool_addr != 0) {
334
- CU_CHECK(cuMemUnmap(pool_addr, pool_size));
335
- CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
336
- }
337
- }
338
-
339
- void * alloc(size_t size, size_t * actual_size) override {
340
- // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
341
- const size_t alignment = 128;
342
- size = alignment * ((size + alignment - 1) / alignment);
343
-
344
- size_t avail = pool_size - pool_used;
345
-
346
- if (size > avail) {
347
- // round up to the next multiple of the granularity
348
- size_t reserve_size = size - avail;
349
- reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
350
-
351
- GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
352
-
353
- // allocate more physical memory
354
- CUmemAllocationProp prop = {};
355
- prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
356
- prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
357
- prop.location.id = device;
358
- CUmemGenericAllocationHandle handle;
359
- CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
360
-
361
- // reserve virtual address space (if not already reserved)
362
- if (pool_addr == 0) {
363
- CU_CHECK(cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
364
- }
365
-
366
- // map at the end of the pool
367
- CU_CHECK(cuMemMap(pool_addr + pool_size, reserve_size, 0, handle, 0));
368
-
369
- // the memory allocation handle is no longer needed after mapping
370
- CU_CHECK(cuMemRelease(handle));
371
-
372
- // set access
373
- CUmemAccessDesc access = {};
374
- access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
375
- access.location.id = device;
376
- access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
377
- CU_CHECK(cuMemSetAccess(pool_addr + pool_size, reserve_size, &access, 1));
378
-
379
- // add to the pool
380
- pool_size += reserve_size;
381
-
382
- //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
383
- // device, (unsigned long long) (pool_size/1024/1024),
384
- // (unsigned long long) (reserve_size/1024/1024));
385
- }
386
-
387
- GGML_ASSERT(pool_addr != 0);
388
-
389
- void * ptr = (void *) (pool_addr + pool_used);
390
- *actual_size = size;
391
- pool_used += size;
392
-
393
- #ifdef DEBUG_CUDA_MALLOC
394
- printf("cuda pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
395
- #endif
396
-
397
- return ptr;
398
- }
399
-
400
- void free(void * ptr, size_t size) override {
401
- #ifdef DEBUG_CUDA_MALLOC
402
- printf("cuda pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
403
- #endif
404
-
405
- pool_used -= size;
406
-
407
- // all deallocations must be in reverse order of the allocations
408
- GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
409
- }
410
- };
411
- #endif // !defined(GGML_USE_HIPBLAS)
412
-
413
- std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
414
- #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
415
- if (ggml_cuda_info().devices[device].vmm) {
416
- return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
417
- }
418
- #endif
419
- return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
420
- }
421
-
422
- // cuda buffer
423
-
424
- struct ggml_backend_cuda_buffer_context {
425
- int device;
426
- void * dev_ptr = nullptr;
427
- std::string name;
428
-
429
- ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
430
- device(device), dev_ptr(dev_ptr),
431
- name(GGML_CUDA_NAME + std::to_string(device)) {
432
- }
433
-
434
- ~ggml_backend_cuda_buffer_context() {
435
- CUDA_CHECK(cudaFree(dev_ptr));
436
- }
437
- };
438
-
439
- GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
440
- ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
441
- return ctx->name.c_str();
442
- }
443
-
444
- GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
445
- return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
446
- }
447
-
448
- GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
449
- ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
450
- delete ctx;
451
- }
452
-
453
- GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
454
- ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
455
- return ctx->dev_ptr;
456
- }
457
-
458
- GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
459
- ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
460
-
461
- if (tensor->view_src != NULL) {
462
- assert(tensor->view_src->buffer->buft == buffer->buft);
463
- return;
464
- }
465
-
466
- if (ggml_is_quantized(tensor->type)) {
467
- // initialize padding to 0 to avoid possible NaN values
468
- size_t original_size = ggml_nbytes(tensor);
469
- size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
470
-
471
- if (padded_size > original_size && tensor->view_src == nullptr) {
472
- ggml_cuda_set_device(ctx->device);
473
- CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
474
- }
475
- }
476
- }
477
-
478
- GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
479
- ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
480
-
481
- ggml_cuda_set_device(ctx->device);
482
- CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
483
- CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
484
- }
485
-
486
- GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
487
- ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
488
-
489
- ggml_cuda_set_device(ctx->device);
490
- CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
491
- CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
492
- }
493
-
494
- GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
495
- if (ggml_backend_buffer_is_cuda(src->buffer)) {
496
- ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
497
- ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
498
- if (src_ctx->device == dst_ctx->device) {
499
- CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
500
- } else {
501
- #ifdef GGML_CUDA_NO_PEER_COPY
502
- return false;
503
- #else
504
- CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
505
- #endif
506
- }
507
- CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
508
- return true;
509
- }
510
- return false;
511
-
512
- GGML_UNUSED(buffer);
513
- }
514
-
515
- GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
516
- ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
517
-
518
- ggml_cuda_set_device(ctx->device);
519
- CUDA_CHECK(cudaDeviceSynchronize());
520
- CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
521
- CUDA_CHECK(cudaDeviceSynchronize());
522
- }
523
-
524
- static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
525
- /* .get_name = */ ggml_backend_cuda_buffer_get_name,
526
- /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
527
- /* .get_base = */ ggml_backend_cuda_buffer_get_base,
528
- /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
529
- /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
530
- /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
531
- /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor,
532
- /* .clear = */ ggml_backend_cuda_buffer_clear,
533
- /* .reset = */ NULL,
534
- };
535
-
536
- // cuda buffer type
537
- struct ggml_backend_cuda_buffer_type_context {
538
- int device;
539
- std::string name;
540
- };
541
-
542
- GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
543
- ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
544
-
545
- return ctx->name.c_str();
546
- }
547
-
548
- static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
549
- return buft->iface.get_name == ggml_backend_cuda_buffer_type_name;
550
- }
551
-
552
- GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
553
- ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
554
-
555
- ggml_cuda_set_device(buft_ctx->device);
556
-
557
- size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
558
-
559
- void * dev_ptr;
560
- cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
561
- if (err != cudaSuccess) {
562
- // clear the error
563
- cudaGetLastError();
564
- GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
565
- return nullptr;
566
- }
567
-
568
- ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
569
-
570
- return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
571
- }
572
-
573
- GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
574
- return 128;
575
-
576
- GGML_UNUSED(buft);
577
- }
578
-
579
- GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
580
- size_t size = ggml_nbytes(tensor);
581
- int64_t ne0 = tensor->ne[0];
582
-
583
- if (ggml_is_quantized(tensor->type)) {
584
- if (ne0 % MATRIX_ROW_PADDING != 0) {
585
- size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
586
- }
587
- }
588
-
589
- return size;
590
-
591
- GGML_UNUSED(buft);
592
- }
593
-
594
- static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
595
- /* .get_name = */ ggml_backend_cuda_buffer_type_name,
596
- /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
597
- /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
598
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
599
- /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
600
- /* .is_host = */ NULL,
601
- };
602
-
603
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
604
- static std::mutex mutex;
605
- std::lock_guard<std::mutex> lock(mutex);
606
-
607
- if (device >= ggml_backend_cuda_get_device_count()) {
608
- return nullptr;
609
- }
610
-
611
- static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
612
-
613
- static bool ggml_backend_cuda_buffer_type_initialized = false;
614
-
615
- if (!ggml_backend_cuda_buffer_type_initialized) {
616
- for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
617
- ggml_backend_cuda_buffer_types[i] = {
618
- /* .iface = */ ggml_backend_cuda_buffer_type_interface,
619
- /* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
620
- };
621
- }
622
- ggml_backend_cuda_buffer_type_initialized = true;
623
- }
624
-
625
- return &ggml_backend_cuda_buffer_types[device];
626
- }
627
-
628
- // cuda split buffer
629
-
630
- static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
631
- int64_t row_rounding = 0;
632
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
633
- if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
634
- continue;
635
- }
636
-
637
- const int cc = ggml_cuda_info().devices[id].cc;
638
- row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
639
- }
640
- return row_rounding;
641
- }
642
-
643
- static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
644
- const int64_t nrows = ggml_nrows(tensor);
645
- const int64_t rounding = get_row_rounding(tensor_split);
646
-
647
- *row_low = id == 0 ? 0 : nrows*tensor_split[id];
648
- *row_low -= *row_low % rounding;
649
-
650
- if (id == ggml_backend_cuda_get_device_count() - 1) {
651
- *row_high = nrows;
652
- } else {
653
- *row_high = nrows*tensor_split[id + 1];
654
- *row_high -= *row_high % rounding;
655
- }
656
- }
657
-
658
- static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
659
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
660
-
661
- return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
662
- }
663
-
664
- struct ggml_backend_cuda_split_buffer_type_context {
665
- std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
666
- };
667
-
668
- struct ggml_backend_cuda_split_buffer_context {
669
- ~ggml_backend_cuda_split_buffer_context() {
670
- for (ggml_tensor_extra_gpu * extra : tensor_extras) {
671
- for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) {
672
- for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
673
- if (extra->events[id][is] != nullptr) {
674
- CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
675
- }
676
- }
677
- if (extra->data_device[id] != nullptr) {
678
- CUDA_CHECK(cudaFree(extra->data_device[id]));
679
- }
680
- }
681
- delete extra;
682
- }
683
- }
684
-
685
- std::vector<ggml_tensor_extra_gpu *> tensor_extras;
686
- };
687
-
688
- GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
689
- return GGML_CUDA_NAME "_Split";
690
-
691
- GGML_UNUSED(buffer);
692
- }
693
-
694
- static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
695
- return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
696
- GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
697
- }
698
-
699
- GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
700
- ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
701
- delete ctx;
702
- }
703
-
704
- GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
705
- // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
706
- return (void *)0x1000;
707
-
708
- GGML_UNUSED(buffer);
709
- }
710
-
711
- GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
712
- GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
713
-
714
- ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
715
- ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
716
-
717
- const int64_t ne0 = tensor->ne[0];
718
-
719
- ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
720
- ctx->tensor_extras.push_back(extra);
721
-
722
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
723
- int64_t row_low, row_high;
724
- get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
725
-
726
- int64_t nrows_split = row_high - row_low;
727
- if (nrows_split == 0) {
728
- continue;
729
- }
730
-
731
- size_t size = ggml_nbytes_split(tensor, nrows_split);
732
- const size_t original_size = size;
733
-
734
- // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
735
- if (ne0 % MATRIX_ROW_PADDING != 0) {
736
- size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
737
- }
738
-
739
- // FIXME: do not crash if cudaMalloc fails
740
- // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
741
- ggml_cuda_set_device(id);
742
- char * buf;
743
- CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
744
-
745
- // set padding to 0 to avoid possible NaN values
746
- if (size > original_size) {
747
- CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
748
- }
749
-
750
- extra->data_device[id] = buf;
751
-
752
- for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
753
- CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
754
- }
755
- }
756
- tensor->extra = extra;
757
- }
758
-
759
- GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
760
- // split tensors must always be set in their entirety at once
761
- GGML_ASSERT(offset == 0);
762
- GGML_ASSERT(size == ggml_nbytes(tensor));
763
-
764
- ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
765
-
766
- const int64_t ne0 = tensor->ne[0];
767
- const size_t nb1 = tensor->nb[1];
768
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
769
-
770
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
771
- int64_t row_low, row_high;
772
- get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
773
-
774
- int64_t nrows_split = row_high - row_low;
775
- if (nrows_split == 0) {
776
- continue;
777
- }
778
-
779
- const size_t offset_split = row_low*nb1;
780
- size_t size = ggml_nbytes_split(tensor, nrows_split);
781
- const size_t original_size = size;
782
-
783
- // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
784
- if (ne0 % MATRIX_ROW_PADDING != 0) {
785
- size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
786
- }
787
-
788
- const char * buf_host = (const char *)data + offset_split;
789
- CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
790
- }
791
-
792
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
793
- CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
794
- }
795
- }
796
-
797
- GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
798
- // split tensors must always be set in their entirety at once
799
- GGML_ASSERT(offset == 0);
800
- GGML_ASSERT(size == ggml_nbytes(tensor));
801
-
802
- ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
803
-
804
- const int64_t ne0 = tensor->ne[0];
805
- const size_t nb1 = tensor->nb[1];
806
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
807
-
808
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
809
- int64_t row_low, row_high;
810
- get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
811
-
812
- int64_t nrows_split = row_high - row_low;
813
- if (nrows_split == 0) {
814
- continue;
815
- }
816
-
817
- const size_t offset_split = row_low*nb1;
818
- size_t size = ggml_nbytes_split(tensor, nrows_split);
819
- const size_t original_size = size;
820
-
821
- // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
822
- if (ne0 % MATRIX_ROW_PADDING != 0) {
823
- size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
824
- }
825
-
826
- char * buf_host = (char *)data + offset_split;
827
- CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
828
- }
829
-
830
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
831
- CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
832
- }
833
- }
834
-
835
- GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
836
- GGML_UNUSED(buffer);
837
- GGML_UNUSED(value);
838
- }
839
-
840
- static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
841
- /* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
842
- /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
843
- /* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
844
- /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
845
- /* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
846
- /* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
847
- /* .cpy_tensor = */ NULL,
848
- /* .clear = */ ggml_backend_cuda_split_buffer_clear,
849
- /* .reset = */ NULL,
850
- };
851
-
852
- // cuda split buffer type
853
-
854
- GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
855
- return GGML_CUDA_NAME "_Split";
856
-
857
- GGML_UNUSED(buft);
858
- }
859
-
860
- static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
861
- return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name;
862
- }
863
-
864
- GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
865
- // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
866
- // instead, we allocate them for each tensor separately in init_tensor
867
- // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
868
- // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
869
- ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
870
-
871
- return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
872
- }
873
-
874
- GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
875
- return 128;
876
-
877
- GGML_UNUSED(buft);
878
- }
879
-
880
- GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
881
- ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
882
-
883
- size_t total_size = 0;
884
-
885
- const int64_t ne0 = tensor->ne[0];
886
-
887
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
888
- int64_t row_low, row_high;
889
- get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
890
-
891
- int64_t nrows_split = row_high - row_low;
892
- if (nrows_split == 0) {
893
- continue;
894
- }
895
-
896
- total_size += ggml_nbytes_split(tensor, nrows_split);
897
-
898
- // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
899
- if (ne0 % MATRIX_ROW_PADDING != 0) {
900
- total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
901
- }
902
- }
903
-
904
- return total_size;
905
- }
906
-
907
- GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
908
- return false;
909
-
910
- GGML_UNUSED(buft);
911
- }
912
-
913
- static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
914
- /* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
915
- /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
916
- /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
917
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
918
- /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
919
- /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
920
- };
921
-
922
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
923
- static std::mutex mutex;
924
- std::lock_guard<std::mutex> lock(mutex);
925
-
926
- static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
927
-
928
- std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
929
-
930
- bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
931
- if (all_zero) {
932
- tensor_split_arr = ggml_cuda_info().default_tensor_split;
933
- } else {
934
- float split_sum = 0.0f;
935
- for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
936
- tensor_split_arr[i] = split_sum;
937
- split_sum += tensor_split[i];
938
- }
939
- for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
940
- tensor_split_arr[i] /= split_sum;
941
- }
942
- }
943
-
944
- auto it = buft_map.find(tensor_split_arr);
945
- if (it != buft_map.end()) {
946
- return &it->second;
947
- }
948
-
949
- struct ggml_backend_buffer_type buft {
950
- /* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
951
- /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
952
- };
953
-
954
- auto result = buft_map.emplace(tensor_split_arr, buft);
955
- return &result.first->second;
956
- }
957
-
958
- // host buffer type
959
-
960
- GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
961
- return GGML_CUDA_NAME "_Host";
962
-
963
- GGML_UNUSED(buft);
964
- }
965
-
966
- GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
967
- return GGML_CUDA_NAME "_Host";
968
-
969
- GGML_UNUSED(buffer);
970
- }
971
-
972
- GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
973
- CUDA_CHECK(cudaFreeHost(buffer->context));
974
- }
975
-
976
- static void * ggml_cuda_host_malloc(size_t size) {
977
- if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
978
- return nullptr;
979
- }
980
-
981
- void * ptr = nullptr;
982
- cudaError_t err = cudaMallocHost((void **) &ptr, size);
983
- if (err != cudaSuccess) {
984
- // clear the error
985
- cudaGetLastError();
986
- GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
987
- size / 1024.0 / 1024.0, cudaGetErrorString(err));
988
- return nullptr;
989
- }
990
-
991
- return ptr;
992
- }
993
-
994
- GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
995
- void * ptr = ggml_cuda_host_malloc(size);
996
-
997
- if (ptr == nullptr) {
998
- // fallback to cpu buffer
999
- return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
1000
- }
1001
-
1002
- ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
1003
- buffer->buft = buft;
1004
- buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
1005
- buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
1006
-
1007
- return buffer;
1008
- }
1009
-
1010
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
1011
- static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
1012
- /* .iface = */ {
1013
- /* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
1014
- /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
1015
- /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
1016
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1017
- /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
1018
- /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1019
- },
1020
- /* .context = */ nullptr,
1021
- };
1022
-
1023
- return &ggml_backend_cuda_buffer_type_host;
1024
- }
1025
-
1026
- //static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
1027
- // return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
1028
- //}
1029
-
1030
- /// kernels
1031
-
1032
- typedef void (*ggml_cuda_op_mul_mat_t)(
1033
- ggml_backend_cuda_context & ctx,
1034
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
1035
- const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
1036
- const int64_t src1_padded_row_size, cudaStream_t stream);
1037
-
1038
- #ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
1039
- #define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
1040
- #endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
1041
-
1042
- #define MUL_MAT_SRC1_COL_STRIDE 128
1043
-
1044
- static __global__ void mul_mat_p021_f16_f32(
1045
- const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
1046
- const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
1047
-
1048
- const half * x = (const half *) vx;
1049
-
1050
- const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1051
- const int channel = blockDim.z*blockIdx.z + threadIdx.z;
1052
- const int channel_x = channel / (nchannels_y / nchannels_x);
1053
-
1054
- const int nrows_y = ncols_x;
1055
- const int nrows_dst = nrows_x;
1056
- const int row_dst = row_x;
1057
-
1058
- float tmp = 0.0f;
1059
-
1060
- for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
1061
- const int col_x = col_x0 + threadIdx.x;
1062
-
1063
- if (col_x >= ncols_x) {
1064
- break;
1065
- }
1066
-
1067
- // x is transposed and permuted
1068
- const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
1069
- const float xi = __half2float(x[ix]);
1070
-
1071
- const int row_y = col_x;
1072
-
1073
- // y is not transposed but permuted
1074
- const int iy = channel*nrows_y + row_y;
1075
-
1076
- tmp += xi * y[iy];
1077
- }
1078
-
1079
- // dst is not transposed and not permuted
1080
- const int idst = channel*nrows_dst + row_dst;
1081
-
1082
- // sum up partial sums and write back result
1083
- tmp = warp_reduce_sum(tmp);
1084
-
1085
- if (threadIdx.x == 0) {
1086
- dst[idst] = tmp;
1087
- }
1088
- }
1089
-
1090
- static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1091
- const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
1092
- const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
1093
-
1094
- const half * x = (const half *) vx;
1095
-
1096
- const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1097
- const int channel = blockDim.z*blockIdx.z + threadIdx.z;
1098
- const int channel_x = channel / channel_x_divisor;
1099
-
1100
- const int nrows_y = ncols_x;
1101
- const int nrows_dst = nrows_x;
1102
- const int row_dst = row_x;
1103
-
1104
- const int idst = channel*nrows_dst + row_dst;
1105
-
1106
- float tmp = 0.0f;
1107
-
1108
- for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
1109
- const int col_x = col_x0 + threadIdx.x;
1110
-
1111
- if (col_x >= ncols_x) {
1112
- break;
1113
- }
1114
-
1115
- const int row_y = col_x;
1116
-
1117
- const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
1118
- const int iy = channel*nrows_y + row_y;
1119
-
1120
- const float xi = __half2float(x[ix]);
1121
-
1122
- tmp += xi * y[iy];
1123
- }
1124
-
1125
- // sum up partial sums and write back result
1126
- tmp = warp_reduce_sum(tmp);
1127
-
1128
- if (threadIdx.x == 0) {
1129
- dst[idst] = tmp;
1130
- }
1131
- }
1132
-
1133
- static void ggml_mul_mat_p021_f16_f32_cuda(
1134
- const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
1135
- const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
1136
-
1137
- const dim3 block_nums(1, nrows_x, nchannels_y);
1138
- const dim3 block_dims(WARP_SIZE, 1, 1);
1139
- mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
1140
- }
1141
-
1142
- static void ggml_mul_mat_vec_nc_f16_f32_cuda(
1143
- const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
1144
- const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
1145
-
1146
- const dim3 block_nums(1, nrows_x, nchannels_y);
1147
- const dim3 block_dims(WARP_SIZE, 1, 1);
1148
- mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
1149
- (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
1150
- }
1151
-
1152
- static cudaError_t ggml_cuda_cpy_tensor_2d(
1153
- void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
1154
-
1155
- GGML_ASSERT(ggml_backend_buffer_is_cuda(src->buffer));
1156
- char * src_ptr = (char *) src->data;
1157
- char * dst_ptr = (char *) dst;
1158
-
1159
- const int64_t ne0 = src->ne[0];
1160
- const int64_t nb0 = src->nb[0];
1161
- const int64_t nb1 = src->nb[1];
1162
- const int64_t nb2 = src->nb[2];
1163
- const int64_t nb3 = src->nb[3];
1164
- const enum ggml_type type = src->type;
1165
- const int64_t ts = ggml_type_size(type);
1166
- const int64_t bs = ggml_blck_size(type);
1167
- int64_t i1_diff = i1_high - i1_low;
1168
-
1169
- const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
1170
- if (nb0 == ts && nb1 == ts*ne0/bs) {
1171
- return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, cudaMemcpyDeviceToDevice, stream);
1172
- } else if (nb0 == ts) {
1173
- return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, cudaMemcpyDeviceToDevice, stream);
1174
- } else {
1175
- for (int64_t i1 = 0; i1 < i1_diff; i1++) {
1176
- const void * rx = (const void *) ((const char *) x + i1*nb1);
1177
- void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
1178
- // pretend the row is a matrix with cols=1
1179
- cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyDeviceToDevice, stream);
1180
- if (r != cudaSuccess) {
1181
- return r;
1182
- }
1183
- }
1184
- return cudaSuccess;
1185
- }
1186
- }
1187
-
1188
- static void ggml_cuda_op_mul_mat_cublas(
1189
- ggml_backend_cuda_context & ctx,
1190
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
1191
- const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
1192
- const int64_t src1_padded_row_size, cudaStream_t stream) {
1193
-
1194
- GGML_ASSERT(src0_dd_i != nullptr);
1195
- GGML_ASSERT(src1_ddf_i != nullptr);
1196
- GGML_ASSERT(dst_dd_i != nullptr);
1197
-
1198
- const int64_t ne00 = src0->ne[0];
1199
- const int64_t ne10 = src1->ne[0];
1200
-
1201
- const int64_t ne0 = dst->ne[0];
1202
-
1203
- const int64_t row_diff = row_high - row_low;
1204
-
1205
- int id = ggml_cuda_get_device();
1206
-
1207
- // the main device has a larger memory buffer to hold the results from all GPUs
1208
- // ldc == nrows of the matrix that cuBLAS writes into
1209
- int64_t ldc = id == ctx.device ? ne0 : row_diff;
1210
-
1211
- const int compute_capability = ggml_cuda_info().devices[id].cc;
1212
-
1213
- if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
1214
- // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
1215
- ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
1216
- if (src0->type != GGML_TYPE_F16) {
1217
- const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
1218
- GGML_ASSERT(to_fp16_cuda != nullptr);
1219
- size_t ne = row_diff*ne00;
1220
- src0_as_f16.alloc(ne);
1221
- to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
1222
- }
1223
- const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
1224
-
1225
- ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool(id));
1226
- if (src1->type != GGML_TYPE_F16) {
1227
- const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
1228
- GGML_ASSERT(to_fp16_cuda != nullptr);
1229
- size_t ne = src1_ncols*ne10;
1230
- src1_as_f16.alloc(ne);
1231
- to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
1232
- }
1233
- const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
1234
- ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(id), row_diff*src1_ncols);
1235
-
1236
- const half alpha_f16 = 1.0f;
1237
- const half beta_f16 = 0.0f;
1238
-
1239
- CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
1240
- CUBLAS_CHECK(
1241
- cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
1242
- row_diff, src1_ncols, ne10,
1243
- &alpha_f16, src0_ptr, CUDA_R_16F, ne00,
1244
- src1_ptr, CUDA_R_16F, ne10,
1245
- &beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
1246
- CUBLAS_COMPUTE_16F,
1247
- CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1248
-
1249
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
1250
- to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
1251
- } else {
1252
- ggml_cuda_pool_alloc<float> src0_ddq_as_f32(ctx.pool(id));
1253
- ggml_cuda_pool_alloc<float> src1_ddq_as_f32(ctx.pool(id));
1254
-
1255
- if (src0->type != GGML_TYPE_F32) {
1256
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
1257
- GGML_ASSERT(to_fp32_cuda != nullptr);
1258
- src0_ddq_as_f32.alloc(row_diff*ne00);
1259
- to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
1260
- }
1261
- if (src1->type != GGML_TYPE_F32) {
1262
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
1263
- GGML_ASSERT(to_fp32_cuda != nullptr);
1264
- src1_ddq_as_f32.alloc(src1_ncols*ne10);
1265
- to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
1266
- }
1267
-
1268
- const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
1269
- const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
1270
-
1271
- const float alpha = 1.0f;
1272
- const float beta = 0.0f;
1273
-
1274
- CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
1275
- CUBLAS_CHECK(
1276
- cublasSgemm(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
1277
- row_diff, src1_ncols, ne10,
1278
- &alpha, src0_ddf_i, ne00,
1279
- src1_ddf1_i, ne10,
1280
- &beta, dst_dd_i, ldc));
1281
- }
1282
-
1283
- GGML_UNUSED(dst);
1284
- GGML_UNUSED(src1_ddq_i);
1285
- GGML_UNUSED(src1_padded_row_size);
1286
- }
1287
-
1288
- static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
1289
- static bool peer_access_enabled = false;
1290
-
1291
- const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
1292
-
1293
- if (peer_access_enabled == enable_peer_access) {
1294
- return;
1295
- }
1296
-
1297
- #ifdef NDEBUG
1298
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1299
- ggml_cuda_set_device(id);
1300
- CUDA_CHECK(cudaDeviceSynchronize());
1301
- }
1302
-
1303
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1304
- ggml_cuda_set_device(id);
1305
-
1306
- for (int id_other = 0; id_other < ggml_backend_cuda_get_device_count(); ++id_other) {
1307
- if (id == id_other) {
1308
- continue;
1309
- }
1310
- if (id != main_device && id_other != main_device) {
1311
- continue;
1312
- }
1313
-
1314
- int can_access_peer;
1315
- CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
1316
- if (can_access_peer) {
1317
- if (enable_peer_access) {
1318
- cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
1319
- if (err != cudaErrorPeerAccessAlreadyEnabled) {
1320
- CUDA_CHECK(err);
1321
- }
1322
- } else {
1323
- cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
1324
- if (err != cudaErrorPeerAccessNotEnabled) {
1325
- CUDA_CHECK(err);
1326
- }
1327
- }
1328
- }
1329
- }
1330
- }
1331
-
1332
- ggml_cuda_set_device(main_device);
1333
- #endif // NDEBUG
1334
-
1335
- peer_access_enabled = enable_peer_access;
1336
-
1337
- GGML_UNUSED(main_device);
1338
- }
1339
-
1340
- static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
1341
- void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
1342
-
1343
- #if !defined(GGML_USE_HIPBLAS)
1344
- // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
1345
- cudaMemcpy3DPeerParms p = {};
1346
- p.dstDevice = dstDevice;
1347
- p.dstPtr = make_cudaPitchedPtr(dst, dpitch, dpitch, height);
1348
- p.srcDevice = srcDevice;
1349
- p.srcPtr = make_cudaPitchedPtr(src, spitch, spitch, height);
1350
- p.extent = make_cudaExtent(width, height, 1);
1351
- return cudaMemcpy3DPeerAsync(&p, stream);
1352
- #else
1353
- // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
1354
- GGML_UNUSED(dstDevice);
1355
- GGML_UNUSED(srcDevice);
1356
- return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
1357
- #endif // !defined(GGML_USE_HIPBLAS)
1358
- }
1359
-
1360
- static void ggml_cuda_op_mul_mat(
1361
- ggml_backend_cuda_context & ctx,
1362
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
1363
- quantize_cuda_t quantize_src1) {
1364
-
1365
- const int64_t ne00 = src0->ne[0];
1366
- const int64_t ne01 = src0->ne[1];
1367
- const int64_t ne02 = src0->ne[2];
1368
- const int64_t ne03 = src0->ne[3];
1369
-
1370
- const int64_t ne10 = src1->ne[0];
1371
- const int64_t ne11 = src1->ne[1];
1372
- const int64_t ne12 = src1->ne[2];
1373
- const int64_t ne13 = src1->ne[3];
1374
- const int64_t nrows1 = ggml_nrows(src1);
1375
-
1376
- GGML_ASSERT(ne03 == ne13);
1377
-
1378
- const int64_t ne0 = dst->ne[0];
1379
- const int64_t ne1 = dst->ne[1];
1380
-
1381
- const int64_t nb2 = dst->nb[2];
1382
- const int64_t nb3 = dst->nb[3];
1383
-
1384
- GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
1385
- GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
1386
- ggml_backend_cuda_buffer_context * src1_ctx = (ggml_backend_cuda_buffer_context *) src1->buffer->context;
1387
- ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *) dst->buffer->context;
1388
-
1389
- GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
1390
-
1391
- GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
1392
-
1393
- const int64_t i02_divisor = ne12 / ne02;
1394
-
1395
- const size_t src0_ts = ggml_type_size(src0->type);
1396
- const size_t src0_bs = ggml_blck_size(src0->type);
1397
- const size_t q8_1_ts = sizeof(block_q8_1);
1398
- const size_t q8_1_bs = QK8_1;
1399
-
1400
- const bool src0_is_contiguous = ggml_is_contiguous(src0);
1401
- const bool src1_is_contiguous = ggml_is_contiguous(src1);
1402
-
1403
- const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
1404
-
1405
- const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
1406
- GGML_ASSERT(!(split && ne02 > 1));
1407
- GGML_ASSERT(!(split && ne03 > 1));
1408
- GGML_ASSERT(!(split && ne02 < ne12));
1409
-
1410
- ggml_tensor_extra_gpu * src0_extra = split ? (ggml_tensor_extra_gpu *) src0->extra : nullptr;
1411
-
1412
-
1413
- std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
1414
- if (split) {
1415
- ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
1416
- tensor_split = buft_ctx->tensor_split;
1417
- }
1418
-
1419
- struct dev_data {
1420
- int cc;
1421
-
1422
- ggml_cuda_pool_alloc<char> src0_dd_alloc;
1423
- ggml_cuda_pool_alloc<float> src1_ddf_alloc;
1424
- ggml_cuda_pool_alloc<char> src1_ddq_alloc;
1425
- ggml_cuda_pool_alloc<float> dst_dd_alloc;
1426
-
1427
- char * src0_dd = nullptr;
1428
- float * src1_ddf = nullptr; // float
1429
- char * src1_ddq = nullptr; // q8_1
1430
- float * dst_dd = nullptr;
1431
-
1432
- int64_t row_low;
1433
- int64_t row_high;
1434
- };
1435
-
1436
- dev_data dev[GGML_CUDA_MAX_DEVICES];
1437
-
1438
- int used_devices = 0;
1439
-
1440
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1441
- dev[id].cc = ggml_cuda_info().devices[id].cc;
1442
-
1443
- // by default, use all rows
1444
- dev[id].row_low = 0;
1445
- dev[id].row_high = ne01;
1446
-
1447
- // for multi GPU, get the row boundaries from tensor split
1448
- // and round to mul_mat_q tile sizes
1449
- if (split) {
1450
- const int64_t rounding = get_row_rounding(tensor_split);
1451
-
1452
- if (id != 0) {
1453
- dev[id].row_low = ne01*tensor_split[id];
1454
- if (dev[id].row_low < ne01) {
1455
- dev[id].row_low -= dev[id].row_low % rounding;
1456
- }
1457
- }
1458
-
1459
- if (id != ggml_backend_cuda_get_device_count() - 1) {
1460
- dev[id].row_high = ne01*tensor_split[id + 1];
1461
- if (dev[id].row_high < ne01) {
1462
- dev[id].row_high -= dev[id].row_high % rounding;
1463
- }
1464
- }
1465
- }
1466
- }
1467
-
1468
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1469
- if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
1470
- continue;
1471
- }
1472
-
1473
- used_devices++;
1474
-
1475
- const bool src1_on_device = id == src1_ctx->device;
1476
- const bool dst_on_device = id == dst_ctx->device;
1477
-
1478
- ggml_cuda_set_device(id);
1479
- cudaStream_t stream = ctx.stream(id, 0);
1480
-
1481
- if (src0_is_contiguous) {
1482
- dev[id].src0_dd = split ? (char *) src0_extra->data_device[id] : (char *) src0->data;
1483
- } else {
1484
- dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
1485
- }
1486
-
1487
- if (src1_on_device && src1_is_contiguous) {
1488
- dev[id].src1_ddf = (float *) src1->data;
1489
- } else {
1490
- dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
1491
- }
1492
-
1493
- if (quantize_src1) {
1494
- size_t src_1_ddq_size = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
1495
- if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1496
- src_1_ddq_size += get_mmq_x_max_host(dev[id].cc)*sizeof(block_q8_1_mmq);
1497
- }
1498
- dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
1499
-
1500
- if (src1_on_device && src1_is_contiguous) {
1501
- quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
1502
- CUDA_CHECK(cudaGetLastError());
1503
- }
1504
- }
1505
-
1506
- if (dst_on_device) {
1507
- dev[id].dst_dd = (float *) dst->data;
1508
- } else {
1509
- const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
1510
- dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(ctx.pool(id), size_dst_ddf);
1511
- }
1512
- }
1513
-
1514
- // if multiple devices are used they need to wait for the main device
1515
- // here an event is recorded that signals that the main device has finished calculating the input data
1516
- if (split && used_devices > 1) {
1517
- ggml_cuda_set_device(ctx.device);
1518
- CUDA_CHECK(cudaEventRecord(src0_extra->events[ctx.device][0], ctx.stream()));
1519
- }
1520
-
1521
- const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
1522
- for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
1523
- const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_CUDA_MAX_STREAMS : 0;
1524
- const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
1525
-
1526
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1527
- if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
1528
- continue;
1529
- }
1530
-
1531
- const bool src1_on_device = id == src1_ctx->device;
1532
- const bool dst_on_device = id == dst_ctx->device;
1533
- const int64_t row_diff = dev[id].row_high - dev[id].row_low;
1534
-
1535
- ggml_cuda_set_device(id);
1536
- cudaStream_t stream = ctx.stream(id, is);
1537
-
1538
- // wait for main GPU data if necessary
1539
- if (split && (id != ctx.device || is != 0)) {
1540
- CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[ctx.device][0], 0));
1541
- }
1542
-
1543
- for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
1544
- const int64_t i03 = i0 / ne12;
1545
- const int64_t i02 = i0 % ne12;
1546
-
1547
- size_t src1_ddq_i_offset = i0*ne11 * src1_padded_col_size*q8_1_ts/q8_1_bs;
1548
- if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1549
- src1_ddq_i_offset += src1_col_0 * sizeof(block_q8_1_mmq);
1550
- } else {
1551
- src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs;
1552
- }
1553
-
1554
- // for split tensors the data begins at i0 == i0_offset_low
1555
- char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
1556
- float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
1557
- char * src1_ddq_i = dev[id].src1_ddq + src1_ddq_i_offset;
1558
- float * dst_dd_i = dev[id].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
1559
-
1560
- // the main device memory buffer can be on VRAM scratch, with space for all partial results
1561
- // in that case an offset on dst_ddf_i is needed
1562
- if (id == ctx.device) {
1563
- dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
1564
- }
1565
-
1566
- // copy src0, src1 to device if necessary
1567
- if (src1_is_contiguous) {
1568
- if (id != ctx.device) {
1569
- if (quantize_src1) {
1570
- char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
1571
- if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1572
- const size_t pitch = ne11*sizeof(block_q8_1_mmq);
1573
- const size_t width = src1_ncols*sizeof(block_q8_1_mmq);
1574
- const size_t height = src1_padded_col_size/(4*QK8_1);
1575
- CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device, pitch, width, height, stream));
1576
- } else {
1577
- CUDA_CHECK(cudaMemcpyPeerAsync(
1578
- src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
1579
- }
1580
- } else {
1581
- float * src1_ddf_i_source = (float *) src1->data;
1582
- src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
1583
- CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, ctx.device,
1584
- src1_ncols*ne10*sizeof(float), stream));
1585
- }
1586
- }
1587
- } else if (src1_on_device && !src1_is_contiguous) {
1588
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
1589
- src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
1590
- } else {
1591
- GGML_ASSERT(false);
1592
- }
1593
-
1594
- if (quantize_src1 && !src1_is_contiguous) {
1595
- quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream);
1596
- CUDA_CHECK(cudaGetLastError());
1597
- }
1598
-
1599
- if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) {
1600
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
1601
- }
1602
-
1603
- // do the computation
1604
- op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
1605
- dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream);
1606
- CUDA_CHECK(cudaGetLastError());
1607
-
1608
- // copy dst to host or other device if necessary
1609
- if (!dst_on_device) {
1610
- void * dst_off_device = dst->data;
1611
- if (split) {
1612
- // src0 = weight matrix is saved as a transposed matrix for better memory layout.
1613
- // dst is NOT transposed.
1614
- // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
1615
- // Instead they need to be copied to the correct slice in ne0 = dst row index.
1616
- // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
1617
- float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
1618
- GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
1619
- dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
1620
- CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(
1621
- dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream));
1622
- } else {
1623
- float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
1624
- GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
1625
- dhf_dst_i += src1_col_0*ne0;
1626
- CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), cudaMemcpyDeviceToDevice, stream));
1627
- }
1628
- }
1629
-
1630
- // add event for the main device to wait on until other device is done
1631
- if (split && (id != ctx.device || is != 0)) {
1632
- CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
1633
- }
1634
- }
1635
- }
1636
- }
1637
-
1638
- // main device waits for all other devices to be finished
1639
- if (split && ggml_backend_cuda_get_device_count() > 1) {
1640
- int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
1641
- is_max = is_max <= GGML_CUDA_MAX_STREAMS ? is_max : GGML_CUDA_MAX_STREAMS;
1642
-
1643
- ggml_cuda_set_device(ctx.device);
1644
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1645
- if (dev[id].row_low == dev[id].row_high) {
1646
- continue;
1647
- }
1648
- for (int64_t is = 0; is < is_max; ++is) {
1649
- CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), src0_extra->events[id][is], 0));
1650
- }
1651
- }
1652
- }
1653
- }
1654
-
1655
- static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1656
- GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
1657
- GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
1658
- GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
1659
- GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
1660
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
1661
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
1662
-
1663
- const int64_t ne00 = src0->ne[0];
1664
- const int64_t ne01 = src0->ne[1];
1665
- const int64_t ne02 = src0->ne[2];
1666
-
1667
- const int64_t ne12 = src1->ne[2];
1668
-
1669
- cudaStream_t main_stream = ctx.stream();
1670
-
1671
- void * src0_ddq = src0->data;
1672
- float * src1_ddf = (float *) src1->data;
1673
- float * dst_ddf = (float *) dst->data;
1674
-
1675
- ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
1676
- }
1677
-
1678
- static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1679
- GGML_ASSERT(!ggml_is_transposed(src0));
1680
- GGML_ASSERT(!ggml_is_transposed(src1));
1681
- GGML_ASSERT(!ggml_is_permuted(src0));
1682
- GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
1683
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
1684
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
1685
-
1686
- const int64_t ne00 = src0->ne[0];
1687
- const int64_t ne01 = src0->ne[1];
1688
- const int64_t ne02 = src0->ne[2];
1689
-
1690
- const int64_t nb01 = src0->nb[1];
1691
- const int64_t nb02 = src0->nb[2];
1692
-
1693
- const int64_t ne12 = src1->ne[2];
1694
-
1695
- cudaStream_t main_stream = ctx.stream();
1696
-
1697
- void * src0_ddq = src0->data;
1698
- float * src1_ddf = (float *) src1->data;
1699
- float * dst_ddf = (float *) dst->data;
1700
-
1701
- const int64_t row_stride_x = nb01 / sizeof(half);
1702
- const int64_t channel_stride_x = nb02 / sizeof(half);
1703
-
1704
- ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
1705
- }
1706
-
1707
- static __global__ void k_compute_batched_ptrs(
1708
- const half * src0_as_f16, const half * src1_as_f16, char * dst,
1709
- const void ** ptrs_src, void ** ptrs_dst,
1710
- int64_t ne12, int64_t ne13,
1711
- int64_t ne23,
1712
- size_t nb02, size_t nb03,
1713
- size_t nb12, size_t nb13,
1714
- size_t nbd2, size_t nbd3,
1715
- int64_t r2, int64_t r3) {
1716
- int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
1717
- int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
1718
-
1719
- if (i13 >= ne13 || i12 >= ne12) {
1720
- return;
1721
- }
1722
-
1723
- int64_t i03 = i13 / r3;
1724
- int64_t i02 = i12 / r2;
1725
-
1726
- ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
1727
- ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
1728
- ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
1729
- }
1730
-
1731
- static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1732
- GGML_ASSERT(!ggml_is_transposed(src0));
1733
- GGML_ASSERT(!ggml_is_transposed(src1));
1734
-
1735
- GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
1736
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
1737
-
1738
- GGML_TENSOR_BINARY_OP_LOCALS
1739
-
1740
- const int64_t ne_dst = ggml_nelements(dst);
1741
-
1742
- cudaStream_t main_stream = ctx.stream();
1743
-
1744
- CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream));
1745
-
1746
- void * src0_ddq = src0->data;
1747
- half * src0_f16 = (half *) src0_ddq;
1748
- float * src1_ddf = (float *) src1->data;
1749
- float * dst_ddf = (float *) dst->data;
1750
-
1751
- // convert src1 to fp16
1752
- ggml_cuda_pool_alloc<half> src1_f16_alloc(ctx.pool());
1753
- if (src1->type != GGML_TYPE_F16) {
1754
- const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
1755
- const int64_t ne_src1 = ggml_nelements(src1);
1756
- src1_f16_alloc.alloc(ne_src1);
1757
- GGML_ASSERT(to_fp16_cuda != nullptr);
1758
- to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
1759
- }
1760
- half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
1761
-
1762
- ggml_cuda_pool_alloc<half> dst_f16(ctx.pool());
1763
- char * dst_t;
1764
-
1765
- cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
1766
- cudaDataType_t cu_data_type = CUDA_R_16F;
1767
-
1768
- // dst strides
1769
- size_t nbd2 = dst->nb[2];
1770
- size_t nbd3 = dst->nb[3];
1771
-
1772
- const half alpha_f16 = 1.0f;
1773
- const half beta_f16 = 0.0f;
1774
-
1775
- const float alpha_f32 = 1.0f;
1776
- const float beta_f32 = 0.0f;
1777
-
1778
- const void * alpha = &alpha_f16;
1779
- const void * beta = &beta_f16;
1780
-
1781
- if (dst->op_params[0] == GGML_PREC_DEFAULT) {
1782
- dst_t = (char *) dst_f16.alloc(ne_dst);
1783
-
1784
- nbd2 /= sizeof(float) / sizeof(half);
1785
- nbd3 /= sizeof(float) / sizeof(half);
1786
- } else {
1787
- dst_t = (char *) dst_ddf;
1788
-
1789
- cu_compute_type = CUBLAS_COMPUTE_32F;
1790
- cu_data_type = CUDA_R_32F;
1791
-
1792
- alpha = &alpha_f32;
1793
- beta = &beta_f32;
1794
- }
1795
-
1796
- GGML_ASSERT(ne12 % ne02 == 0);
1797
- GGML_ASSERT(ne13 % ne03 == 0);
1798
-
1799
- // broadcast factors
1800
- const int64_t r2 = ne12/ne02;
1801
- const int64_t r3 = ne13/ne03;
1802
-
1803
- #if 0
1804
- // use cublasGemmEx
1805
- {
1806
- for (int i13 = 0; i13 < ne13; ++i13) {
1807
- for (int i12 = 0; i12 < ne12; ++i12) {
1808
- int i03 = i13 / r3;
1809
- int i02 = i12 / r2;
1810
-
1811
- CUBLAS_CHECK(
1812
- cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
1813
- ne01, ne11, ne10,
1814
- alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
1815
- (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
1816
- beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
1817
- cu_compute_type,
1818
- CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1819
- }
1820
- }
1821
- }
1822
- #else
1823
- if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
1824
- // there is no broadcast and src0, src1 are contiguous across dims 2, 3
1825
- // use cublasGemmStridedBatchedEx
1826
- CUBLAS_CHECK(
1827
- cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
1828
- ne01, ne11, ne10,
1829
- alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
1830
- (const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB
1831
- beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC
1832
- ne12*ne13,
1833
- cu_compute_type,
1834
- CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1835
- } else {
1836
- // use cublasGemmBatchedEx
1837
- const int ne23 = ne12*ne13;
1838
-
1839
- ggml_cuda_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
1840
- ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
1841
-
1842
- dim3 block_dims(ne13, ne12);
1843
- k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
1844
- src0_f16, src1_f16, dst_t,
1845
- ptrs_src.get(), ptrs_dst.get(),
1846
- ne12, ne13,
1847
- ne23,
1848
- nb02, nb03,
1849
- src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
1850
- src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
1851
- nbd2, nbd3,
1852
- r2, r3);
1853
- CUDA_CHECK(cudaGetLastError());
1854
-
1855
- CUBLAS_CHECK(
1856
- cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
1857
- ne01, ne11, ne10,
1858
- alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00,
1859
- (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10,
1860
- beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
1861
- ne23,
1862
- cu_compute_type,
1863
- CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1864
- }
1865
- #endif
1866
-
1867
- if (dst->op_params[0] == GGML_PREC_DEFAULT) {
1868
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
1869
- to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
1870
- }
1871
- }
1872
-
1873
- static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1874
- const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
1875
-
1876
- int64_t min_compute_capability = INT_MAX;
1877
-
1878
- bool any_pascal_with_slow_fp16 = false;
1879
- if (split) {
1880
- ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
1881
- auto & tensor_split = buft_ctx->tensor_split;
1882
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1883
- // skip devices that are not going to do any work:
1884
- if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
1885
- continue;
1886
- }
1887
-
1888
- if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
1889
- min_compute_capability = ggml_cuda_info().devices[id].cc;
1890
- }
1891
- if (ggml_cuda_info().devices[id].cc == 610) {
1892
- any_pascal_with_slow_fp16 = true;
1893
- }
1894
- }
1895
- } else {
1896
- min_compute_capability = ggml_cuda_info().devices[ctx.device].cc;
1897
- any_pascal_with_slow_fp16 = ggml_cuda_info().devices[ctx.device].cc == 610;
1898
- }
1899
-
1900
- // check data types and tensor shapes for custom matrix multiplication kernels:
1901
- bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
1902
- && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1903
- && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
1904
-
1905
- bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
1906
- && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1907
- && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
1908
-
1909
- bool use_mul_mat_q = ggml_cuda_supports_mmq(src0->type)
1910
- && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
1911
-
1912
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1913
-
1914
- const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
1915
-
1916
- #ifdef CUDA_USE_TENSOR_CORES
1917
- use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
1918
- #endif // CUDA_USE_TENSOR_CORES
1919
-
1920
- #else
1921
-
1922
- // fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
1923
- const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
1924
-
1925
- // mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
1926
- use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
1927
- use_mul_mat_q = use_mul_mat_q && min_compute_capability >= MIN_CC_DP4A;
1928
-
1929
- #ifdef CUDA_USE_TENSOR_CORES
1930
- // when tensor cores are available, use them for large batch size
1931
- // ref: https://github.com/ggerganov/llama.cpp/pull/3776
1932
- use_mul_mat_q = use_mul_mat_q && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
1933
- #endif // CUDA_USE_TENSOR_CORES
1934
-
1935
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1936
-
1937
- // if mmvq is available it's a better choice than dmmv:
1938
- #ifndef GGML_CUDA_FORCE_DMMV
1939
- use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
1940
- #endif // GGML_CUDA_FORCE_DMMV
1941
-
1942
- // debug helpers
1943
- //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
1944
- //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
1945
- //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
1946
- //printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
1947
- //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
1948
- //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
1949
-
1950
- if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
1951
- // KQ single-batch
1952
- ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst);
1953
- } else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
1954
- // KQV single-batch
1955
- ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
1956
- } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
1957
- // KQ + KQV multi-batch
1958
- ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
1959
- } else if (use_dequantize_mul_mat_vec) {
1960
- ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
1961
- } else if (use_mul_mat_vec_q) {
1962
- ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
1963
- } else if (use_mul_mat_q) {
1964
- ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
1965
- } else {
1966
- ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
1967
- }
1968
- }
1969
-
1970
- struct mmid_row_mapping {
1971
- int32_t i1;
1972
- int32_t i2;
1973
- };
1974
-
1975
- static __global__ void k_copy_src1_to_contiguous(const char * __restrict__ src1_original, char * __restrict__ src1_contiguous,
1976
- int * __restrict__ cur_src1_row, mmid_row_mapping * __restrict__ row_mapping,
1977
- const char * __restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
1978
- int64_t ne11, int64_t ne10,
1979
- size_t nb11, size_t nb12) {
1980
- int32_t iid1 = blockIdx.x;
1981
- int32_t id = blockIdx.y;
1982
-
1983
- const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
1984
-
1985
- if (row_id_i != i02) {
1986
- return;
1987
- }
1988
-
1989
- const int64_t i11 = id % ne11;
1990
- const int64_t i12 = iid1;
1991
-
1992
- __shared__ int src1_row;
1993
- if (threadIdx.x == 0) {
1994
- src1_row = atomicAdd(cur_src1_row, 1);
1995
- row_mapping[src1_row] = {id, iid1};
1996
- }
1997
- __syncthreads();
1998
-
1999
- const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
2000
- float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
2001
-
2002
- for (int i = threadIdx.x; i < ne10; i += blockDim.x) {
2003
- src1_row_contiguous[i] = src1_row_original[i];
2004
- }
2005
- }
2006
-
2007
- static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_original, const char * __restrict__ dst_contiguous,
2008
- const mmid_row_mapping * __restrict__ row_mapping,
2009
- int64_t ne0,
2010
- size_t nb1, size_t nb2) {
2011
- int32_t i = blockIdx.x;
2012
-
2013
- const int32_t i1 = row_mapping[i].i1;
2014
- const int32_t i2 = row_mapping[i].i2;
2015
-
2016
- const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1);
2017
- float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2);
2018
-
2019
- for (int j = threadIdx.x; j < ne0; j += blockDim.x) {
2020
- dst_row_original[j] = dst_row_contiguous[j];
2021
- }
2022
- }
2023
-
2024
- static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
2025
- const ggml_tensor * src0 = dst->src[0];
2026
- const ggml_tensor * src1 = dst->src[1];
2027
- const ggml_tensor * ids = dst->src[2];
2028
-
2029
- GGML_TENSOR_BINARY_OP_LOCALS
2030
-
2031
- GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");
2032
-
2033
- cudaStream_t stream = ctx.stream();
2034
-
2035
- const int64_t n_as = ne02;
2036
- const int64_t n_ids = ids->ne[0];
2037
-
2038
- std::vector<char> ids_host(ggml_nbytes(ids));
2039
- const char * ids_dev = (const char *) ids->data;
2040
- CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
2041
- CUDA_CHECK(cudaStreamSynchronize(stream));
2042
-
2043
- ggml_tensor src0_row = *src0;
2044
- ggml_tensor src1_row = *src1;
2045
- ggml_tensor dst_row = *dst;
2046
-
2047
- char * src0_original = (char *) src0->data;
2048
- char * src1_original = (char *) src1->data;
2049
- char * dst_original = (char *) dst->data;
2050
-
2051
- src0_row.ne[2] = 1;
2052
- src0_row.ne[3] = 1;
2053
- src0_row.nb[3] = nb02;
2054
-
2055
- src1_row.ne[1] = 1;
2056
- src1_row.ne[2] = 1;
2057
- src1_row.ne[3] = 1;
2058
- src1_row.nb[2] = nb11;
2059
- src1_row.nb[3] = nb11;
2060
-
2061
- dst_row.ne[1] = 1;
2062
- dst_row.ne[2] = 1;
2063
- dst_row.ne[3] = 1;
2064
- dst_row.nb[2] = nb1;
2065
- dst_row.nb[3] = nb1;
2066
-
2067
- if (ne12 == 1) {
2068
- for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2069
- for (int64_t id = 0; id < n_ids; id++) {
2070
- const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2071
-
2072
- GGML_ASSERT(i02 >= 0 && i02 < n_as);
2073
-
2074
- const int64_t i11 = id % ne11;
2075
- const int64_t i12 = iid1;
2076
-
2077
- const int64_t i1 = id;
2078
- const int64_t i2 = i12;
2079
-
2080
- src0_row.data = src0_original + i02*nb02;
2081
- src1_row.data = src1_original + i11*nb11 + i12*nb12;
2082
- dst_row.data = dst_original + i1*nb1 + i2*nb2;
2083
-
2084
- ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
2085
- }
2086
- }
2087
- } else {
2088
- ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
2089
- ggml_cuda_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
2090
-
2091
- src1_row.data = src1_contiguous.get();
2092
- dst_row.data = dst_contiguous.get();
2093
-
2094
- for (int64_t i02 = 0; i02 < n_as; i02++) {
2095
- int64_t num_src1_rows = 0;
2096
-
2097
- for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2098
- for (int64_t id = 0; id < n_ids; id++) {
2099
- const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2100
-
2101
- GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
2102
-
2103
- if (row_id_i != i02) {
2104
- continue;
2105
- }
2106
-
2107
- num_src1_rows++;
2108
- }
2109
- }
2110
-
2111
- if (num_src1_rows == 0) {
2112
- continue;
2113
- }
2114
-
2115
- ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
2116
- ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
2117
- CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
2118
-
2119
- {
2120
- dim3 block_dims(std::min((unsigned int)ne10, 768u));
2121
- dim3 grid_dims(ids->ne[1], n_ids);
2122
- k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
2123
- src1_original, src1_contiguous.get(),
2124
- dev_cur_src1_row.get(), dev_row_mapping.get(),
2125
- ids_dev, i02, ids->nb[1], ids->nb[0],
2126
- ne11, ne10,
2127
- nb11, nb12);
2128
- CUDA_CHECK(cudaGetLastError());
2129
- }
2130
-
2131
- src0_row.data = src0_original + i02*nb02;
2132
-
2133
- GGML_ASSERT(nb11 == sizeof(float)*ne10);
2134
- GGML_ASSERT(nb1 == sizeof(float)*ne0);
2135
-
2136
- src1_row.ne[1] = num_src1_rows;
2137
- src1_row.nb[1] = nb11;
2138
- src1_row.nb[2] = num_src1_rows*nb11;
2139
- src1_row.nb[3] = num_src1_rows*nb11;
2140
-
2141
- dst_row.ne[1] = num_src1_rows;
2142
- dst_row.nb[1] = nb1;
2143
- dst_row.nb[2] = num_src1_rows*nb1;
2144
- dst_row.nb[3] = num_src1_rows*nb1;
2145
-
2146
- ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
2147
-
2148
- {
2149
- dim3 block_dims(std::min((unsigned int)ne0, 768u));
2150
- dim3 grid_dims(num_src1_rows);
2151
- k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
2152
- dst_original, dst_contiguous.get(),
2153
- dev_row_mapping.get(),
2154
- ne0,
2155
- nb1, nb2);
2156
- CUDA_CHECK(cudaGetLastError());
2157
- }
2158
- }
2159
- }
2160
- }
2161
-
2162
- static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
2163
- // why is this here instead of mul_mat?
2164
- if (dst->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(dst->src[0]->buffer)) {
2165
- ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
2166
- }
2167
-
2168
- switch (dst->op) {
2169
- case GGML_OP_REPEAT:
2170
- ggml_cuda_op_repeat(ctx, dst);
2171
- break;
2172
- case GGML_OP_GET_ROWS:
2173
- ggml_cuda_op_get_rows(ctx, dst);
2174
- break;
2175
- case GGML_OP_DUP:
2176
- ggml_cuda_dup(ctx, dst);
2177
- break;
2178
- case GGML_OP_CPY:
2179
- ggml_cuda_cpy(ctx, dst->src[0], dst->src[1]);
2180
- break;
2181
- case GGML_OP_CONT:
2182
- ggml_cuda_dup(ctx, dst);
2183
- break;
2184
- case GGML_OP_ADD:
2185
- ggml_cuda_op_add(ctx, dst);
2186
- break;
2187
- case GGML_OP_ACC:
2188
- ggml_cuda_op_acc(ctx, dst);
2189
- break;
2190
- case GGML_OP_MUL:
2191
- ggml_cuda_op_mul(ctx, dst);
2192
- break;
2193
- case GGML_OP_DIV:
2194
- ggml_cuda_op_div(ctx, dst);
2195
- break;
2196
- case GGML_OP_UNARY:
2197
- switch (ggml_get_unary_op(dst)) {
2198
- case GGML_UNARY_OP_GELU:
2199
- ggml_cuda_op_gelu(ctx, dst);
2200
- break;
2201
- case GGML_UNARY_OP_SILU:
2202
- ggml_cuda_op_silu(ctx, dst);
2203
- break;
2204
- case GGML_UNARY_OP_GELU_QUICK:
2205
- ggml_cuda_op_gelu_quick(ctx, dst);
2206
- break;
2207
- case GGML_UNARY_OP_TANH:
2208
- ggml_cuda_op_tanh(ctx, dst);
2209
- break;
2210
- case GGML_UNARY_OP_RELU:
2211
- ggml_cuda_op_relu(ctx, dst);
2212
- break;
2213
- case GGML_UNARY_OP_SIGMOID:
2214
- ggml_cuda_op_sigmoid(ctx, dst);
2215
- break;
2216
- case GGML_UNARY_OP_HARDSIGMOID:
2217
- ggml_cuda_op_hardsigmoid(ctx, dst);
2218
- break;
2219
- case GGML_UNARY_OP_HARDSWISH:
2220
- ggml_cuda_op_hardswish(ctx, dst);
2221
- break;
2222
- default:
2223
- return false;
2224
- }
2225
- break;
2226
- case GGML_OP_NORM:
2227
- ggml_cuda_op_norm(ctx, dst);
2228
- break;
2229
- case GGML_OP_GROUP_NORM:
2230
- ggml_cuda_op_group_norm(ctx, dst);
2231
- break;
2232
- case GGML_OP_CONCAT:
2233
- ggml_cuda_op_concat(ctx, dst);
2234
- break;
2235
- case GGML_OP_UPSCALE:
2236
- ggml_cuda_op_upscale(ctx, dst);
2237
- break;
2238
- case GGML_OP_PAD:
2239
- ggml_cuda_op_pad(ctx, dst);
2240
- break;
2241
- case GGML_OP_ARANGE:
2242
- ggml_cuda_op_arange(ctx, dst);
2243
- break;
2244
- case GGML_OP_TIMESTEP_EMBEDDING:
2245
- ggml_cuda_op_timestep_embedding(ctx, dst);
2246
- break;
2247
- case GGML_OP_LEAKY_RELU:
2248
- ggml_cuda_op_leaky_relu(ctx, dst);
2249
- break;
2250
- case GGML_OP_RMS_NORM:
2251
- ggml_cuda_op_rms_norm(ctx, dst);
2252
- break;
2253
- case GGML_OP_MUL_MAT:
2254
- if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
2255
- GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
2256
- return false;
2257
- } else {
2258
- ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
2259
- }
2260
- break;
2261
- case GGML_OP_MUL_MAT_ID:
2262
- ggml_cuda_mul_mat_id(ctx, dst);
2263
- break;
2264
- case GGML_OP_SCALE:
2265
- ggml_cuda_op_scale(ctx, dst);
2266
- break;
2267
- case GGML_OP_SQR:
2268
- ggml_cuda_op_sqr(ctx, dst);
2269
- break;
2270
- case GGML_OP_CLAMP:
2271
- ggml_cuda_op_clamp(ctx, dst);
2272
- break;
2273
- case GGML_OP_NONE:
2274
- case GGML_OP_RESHAPE:
2275
- case GGML_OP_VIEW:
2276
- case GGML_OP_PERMUTE:
2277
- case GGML_OP_TRANSPOSE:
2278
- break;
2279
- case GGML_OP_DIAG_MASK_INF:
2280
- ggml_cuda_op_diag_mask_inf(ctx, dst);
2281
- break;
2282
- case GGML_OP_SOFT_MAX:
2283
- ggml_cuda_op_soft_max(ctx, dst);
2284
- break;
2285
- case GGML_OP_ROPE:
2286
- ggml_cuda_op_rope(ctx, dst);
2287
- break;
2288
- case GGML_OP_IM2COL:
2289
- ggml_cuda_op_im2col(ctx, dst);
2290
- break;
2291
- case GGML_OP_POOL_2D:
2292
- ggml_cuda_op_pool2d(ctx, dst);
2293
- break;
2294
- case GGML_OP_SUM_ROWS:
2295
- ggml_cuda_op_sum_rows(ctx, dst);
2296
- break;
2297
- case GGML_OP_ARGSORT:
2298
- ggml_cuda_op_argsort(ctx, dst);
2299
- break;
2300
- case GGML_OP_FLASH_ATTN_EXT:
2301
- ggml_cuda_flash_attn_ext(ctx, dst);
2302
- break;
2303
- default:
2304
- return false;
2305
- }
2306
-
2307
- cudaError_t err = cudaGetLastError();
2308
- if (err != cudaSuccess) {
2309
- GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
2310
- CUDA_CHECK(err);
2311
- }
2312
-
2313
- return true;
2314
- }
2315
-
2316
- ////////////////////////////////////////////////////////////////////////////////
2317
-
2318
- // backend
2319
-
2320
- GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
2321
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2322
-
2323
- return cuda_ctx->name.c_str();
2324
- }
2325
-
2326
- GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
2327
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2328
-
2329
- delete cuda_ctx;
2330
- delete backend;
2331
- }
2332
-
2333
- GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
2334
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2335
-
2336
- return ggml_backend_cuda_buffer_type(cuda_ctx->device);
2337
- }
2338
-
2339
- GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2340
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2341
- ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
2342
-
2343
- GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
2344
-
2345
- CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
2346
- }
2347
-
2348
- GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
2349
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2350
- ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
2351
-
2352
- GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
2353
-
2354
- CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
2355
- }
2356
-
2357
- GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
2358
- GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
2359
-
2360
- ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
2361
- ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
2362
-
2363
- if (!ggml_backend_buffer_is_cuda(src->buffer)) {
2364
- return false;
2365
- }
2366
-
2367
- if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
2368
- return false;
2369
- }
2370
-
2371
- // device -> device
2372
- ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
2373
- ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
2374
-
2375
- if (backend_src != backend_dst) {
2376
- ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
2377
- ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
2378
-
2379
- GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
2380
- GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
2381
-
2382
- // copy on src stream
2383
- if (cuda_ctx_src->device == cuda_ctx_dst->device) {
2384
- CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
2385
- } else {
2386
- #ifdef GGML_CUDA_NO_PEER_COPY
2387
- return false;
2388
- #else
2389
- CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
2390
- #endif
2391
- }
2392
-
2393
- // record event on src stream
2394
- if (!cuda_ctx_src->copy_event) {
2395
- ggml_cuda_set_device(cuda_ctx_src->device);
2396
- CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
2397
- }
2398
-
2399
- CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));
2400
-
2401
- // wait on dst stream for the copy to complete
2402
- CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
2403
- } else {
2404
- // src and dst are on the same backend
2405
- CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
2406
- }
2407
- return true;
2408
- }
2409
-
2410
- GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
2411
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2412
-
2413
- CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream()));
2414
-
2415
- GGML_UNUSED(backend);
2416
- }
2417
-
2418
- static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2419
- graph_node_properties->node_address = node->data;
2420
- graph_node_properties->node_op = node->op;
2421
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
2422
- graph_node_properties->ne[i] = node->ne[i];
2423
- graph_node_properties->nb[i] = node->nb[i];
2424
- }
2425
- for (int i = 0; i < GGML_MAX_SRC; i++) {
2426
- graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
2427
- }
2428
- }
2429
-
2430
- static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2431
- if (node->data != graph_node_properties->node_address &&
2432
- node->op != GGML_OP_CPY &&
2433
- node->op != GGML_OP_VIEW) {
2434
- return false;
2435
- }
2436
-
2437
- if (node->op != graph_node_properties->node_op) {
2438
- return false;
2439
- }
2440
-
2441
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
2442
- if (node->ne[i] != graph_node_properties->ne[i]) {
2443
- return false;
2444
- }
2445
- if (node->nb[i] != graph_node_properties->nb[i]) {
2446
- return false;
2447
- }
2448
- }
2449
-
2450
- for (int i = 0; i < GGML_MAX_SRC; i++) {
2451
- if (node->src[i] &&
2452
- node->src[i]->data != graph_node_properties->src_address[i] &&
2453
- node->op != GGML_OP_CPY &&
2454
- node->op != GGML_OP_VIEW
2455
- ) {
2456
- return false;
2457
- }
2458
- }
2459
- return true;
2460
- }
2461
-
2462
- GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
2463
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2464
-
2465
- ggml_cuda_set_device(cuda_ctx->device);
2466
-
2467
- #ifdef USE_CUDA_GRAPH
2468
- static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
2469
-
2470
- // Objects required for CUDA Graph
2471
- if (cuda_ctx->cuda_graph == nullptr) {
2472
- cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
2473
- }
2474
-
2475
- bool use_cuda_graph = true;
2476
- bool cuda_graph_update_required = false;
2477
- // vector of pointers to CUDA cpy kernels, which are required to identify
2478
- // kernel parameters which need updated in the graph for each token
2479
- std::vector<void *> ggml_cuda_cpy_fn_ptrs;
2480
-
2481
- if (cuda_ctx->cuda_graph->graph == nullptr) {
2482
- if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2483
- cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2484
- #ifndef NDEBUG
2485
- GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2486
- #endif
2487
- }
2488
- }
2489
-
2490
- // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
2491
- // or previous graph capture failure.
2492
- // Also disable for multi-gpu for now. TO DO investigate
2493
- if (disable_cuda_graphs_due_to_env
2494
- || cuda_ctx->cuda_graph->disable_due_to_gpu_arch
2495
- || cuda_ctx->cuda_graph->disable_due_to_too_many_updates
2496
- || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
2497
- use_cuda_graph = false;
2498
- }
2499
-
2500
- if (use_cuda_graph) {
2501
- if (cuda_ctx->cuda_graph->instance == nullptr) {
2502
- cuda_graph_update_required = true;
2503
- }
2504
-
2505
- // Check if the graph size has changed
2506
- if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
2507
- cuda_graph_update_required = true;
2508
- cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
2509
- }
2510
-
2511
- // Loop over nodes in GGML graph to determine if CUDA graph update is required
2512
- // and store properties to allow this comparison for the next token
2513
- for (int i = 0; i < cgraph->n_nodes; i++) {
2514
- bool has_matching_properties = true;
2515
- if (!cuda_graph_update_required) {
2516
- has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2517
- }
2518
- if (!has_matching_properties) {
2519
- cuda_graph_update_required = true;
2520
- }
2521
- set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2522
- }
2523
-
2524
- // Loop over nodes in GGML graph to obtain info needed for CUDA graph
2525
- cuda_ctx->cuda_graph->updated_kernel_arg.clear();
2526
- for (int i = 0; i < cgraph->n_nodes; i++) {
2527
- ggml_tensor * node = cgraph->nodes[i];
2528
-
2529
- if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
2530
- use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2531
- #ifndef NDEBUG
2532
- GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
2533
- #endif
2534
- }
2535
-
2536
- if (node->op == GGML_OP_MUL_MAT_ID) {
2537
- use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2538
- #ifndef NDEBUG
2539
- GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2540
- #endif
2541
- }
2542
-
2543
- if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
2544
- // disable CUDA graphs for batch size > 1 for now.
2545
- // Changes in batch size or context size can cause changes to the grid size of some kernels.
2546
- use_cuda_graph = false;
2547
- #ifndef NDEBUG
2548
- GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2549
- #endif
2550
- }
2551
-
2552
- if (node->op == GGML_OP_CPY) {
2553
- // store the copy op parameter which changes with each token.
2554
- cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
2555
- // store a pointer to each copy op CUDA kernel to identify it later
2556
- void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
2557
- if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
2558
- ggml_cuda_cpy_fn_ptrs.push_back(ptr);
2559
- }
2560
- }
2561
-
2562
- if (!use_cuda_graph) {
2563
- break;
2564
- }
2565
- }
2566
-
2567
- // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
2568
- if (use_cuda_graph && cuda_graph_update_required) {
2569
- cuda_ctx->cuda_graph->number_consecutive_updates++;
2570
- } else {
2571
- cuda_ctx->cuda_graph->number_consecutive_updates = 0;
2572
- }
2573
-
2574
- if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2575
- cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2576
- #ifndef NDEBUG
2577
- GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2578
- #endif
2579
- }
2580
- }
2581
-
2582
- if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
2583
- CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
2584
- }
2585
-
2586
- #else
2587
- bool use_cuda_graph = false;
2588
- bool cuda_graph_update_required = false;
2589
- #endif // USE_CUDA_GRAPH
2590
-
2591
- bool graph_evaluated_or_captured = false;
2592
-
2593
- while (!graph_evaluated_or_captured) {
2594
- // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
2595
- // With the use of CUDA graphs, the execution will be performed by the graph launch.
2596
- if (!use_cuda_graph || cuda_graph_update_required) {
2597
- for (int i = 0; i < cgraph->n_nodes; i++) {
2598
- ggml_tensor * node = cgraph->nodes[i];
2599
-
2600
- if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2601
- continue;
2602
- }
2603
-
2604
- #ifndef NDEBUG
2605
- assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
2606
- for (int j = 0; j < GGML_MAX_SRC; j++) {
2607
- if (node->src[j] != nullptr) {
2608
- assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
2609
- }
2610
- }
2611
- #endif
2612
-
2613
- bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2614
- if (!ok) {
2615
- GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2616
- }
2617
- GGML_ASSERT(ok);
2618
- }
2619
- }
2620
-
2621
- #ifdef USE_CUDA_GRAPH
2622
- if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture
2623
- if (cuda_ctx->cuda_graph->graph != nullptr) {
2624
- CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
2625
- cuda_ctx->cuda_graph->graph = nullptr;
2626
- }
2627
- CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
2628
-
2629
- #if 0
2630
- if (disable_cuda_graphs_due_to_failed_capture) {
2631
- use_cuda_graph = false;
2632
- cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
2633
- #ifndef NDEBUG
2634
- GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2635
- #endif
2636
- } else {
2637
- graph_evaluated_or_captured = true; // CUDA graph has been captured
2638
- }
2639
- #endif
2640
- graph_evaluated_or_captured = true; // CUDA graph has been captured
2641
- } else {
2642
- graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
2643
- }
2644
- }
2645
-
2646
- if (use_cuda_graph) {
2647
- if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
2648
- CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2649
- }
2650
-
2651
- // Perform update to graph (if required for this token), and change copy parameter (required for every token)
2652
-
2653
- if (cuda_graph_update_required) {
2654
- // Extract nodes from graph
2655
- // First call with null argument gets number of nodes in graph
2656
- CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
2657
- // Subsequent call with non-null argument gets nodes
2658
- cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
2659
- cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
2660
- if (cuda_ctx->cuda_graph->num_nodes > 0) {
2661
- CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
2662
-
2663
- // Loop over nodes, and extract kernel parameters from each node
2664
- for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
2665
- cudaGraphNodeType node_type;
2666
- CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
2667
- if (node_type == cudaGraphNodeTypeKernel) {
2668
- cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
2669
- if (stat == cudaErrorInvalidDeviceFunction) {
2670
- // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
2671
- // We don't need to update blas nodes, so clear error and move on.
2672
- cudaGetLastError();
2673
- } else {
2674
- GGML_ASSERT(stat == cudaSuccess);
2675
- }
2676
- }
2677
- }
2678
- }
2679
- }
2680
-
2681
- // One of the arguments to the copy kernel is updated for each token, hence we need to
2682
- // replace that argument with the updated value in the CUDA graph
2683
- if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
2684
- int k = 0;
2685
- for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
2686
- if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
2687
- char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
2688
- cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
2689
- CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
2690
- }
2691
- }
2692
- }
2693
-
2694
- // Update graph executable
2695
- cudaGraphExecUpdateResultInfo result_info;
2696
- cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2697
- if (stat == cudaErrorGraphExecUpdateFailure) {
2698
- #ifndef NDEBUG
2699
- GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
2700
- #endif
2701
- // The pre-existing graph exec cannot be updated due to violated constraints
2702
- // so instead clear error and re-instantiate
2703
- cudaGetLastError();
2704
- CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
2705
- cuda_ctx->cuda_graph->instance = nullptr;
2706
- CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2707
- } else {
2708
- GGML_ASSERT(stat == cudaSuccess);
2709
- }
2710
- // Launch graph
2711
- CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
2712
- #else
2713
- graph_evaluated_or_captured = true;
2714
- #endif // USE_CUDA_GRAPH
2715
- }
2716
-
2717
- return GGML_STATUS_SUCCESS;
2718
- }
2719
-
2720
- GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
2721
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
2722
- switch (op->op) {
2723
- case GGML_OP_UNARY:
2724
- switch (ggml_get_unary_op(op)) {
2725
- case GGML_UNARY_OP_GELU:
2726
- case GGML_UNARY_OP_SILU:
2727
- case GGML_UNARY_OP_RELU:
2728
- case GGML_UNARY_OP_SIGMOID:
2729
- case GGML_UNARY_OP_HARDSIGMOID:
2730
- case GGML_UNARY_OP_HARDSWISH:
2731
- case GGML_UNARY_OP_GELU_QUICK:
2732
- case GGML_UNARY_OP_TANH:
2733
- return ggml_is_contiguous(op->src[0]);
2734
- default:
2735
- return false;
2736
- }
2737
- break;
2738
- case GGML_OP_MUL_MAT:
2739
- case GGML_OP_MUL_MAT_ID:
2740
- {
2741
- struct ggml_tensor * a;
2742
- struct ggml_tensor * b;
2743
- if (op->op == GGML_OP_MUL_MAT) {
2744
- a = op->src[0];
2745
- b = op->src[1];
2746
- } else {
2747
- a = op->src[2];
2748
- b = op->src[1];
2749
- }
2750
- if (a->ne[3] != b->ne[3]) {
2751
- return false;
2752
- }
2753
- ggml_type a_type = a->type;
2754
- if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
2755
- a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S ||
2756
- a_type == GGML_TYPE_IQ1_M || a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) {
2757
- if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
2758
- return false;
2759
- }
2760
- }
2761
- return true;
2762
- } break;
2763
- case GGML_OP_GET_ROWS:
2764
- {
2765
- switch (op->src[0]->type) {
2766
- case GGML_TYPE_F16:
2767
- case GGML_TYPE_F32:
2768
- case GGML_TYPE_Q4_0:
2769
- case GGML_TYPE_Q4_1:
2770
- case GGML_TYPE_Q5_0:
2771
- case GGML_TYPE_Q5_1:
2772
- case GGML_TYPE_Q8_0:
2773
- return true;
2774
- default:
2775
- return false;
2776
- }
2777
- } break;
2778
- case GGML_OP_CPY:
2779
- {
2780
- ggml_type src0_type = op->src[0]->type;
2781
- ggml_type src1_type = op->src[1]->type;
2782
- if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
2783
- return true;
2784
- }
2785
- if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
2786
- return true;
2787
- }
2788
- if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
2789
- return true;
2790
- }
2791
- if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
2792
- return true;
2793
- }
2794
- if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
2795
- return true;
2796
- }
2797
- if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_0) {
2798
- return true;
2799
- }
2800
- if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_1) {
2801
- return true;
2802
- }
2803
- if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
2804
- return true;
2805
- }
2806
- if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
2807
- return true;
2808
- }
2809
- if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
2810
- return true;
2811
- }
2812
- return false;
2813
- } break;
2814
- case GGML_OP_DUP:
2815
- case GGML_OP_REPEAT:
2816
- case GGML_OP_CONCAT:
2817
- {
2818
- ggml_type src0_type = op->src[0]->type;
2819
- return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
2820
- } break;
2821
- case GGML_OP_NONE:
2822
- case GGML_OP_RESHAPE:
2823
- case GGML_OP_VIEW:
2824
- case GGML_OP_PERMUTE:
2825
- case GGML_OP_TRANSPOSE:
2826
- case GGML_OP_NORM:
2827
- case GGML_OP_ADD:
2828
- case GGML_OP_MUL:
2829
- case GGML_OP_DIV:
2830
- case GGML_OP_RMS_NORM:
2831
- case GGML_OP_SCALE:
2832
- case GGML_OP_SQR:
2833
- case GGML_OP_CLAMP:
2834
- case GGML_OP_CONT:
2835
- case GGML_OP_DIAG_MASK_INF:
2836
- case GGML_OP_SOFT_MAX:
2837
- return true;
2838
- case GGML_OP_ROPE:
2839
- return ggml_is_contiguous(op->src[0]);
2840
- case GGML_OP_IM2COL:
2841
- case GGML_OP_POOL_2D:
2842
- case GGML_OP_SUM_ROWS:
2843
- case GGML_OP_ARGSORT:
2844
- case GGML_OP_ACC:
2845
- case GGML_OP_GROUP_NORM:
2846
- case GGML_OP_UPSCALE:
2847
- case GGML_OP_PAD:
2848
- case GGML_OP_ARANGE:
2849
- case GGML_OP_TIMESTEP_EMBEDDING:
2850
- case GGML_OP_LEAKY_RELU:
2851
- return true;
2852
- case GGML_OP_FLASH_ATTN_EXT:
2853
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
2854
- return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
2855
- #else
2856
- if (op->src[0]->ne[0] == 128) {
2857
- return true;
2858
- }
2859
- if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) {
2860
- return true;
2861
- }
2862
- return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA &&
2863
- op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
2864
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
2865
- default:
2866
- return false;
2867
- }
2868
-
2869
- GGML_UNUSED(backend);
2870
- }
2871
-
2872
- GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
2873
- if (ggml_backend_buft_is_cuda_split(buft)) {
2874
- return true;
2875
- }
2876
-
2877
- if (ggml_backend_buft_is_cuda(buft)) {
2878
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2879
- ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
2880
- return buft_ctx->device == cuda_ctx->device;
2881
- }
2882
-
2883
- return false;
2884
- }
2885
-
2886
- GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
2887
- const int min_batch_size = 32;
2888
-
2889
- return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
2890
- (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
2891
-
2892
- GGML_UNUSED(backend);
2893
- }
2894
-
2895
- static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
2896
- #ifdef GGML_CUDA_NO_PEER_COPY
2897
- return nullptr;
2898
- #else
2899
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2900
-
2901
- ggml_cuda_set_device(cuda_ctx->device);
2902
-
2903
- cudaEvent_t event;
2904
- CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
2905
-
2906
- return new ggml_backend_event {
2907
- /* .backend = */ backend,
2908
- /* .context = */ event,
2909
- };
2910
- #endif
2911
- }
2912
-
2913
- static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
2914
- CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
2915
-
2916
- delete event;
2917
- }
2918
-
2919
- static void ggml_backend_cuda_event_record(ggml_backend_event_t event) {
2920
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context;
2921
-
2922
- CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream()));
2923
- }
2924
-
2925
- static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
2926
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2927
-
2928
- if (ggml_backend_is_cuda(event->backend)) {
2929
- CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0));
2930
- } else {
2931
- #if 0
2932
- // untested
2933
- auto wait_fn = [](void * user_data) {
2934
- ggml_backend_event_t event = (ggml_backend_event_t)user_data;
2935
- ggml_backend_event_synchronize(event);
2936
- };
2937
-
2938
- CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
2939
- #endif
2940
- GGML_ASSERT(false);
2941
- }
2942
- }
2943
-
2944
- static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
2945
- CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
2946
- }
2947
-
2948
- static ggml_backend_i ggml_backend_cuda_interface = {
2949
- /* .get_name = */ ggml_backend_cuda_name,
2950
- /* .free = */ ggml_backend_cuda_free,
2951
- /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
2952
- /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
2953
- /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
2954
- /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
2955
- /* .synchronize = */ ggml_backend_cuda_synchronize,
2956
- /* .graph_plan_create = */ NULL,
2957
- /* .graph_plan_free = */ NULL,
2958
- /* .graph_plan_update = */ NULL,
2959
- /* .graph_plan_compute = */ NULL,
2960
- /* .graph_compute = */ ggml_backend_cuda_graph_compute,
2961
- /* .supports_op = */ ggml_backend_cuda_supports_op,
2962
- /* .supports_buft = */ ggml_backend_cuda_supports_buft,
2963
- /* .offload_op = */ ggml_backend_cuda_offload_op,
2964
- /* .event_new = */ ggml_backend_cuda_event_new,
2965
- /* .event_free = */ ggml_backend_cuda_event_free,
2966
- /* .event_record = */ ggml_backend_cuda_event_record,
2967
- /* .event_wait = */ ggml_backend_cuda_event_wait,
2968
- /* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
2969
- };
2970
-
2971
- static ggml_guid_t ggml_backend_cuda_guid() {
2972
- static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
2973
- return &guid;
2974
- }
2975
-
2976
- GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
2977
- if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
2978
- GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
2979
- return nullptr;
2980
- }
2981
-
2982
- ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
2983
- if (ctx == nullptr) {
2984
- GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
2985
- return nullptr;
2986
- }
2987
-
2988
- ggml_backend_t cuda_backend = new ggml_backend {
2989
- /* .guid = */ ggml_backend_cuda_guid(),
2990
- /* .interface = */ ggml_backend_cuda_interface,
2991
- /* .context = */ ctx
2992
- };
2993
-
2994
- return cuda_backend;
2995
- }
2996
-
2997
- GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
2998
- return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
2999
- }
3000
-
3001
- GGML_CALL int ggml_backend_cuda_get_device_count() {
3002
- return ggml_cuda_info().device_count;
3003
- }
3004
-
3005
- GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
3006
- cudaDeviceProp prop;
3007
- CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
3008
- snprintf(description, description_size, "%s", prop.name);
3009
- }
3010
-
3011
- GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
3012
- ggml_cuda_set_device(device);
3013
-
3014
- CUDA_CHECK(cudaMemGetInfo(free, total));
3015
- }
3016
-
3017
- GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
3018
- if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
3019
- return false;
3020
- }
3021
-
3022
- #if CUDART_VERSION >= 11100
3023
- cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
3024
- if (err != cudaSuccess) {
3025
- // clear the error
3026
- cudaGetLastError();
3027
-
3028
- GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
3029
- size / 1024.0 / 1024.0, cudaGetErrorString(err));
3030
- return false;
3031
- }
3032
- return true;
3033
- #else
3034
- return false;
3035
- #endif
3036
- }
3037
-
3038
- GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
3039
- if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
3040
- return;
3041
- }
3042
-
3043
- cudaError_t err = cudaHostUnregister(buffer);
3044
- if (err != cudaSuccess) {
3045
- // clear the error
3046
- cudaGetLastError();
3047
- }
3048
- }
3049
-
3050
- // backend registry
3051
- GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
3052
- ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
3053
- return cuda_backend;
3054
-
3055
- GGML_UNUSED(params);
3056
- }
3057
-
3058
- extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
3059
-
3060
- GGML_CALL int ggml_backend_cuda_reg_devices() {
3061
- int device_count = ggml_backend_cuda_get_device_count();
3062
- //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
3063
- for (int i = 0; i < device_count; i++) {
3064
- char name[128];
3065
- snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
3066
- ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
3067
- }
3068
- return device_count;
3069
- }