llama_cpp 0.16.2 → 0.17.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (177) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +18 -0
  3. data/README.md +7 -12
  4. data/ext/llama_cpp/extconf.rb +2 -43
  5. data/ext/llama_cpp/llama_cpp.cpp +8 -0
  6. data/lib/llama_cpp/version.rb +3 -3
  7. data/sig/llama_cpp.rbs +3 -0
  8. metadata +2 -171
  9. data/vendor/include/.gitkeep +0 -0
  10. data/vendor/lib/.gitkeep +0 -0
  11. data/vendor/tmp/llama.cpp/LICENSE +0 -21
  12. data/vendor/tmp/llama.cpp/Makefile +0 -1124
  13. data/vendor/tmp/llama.cpp/ggml-alloc.c +0 -1041
  14. data/vendor/tmp/llama.cpp/ggml-alloc.h +0 -76
  15. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +0 -153
  16. data/vendor/tmp/llama.cpp/ggml-backend.c +0 -2225
  17. data/vendor/tmp/llama.cpp/ggml-backend.h +0 -236
  18. data/vendor/tmp/llama.cpp/ggml-blas.cpp +0 -363
  19. data/vendor/tmp/llama.cpp/ggml-blas.h +0 -23
  20. data/vendor/tmp/llama.cpp/ggml-common.h +0 -1805
  21. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +0 -47
  22. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +0 -34
  23. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +0 -104
  24. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +0 -280
  25. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +0 -34
  26. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +0 -196
  27. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +0 -686
  28. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +0 -490
  29. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +0 -40
  30. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +0 -674
  31. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +0 -319
  32. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +0 -312
  33. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +0 -345
  34. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +0 -178
  35. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +0 -104
  36. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +0 -88
  37. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +0 -419
  38. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +0 -221
  39. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +0 -49
  40. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +0 -94
  41. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +0 -112
  42. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +0 -271
  43. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +0 -31
  44. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +0 -206
  45. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +0 -40
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  127. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  128. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  129. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  130. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  131. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  132. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +0 -10
  133. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +0 -9
  134. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +0 -10
  135. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +0 -10
  136. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +0 -8
  137. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +0 -5
  138. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +0 -5
  139. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +0 -5
  140. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +0 -5
  141. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +0 -5
  142. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +0 -5
  143. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +0 -5
  144. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +0 -5
  145. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +0 -5
  146. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +0 -5
  147. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +0 -47
  148. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +0 -314
  149. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +0 -51
  150. data/vendor/tmp/llama.cpp/ggml-cuda.cu +0 -3069
  151. data/vendor/tmp/llama.cpp/ggml-cuda.h +0 -44
  152. data/vendor/tmp/llama.cpp/ggml-impl.h +0 -651
  153. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -2038
  154. data/vendor/tmp/llama.cpp/ggml-kompute.h +0 -46
  155. data/vendor/tmp/llama.cpp/ggml-metal.h +0 -66
  156. data/vendor/tmp/llama.cpp/ggml-metal.m +0 -3273
  157. data/vendor/tmp/llama.cpp/ggml-metal.metal +0 -6540
  158. data/vendor/tmp/llama.cpp/ggml-quants.c +0 -14994
  159. data/vendor/tmp/llama.cpp/ggml-quants.h +0 -133
  160. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +0 -1178
  161. data/vendor/tmp/llama.cpp/ggml-rpc.h +0 -24
  162. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +0 -6351
  163. data/vendor/tmp/llama.cpp/ggml-sycl.h +0 -40
  164. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +0 -144508
  165. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +0 -7183
  166. data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -29
  167. data/vendor/tmp/llama.cpp/ggml.c +0 -22506
  168. data/vendor/tmp/llama.cpp/ggml.h +0 -2458
  169. data/vendor/tmp/llama.cpp/llama.cpp +0 -18985
  170. data/vendor/tmp/llama.cpp/llama.h +0 -1147
  171. data/vendor/tmp/llama.cpp/scripts/get-flags.mk +0 -38
  172. data/vendor/tmp/llama.cpp/sgemm.cpp +0 -1032
  173. data/vendor/tmp/llama.cpp/sgemm.h +0 -14
  174. data/vendor/tmp/llama.cpp/unicode-data.cpp +0 -7033
  175. data/vendor/tmp/llama.cpp/unicode-data.h +0 -20
  176. data/vendor/tmp/llama.cpp/unicode.cpp +0 -810
  177. data/vendor/tmp/llama.cpp/unicode.h +0 -63
@@ -1,3069 +0,0 @@
1
- #include "ggml-cuda.h"
2
- #include "ggml.h"
3
- #include "ggml-backend-impl.h"
4
-
5
- #include "ggml-cuda/common.cuh"
6
- #include "ggml-cuda/acc.cuh"
7
- #include "ggml-cuda/arange.cuh"
8
- #include "ggml-cuda/argsort.cuh"
9
- #include "ggml-cuda/binbcast.cuh"
10
- #include "ggml-cuda/clamp.cuh"
11
- #include "ggml-cuda/concat.cuh"
12
- #include "ggml-cuda/convert.cuh"
13
- #include "ggml-cuda/cpy.cuh"
14
- #include "ggml-cuda/diagmask.cuh"
15
- #include "ggml-cuda/dmmv.cuh"
16
- #include "ggml-cuda/fattn.cuh"
17
- #include "ggml-cuda/getrows.cuh"
18
- #include "ggml-cuda/im2col.cuh"
19
- #include "ggml-cuda/mmq.cuh"
20
- #include "ggml-cuda/mmvq.cuh"
21
- #include "ggml-cuda/norm.cuh"
22
- #include "ggml-cuda/pad.cuh"
23
- #include "ggml-cuda/pool2d.cuh"
24
- #include "ggml-cuda/quantize.cuh"
25
- #include "ggml-cuda/rope.cuh"
26
- #include "ggml-cuda/scale.cuh"
27
- #include "ggml-cuda/softmax.cuh"
28
- #include "ggml-cuda/sumrows.cuh"
29
- #include "ggml-cuda/tsembd.cuh"
30
- #include "ggml-cuda/unary.cuh"
31
- #include "ggml-cuda/upscale.cuh"
32
-
33
- #include <algorithm>
34
- #include <array>
35
- #include <atomic>
36
- #include <cinttypes>
37
- #include <cstddef>
38
- #include <cstdint>
39
- #include <float.h>
40
- #include <limits>
41
- #include <map>
42
- #include <memory>
43
- #include <mutex>
44
- #include <stdint.h>
45
- #include <stdio.h>
46
- #include <stdarg.h>
47
- #include <stdlib.h>
48
- #include <string>
49
- #include <vector>
50
-
51
- static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
52
-
53
- static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
54
- GGML_UNUSED(level);
55
- GGML_UNUSED(user_data);
56
- fprintf(stderr, "%s", msg);
57
- }
58
-
59
- ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
60
- void * ggml_cuda_log_user_data = NULL;
61
-
62
- GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
63
- ggml_cuda_log_callback = log_callback;
64
- ggml_cuda_log_user_data = user_data;
65
- }
66
-
67
- #define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
68
- #define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
69
- #define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
70
-
71
- GGML_ATTRIBUTE_FORMAT(2, 3)
72
- static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
73
- if (ggml_cuda_log_callback != NULL) {
74
- va_list args;
75
- va_start(args, format);
76
- char buffer[128];
77
- int len = vsnprintf(buffer, 128, format, args);
78
- if (len < 128) {
79
- ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
80
- } else {
81
- std::vector<char> buffer2(len + 1); // vsnprintf adds a null terminator
82
- va_end(args);
83
- va_start(args, format);
84
- vsnprintf(&buffer2[0], buffer2.size(), format, args);
85
- ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
86
- }
87
- va_end(args);
88
- }
89
- }
90
-
91
- [[noreturn]]
92
- void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
93
- int id = -1; // in case cudaGetDevice fails
94
- cudaGetDevice(&id);
95
-
96
- GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
97
- GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
98
- GGML_CUDA_LOG_ERROR(" %s\n", stmt);
99
- // abort with GGML_ASSERT to get a stack trace
100
- GGML_ASSERT(!"CUDA error");
101
- }
102
-
103
- // this is faster on Windows
104
- // probably because the Windows CUDA libraries forget to make this check before invoking the drivers
105
- void ggml_cuda_set_device(int device) {
106
- int current_device;
107
- CUDA_CHECK(cudaGetDevice(&current_device));
108
-
109
- if (device == current_device) {
110
- return;
111
- }
112
-
113
- CUDA_CHECK(cudaSetDevice(device));
114
- }
115
-
116
- int ggml_cuda_get_device() {
117
- int id;
118
- CUDA_CHECK(cudaGetDevice(&id));
119
- return id;
120
- }
121
-
122
- static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
123
- ggml_cuda_set_device(device);
124
- #if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA)
125
- auto res = hipMallocManaged(ptr, size);
126
- if (res == hipSuccess) {
127
- // if error we "need" to know why...
128
- CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
129
- }
130
- return res;
131
- #else
132
- return cudaMalloc(ptr, size);
133
- #endif
134
- }
135
-
136
- static ggml_cuda_device_info ggml_cuda_init() {
137
- #ifdef __HIP_PLATFORM_AMD__
138
- // Workaround for a rocBLAS bug when using multiple graphics cards:
139
- // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
140
- rocblas_initialize();
141
- CUDA_CHECK(cudaDeviceSynchronize());
142
- #endif
143
-
144
- ggml_cuda_device_info info = {};
145
-
146
- cudaError_t err = cudaGetDeviceCount(&info.device_count);
147
- if (err != cudaSuccess) {
148
- GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
149
- return info;
150
- }
151
-
152
- GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
153
-
154
- int64_t total_vram = 0;
155
- #if defined(GGML_CUDA_FORCE_MMQ)
156
- GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
157
- #else
158
- GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
159
- #endif
160
- #if defined(CUDA_USE_TENSOR_CORES)
161
- GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
162
- #else
163
- GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
164
- #endif
165
- GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
166
- for (int id = 0; id < info.device_count; ++id) {
167
- int device_vmm = 0;
168
-
169
- #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
170
- CUdevice device;
171
- CU_CHECK(cuDeviceGet(&device, id));
172
- CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
173
-
174
- if (device_vmm) {
175
- CUmemAllocationProp alloc_prop = {};
176
- alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
177
- alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
178
- alloc_prop.location.id = id;
179
- CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
180
- }
181
- #endif // !defined(GGML_USE_HIPBLAS)
182
- info.devices[id].vmm = !!device_vmm;
183
-
184
- cudaDeviceProp prop;
185
- CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
186
- GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
187
-
188
- info.default_tensor_split[id] = total_vram;
189
- total_vram += prop.totalGlobalMem;
190
-
191
- info.devices[id].nsm = prop.multiProcessorCount;
192
- info.devices[id].smpb = prop.sharedMemPerBlock;
193
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
194
- info.devices[id].smpbo = prop.sharedMemPerBlock;
195
- info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
196
- #else
197
- info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
198
- info.devices[id].cc = 100*prop.major + 10*prop.minor;
199
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
200
- }
201
-
202
- for (int id = 0; id < info.device_count; ++id) {
203
- info.default_tensor_split[id] /= total_vram;
204
- }
205
-
206
- // configure logging to stdout
207
- // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
208
-
209
- return info;
210
- }
211
-
212
- const ggml_cuda_device_info & ggml_cuda_info() {
213
- static ggml_cuda_device_info info = ggml_cuda_init();
214
- return info;
215
- }
216
-
217
- // #define DEBUG_CUDA_MALLOC
218
-
219
- // buffer pool for cuda (legacy)
220
- struct ggml_cuda_pool_leg : public ggml_cuda_pool {
221
- static const int MAX_BUFFERS = 256;
222
-
223
- int device;
224
- struct ggml_cuda_buffer {
225
- void * ptr = nullptr;
226
- size_t size = 0;
227
- };
228
-
229
- ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
230
- size_t pool_size = 0;
231
-
232
- explicit ggml_cuda_pool_leg(int device) :
233
- device(device) {
234
- }
235
-
236
- ~ggml_cuda_pool_leg() {
237
- ggml_cuda_set_device(device);
238
- for (int i = 0; i < MAX_BUFFERS; ++i) {
239
- ggml_cuda_buffer & b = buffer_pool[i];
240
- if (b.ptr != nullptr) {
241
- CUDA_CHECK(cudaFree(b.ptr));
242
- pool_size -= b.size;
243
- }
244
- }
245
- GGML_ASSERT(pool_size == 0);
246
- }
247
-
248
- void * alloc(size_t size, size_t * actual_size) override {
249
- #ifdef DEBUG_CUDA_MALLOC
250
- int nnz = 0;
251
- size_t max_size = 0;
252
- #endif
253
- size_t best_diff = 1ull << 36;
254
- int ibest = -1;
255
- for (int i = 0; i < MAX_BUFFERS; ++i) {
256
- ggml_cuda_buffer& b = buffer_pool[i];
257
- if (b.ptr != nullptr) {
258
- #ifdef DEBUG_CUDA_MALLOC
259
- ++nnz;
260
- if (b.size > max_size) max_size = b.size;
261
- #endif
262
- if (b.size >= size) {
263
- size_t diff = b.size - size;
264
- if (diff < best_diff) {
265
- best_diff = diff;
266
- ibest = i;
267
- if (!best_diff) {
268
- void * ptr = b.ptr;
269
- *actual_size = b.size;
270
- b.ptr = nullptr;
271
- b.size = 0;
272
- return ptr;
273
- }
274
- }
275
- }
276
- }
277
- }
278
- if (ibest >= 0) {
279
- ggml_cuda_buffer& b = buffer_pool[ibest];
280
- void * ptr = b.ptr;
281
- *actual_size = b.size;
282
- b.ptr = nullptr;
283
- b.size = 0;
284
- return ptr;
285
- }
286
- void * ptr;
287
- size_t look_ahead_size = (size_t) (1.05 * size);
288
- look_ahead_size = 256 * ((look_ahead_size + 255)/256);
289
- ggml_cuda_set_device(device);
290
- CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
291
- *actual_size = look_ahead_size;
292
- pool_size += look_ahead_size;
293
- #ifdef DEBUG_CUDA_MALLOC
294
- GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
295
- (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
296
- #endif
297
- return ptr;
298
- }
299
-
300
- void free(void * ptr, size_t size) override {
301
- for (int i = 0; i < MAX_BUFFERS; ++i) {
302
- ggml_cuda_buffer& b = buffer_pool[i];
303
- if (b.ptr == nullptr) {
304
- b.ptr = ptr;
305
- b.size = size;
306
- return;
307
- }
308
- }
309
- GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
310
- ggml_cuda_set_device(device);
311
- CUDA_CHECK(cudaFree(ptr));
312
- pool_size -= size;
313
- }
314
- };
315
-
316
- // pool with virtual memory
317
- #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
318
- struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
319
- static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
320
-
321
- int device;
322
- CUdeviceptr pool_addr = 0;
323
- size_t pool_used = 0;
324
- size_t pool_size = 0;
325
- size_t granularity;
326
-
327
- explicit ggml_cuda_pool_vmm(int device) :
328
- device(device),
329
- granularity(ggml_cuda_info().devices[device].vmm_granularity) {
330
- }
331
-
332
- ~ggml_cuda_pool_vmm() {
333
- if (pool_addr != 0) {
334
- CU_CHECK(cuMemUnmap(pool_addr, pool_size));
335
- CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
336
- }
337
- }
338
-
339
- void * alloc(size_t size, size_t * actual_size) override {
340
- // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
341
- const size_t alignment = 128;
342
- size = alignment * ((size + alignment - 1) / alignment);
343
-
344
- size_t avail = pool_size - pool_used;
345
-
346
- if (size > avail) {
347
- // round up to the next multiple of the granularity
348
- size_t reserve_size = size - avail;
349
- reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
350
-
351
- GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
352
-
353
- // allocate more physical memory
354
- CUmemAllocationProp prop = {};
355
- prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
356
- prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
357
- prop.location.id = device;
358
- CUmemGenericAllocationHandle handle;
359
- CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
360
-
361
- // reserve virtual address space (if not already reserved)
362
- if (pool_addr == 0) {
363
- CU_CHECK(cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
364
- }
365
-
366
- // map at the end of the pool
367
- CU_CHECK(cuMemMap(pool_addr + pool_size, reserve_size, 0, handle, 0));
368
-
369
- // the memory allocation handle is no longer needed after mapping
370
- CU_CHECK(cuMemRelease(handle));
371
-
372
- // set access
373
- CUmemAccessDesc access = {};
374
- access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
375
- access.location.id = device;
376
- access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
377
- CU_CHECK(cuMemSetAccess(pool_addr + pool_size, reserve_size, &access, 1));
378
-
379
- // add to the pool
380
- pool_size += reserve_size;
381
-
382
- //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
383
- // device, (unsigned long long) (pool_size/1024/1024),
384
- // (unsigned long long) (reserve_size/1024/1024));
385
- }
386
-
387
- GGML_ASSERT(pool_addr != 0);
388
-
389
- void * ptr = (void *) (pool_addr + pool_used);
390
- *actual_size = size;
391
- pool_used += size;
392
-
393
- #ifdef DEBUG_CUDA_MALLOC
394
- printf("cuda pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
395
- #endif
396
-
397
- return ptr;
398
- }
399
-
400
- void free(void * ptr, size_t size) override {
401
- #ifdef DEBUG_CUDA_MALLOC
402
- printf("cuda pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
403
- #endif
404
-
405
- pool_used -= size;
406
-
407
- // all deallocations must be in reverse order of the allocations
408
- GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
409
- }
410
- };
411
- #endif // !defined(GGML_USE_HIPBLAS)
412
-
413
- std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
414
- #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
415
- if (ggml_cuda_info().devices[device].vmm) {
416
- return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
417
- }
418
- #endif
419
- return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
420
- }
421
-
422
- // cuda buffer
423
-
424
- struct ggml_backend_cuda_buffer_context {
425
- int device;
426
- void * dev_ptr = nullptr;
427
- std::string name;
428
-
429
- ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
430
- device(device), dev_ptr(dev_ptr),
431
- name(GGML_CUDA_NAME + std::to_string(device)) {
432
- }
433
-
434
- ~ggml_backend_cuda_buffer_context() {
435
- CUDA_CHECK(cudaFree(dev_ptr));
436
- }
437
- };
438
-
439
- GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
440
- ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
441
- return ctx->name.c_str();
442
- }
443
-
444
- GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
445
- return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
446
- }
447
-
448
- GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
449
- ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
450
- delete ctx;
451
- }
452
-
453
- GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
454
- ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
455
- return ctx->dev_ptr;
456
- }
457
-
458
- GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
459
- ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
460
-
461
- if (tensor->view_src != NULL) {
462
- assert(tensor->view_src->buffer->buft == buffer->buft);
463
- return;
464
- }
465
-
466
- if (ggml_is_quantized(tensor->type)) {
467
- // initialize padding to 0 to avoid possible NaN values
468
- size_t original_size = ggml_nbytes(tensor);
469
- size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
470
-
471
- if (padded_size > original_size && tensor->view_src == nullptr) {
472
- ggml_cuda_set_device(ctx->device);
473
- CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
474
- }
475
- }
476
- }
477
-
478
- GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
479
- ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
480
-
481
- ggml_cuda_set_device(ctx->device);
482
- CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
483
- CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
484
- }
485
-
486
- GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
487
- ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
488
-
489
- ggml_cuda_set_device(ctx->device);
490
- CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
491
- CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
492
- }
493
-
494
- GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
495
- if (ggml_backend_buffer_is_cuda(src->buffer)) {
496
- ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
497
- ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
498
- if (src_ctx->device == dst_ctx->device) {
499
- CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
500
- } else {
501
- #ifdef GGML_CUDA_NO_PEER_COPY
502
- return false;
503
- #else
504
- CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
505
- #endif
506
- }
507
- CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
508
- return true;
509
- }
510
- return false;
511
-
512
- GGML_UNUSED(buffer);
513
- }
514
-
515
- GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
516
- ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
517
-
518
- ggml_cuda_set_device(ctx->device);
519
- CUDA_CHECK(cudaDeviceSynchronize());
520
- CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
521
- CUDA_CHECK(cudaDeviceSynchronize());
522
- }
523
-
524
- static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
525
- /* .get_name = */ ggml_backend_cuda_buffer_get_name,
526
- /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
527
- /* .get_base = */ ggml_backend_cuda_buffer_get_base,
528
- /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
529
- /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
530
- /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
531
- /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor,
532
- /* .clear = */ ggml_backend_cuda_buffer_clear,
533
- /* .reset = */ NULL,
534
- };
535
-
536
- // cuda buffer type
537
- struct ggml_backend_cuda_buffer_type_context {
538
- int device;
539
- std::string name;
540
- };
541
-
542
- GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
543
- ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
544
-
545
- return ctx->name.c_str();
546
- }
547
-
548
- static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
549
- return buft->iface.get_name == ggml_backend_cuda_buffer_type_name;
550
- }
551
-
552
- GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
553
- ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
554
-
555
- ggml_cuda_set_device(buft_ctx->device);
556
-
557
- size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
558
-
559
- void * dev_ptr;
560
- cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
561
- if (err != cudaSuccess) {
562
- // clear the error
563
- cudaGetLastError();
564
- GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
565
- return nullptr;
566
- }
567
-
568
- ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
569
-
570
- return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
571
- }
572
-
573
- GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
574
- return 128;
575
-
576
- GGML_UNUSED(buft);
577
- }
578
-
579
- GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
580
- size_t size = ggml_nbytes(tensor);
581
- int64_t ne0 = tensor->ne[0];
582
-
583
- if (ggml_is_quantized(tensor->type)) {
584
- if (ne0 % MATRIX_ROW_PADDING != 0) {
585
- size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
586
- }
587
- }
588
-
589
- return size;
590
-
591
- GGML_UNUSED(buft);
592
- }
593
-
594
- static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
595
- /* .get_name = */ ggml_backend_cuda_buffer_type_name,
596
- /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
597
- /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
598
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
599
- /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
600
- /* .is_host = */ NULL,
601
- };
602
-
603
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
604
- static std::mutex mutex;
605
- std::lock_guard<std::mutex> lock(mutex);
606
-
607
- if (device >= ggml_backend_cuda_get_device_count()) {
608
- return nullptr;
609
- }
610
-
611
- static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
612
-
613
- static bool ggml_backend_cuda_buffer_type_initialized = false;
614
-
615
- if (!ggml_backend_cuda_buffer_type_initialized) {
616
- for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
617
- ggml_backend_cuda_buffer_types[i] = {
618
- /* .iface = */ ggml_backend_cuda_buffer_type_interface,
619
- /* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
620
- };
621
- }
622
- ggml_backend_cuda_buffer_type_initialized = true;
623
- }
624
-
625
- return &ggml_backend_cuda_buffer_types[device];
626
- }
627
-
628
- // cuda split buffer
629
-
630
- static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
631
- int64_t row_rounding = 0;
632
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
633
- if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
634
- continue;
635
- }
636
-
637
- const int cc = ggml_cuda_info().devices[id].cc;
638
- row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
639
- }
640
- return row_rounding;
641
- }
642
-
643
- static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
644
- const int64_t nrows = ggml_nrows(tensor);
645
- const int64_t rounding = get_row_rounding(tensor_split);
646
-
647
- *row_low = id == 0 ? 0 : nrows*tensor_split[id];
648
- *row_low -= *row_low % rounding;
649
-
650
- if (id == ggml_backend_cuda_get_device_count() - 1) {
651
- *row_high = nrows;
652
- } else {
653
- *row_high = nrows*tensor_split[id + 1];
654
- *row_high -= *row_high % rounding;
655
- }
656
- }
657
-
658
- static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
659
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
660
-
661
- return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
662
- }
663
-
664
- struct ggml_backend_cuda_split_buffer_type_context {
665
- std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
666
- };
667
-
668
- struct ggml_backend_cuda_split_buffer_context {
669
- ~ggml_backend_cuda_split_buffer_context() {
670
- for (ggml_tensor_extra_gpu * extra : tensor_extras) {
671
- for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) {
672
- for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
673
- if (extra->events[id][is] != nullptr) {
674
- CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
675
- }
676
- }
677
- if (extra->data_device[id] != nullptr) {
678
- CUDA_CHECK(cudaFree(extra->data_device[id]));
679
- }
680
- }
681
- delete extra;
682
- }
683
- }
684
-
685
- std::vector<ggml_tensor_extra_gpu *> tensor_extras;
686
- };
687
-
688
- GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
689
- return GGML_CUDA_NAME "_Split";
690
-
691
- GGML_UNUSED(buffer);
692
- }
693
-
694
- static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
695
- return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
696
- GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
697
- }
698
-
699
- GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
700
- ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
701
- delete ctx;
702
- }
703
-
704
- GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
705
- // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
706
- return (void *)0x1000;
707
-
708
- GGML_UNUSED(buffer);
709
- }
710
-
711
- GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
712
- GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
713
-
714
- ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
715
- ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
716
-
717
- const int64_t ne0 = tensor->ne[0];
718
-
719
- ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
720
- ctx->tensor_extras.push_back(extra);
721
-
722
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
723
- int64_t row_low, row_high;
724
- get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
725
-
726
- int64_t nrows_split = row_high - row_low;
727
- if (nrows_split == 0) {
728
- continue;
729
- }
730
-
731
- size_t size = ggml_nbytes_split(tensor, nrows_split);
732
- const size_t original_size = size;
733
-
734
- // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
735
- if (ne0 % MATRIX_ROW_PADDING != 0) {
736
- size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
737
- }
738
-
739
- // FIXME: do not crash if cudaMalloc fails
740
- // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
741
- ggml_cuda_set_device(id);
742
- char * buf;
743
- CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
744
-
745
- // set padding to 0 to avoid possible NaN values
746
- if (size > original_size) {
747
- CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
748
- }
749
-
750
- extra->data_device[id] = buf;
751
-
752
- for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
753
- CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
754
- }
755
- }
756
- tensor->extra = extra;
757
- }
758
-
759
- GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
760
- // split tensors must always be set in their entirety at once
761
- GGML_ASSERT(offset == 0);
762
- GGML_ASSERT(size == ggml_nbytes(tensor));
763
-
764
- ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
765
-
766
- const int64_t ne0 = tensor->ne[0];
767
- const size_t nb1 = tensor->nb[1];
768
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
769
-
770
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
771
- int64_t row_low, row_high;
772
- get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
773
-
774
- int64_t nrows_split = row_high - row_low;
775
- if (nrows_split == 0) {
776
- continue;
777
- }
778
-
779
- const size_t offset_split = row_low*nb1;
780
- size_t size = ggml_nbytes_split(tensor, nrows_split);
781
- const size_t original_size = size;
782
-
783
- // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
784
- if (ne0 % MATRIX_ROW_PADDING != 0) {
785
- size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
786
- }
787
-
788
- const char * buf_host = (const char *)data + offset_split;
789
- CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
790
- }
791
-
792
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
793
- CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
794
- }
795
- }
796
-
797
- GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
798
- // split tensors must always be set in their entirety at once
799
- GGML_ASSERT(offset == 0);
800
- GGML_ASSERT(size == ggml_nbytes(tensor));
801
-
802
- ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
803
-
804
- const int64_t ne0 = tensor->ne[0];
805
- const size_t nb1 = tensor->nb[1];
806
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
807
-
808
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
809
- int64_t row_low, row_high;
810
- get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
811
-
812
- int64_t nrows_split = row_high - row_low;
813
- if (nrows_split == 0) {
814
- continue;
815
- }
816
-
817
- const size_t offset_split = row_low*nb1;
818
- size_t size = ggml_nbytes_split(tensor, nrows_split);
819
- const size_t original_size = size;
820
-
821
- // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
822
- if (ne0 % MATRIX_ROW_PADDING != 0) {
823
- size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
824
- }
825
-
826
- char * buf_host = (char *)data + offset_split;
827
- CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
828
- }
829
-
830
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
831
- CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
832
- }
833
- }
834
-
835
- GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
836
- GGML_UNUSED(buffer);
837
- GGML_UNUSED(value);
838
- }
839
-
840
- static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
841
- /* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
842
- /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
843
- /* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
844
- /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
845
- /* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
846
- /* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
847
- /* .cpy_tensor = */ NULL,
848
- /* .clear = */ ggml_backend_cuda_split_buffer_clear,
849
- /* .reset = */ NULL,
850
- };
851
-
852
- // cuda split buffer type
853
-
854
- GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
855
- return GGML_CUDA_NAME "_Split";
856
-
857
- GGML_UNUSED(buft);
858
- }
859
-
860
- static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
861
- return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name;
862
- }
863
-
864
- GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
865
- // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
866
- // instead, we allocate them for each tensor separately in init_tensor
867
- // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
868
- // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
869
- ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
870
-
871
- return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
872
- }
873
-
874
- GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
875
- return 128;
876
-
877
- GGML_UNUSED(buft);
878
- }
879
-
880
- GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
881
- ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
882
-
883
- size_t total_size = 0;
884
-
885
- const int64_t ne0 = tensor->ne[0];
886
-
887
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
888
- int64_t row_low, row_high;
889
- get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
890
-
891
- int64_t nrows_split = row_high - row_low;
892
- if (nrows_split == 0) {
893
- continue;
894
- }
895
-
896
- total_size += ggml_nbytes_split(tensor, nrows_split);
897
-
898
- // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
899
- if (ne0 % MATRIX_ROW_PADDING != 0) {
900
- total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
901
- }
902
- }
903
-
904
- return total_size;
905
- }
906
-
907
- GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
908
- return false;
909
-
910
- GGML_UNUSED(buft);
911
- }
912
-
913
- static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
914
- /* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
915
- /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
916
- /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
917
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
918
- /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
919
- /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
920
- };
921
-
922
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
923
- static std::mutex mutex;
924
- std::lock_guard<std::mutex> lock(mutex);
925
-
926
- static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
927
-
928
- std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
929
-
930
- bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
931
- if (all_zero) {
932
- tensor_split_arr = ggml_cuda_info().default_tensor_split;
933
- } else {
934
- float split_sum = 0.0f;
935
- for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
936
- tensor_split_arr[i] = split_sum;
937
- split_sum += tensor_split[i];
938
- }
939
- for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
940
- tensor_split_arr[i] /= split_sum;
941
- }
942
- }
943
-
944
- auto it = buft_map.find(tensor_split_arr);
945
- if (it != buft_map.end()) {
946
- return &it->second;
947
- }
948
-
949
- struct ggml_backend_buffer_type buft {
950
- /* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
951
- /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
952
- };
953
-
954
- auto result = buft_map.emplace(tensor_split_arr, buft);
955
- return &result.first->second;
956
- }
957
-
958
- // host buffer type
959
-
960
- GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
961
- return GGML_CUDA_NAME "_Host";
962
-
963
- GGML_UNUSED(buft);
964
- }
965
-
966
- GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
967
- return GGML_CUDA_NAME "_Host";
968
-
969
- GGML_UNUSED(buffer);
970
- }
971
-
972
- GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
973
- CUDA_CHECK(cudaFreeHost(buffer->context));
974
- }
975
-
976
- static void * ggml_cuda_host_malloc(size_t size) {
977
- if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
978
- return nullptr;
979
- }
980
-
981
- void * ptr = nullptr;
982
- cudaError_t err = cudaMallocHost((void **) &ptr, size);
983
- if (err != cudaSuccess) {
984
- // clear the error
985
- cudaGetLastError();
986
- GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
987
- size / 1024.0 / 1024.0, cudaGetErrorString(err));
988
- return nullptr;
989
- }
990
-
991
- return ptr;
992
- }
993
-
994
- GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
995
- void * ptr = ggml_cuda_host_malloc(size);
996
-
997
- if (ptr == nullptr) {
998
- // fallback to cpu buffer
999
- return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
1000
- }
1001
-
1002
- ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
1003
- buffer->buft = buft;
1004
- buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
1005
- buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
1006
-
1007
- return buffer;
1008
- }
1009
-
1010
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
1011
- static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
1012
- /* .iface = */ {
1013
- /* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
1014
- /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
1015
- /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
1016
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1017
- /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
1018
- /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1019
- },
1020
- /* .context = */ nullptr,
1021
- };
1022
-
1023
- return &ggml_backend_cuda_buffer_type_host;
1024
- }
1025
-
1026
- //static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
1027
- // return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
1028
- //}
1029
-
1030
- /// kernels
1031
-
1032
- typedef void (*ggml_cuda_op_mul_mat_t)(
1033
- ggml_backend_cuda_context & ctx,
1034
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
1035
- const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
1036
- const int64_t src1_padded_row_size, cudaStream_t stream);
1037
-
1038
- #ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
1039
- #define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
1040
- #endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
1041
-
1042
- #define MUL_MAT_SRC1_COL_STRIDE 128
1043
-
1044
- static __global__ void mul_mat_p021_f16_f32(
1045
- const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
1046
- const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
1047
-
1048
- const half * x = (const half *) vx;
1049
-
1050
- const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1051
- const int channel = blockDim.z*blockIdx.z + threadIdx.z;
1052
- const int channel_x = channel / (nchannels_y / nchannels_x);
1053
-
1054
- const int nrows_y = ncols_x;
1055
- const int nrows_dst = nrows_x;
1056
- const int row_dst = row_x;
1057
-
1058
- float tmp = 0.0f;
1059
-
1060
- for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
1061
- const int col_x = col_x0 + threadIdx.x;
1062
-
1063
- if (col_x >= ncols_x) {
1064
- break;
1065
- }
1066
-
1067
- // x is transposed and permuted
1068
- const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
1069
- const float xi = __half2float(x[ix]);
1070
-
1071
- const int row_y = col_x;
1072
-
1073
- // y is not transposed but permuted
1074
- const int iy = channel*nrows_y + row_y;
1075
-
1076
- tmp += xi * y[iy];
1077
- }
1078
-
1079
- // dst is not transposed and not permuted
1080
- const int idst = channel*nrows_dst + row_dst;
1081
-
1082
- // sum up partial sums and write back result
1083
- tmp = warp_reduce_sum(tmp);
1084
-
1085
- if (threadIdx.x == 0) {
1086
- dst[idst] = tmp;
1087
- }
1088
- }
1089
-
1090
- static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1091
- const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
1092
- const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
1093
-
1094
- const half * x = (const half *) vx;
1095
-
1096
- const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1097
- const int channel = blockDim.z*blockIdx.z + threadIdx.z;
1098
- const int channel_x = channel / channel_x_divisor;
1099
-
1100
- const int nrows_y = ncols_x;
1101
- const int nrows_dst = nrows_x;
1102
- const int row_dst = row_x;
1103
-
1104
- const int idst = channel*nrows_dst + row_dst;
1105
-
1106
- float tmp = 0.0f;
1107
-
1108
- for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
1109
- const int col_x = col_x0 + threadIdx.x;
1110
-
1111
- if (col_x >= ncols_x) {
1112
- break;
1113
- }
1114
-
1115
- const int row_y = col_x;
1116
-
1117
- const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
1118
- const int iy = channel*nrows_y + row_y;
1119
-
1120
- const float xi = __half2float(x[ix]);
1121
-
1122
- tmp += xi * y[iy];
1123
- }
1124
-
1125
- // sum up partial sums and write back result
1126
- tmp = warp_reduce_sum(tmp);
1127
-
1128
- if (threadIdx.x == 0) {
1129
- dst[idst] = tmp;
1130
- }
1131
- }
1132
-
1133
- static void ggml_mul_mat_p021_f16_f32_cuda(
1134
- const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
1135
- const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
1136
-
1137
- const dim3 block_nums(1, nrows_x, nchannels_y);
1138
- const dim3 block_dims(WARP_SIZE, 1, 1);
1139
- mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
1140
- }
1141
-
1142
- static void ggml_mul_mat_vec_nc_f16_f32_cuda(
1143
- const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
1144
- const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
1145
-
1146
- const dim3 block_nums(1, nrows_x, nchannels_y);
1147
- const dim3 block_dims(WARP_SIZE, 1, 1);
1148
- mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
1149
- (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
1150
- }
1151
-
1152
- static cudaError_t ggml_cuda_cpy_tensor_2d(
1153
- void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
1154
-
1155
- GGML_ASSERT(ggml_backend_buffer_is_cuda(src->buffer));
1156
- char * src_ptr = (char *) src->data;
1157
- char * dst_ptr = (char *) dst;
1158
-
1159
- const int64_t ne0 = src->ne[0];
1160
- const int64_t nb0 = src->nb[0];
1161
- const int64_t nb1 = src->nb[1];
1162
- const int64_t nb2 = src->nb[2];
1163
- const int64_t nb3 = src->nb[3];
1164
- const enum ggml_type type = src->type;
1165
- const int64_t ts = ggml_type_size(type);
1166
- const int64_t bs = ggml_blck_size(type);
1167
- int64_t i1_diff = i1_high - i1_low;
1168
-
1169
- const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
1170
- if (nb0 == ts && nb1 == ts*ne0/bs) {
1171
- return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, cudaMemcpyDeviceToDevice, stream);
1172
- } else if (nb0 == ts) {
1173
- return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, cudaMemcpyDeviceToDevice, stream);
1174
- } else {
1175
- for (int64_t i1 = 0; i1 < i1_diff; i1++) {
1176
- const void * rx = (const void *) ((const char *) x + i1*nb1);
1177
- void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
1178
- // pretend the row is a matrix with cols=1
1179
- cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyDeviceToDevice, stream);
1180
- if (r != cudaSuccess) {
1181
- return r;
1182
- }
1183
- }
1184
- return cudaSuccess;
1185
- }
1186
- }
1187
-
1188
- static void ggml_cuda_op_mul_mat_cublas(
1189
- ggml_backend_cuda_context & ctx,
1190
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
1191
- const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
1192
- const int64_t src1_padded_row_size, cudaStream_t stream) {
1193
-
1194
- GGML_ASSERT(src0_dd_i != nullptr);
1195
- GGML_ASSERT(src1_ddf_i != nullptr);
1196
- GGML_ASSERT(dst_dd_i != nullptr);
1197
-
1198
- const int64_t ne00 = src0->ne[0];
1199
- const int64_t ne10 = src1->ne[0];
1200
-
1201
- const int64_t ne0 = dst->ne[0];
1202
-
1203
- const int64_t row_diff = row_high - row_low;
1204
-
1205
- int id = ggml_cuda_get_device();
1206
-
1207
- // the main device has a larger memory buffer to hold the results from all GPUs
1208
- // ldc == nrows of the matrix that cuBLAS writes into
1209
- int64_t ldc = id == ctx.device ? ne0 : row_diff;
1210
-
1211
- const int compute_capability = ggml_cuda_info().devices[id].cc;
1212
-
1213
- if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
1214
- // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
1215
- ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
1216
- if (src0->type != GGML_TYPE_F16) {
1217
- const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
1218
- GGML_ASSERT(to_fp16_cuda != nullptr);
1219
- size_t ne = row_diff*ne00;
1220
- src0_as_f16.alloc(ne);
1221
- to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
1222
- }
1223
- const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
1224
-
1225
- ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool(id));
1226
- if (src1->type != GGML_TYPE_F16) {
1227
- const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
1228
- GGML_ASSERT(to_fp16_cuda != nullptr);
1229
- size_t ne = src1_ncols*ne10;
1230
- src1_as_f16.alloc(ne);
1231
- to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
1232
- }
1233
- const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
1234
- ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(id), row_diff*src1_ncols);
1235
-
1236
- const half alpha_f16 = 1.0f;
1237
- const half beta_f16 = 0.0f;
1238
-
1239
- CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
1240
- CUBLAS_CHECK(
1241
- cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
1242
- row_diff, src1_ncols, ne10,
1243
- &alpha_f16, src0_ptr, CUDA_R_16F, ne00,
1244
- src1_ptr, CUDA_R_16F, ne10,
1245
- &beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
1246
- CUBLAS_COMPUTE_16F,
1247
- CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1248
-
1249
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
1250
- to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
1251
- } else {
1252
- ggml_cuda_pool_alloc<float> src0_ddq_as_f32(ctx.pool(id));
1253
- ggml_cuda_pool_alloc<float> src1_ddq_as_f32(ctx.pool(id));
1254
-
1255
- if (src0->type != GGML_TYPE_F32) {
1256
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
1257
- GGML_ASSERT(to_fp32_cuda != nullptr);
1258
- src0_ddq_as_f32.alloc(row_diff*ne00);
1259
- to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
1260
- }
1261
- if (src1->type != GGML_TYPE_F32) {
1262
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
1263
- GGML_ASSERT(to_fp32_cuda != nullptr);
1264
- src1_ddq_as_f32.alloc(src1_ncols*ne10);
1265
- to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
1266
- }
1267
-
1268
- const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
1269
- const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
1270
-
1271
- const float alpha = 1.0f;
1272
- const float beta = 0.0f;
1273
-
1274
- CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
1275
- CUBLAS_CHECK(
1276
- cublasSgemm(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
1277
- row_diff, src1_ncols, ne10,
1278
- &alpha, src0_ddf_i, ne00,
1279
- src1_ddf1_i, ne10,
1280
- &beta, dst_dd_i, ldc));
1281
- }
1282
-
1283
- GGML_UNUSED(dst);
1284
- GGML_UNUSED(src1_ddq_i);
1285
- GGML_UNUSED(src1_padded_row_size);
1286
- }
1287
-
1288
- static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
1289
- static bool peer_access_enabled = false;
1290
-
1291
- const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
1292
-
1293
- if (peer_access_enabled == enable_peer_access) {
1294
- return;
1295
- }
1296
-
1297
- #ifdef NDEBUG
1298
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1299
- ggml_cuda_set_device(id);
1300
- CUDA_CHECK(cudaDeviceSynchronize());
1301
- }
1302
-
1303
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1304
- ggml_cuda_set_device(id);
1305
-
1306
- for (int id_other = 0; id_other < ggml_backend_cuda_get_device_count(); ++id_other) {
1307
- if (id == id_other) {
1308
- continue;
1309
- }
1310
- if (id != main_device && id_other != main_device) {
1311
- continue;
1312
- }
1313
-
1314
- int can_access_peer;
1315
- CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
1316
- if (can_access_peer) {
1317
- if (enable_peer_access) {
1318
- cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
1319
- if (err != cudaErrorPeerAccessAlreadyEnabled) {
1320
- CUDA_CHECK(err);
1321
- }
1322
- } else {
1323
- cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
1324
- if (err != cudaErrorPeerAccessNotEnabled) {
1325
- CUDA_CHECK(err);
1326
- }
1327
- }
1328
- }
1329
- }
1330
- }
1331
-
1332
- ggml_cuda_set_device(main_device);
1333
- #endif // NDEBUG
1334
-
1335
- peer_access_enabled = enable_peer_access;
1336
-
1337
- GGML_UNUSED(main_device);
1338
- }
1339
-
1340
- static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
1341
- void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
1342
-
1343
- #if !defined(GGML_USE_HIPBLAS)
1344
- // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
1345
- cudaMemcpy3DPeerParms p = {};
1346
- p.dstDevice = dstDevice;
1347
- p.dstPtr = make_cudaPitchedPtr(dst, dpitch, dpitch, height);
1348
- p.srcDevice = srcDevice;
1349
- p.srcPtr = make_cudaPitchedPtr(src, spitch, spitch, height);
1350
- p.extent = make_cudaExtent(width, height, 1);
1351
- return cudaMemcpy3DPeerAsync(&p, stream);
1352
- #else
1353
- // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
1354
- GGML_UNUSED(dstDevice);
1355
- GGML_UNUSED(srcDevice);
1356
- return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
1357
- #endif // !defined(GGML_USE_HIPBLAS)
1358
- }
1359
-
1360
- static void ggml_cuda_op_mul_mat(
1361
- ggml_backend_cuda_context & ctx,
1362
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
1363
- quantize_cuda_t quantize_src1) {
1364
-
1365
- const int64_t ne00 = src0->ne[0];
1366
- const int64_t ne01 = src0->ne[1];
1367
- const int64_t ne02 = src0->ne[2];
1368
- const int64_t ne03 = src0->ne[3];
1369
-
1370
- const int64_t ne10 = src1->ne[0];
1371
- const int64_t ne11 = src1->ne[1];
1372
- const int64_t ne12 = src1->ne[2];
1373
- const int64_t ne13 = src1->ne[3];
1374
- const int64_t nrows1 = ggml_nrows(src1);
1375
-
1376
- GGML_ASSERT(ne03 == ne13);
1377
-
1378
- const int64_t ne0 = dst->ne[0];
1379
- const int64_t ne1 = dst->ne[1];
1380
-
1381
- const int64_t nb2 = dst->nb[2];
1382
- const int64_t nb3 = dst->nb[3];
1383
-
1384
- GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
1385
- GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
1386
- ggml_backend_cuda_buffer_context * src1_ctx = (ggml_backend_cuda_buffer_context *) src1->buffer->context;
1387
- ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *) dst->buffer->context;
1388
-
1389
- GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
1390
-
1391
- GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
1392
-
1393
- const int64_t i02_divisor = ne12 / ne02;
1394
-
1395
- const size_t src0_ts = ggml_type_size(src0->type);
1396
- const size_t src0_bs = ggml_blck_size(src0->type);
1397
- const size_t q8_1_ts = sizeof(block_q8_1);
1398
- const size_t q8_1_bs = QK8_1;
1399
-
1400
- const bool src0_is_contiguous = ggml_is_contiguous(src0);
1401
- const bool src1_is_contiguous = ggml_is_contiguous(src1);
1402
-
1403
- const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
1404
-
1405
- const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
1406
- GGML_ASSERT(!(split && ne02 > 1));
1407
- GGML_ASSERT(!(split && ne03 > 1));
1408
- GGML_ASSERT(!(split && ne02 < ne12));
1409
-
1410
- ggml_tensor_extra_gpu * src0_extra = split ? (ggml_tensor_extra_gpu *) src0->extra : nullptr;
1411
-
1412
-
1413
- std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
1414
- if (split) {
1415
- ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
1416
- tensor_split = buft_ctx->tensor_split;
1417
- }
1418
-
1419
- struct dev_data {
1420
- int cc;
1421
-
1422
- ggml_cuda_pool_alloc<char> src0_dd_alloc;
1423
- ggml_cuda_pool_alloc<float> src1_ddf_alloc;
1424
- ggml_cuda_pool_alloc<char> src1_ddq_alloc;
1425
- ggml_cuda_pool_alloc<float> dst_dd_alloc;
1426
-
1427
- char * src0_dd = nullptr;
1428
- float * src1_ddf = nullptr; // float
1429
- char * src1_ddq = nullptr; // q8_1
1430
- float * dst_dd = nullptr;
1431
-
1432
- int64_t row_low;
1433
- int64_t row_high;
1434
- };
1435
-
1436
- dev_data dev[GGML_CUDA_MAX_DEVICES];
1437
-
1438
- int used_devices = 0;
1439
-
1440
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1441
- dev[id].cc = ggml_cuda_info().devices[id].cc;
1442
-
1443
- // by default, use all rows
1444
- dev[id].row_low = 0;
1445
- dev[id].row_high = ne01;
1446
-
1447
- // for multi GPU, get the row boundaries from tensor split
1448
- // and round to mul_mat_q tile sizes
1449
- if (split) {
1450
- const int64_t rounding = get_row_rounding(tensor_split);
1451
-
1452
- if (id != 0) {
1453
- dev[id].row_low = ne01*tensor_split[id];
1454
- if (dev[id].row_low < ne01) {
1455
- dev[id].row_low -= dev[id].row_low % rounding;
1456
- }
1457
- }
1458
-
1459
- if (id != ggml_backend_cuda_get_device_count() - 1) {
1460
- dev[id].row_high = ne01*tensor_split[id + 1];
1461
- if (dev[id].row_high < ne01) {
1462
- dev[id].row_high -= dev[id].row_high % rounding;
1463
- }
1464
- }
1465
- }
1466
- }
1467
-
1468
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1469
- if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
1470
- continue;
1471
- }
1472
-
1473
- used_devices++;
1474
-
1475
- const bool src1_on_device = id == src1_ctx->device;
1476
- const bool dst_on_device = id == dst_ctx->device;
1477
-
1478
- ggml_cuda_set_device(id);
1479
- cudaStream_t stream = ctx.stream(id, 0);
1480
-
1481
- if (src0_is_contiguous) {
1482
- dev[id].src0_dd = split ? (char *) src0_extra->data_device[id] : (char *) src0->data;
1483
- } else {
1484
- dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
1485
- }
1486
-
1487
- if (src1_on_device && src1_is_contiguous) {
1488
- dev[id].src1_ddf = (float *) src1->data;
1489
- } else {
1490
- dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
1491
- }
1492
-
1493
- if (quantize_src1) {
1494
- size_t src_1_ddq_size = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
1495
- if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1496
- src_1_ddq_size += get_mmq_x_max_host(dev[id].cc)*sizeof(block_q8_1_mmq);
1497
- }
1498
- dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
1499
-
1500
- if (src1_on_device && src1_is_contiguous) {
1501
- quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
1502
- CUDA_CHECK(cudaGetLastError());
1503
- }
1504
- }
1505
-
1506
- if (dst_on_device) {
1507
- dev[id].dst_dd = (float *) dst->data;
1508
- } else {
1509
- const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
1510
- dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(ctx.pool(id), size_dst_ddf);
1511
- }
1512
- }
1513
-
1514
- // if multiple devices are used they need to wait for the main device
1515
- // here an event is recorded that signals that the main device has finished calculating the input data
1516
- if (split && used_devices > 1) {
1517
- ggml_cuda_set_device(ctx.device);
1518
- CUDA_CHECK(cudaEventRecord(src0_extra->events[ctx.device][0], ctx.stream()));
1519
- }
1520
-
1521
- const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
1522
- for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
1523
- const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_CUDA_MAX_STREAMS : 0;
1524
- const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
1525
-
1526
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1527
- if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
1528
- continue;
1529
- }
1530
-
1531
- const bool src1_on_device = id == src1_ctx->device;
1532
- const bool dst_on_device = id == dst_ctx->device;
1533
- const int64_t row_diff = dev[id].row_high - dev[id].row_low;
1534
-
1535
- ggml_cuda_set_device(id);
1536
- cudaStream_t stream = ctx.stream(id, is);
1537
-
1538
- // wait for main GPU data if necessary
1539
- if (split && (id != ctx.device || is != 0)) {
1540
- CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[ctx.device][0], 0));
1541
- }
1542
-
1543
- for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
1544
- const int64_t i03 = i0 / ne12;
1545
- const int64_t i02 = i0 % ne12;
1546
-
1547
- size_t src1_ddq_i_offset = i0*ne11 * src1_padded_col_size*q8_1_ts/q8_1_bs;
1548
- if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1549
- src1_ddq_i_offset += src1_col_0 * sizeof(block_q8_1_mmq);
1550
- } else {
1551
- src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs;
1552
- }
1553
-
1554
- // for split tensors the data begins at i0 == i0_offset_low
1555
- char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
1556
- float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
1557
- char * src1_ddq_i = dev[id].src1_ddq + src1_ddq_i_offset;
1558
- float * dst_dd_i = dev[id].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
1559
-
1560
- // the main device memory buffer can be on VRAM scratch, with space for all partial results
1561
- // in that case an offset on dst_ddf_i is needed
1562
- if (id == ctx.device) {
1563
- dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
1564
- }
1565
-
1566
- // copy src0, src1 to device if necessary
1567
- if (src1_is_contiguous) {
1568
- if (id != ctx.device) {
1569
- if (quantize_src1) {
1570
- char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
1571
- if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1572
- const size_t pitch = ne11*sizeof(block_q8_1_mmq);
1573
- const size_t width = src1_ncols*sizeof(block_q8_1_mmq);
1574
- const size_t height = src1_padded_col_size/(4*QK8_1);
1575
- CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device, pitch, width, height, stream));
1576
- } else {
1577
- CUDA_CHECK(cudaMemcpyPeerAsync(
1578
- src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
1579
- }
1580
- } else {
1581
- float * src1_ddf_i_source = (float *) src1->data;
1582
- src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
1583
- CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, ctx.device,
1584
- src1_ncols*ne10*sizeof(float), stream));
1585
- }
1586
- }
1587
- } else if (src1_on_device && !src1_is_contiguous) {
1588
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
1589
- src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
1590
- } else {
1591
- GGML_ASSERT(false);
1592
- }
1593
-
1594
- if (quantize_src1 && !src1_is_contiguous) {
1595
- quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream);
1596
- CUDA_CHECK(cudaGetLastError());
1597
- }
1598
-
1599
- if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) {
1600
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
1601
- }
1602
-
1603
- // do the computation
1604
- op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
1605
- dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream);
1606
- CUDA_CHECK(cudaGetLastError());
1607
-
1608
- // copy dst to host or other device if necessary
1609
- if (!dst_on_device) {
1610
- void * dst_off_device = dst->data;
1611
- if (split) {
1612
- // src0 = weight matrix is saved as a transposed matrix for better memory layout.
1613
- // dst is NOT transposed.
1614
- // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
1615
- // Instead they need to be copied to the correct slice in ne0 = dst row index.
1616
- // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
1617
- float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
1618
- GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
1619
- dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
1620
- CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(
1621
- dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream));
1622
- } else {
1623
- float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
1624
- GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
1625
- dhf_dst_i += src1_col_0*ne0;
1626
- CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), cudaMemcpyDeviceToDevice, stream));
1627
- }
1628
- }
1629
-
1630
- // add event for the main device to wait on until other device is done
1631
- if (split && (id != ctx.device || is != 0)) {
1632
- CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
1633
- }
1634
- }
1635
- }
1636
- }
1637
-
1638
- // main device waits for all other devices to be finished
1639
- if (split && ggml_backend_cuda_get_device_count() > 1) {
1640
- int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
1641
- is_max = is_max <= GGML_CUDA_MAX_STREAMS ? is_max : GGML_CUDA_MAX_STREAMS;
1642
-
1643
- ggml_cuda_set_device(ctx.device);
1644
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1645
- if (dev[id].row_low == dev[id].row_high) {
1646
- continue;
1647
- }
1648
- for (int64_t is = 0; is < is_max; ++is) {
1649
- CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), src0_extra->events[id][is], 0));
1650
- }
1651
- }
1652
- }
1653
- }
1654
-
1655
- static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1656
- GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
1657
- GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
1658
- GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
1659
- GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
1660
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
1661
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
1662
-
1663
- const int64_t ne00 = src0->ne[0];
1664
- const int64_t ne01 = src0->ne[1];
1665
- const int64_t ne02 = src0->ne[2];
1666
-
1667
- const int64_t ne12 = src1->ne[2];
1668
-
1669
- cudaStream_t main_stream = ctx.stream();
1670
-
1671
- void * src0_ddq = src0->data;
1672
- float * src1_ddf = (float *) src1->data;
1673
- float * dst_ddf = (float *) dst->data;
1674
-
1675
- ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
1676
- }
1677
-
1678
- static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1679
- GGML_ASSERT(!ggml_is_transposed(src0));
1680
- GGML_ASSERT(!ggml_is_transposed(src1));
1681
- GGML_ASSERT(!ggml_is_permuted(src0));
1682
- GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
1683
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
1684
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
1685
-
1686
- const int64_t ne00 = src0->ne[0];
1687
- const int64_t ne01 = src0->ne[1];
1688
- const int64_t ne02 = src0->ne[2];
1689
-
1690
- const int64_t nb01 = src0->nb[1];
1691
- const int64_t nb02 = src0->nb[2];
1692
-
1693
- const int64_t ne12 = src1->ne[2];
1694
-
1695
- cudaStream_t main_stream = ctx.stream();
1696
-
1697
- void * src0_ddq = src0->data;
1698
- float * src1_ddf = (float *) src1->data;
1699
- float * dst_ddf = (float *) dst->data;
1700
-
1701
- const int64_t row_stride_x = nb01 / sizeof(half);
1702
- const int64_t channel_stride_x = nb02 / sizeof(half);
1703
-
1704
- ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
1705
- }
1706
-
1707
- static __global__ void k_compute_batched_ptrs(
1708
- const half * src0_as_f16, const half * src1_as_f16, char * dst,
1709
- const void ** ptrs_src, void ** ptrs_dst,
1710
- int64_t ne12, int64_t ne13,
1711
- int64_t ne23,
1712
- size_t nb02, size_t nb03,
1713
- size_t nb12, size_t nb13,
1714
- size_t nbd2, size_t nbd3,
1715
- int64_t r2, int64_t r3) {
1716
- int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
1717
- int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
1718
-
1719
- if (i13 >= ne13 || i12 >= ne12) {
1720
- return;
1721
- }
1722
-
1723
- int64_t i03 = i13 / r3;
1724
- int64_t i02 = i12 / r2;
1725
-
1726
- ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
1727
- ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
1728
- ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
1729
- }
1730
-
1731
- static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1732
- GGML_ASSERT(!ggml_is_transposed(src0));
1733
- GGML_ASSERT(!ggml_is_transposed(src1));
1734
-
1735
- GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
1736
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
1737
-
1738
- GGML_TENSOR_BINARY_OP_LOCALS
1739
-
1740
- const int64_t ne_dst = ggml_nelements(dst);
1741
-
1742
- cudaStream_t main_stream = ctx.stream();
1743
-
1744
- CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream));
1745
-
1746
- void * src0_ddq = src0->data;
1747
- half * src0_f16 = (half *) src0_ddq;
1748
- float * src1_ddf = (float *) src1->data;
1749
- float * dst_ddf = (float *) dst->data;
1750
-
1751
- // convert src1 to fp16
1752
- ggml_cuda_pool_alloc<half> src1_f16_alloc(ctx.pool());
1753
- if (src1->type != GGML_TYPE_F16) {
1754
- const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
1755
- const int64_t ne_src1 = ggml_nelements(src1);
1756
- src1_f16_alloc.alloc(ne_src1);
1757
- GGML_ASSERT(to_fp16_cuda != nullptr);
1758
- to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
1759
- }
1760
- half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
1761
-
1762
- ggml_cuda_pool_alloc<half> dst_f16(ctx.pool());
1763
- char * dst_t;
1764
-
1765
- cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
1766
- cudaDataType_t cu_data_type = CUDA_R_16F;
1767
-
1768
- // dst strides
1769
- size_t nbd2 = dst->nb[2];
1770
- size_t nbd3 = dst->nb[3];
1771
-
1772
- const half alpha_f16 = 1.0f;
1773
- const half beta_f16 = 0.0f;
1774
-
1775
- const float alpha_f32 = 1.0f;
1776
- const float beta_f32 = 0.0f;
1777
-
1778
- const void * alpha = &alpha_f16;
1779
- const void * beta = &beta_f16;
1780
-
1781
- if (dst->op_params[0] == GGML_PREC_DEFAULT) {
1782
- dst_t = (char *) dst_f16.alloc(ne_dst);
1783
-
1784
- nbd2 /= sizeof(float) / sizeof(half);
1785
- nbd3 /= sizeof(float) / sizeof(half);
1786
- } else {
1787
- dst_t = (char *) dst_ddf;
1788
-
1789
- cu_compute_type = CUBLAS_COMPUTE_32F;
1790
- cu_data_type = CUDA_R_32F;
1791
-
1792
- alpha = &alpha_f32;
1793
- beta = &beta_f32;
1794
- }
1795
-
1796
- GGML_ASSERT(ne12 % ne02 == 0);
1797
- GGML_ASSERT(ne13 % ne03 == 0);
1798
-
1799
- // broadcast factors
1800
- const int64_t r2 = ne12/ne02;
1801
- const int64_t r3 = ne13/ne03;
1802
-
1803
- #if 0
1804
- // use cublasGemmEx
1805
- {
1806
- for (int i13 = 0; i13 < ne13; ++i13) {
1807
- for (int i12 = 0; i12 < ne12; ++i12) {
1808
- int i03 = i13 / r3;
1809
- int i02 = i12 / r2;
1810
-
1811
- CUBLAS_CHECK(
1812
- cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
1813
- ne01, ne11, ne10,
1814
- alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
1815
- (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
1816
- beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
1817
- cu_compute_type,
1818
- CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1819
- }
1820
- }
1821
- }
1822
- #else
1823
- if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
1824
- // there is no broadcast and src0, src1 are contiguous across dims 2, 3
1825
- // use cublasGemmStridedBatchedEx
1826
- CUBLAS_CHECK(
1827
- cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
1828
- ne01, ne11, ne10,
1829
- alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
1830
- (const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB
1831
- beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC
1832
- ne12*ne13,
1833
- cu_compute_type,
1834
- CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1835
- } else {
1836
- // use cublasGemmBatchedEx
1837
- const int ne23 = ne12*ne13;
1838
-
1839
- ggml_cuda_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
1840
- ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
1841
-
1842
- dim3 block_dims(ne13, ne12);
1843
- k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
1844
- src0_f16, src1_f16, dst_t,
1845
- ptrs_src.get(), ptrs_dst.get(),
1846
- ne12, ne13,
1847
- ne23,
1848
- nb02, nb03,
1849
- src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
1850
- src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
1851
- nbd2, nbd3,
1852
- r2, r3);
1853
- CUDA_CHECK(cudaGetLastError());
1854
-
1855
- CUBLAS_CHECK(
1856
- cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
1857
- ne01, ne11, ne10,
1858
- alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00,
1859
- (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10,
1860
- beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
1861
- ne23,
1862
- cu_compute_type,
1863
- CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1864
- }
1865
- #endif
1866
-
1867
- if (dst->op_params[0] == GGML_PREC_DEFAULT) {
1868
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
1869
- to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
1870
- }
1871
- }
1872
-
1873
- static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1874
- const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
1875
-
1876
- int64_t min_compute_capability = INT_MAX;
1877
-
1878
- bool any_pascal_with_slow_fp16 = false;
1879
- if (split) {
1880
- ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
1881
- auto & tensor_split = buft_ctx->tensor_split;
1882
- for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1883
- // skip devices that are not going to do any work:
1884
- if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
1885
- continue;
1886
- }
1887
-
1888
- if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
1889
- min_compute_capability = ggml_cuda_info().devices[id].cc;
1890
- }
1891
- if (ggml_cuda_info().devices[id].cc == 610) {
1892
- any_pascal_with_slow_fp16 = true;
1893
- }
1894
- }
1895
- } else {
1896
- min_compute_capability = ggml_cuda_info().devices[ctx.device].cc;
1897
- any_pascal_with_slow_fp16 = ggml_cuda_info().devices[ctx.device].cc == 610;
1898
- }
1899
-
1900
- // check data types and tensor shapes for custom matrix multiplication kernels:
1901
- bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
1902
- && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1903
- && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
1904
-
1905
- bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
1906
- && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1907
- && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
1908
-
1909
- bool use_mul_mat_q = ggml_cuda_supports_mmq(src0->type)
1910
- && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
1911
-
1912
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1913
-
1914
- const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
1915
-
1916
- #ifdef CUDA_USE_TENSOR_CORES
1917
- use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
1918
- #endif // CUDA_USE_TENSOR_CORES
1919
-
1920
- #else
1921
-
1922
- // fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
1923
- const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
1924
-
1925
- // mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
1926
- use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
1927
- use_mul_mat_q = use_mul_mat_q && min_compute_capability >= MIN_CC_DP4A;
1928
-
1929
- #ifdef CUDA_USE_TENSOR_CORES
1930
- // when tensor cores are available, use them for large batch size
1931
- // ref: https://github.com/ggerganov/llama.cpp/pull/3776
1932
- use_mul_mat_q = use_mul_mat_q && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
1933
- #endif // CUDA_USE_TENSOR_CORES
1934
-
1935
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1936
-
1937
- // if mmvq is available it's a better choice than dmmv:
1938
- #ifndef GGML_CUDA_FORCE_DMMV
1939
- use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
1940
- #endif // GGML_CUDA_FORCE_DMMV
1941
-
1942
- // debug helpers
1943
- //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
1944
- //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
1945
- //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
1946
- //printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
1947
- //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
1948
- //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
1949
-
1950
- if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
1951
- // KQ single-batch
1952
- ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst);
1953
- } else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
1954
- // KQV single-batch
1955
- ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
1956
- } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
1957
- // KQ + KQV multi-batch
1958
- ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
1959
- } else if (use_dequantize_mul_mat_vec) {
1960
- ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
1961
- } else if (use_mul_mat_vec_q) {
1962
- ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
1963
- } else if (use_mul_mat_q) {
1964
- ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
1965
- } else {
1966
- ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
1967
- }
1968
- }
1969
-
1970
- struct mmid_row_mapping {
1971
- int32_t i1;
1972
- int32_t i2;
1973
- };
1974
-
1975
- static __global__ void k_copy_src1_to_contiguous(const char * __restrict__ src1_original, char * __restrict__ src1_contiguous,
1976
- int * __restrict__ cur_src1_row, mmid_row_mapping * __restrict__ row_mapping,
1977
- const char * __restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
1978
- int64_t ne11, int64_t ne10,
1979
- size_t nb11, size_t nb12) {
1980
- int32_t iid1 = blockIdx.x;
1981
- int32_t id = blockIdx.y;
1982
-
1983
- const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
1984
-
1985
- if (row_id_i != i02) {
1986
- return;
1987
- }
1988
-
1989
- const int64_t i11 = id % ne11;
1990
- const int64_t i12 = iid1;
1991
-
1992
- __shared__ int src1_row;
1993
- if (threadIdx.x == 0) {
1994
- src1_row = atomicAdd(cur_src1_row, 1);
1995
- row_mapping[src1_row] = {id, iid1};
1996
- }
1997
- __syncthreads();
1998
-
1999
- const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
2000
- float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
2001
-
2002
- for (int i = threadIdx.x; i < ne10; i += blockDim.x) {
2003
- src1_row_contiguous[i] = src1_row_original[i];
2004
- }
2005
- }
2006
-
2007
- static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_original, const char * __restrict__ dst_contiguous,
2008
- const mmid_row_mapping * __restrict__ row_mapping,
2009
- int64_t ne0,
2010
- size_t nb1, size_t nb2) {
2011
- int32_t i = blockIdx.x;
2012
-
2013
- const int32_t i1 = row_mapping[i].i1;
2014
- const int32_t i2 = row_mapping[i].i2;
2015
-
2016
- const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1);
2017
- float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2);
2018
-
2019
- for (int j = threadIdx.x; j < ne0; j += blockDim.x) {
2020
- dst_row_original[j] = dst_row_contiguous[j];
2021
- }
2022
- }
2023
-
2024
- static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
2025
- const ggml_tensor * src0 = dst->src[0];
2026
- const ggml_tensor * src1 = dst->src[1];
2027
- const ggml_tensor * ids = dst->src[2];
2028
-
2029
- GGML_TENSOR_BINARY_OP_LOCALS
2030
-
2031
- GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");
2032
-
2033
- cudaStream_t stream = ctx.stream();
2034
-
2035
- const int64_t n_as = ne02;
2036
- const int64_t n_ids = ids->ne[0];
2037
-
2038
- std::vector<char> ids_host(ggml_nbytes(ids));
2039
- const char * ids_dev = (const char *) ids->data;
2040
- CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
2041
- CUDA_CHECK(cudaStreamSynchronize(stream));
2042
-
2043
- ggml_tensor src0_row = *src0;
2044
- ggml_tensor src1_row = *src1;
2045
- ggml_tensor dst_row = *dst;
2046
-
2047
- char * src0_original = (char *) src0->data;
2048
- char * src1_original = (char *) src1->data;
2049
- char * dst_original = (char *) dst->data;
2050
-
2051
- src0_row.ne[2] = 1;
2052
- src0_row.ne[3] = 1;
2053
- src0_row.nb[3] = nb02;
2054
-
2055
- src1_row.ne[1] = 1;
2056
- src1_row.ne[2] = 1;
2057
- src1_row.ne[3] = 1;
2058
- src1_row.nb[2] = nb11;
2059
- src1_row.nb[3] = nb11;
2060
-
2061
- dst_row.ne[1] = 1;
2062
- dst_row.ne[2] = 1;
2063
- dst_row.ne[3] = 1;
2064
- dst_row.nb[2] = nb1;
2065
- dst_row.nb[3] = nb1;
2066
-
2067
- if (ne12 == 1) {
2068
- for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2069
- for (int64_t id = 0; id < n_ids; id++) {
2070
- const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2071
-
2072
- GGML_ASSERT(i02 >= 0 && i02 < n_as);
2073
-
2074
- const int64_t i11 = id % ne11;
2075
- const int64_t i12 = iid1;
2076
-
2077
- const int64_t i1 = id;
2078
- const int64_t i2 = i12;
2079
-
2080
- src0_row.data = src0_original + i02*nb02;
2081
- src1_row.data = src1_original + i11*nb11 + i12*nb12;
2082
- dst_row.data = dst_original + i1*nb1 + i2*nb2;
2083
-
2084
- ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
2085
- }
2086
- }
2087
- } else {
2088
- ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
2089
- ggml_cuda_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
2090
-
2091
- src1_row.data = src1_contiguous.get();
2092
- dst_row.data = dst_contiguous.get();
2093
-
2094
- for (int64_t i02 = 0; i02 < n_as; i02++) {
2095
- int64_t num_src1_rows = 0;
2096
-
2097
- for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2098
- for (int64_t id = 0; id < n_ids; id++) {
2099
- const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2100
-
2101
- GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
2102
-
2103
- if (row_id_i != i02) {
2104
- continue;
2105
- }
2106
-
2107
- num_src1_rows++;
2108
- }
2109
- }
2110
-
2111
- if (num_src1_rows == 0) {
2112
- continue;
2113
- }
2114
-
2115
- ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
2116
- ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
2117
- CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
2118
-
2119
- {
2120
- dim3 block_dims(std::min((unsigned int)ne10, 768u));
2121
- dim3 grid_dims(ids->ne[1], n_ids);
2122
- k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
2123
- src1_original, src1_contiguous.get(),
2124
- dev_cur_src1_row.get(), dev_row_mapping.get(),
2125
- ids_dev, i02, ids->nb[1], ids->nb[0],
2126
- ne11, ne10,
2127
- nb11, nb12);
2128
- CUDA_CHECK(cudaGetLastError());
2129
- }
2130
-
2131
- src0_row.data = src0_original + i02*nb02;
2132
-
2133
- GGML_ASSERT(nb11 == sizeof(float)*ne10);
2134
- GGML_ASSERT(nb1 == sizeof(float)*ne0);
2135
-
2136
- src1_row.ne[1] = num_src1_rows;
2137
- src1_row.nb[1] = nb11;
2138
- src1_row.nb[2] = num_src1_rows*nb11;
2139
- src1_row.nb[3] = num_src1_rows*nb11;
2140
-
2141
- dst_row.ne[1] = num_src1_rows;
2142
- dst_row.nb[1] = nb1;
2143
- dst_row.nb[2] = num_src1_rows*nb1;
2144
- dst_row.nb[3] = num_src1_rows*nb1;
2145
-
2146
- ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
2147
-
2148
- {
2149
- dim3 block_dims(std::min((unsigned int)ne0, 768u));
2150
- dim3 grid_dims(num_src1_rows);
2151
- k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
2152
- dst_original, dst_contiguous.get(),
2153
- dev_row_mapping.get(),
2154
- ne0,
2155
- nb1, nb2);
2156
- CUDA_CHECK(cudaGetLastError());
2157
- }
2158
- }
2159
- }
2160
- }
2161
-
2162
- static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
2163
- // why is this here instead of mul_mat?
2164
- if (dst->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(dst->src[0]->buffer)) {
2165
- ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
2166
- }
2167
-
2168
- switch (dst->op) {
2169
- case GGML_OP_REPEAT:
2170
- ggml_cuda_op_repeat(ctx, dst);
2171
- break;
2172
- case GGML_OP_GET_ROWS:
2173
- ggml_cuda_op_get_rows(ctx, dst);
2174
- break;
2175
- case GGML_OP_DUP:
2176
- ggml_cuda_dup(ctx, dst);
2177
- break;
2178
- case GGML_OP_CPY:
2179
- ggml_cuda_cpy(ctx, dst->src[0], dst->src[1]);
2180
- break;
2181
- case GGML_OP_CONT:
2182
- ggml_cuda_dup(ctx, dst);
2183
- break;
2184
- case GGML_OP_ADD:
2185
- ggml_cuda_op_add(ctx, dst);
2186
- break;
2187
- case GGML_OP_ACC:
2188
- ggml_cuda_op_acc(ctx, dst);
2189
- break;
2190
- case GGML_OP_MUL:
2191
- ggml_cuda_op_mul(ctx, dst);
2192
- break;
2193
- case GGML_OP_DIV:
2194
- ggml_cuda_op_div(ctx, dst);
2195
- break;
2196
- case GGML_OP_UNARY:
2197
- switch (ggml_get_unary_op(dst)) {
2198
- case GGML_UNARY_OP_GELU:
2199
- ggml_cuda_op_gelu(ctx, dst);
2200
- break;
2201
- case GGML_UNARY_OP_SILU:
2202
- ggml_cuda_op_silu(ctx, dst);
2203
- break;
2204
- case GGML_UNARY_OP_GELU_QUICK:
2205
- ggml_cuda_op_gelu_quick(ctx, dst);
2206
- break;
2207
- case GGML_UNARY_OP_TANH:
2208
- ggml_cuda_op_tanh(ctx, dst);
2209
- break;
2210
- case GGML_UNARY_OP_RELU:
2211
- ggml_cuda_op_relu(ctx, dst);
2212
- break;
2213
- case GGML_UNARY_OP_SIGMOID:
2214
- ggml_cuda_op_sigmoid(ctx, dst);
2215
- break;
2216
- case GGML_UNARY_OP_HARDSIGMOID:
2217
- ggml_cuda_op_hardsigmoid(ctx, dst);
2218
- break;
2219
- case GGML_UNARY_OP_HARDSWISH:
2220
- ggml_cuda_op_hardswish(ctx, dst);
2221
- break;
2222
- default:
2223
- return false;
2224
- }
2225
- break;
2226
- case GGML_OP_NORM:
2227
- ggml_cuda_op_norm(ctx, dst);
2228
- break;
2229
- case GGML_OP_GROUP_NORM:
2230
- ggml_cuda_op_group_norm(ctx, dst);
2231
- break;
2232
- case GGML_OP_CONCAT:
2233
- ggml_cuda_op_concat(ctx, dst);
2234
- break;
2235
- case GGML_OP_UPSCALE:
2236
- ggml_cuda_op_upscale(ctx, dst);
2237
- break;
2238
- case GGML_OP_PAD:
2239
- ggml_cuda_op_pad(ctx, dst);
2240
- break;
2241
- case GGML_OP_ARANGE:
2242
- ggml_cuda_op_arange(ctx, dst);
2243
- break;
2244
- case GGML_OP_TIMESTEP_EMBEDDING:
2245
- ggml_cuda_op_timestep_embedding(ctx, dst);
2246
- break;
2247
- case GGML_OP_LEAKY_RELU:
2248
- ggml_cuda_op_leaky_relu(ctx, dst);
2249
- break;
2250
- case GGML_OP_RMS_NORM:
2251
- ggml_cuda_op_rms_norm(ctx, dst);
2252
- break;
2253
- case GGML_OP_MUL_MAT:
2254
- if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
2255
- GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
2256
- return false;
2257
- } else {
2258
- ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
2259
- }
2260
- break;
2261
- case GGML_OP_MUL_MAT_ID:
2262
- ggml_cuda_mul_mat_id(ctx, dst);
2263
- break;
2264
- case GGML_OP_SCALE:
2265
- ggml_cuda_op_scale(ctx, dst);
2266
- break;
2267
- case GGML_OP_SQR:
2268
- ggml_cuda_op_sqr(ctx, dst);
2269
- break;
2270
- case GGML_OP_CLAMP:
2271
- ggml_cuda_op_clamp(ctx, dst);
2272
- break;
2273
- case GGML_OP_NONE:
2274
- case GGML_OP_RESHAPE:
2275
- case GGML_OP_VIEW:
2276
- case GGML_OP_PERMUTE:
2277
- case GGML_OP_TRANSPOSE:
2278
- break;
2279
- case GGML_OP_DIAG_MASK_INF:
2280
- ggml_cuda_op_diag_mask_inf(ctx, dst);
2281
- break;
2282
- case GGML_OP_SOFT_MAX:
2283
- ggml_cuda_op_soft_max(ctx, dst);
2284
- break;
2285
- case GGML_OP_ROPE:
2286
- ggml_cuda_op_rope(ctx, dst);
2287
- break;
2288
- case GGML_OP_IM2COL:
2289
- ggml_cuda_op_im2col(ctx, dst);
2290
- break;
2291
- case GGML_OP_POOL_2D:
2292
- ggml_cuda_op_pool2d(ctx, dst);
2293
- break;
2294
- case GGML_OP_SUM_ROWS:
2295
- ggml_cuda_op_sum_rows(ctx, dst);
2296
- break;
2297
- case GGML_OP_ARGSORT:
2298
- ggml_cuda_op_argsort(ctx, dst);
2299
- break;
2300
- case GGML_OP_FLASH_ATTN_EXT:
2301
- ggml_cuda_flash_attn_ext(ctx, dst);
2302
- break;
2303
- default:
2304
- return false;
2305
- }
2306
-
2307
- cudaError_t err = cudaGetLastError();
2308
- if (err != cudaSuccess) {
2309
- GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
2310
- CUDA_CHECK(err);
2311
- }
2312
-
2313
- return true;
2314
- }
2315
-
2316
- ////////////////////////////////////////////////////////////////////////////////
2317
-
2318
- // backend
2319
-
2320
- GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
2321
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2322
-
2323
- return cuda_ctx->name.c_str();
2324
- }
2325
-
2326
- GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
2327
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2328
-
2329
- delete cuda_ctx;
2330
- delete backend;
2331
- }
2332
-
2333
- GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
2334
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2335
-
2336
- return ggml_backend_cuda_buffer_type(cuda_ctx->device);
2337
- }
2338
-
2339
- GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2340
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2341
- ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
2342
-
2343
- GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
2344
-
2345
- CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
2346
- }
2347
-
2348
- GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
2349
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2350
- ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
2351
-
2352
- GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
2353
-
2354
- CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
2355
- }
2356
-
2357
- GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
2358
- GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
2359
-
2360
- ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
2361
- ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
2362
-
2363
- if (!ggml_backend_buffer_is_cuda(src->buffer)) {
2364
- return false;
2365
- }
2366
-
2367
- if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
2368
- return false;
2369
- }
2370
-
2371
- // device -> device
2372
- ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
2373
- ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
2374
-
2375
- if (backend_src != backend_dst) {
2376
- ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
2377
- ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
2378
-
2379
- GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
2380
- GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
2381
-
2382
- // copy on src stream
2383
- if (cuda_ctx_src->device == cuda_ctx_dst->device) {
2384
- CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
2385
- } else {
2386
- #ifdef GGML_CUDA_NO_PEER_COPY
2387
- return false;
2388
- #else
2389
- CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
2390
- #endif
2391
- }
2392
-
2393
- // record event on src stream
2394
- if (!cuda_ctx_src->copy_event) {
2395
- ggml_cuda_set_device(cuda_ctx_src->device);
2396
- CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
2397
- }
2398
-
2399
- CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));
2400
-
2401
- // wait on dst stream for the copy to complete
2402
- CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
2403
- } else {
2404
- // src and dst are on the same backend
2405
- CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
2406
- }
2407
- return true;
2408
- }
2409
-
2410
- GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
2411
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2412
-
2413
- CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream()));
2414
-
2415
- GGML_UNUSED(backend);
2416
- }
2417
-
2418
- static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2419
- graph_node_properties->node_address = node->data;
2420
- graph_node_properties->node_op = node->op;
2421
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
2422
- graph_node_properties->ne[i] = node->ne[i];
2423
- graph_node_properties->nb[i] = node->nb[i];
2424
- }
2425
- for (int i = 0; i < GGML_MAX_SRC; i++) {
2426
- graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
2427
- }
2428
- }
2429
-
2430
- static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2431
- if (node->data != graph_node_properties->node_address &&
2432
- node->op != GGML_OP_CPY &&
2433
- node->op != GGML_OP_VIEW) {
2434
- return false;
2435
- }
2436
-
2437
- if (node->op != graph_node_properties->node_op) {
2438
- return false;
2439
- }
2440
-
2441
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
2442
- if (node->ne[i] != graph_node_properties->ne[i]) {
2443
- return false;
2444
- }
2445
- if (node->nb[i] != graph_node_properties->nb[i]) {
2446
- return false;
2447
- }
2448
- }
2449
-
2450
- for (int i = 0; i < GGML_MAX_SRC; i++) {
2451
- if (node->src[i] &&
2452
- node->src[i]->data != graph_node_properties->src_address[i] &&
2453
- node->op != GGML_OP_CPY &&
2454
- node->op != GGML_OP_VIEW
2455
- ) {
2456
- return false;
2457
- }
2458
- }
2459
- return true;
2460
- }
2461
-
2462
- GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
2463
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2464
-
2465
- ggml_cuda_set_device(cuda_ctx->device);
2466
-
2467
- #ifdef USE_CUDA_GRAPH
2468
- static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
2469
-
2470
- // Objects required for CUDA Graph
2471
- if (cuda_ctx->cuda_graph == nullptr) {
2472
- cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
2473
- }
2474
-
2475
- bool use_cuda_graph = true;
2476
- bool cuda_graph_update_required = false;
2477
- // vector of pointers to CUDA cpy kernels, which are required to identify
2478
- // kernel parameters which need updated in the graph for each token
2479
- std::vector<void *> ggml_cuda_cpy_fn_ptrs;
2480
-
2481
- if (cuda_ctx->cuda_graph->graph == nullptr) {
2482
- if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2483
- cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2484
- #ifndef NDEBUG
2485
- GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2486
- #endif
2487
- }
2488
- }
2489
-
2490
- // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
2491
- // or previous graph capture failure.
2492
- // Also disable for multi-gpu for now. TO DO investigate
2493
- if (disable_cuda_graphs_due_to_env
2494
- || cuda_ctx->cuda_graph->disable_due_to_gpu_arch
2495
- || cuda_ctx->cuda_graph->disable_due_to_too_many_updates
2496
- || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
2497
- use_cuda_graph = false;
2498
- }
2499
-
2500
- if (use_cuda_graph) {
2501
- if (cuda_ctx->cuda_graph->instance == nullptr) {
2502
- cuda_graph_update_required = true;
2503
- }
2504
-
2505
- // Check if the graph size has changed
2506
- if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
2507
- cuda_graph_update_required = true;
2508
- cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
2509
- }
2510
-
2511
- // Loop over nodes in GGML graph to determine if CUDA graph update is required
2512
- // and store properties to allow this comparison for the next token
2513
- for (int i = 0; i < cgraph->n_nodes; i++) {
2514
- bool has_matching_properties = true;
2515
- if (!cuda_graph_update_required) {
2516
- has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2517
- }
2518
- if (!has_matching_properties) {
2519
- cuda_graph_update_required = true;
2520
- }
2521
- set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2522
- }
2523
-
2524
- // Loop over nodes in GGML graph to obtain info needed for CUDA graph
2525
- cuda_ctx->cuda_graph->updated_kernel_arg.clear();
2526
- for (int i = 0; i < cgraph->n_nodes; i++) {
2527
- ggml_tensor * node = cgraph->nodes[i];
2528
-
2529
- if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
2530
- use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2531
- #ifndef NDEBUG
2532
- GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
2533
- #endif
2534
- }
2535
-
2536
- if (node->op == GGML_OP_MUL_MAT_ID) {
2537
- use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2538
- #ifndef NDEBUG
2539
- GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2540
- #endif
2541
- }
2542
-
2543
- if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
2544
- // disable CUDA graphs for batch size > 1 for now.
2545
- // Changes in batch size or context size can cause changes to the grid size of some kernels.
2546
- use_cuda_graph = false;
2547
- #ifndef NDEBUG
2548
- GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2549
- #endif
2550
- }
2551
-
2552
- if (node->op == GGML_OP_CPY) {
2553
- // store the copy op parameter which changes with each token.
2554
- cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
2555
- // store a pointer to each copy op CUDA kernel to identify it later
2556
- void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
2557
- if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
2558
- ggml_cuda_cpy_fn_ptrs.push_back(ptr);
2559
- }
2560
- }
2561
-
2562
- if (!use_cuda_graph) {
2563
- break;
2564
- }
2565
- }
2566
-
2567
- // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
2568
- if (use_cuda_graph && cuda_graph_update_required) {
2569
- cuda_ctx->cuda_graph->number_consecutive_updates++;
2570
- } else {
2571
- cuda_ctx->cuda_graph->number_consecutive_updates = 0;
2572
- }
2573
-
2574
- if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2575
- cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2576
- #ifndef NDEBUG
2577
- GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2578
- #endif
2579
- }
2580
- }
2581
-
2582
- if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
2583
- CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
2584
- }
2585
-
2586
- #else
2587
- bool use_cuda_graph = false;
2588
- bool cuda_graph_update_required = false;
2589
- #endif // USE_CUDA_GRAPH
2590
-
2591
- bool graph_evaluated_or_captured = false;
2592
-
2593
- while (!graph_evaluated_or_captured) {
2594
- // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
2595
- // With the use of CUDA graphs, the execution will be performed by the graph launch.
2596
- if (!use_cuda_graph || cuda_graph_update_required) {
2597
- for (int i = 0; i < cgraph->n_nodes; i++) {
2598
- ggml_tensor * node = cgraph->nodes[i];
2599
-
2600
- if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2601
- continue;
2602
- }
2603
-
2604
- #ifndef NDEBUG
2605
- assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
2606
- for (int j = 0; j < GGML_MAX_SRC; j++) {
2607
- if (node->src[j] != nullptr) {
2608
- assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
2609
- }
2610
- }
2611
- #endif
2612
-
2613
- bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2614
- if (!ok) {
2615
- GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2616
- }
2617
- GGML_ASSERT(ok);
2618
- }
2619
- }
2620
-
2621
- #ifdef USE_CUDA_GRAPH
2622
- if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture
2623
- if (cuda_ctx->cuda_graph->graph != nullptr) {
2624
- CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
2625
- cuda_ctx->cuda_graph->graph = nullptr;
2626
- }
2627
- CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
2628
-
2629
- #if 0
2630
- if (disable_cuda_graphs_due_to_failed_capture) {
2631
- use_cuda_graph = false;
2632
- cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
2633
- #ifndef NDEBUG
2634
- GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2635
- #endif
2636
- } else {
2637
- graph_evaluated_or_captured = true; // CUDA graph has been captured
2638
- }
2639
- #endif
2640
- graph_evaluated_or_captured = true; // CUDA graph has been captured
2641
- } else {
2642
- graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
2643
- }
2644
- }
2645
-
2646
- if (use_cuda_graph) {
2647
- if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
2648
- CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2649
- }
2650
-
2651
- // Perform update to graph (if required for this token), and change copy parameter (required for every token)
2652
-
2653
- if (cuda_graph_update_required) {
2654
- // Extract nodes from graph
2655
- // First call with null argument gets number of nodes in graph
2656
- CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
2657
- // Subsequent call with non-null argument gets nodes
2658
- cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
2659
- cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
2660
- if (cuda_ctx->cuda_graph->num_nodes > 0) {
2661
- CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
2662
-
2663
- // Loop over nodes, and extract kernel parameters from each node
2664
- for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
2665
- cudaGraphNodeType node_type;
2666
- CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
2667
- if (node_type == cudaGraphNodeTypeKernel) {
2668
- cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
2669
- if (stat == cudaErrorInvalidDeviceFunction) {
2670
- // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
2671
- // We don't need to update blas nodes, so clear error and move on.
2672
- cudaGetLastError();
2673
- } else {
2674
- GGML_ASSERT(stat == cudaSuccess);
2675
- }
2676
- }
2677
- }
2678
- }
2679
- }
2680
-
2681
- // One of the arguments to the copy kernel is updated for each token, hence we need to
2682
- // replace that argument with the updated value in the CUDA graph
2683
- if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
2684
- int k = 0;
2685
- for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
2686
- if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
2687
- char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
2688
- cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
2689
- CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
2690
- }
2691
- }
2692
- }
2693
-
2694
- // Update graph executable
2695
- cudaGraphExecUpdateResultInfo result_info;
2696
- cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2697
- if (stat == cudaErrorGraphExecUpdateFailure) {
2698
- #ifndef NDEBUG
2699
- GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
2700
- #endif
2701
- // The pre-existing graph exec cannot be updated due to violated constraints
2702
- // so instead clear error and re-instantiate
2703
- cudaGetLastError();
2704
- CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
2705
- cuda_ctx->cuda_graph->instance = nullptr;
2706
- CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2707
- } else {
2708
- GGML_ASSERT(stat == cudaSuccess);
2709
- }
2710
- // Launch graph
2711
- CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
2712
- #else
2713
- graph_evaluated_or_captured = true;
2714
- #endif // USE_CUDA_GRAPH
2715
- }
2716
-
2717
- return GGML_STATUS_SUCCESS;
2718
- }
2719
-
2720
- GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
2721
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
2722
- switch (op->op) {
2723
- case GGML_OP_UNARY:
2724
- switch (ggml_get_unary_op(op)) {
2725
- case GGML_UNARY_OP_GELU:
2726
- case GGML_UNARY_OP_SILU:
2727
- case GGML_UNARY_OP_RELU:
2728
- case GGML_UNARY_OP_SIGMOID:
2729
- case GGML_UNARY_OP_HARDSIGMOID:
2730
- case GGML_UNARY_OP_HARDSWISH:
2731
- case GGML_UNARY_OP_GELU_QUICK:
2732
- case GGML_UNARY_OP_TANH:
2733
- return ggml_is_contiguous(op->src[0]);
2734
- default:
2735
- return false;
2736
- }
2737
- break;
2738
- case GGML_OP_MUL_MAT:
2739
- case GGML_OP_MUL_MAT_ID:
2740
- {
2741
- struct ggml_tensor * a;
2742
- struct ggml_tensor * b;
2743
- if (op->op == GGML_OP_MUL_MAT) {
2744
- a = op->src[0];
2745
- b = op->src[1];
2746
- } else {
2747
- a = op->src[2];
2748
- b = op->src[1];
2749
- }
2750
- if (a->ne[3] != b->ne[3]) {
2751
- return false;
2752
- }
2753
- ggml_type a_type = a->type;
2754
- if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
2755
- a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S ||
2756
- a_type == GGML_TYPE_IQ1_M || a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) {
2757
- if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
2758
- return false;
2759
- }
2760
- }
2761
- return true;
2762
- } break;
2763
- case GGML_OP_GET_ROWS:
2764
- {
2765
- switch (op->src[0]->type) {
2766
- case GGML_TYPE_F16:
2767
- case GGML_TYPE_F32:
2768
- case GGML_TYPE_Q4_0:
2769
- case GGML_TYPE_Q4_1:
2770
- case GGML_TYPE_Q5_0:
2771
- case GGML_TYPE_Q5_1:
2772
- case GGML_TYPE_Q8_0:
2773
- return true;
2774
- default:
2775
- return false;
2776
- }
2777
- } break;
2778
- case GGML_OP_CPY:
2779
- {
2780
- ggml_type src0_type = op->src[0]->type;
2781
- ggml_type src1_type = op->src[1]->type;
2782
- if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
2783
- return true;
2784
- }
2785
- if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
2786
- return true;
2787
- }
2788
- if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
2789
- return true;
2790
- }
2791
- if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
2792
- return true;
2793
- }
2794
- if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
2795
- return true;
2796
- }
2797
- if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_0) {
2798
- return true;
2799
- }
2800
- if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_1) {
2801
- return true;
2802
- }
2803
- if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
2804
- return true;
2805
- }
2806
- if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
2807
- return true;
2808
- }
2809
- if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
2810
- return true;
2811
- }
2812
- return false;
2813
- } break;
2814
- case GGML_OP_DUP:
2815
- case GGML_OP_REPEAT:
2816
- case GGML_OP_CONCAT:
2817
- {
2818
- ggml_type src0_type = op->src[0]->type;
2819
- return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
2820
- } break;
2821
- case GGML_OP_NONE:
2822
- case GGML_OP_RESHAPE:
2823
- case GGML_OP_VIEW:
2824
- case GGML_OP_PERMUTE:
2825
- case GGML_OP_TRANSPOSE:
2826
- case GGML_OP_NORM:
2827
- case GGML_OP_ADD:
2828
- case GGML_OP_MUL:
2829
- case GGML_OP_DIV:
2830
- case GGML_OP_RMS_NORM:
2831
- case GGML_OP_SCALE:
2832
- case GGML_OP_SQR:
2833
- case GGML_OP_CLAMP:
2834
- case GGML_OP_CONT:
2835
- case GGML_OP_DIAG_MASK_INF:
2836
- case GGML_OP_SOFT_MAX:
2837
- return true;
2838
- case GGML_OP_ROPE:
2839
- return ggml_is_contiguous(op->src[0]);
2840
- case GGML_OP_IM2COL:
2841
- case GGML_OP_POOL_2D:
2842
- case GGML_OP_SUM_ROWS:
2843
- case GGML_OP_ARGSORT:
2844
- case GGML_OP_ACC:
2845
- case GGML_OP_GROUP_NORM:
2846
- case GGML_OP_UPSCALE:
2847
- case GGML_OP_PAD:
2848
- case GGML_OP_ARANGE:
2849
- case GGML_OP_TIMESTEP_EMBEDDING:
2850
- case GGML_OP_LEAKY_RELU:
2851
- return true;
2852
- case GGML_OP_FLASH_ATTN_EXT:
2853
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
2854
- return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
2855
- #else
2856
- if (op->src[0]->ne[0] == 128) {
2857
- return true;
2858
- }
2859
- if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) {
2860
- return true;
2861
- }
2862
- return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA &&
2863
- op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
2864
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
2865
- default:
2866
- return false;
2867
- }
2868
-
2869
- GGML_UNUSED(backend);
2870
- }
2871
-
2872
- GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
2873
- if (ggml_backend_buft_is_cuda_split(buft)) {
2874
- return true;
2875
- }
2876
-
2877
- if (ggml_backend_buft_is_cuda(buft)) {
2878
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2879
- ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
2880
- return buft_ctx->device == cuda_ctx->device;
2881
- }
2882
-
2883
- return false;
2884
- }
2885
-
2886
- GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
2887
- const int min_batch_size = 32;
2888
-
2889
- return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
2890
- (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
2891
-
2892
- GGML_UNUSED(backend);
2893
- }
2894
-
2895
- static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
2896
- #ifdef GGML_CUDA_NO_PEER_COPY
2897
- return nullptr;
2898
- #else
2899
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2900
-
2901
- ggml_cuda_set_device(cuda_ctx->device);
2902
-
2903
- cudaEvent_t event;
2904
- CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
2905
-
2906
- return new ggml_backend_event {
2907
- /* .backend = */ backend,
2908
- /* .context = */ event,
2909
- };
2910
- #endif
2911
- }
2912
-
2913
- static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
2914
- CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
2915
-
2916
- delete event;
2917
- }
2918
-
2919
- static void ggml_backend_cuda_event_record(ggml_backend_event_t event) {
2920
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context;
2921
-
2922
- CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream()));
2923
- }
2924
-
2925
- static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
2926
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2927
-
2928
- if (ggml_backend_is_cuda(event->backend)) {
2929
- CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0));
2930
- } else {
2931
- #if 0
2932
- // untested
2933
- auto wait_fn = [](void * user_data) {
2934
- ggml_backend_event_t event = (ggml_backend_event_t)user_data;
2935
- ggml_backend_event_synchronize(event);
2936
- };
2937
-
2938
- CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
2939
- #endif
2940
- GGML_ASSERT(false);
2941
- }
2942
- }
2943
-
2944
- static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
2945
- CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
2946
- }
2947
-
2948
- static ggml_backend_i ggml_backend_cuda_interface = {
2949
- /* .get_name = */ ggml_backend_cuda_name,
2950
- /* .free = */ ggml_backend_cuda_free,
2951
- /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
2952
- /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
2953
- /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
2954
- /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
2955
- /* .synchronize = */ ggml_backend_cuda_synchronize,
2956
- /* .graph_plan_create = */ NULL,
2957
- /* .graph_plan_free = */ NULL,
2958
- /* .graph_plan_update = */ NULL,
2959
- /* .graph_plan_compute = */ NULL,
2960
- /* .graph_compute = */ ggml_backend_cuda_graph_compute,
2961
- /* .supports_op = */ ggml_backend_cuda_supports_op,
2962
- /* .supports_buft = */ ggml_backend_cuda_supports_buft,
2963
- /* .offload_op = */ ggml_backend_cuda_offload_op,
2964
- /* .event_new = */ ggml_backend_cuda_event_new,
2965
- /* .event_free = */ ggml_backend_cuda_event_free,
2966
- /* .event_record = */ ggml_backend_cuda_event_record,
2967
- /* .event_wait = */ ggml_backend_cuda_event_wait,
2968
- /* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
2969
- };
2970
-
2971
- static ggml_guid_t ggml_backend_cuda_guid() {
2972
- static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
2973
- return &guid;
2974
- }
2975
-
2976
- GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
2977
- if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
2978
- GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
2979
- return nullptr;
2980
- }
2981
-
2982
- ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
2983
- if (ctx == nullptr) {
2984
- GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
2985
- return nullptr;
2986
- }
2987
-
2988
- ggml_backend_t cuda_backend = new ggml_backend {
2989
- /* .guid = */ ggml_backend_cuda_guid(),
2990
- /* .interface = */ ggml_backend_cuda_interface,
2991
- /* .context = */ ctx
2992
- };
2993
-
2994
- return cuda_backend;
2995
- }
2996
-
2997
- GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
2998
- return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
2999
- }
3000
-
3001
- GGML_CALL int ggml_backend_cuda_get_device_count() {
3002
- return ggml_cuda_info().device_count;
3003
- }
3004
-
3005
- GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
3006
- cudaDeviceProp prop;
3007
- CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
3008
- snprintf(description, description_size, "%s", prop.name);
3009
- }
3010
-
3011
- GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
3012
- ggml_cuda_set_device(device);
3013
-
3014
- CUDA_CHECK(cudaMemGetInfo(free, total));
3015
- }
3016
-
3017
- GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
3018
- if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
3019
- return false;
3020
- }
3021
-
3022
- #if CUDART_VERSION >= 11100
3023
- cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
3024
- if (err != cudaSuccess) {
3025
- // clear the error
3026
- cudaGetLastError();
3027
-
3028
- GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
3029
- size / 1024.0 / 1024.0, cudaGetErrorString(err));
3030
- return false;
3031
- }
3032
- return true;
3033
- #else
3034
- return false;
3035
- #endif
3036
- }
3037
-
3038
- GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
3039
- if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
3040
- return;
3041
- }
3042
-
3043
- cudaError_t err = cudaHostUnregister(buffer);
3044
- if (err != cudaSuccess) {
3045
- // clear the error
3046
- cudaGetLastError();
3047
- }
3048
- }
3049
-
3050
- // backend registry
3051
- GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
3052
- ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
3053
- return cuda_backend;
3054
-
3055
- GGML_UNUSED(params);
3056
- }
3057
-
3058
- extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
3059
-
3060
- GGML_CALL int ggml_backend_cuda_reg_devices() {
3061
- int device_count = ggml_backend_cuda_get_device_count();
3062
- //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
3063
- for (int i = 0; i < device_count; i++) {
3064
- char name[128];
3065
- snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
3066
- ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
3067
- }
3068
- return device_count;
3069
- }