ksgpu 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. ksgpu-1.0.0/.gitignore +2 -0
  2. ksgpu-1.0.0/PKG-INFO +7 -0
  3. ksgpu-1.0.0/README.md +5 -0
  4. ksgpu-1.0.0/generate_device_mma_hpp.py +257 -0
  5. ksgpu-1.0.0/include/ksgpu/Array.hpp +624 -0
  6. ksgpu-1.0.0/include/ksgpu/Barrier.hpp +44 -0
  7. ksgpu-1.0.0/include/ksgpu/CpuThreadPool.hpp +96 -0
  8. ksgpu-1.0.0/include/ksgpu/CudaStreamPool.hpp +119 -0
  9. ksgpu-1.0.0/include/ksgpu/ThreadSafeRingBuffer.hpp +133 -0
  10. ksgpu-1.0.0/include/ksgpu/complex_type_traits.hpp +48 -0
  11. ksgpu-1.0.0/include/ksgpu/constexpr_functions.hpp +56 -0
  12. ksgpu-1.0.0/include/ksgpu/cuda_utils.hpp +199 -0
  13. ksgpu-1.0.0/include/ksgpu/device_mma.hpp +226 -0
  14. ksgpu-1.0.0/include/ksgpu/dlpack.h +332 -0
  15. ksgpu-1.0.0/include/ksgpu/mem_utils.hpp +149 -0
  16. ksgpu-1.0.0/include/ksgpu/memcpy_kernels.hpp +45 -0
  17. ksgpu-1.0.0/include/ksgpu/pybind11.hpp +89 -0
  18. ksgpu-1.0.0/include/ksgpu/pybind11_utils.hpp +109 -0
  19. ksgpu-1.0.0/include/ksgpu/rand_utils.hpp +167 -0
  20. ksgpu-1.0.0/include/ksgpu/string_utils.hpp +145 -0
  21. ksgpu-1.0.0/include/ksgpu/test_utils.hpp +94 -0
  22. ksgpu-1.0.0/include/ksgpu/time_utils.hpp +39 -0
  23. ksgpu-1.0.0/include/ksgpu/xassert.hpp +84 -0
  24. ksgpu-1.0.0/include/ksgpu.hpp +58 -0
  25. ksgpu-1.0.0/loose_ends/bit-mapping.cu +292 -0
  26. ksgpu-1.0.0/meson.build +166 -0
  27. ksgpu-1.0.0/pyproject.toml +17 -0
  28. ksgpu-1.0.0/src_bin/reverse-engineer-mma.cu +807 -0
  29. ksgpu-1.0.0/src_bin/scratch.cu +17 -0
  30. ksgpu-1.0.0/src_bin/show-devices.cu +41 -0
  31. ksgpu-1.0.0/src_bin/test-array.cu +474 -0
  32. ksgpu-1.0.0/src_bin/test-memcpy-kernels.cu +99 -0
  33. ksgpu-1.0.0/src_bin/test-sparse-mma.cu +221 -0
  34. ksgpu-1.0.0/src_bin/time-atomic-add.cu +214 -0
  35. ksgpu-1.0.0/src_bin/time-fma.cu +124 -0
  36. ksgpu-1.0.0/src_bin/time-l2-cache.cu +73 -0
  37. ksgpu-1.0.0/src_bin/time-local-transpose.cu +135 -0
  38. ksgpu-1.0.0/src_bin/time-memcpy-kernels.cu +81 -0
  39. ksgpu-1.0.0/src_bin/time-shared-memory.cu +87 -0
  40. ksgpu-1.0.0/src_bin/time-tensor-cores.cu +442 -0
  41. ksgpu-1.0.0/src_bin/time-warp-shuffle.cu +105 -0
  42. ksgpu-1.0.0/src_lib/Array.cu +408 -0
  43. ksgpu-1.0.0/src_lib/Barrier.cu +72 -0
  44. ksgpu-1.0.0/src_lib/CpuThreadPool.cu +157 -0
  45. ksgpu-1.0.0/src_lib/CudaStreamPool.cu +196 -0
  46. ksgpu-1.0.0/src_lib/cuda_utils.cu +171 -0
  47. ksgpu-1.0.0/src_lib/mem_utils.cu +391 -0
  48. ksgpu-1.0.0/src_lib/memcpy_kernels.cu +145 -0
  49. ksgpu-1.0.0/src_lib/rand_utils.cu +73 -0
  50. ksgpu-1.0.0/src_lib/string_utils.cu +113 -0
  51. ksgpu-1.0.0/src_lib/test_utils.cu +409 -0
  52. ksgpu-1.0.0/src_pybind11/ksgpu_pybind11.cu +197 -0
  53. ksgpu-1.0.0/src_pybind11/pybind11_utils.cu +430 -0
  54. ksgpu-1.0.0/src_python/ksgpu/__init__.py +20 -0
ksgpu-1.0.0/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ *~
2
+ *.o
ksgpu-1.0.0/PKG-INFO ADDED
@@ -0,0 +1,7 @@
1
+ Metadata-Version: 2.1
2
+ Name: ksgpu
3
+ Version: 1.0.0
4
+ Summary: A library of low-level utilities for cuda/cupy
5
+ Requires-Dist: pybind11
6
+ Requires-Dist: numpy
7
+ Requires-Dist: cupy
ksgpu-1.0.0/README.md ADDED
@@ -0,0 +1,5 @@
1
+ ### GPU C++/CUDA core utils
2
+
3
+ Some day there will be useful documentation here.
4
+
5
+ Contact: Kendrick Smith <kmsmith@perimeterinstitute.ca>
@@ -0,0 +1,257 @@
1
+ #!/usr/bin/env python3
2
+ #
3
+ # Reference:
4
+ # https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma
5
+ #
6
+ # Note: I'm no longer running this as part of the build process (in an automated way).
7
+ #
8
+ # Instead, I'm keeping the output file (include/ksgpu/device_mma.hpp) in git, and occasionally
9
+ # updating by hand:
10
+ #
11
+ # ./generate_device_mma_hpp.py > include/ksgpu/device_mma.hpp
12
+
13
+
14
+ class Argument:
15
+ def __init__(self, name, cuda_type, num_registers, is_const=False, pack=True, ptx_type=None, is_scalar=False, is_immediate=False):
16
+ self.name = name
17
+ self.cuda_type = cuda_type
18
+ self.num_registers = num_registers
19
+ self.is_const = is_const
20
+ self.pack = pack
21
+ self.ptx_type = ptx_type
22
+ self.is_scalar = is_scalar
23
+ self.is_immediate = is_immediate
24
+
25
+ if is_immediate:
26
+ assert is_const and is_scalar and (ptx_type is None)
27
+
28
+ if is_scalar:
29
+ assert (not pack) and (num_registers == 1)
30
+
31
+
32
+ def make_template_arglist(self):
33
+ return [ f'{self.cuda_type} {self.name}' ] if self.is_immediate else [ ]
34
+
35
+
36
+ def make_cuda_arglist(self):
37
+ amp = '' if self.is_const else '&'
38
+ cv = 'const ' if self.is_const else ''
39
+
40
+ if self.is_immediate:
41
+ return [ ]
42
+ elif self.is_scalar:
43
+ return [ f'{self.cuda_type} {amp}{self.name}' ]
44
+ elif self.pack:
45
+ return [ f'{cv}{self.cuda_type} {self.name}[{self.num_registers}]' ]
46
+ else:
47
+ return [ f'{self.cuda_type} {cv}{self.name}{i}' for i in range(self.num_registers) ]
48
+
49
+
50
+ def make_ptx_argstr(self, base):
51
+ t = [ f'%{i}' for i in range(base, base + self.num_registers) ]
52
+ t = ', '.join(t)
53
+ if not self.is_scalar:
54
+ t = '{' + t + '}'
55
+ return t
56
+
57
+
58
+ def make_constraint_str(self):
59
+ cv = 'const ' if self.is_const else ''
60
+ constraint = '"r"' if self.is_const else '"=r"'
61
+
62
+ if self.is_immediate:
63
+ return f'"n" ({self.name})'
64
+
65
+ if self.is_scalar:
66
+ return f'{constraint} ({self.name})'
67
+
68
+ ret = [ ]
69
+ for i in range(self.num_registers):
70
+ v = f'{self.name}[{i}]' if self.pack else f'{self.name}{i}'
71
+ if self.ptx_type is not None:
72
+ v = f'*({cv}{self.ptx_type} *) &{v}'
73
+ ret.append(f'{constraint} ({v})')
74
+
75
+ return ', '.join(ret)
76
+
77
+
78
+ ####################################################################################################
79
+
80
+
81
+ def emit_kernel(cuda_name, ptx_name, *args):
82
+ # 'ordered_metadata' was introduced in nvcc 12.5 (I think), and we accommodate it with a hack (see below).
83
+ # FIXME some day, when nvcc 12.4 is ancient history, this hack can be removed.
84
+ omstr = '::ordered_metadata'
85
+ omi = ptx_name.find(omstr)
86
+
87
+ template_arglist = [ ]
88
+ cuda_arglist = [ ]
89
+
90
+ for arg in args:
91
+ template_arglist += arg.make_template_arglist()
92
+ cuda_arglist += arg.make_cuda_arglist()
93
+
94
+ template_argstr = ', '.join(template_arglist)
95
+ cuda_argstr = ', '.join(cuda_arglist)
96
+
97
+ print(f'')
98
+ print(f'// D = A*B + C')
99
+
100
+ if len(template_arglist) > 0:
101
+ print(f'template<{template_argstr}>')
102
+
103
+ print(f'__device__ __forceinline__')
104
+ print(f'void {cuda_name}({cuda_argstr})')
105
+ print(f'{{')
106
+
107
+ if omi < 0:
108
+ print(f' asm("{ptx_name} "')
109
+ else:
110
+ ptx_name2 = ptx_name[:omi] + ptx_name[omi+len(omstr):]
111
+ print(f' asm(')
112
+ print(f'#if CUDART_VERSION >= 12050')
113
+ print(f' "{ptx_name} "')
114
+ print(f'#else')
115
+ print(f' "{ptx_name2} "')
116
+ print(f'#endif')
117
+
118
+ base = 0
119
+ for i,arg in enumerate(args):
120
+ s = f'"{arg.make_ptx_argstr(base)}'
121
+ t = ', "' if (i < len(args)-1) else ';" :'
122
+ base += arg.num_registers
123
+ print(f' {s}{t}')
124
+
125
+ for i,arg in enumerate(args):
126
+ t = ''
127
+ if i == 0:
128
+ t = ' :'
129
+ elif i < len(args)-1:
130
+ t = ','
131
+
132
+ s = arg.make_constraint_str()
133
+ print(f' {s}{t}')
134
+
135
+ print(f' );')
136
+ print(f'}}')
137
+ print(f'')
138
+
139
+
140
+ ####################################################################################################
141
+
142
+
143
+ def emit_dense_mma(cuda_name, ptx_name, cuda_type, dbits, sbits, m, n, k, ptx_type=None, s=1):
144
+ # Register counts
145
+ na = (m*k*s*sbits) // 1024
146
+ nb = (k*n*s*sbits) // 1024
147
+ nc = (m*n*s*dbits) // 1024
148
+
149
+ emit_kernel(
150
+ cuda_name,
151
+ ptx_name,
152
+ Argument('d', cuda_type, nc, ptx_type=ptx_type),
153
+ Argument('a', cuda_type, na, ptx_type=ptx_type, is_const=True),
154
+ Argument('b', cuda_type, nb, ptx_type=ptx_type, is_const=True),
155
+ Argument('c', cuda_type, nc, ptx_type=ptx_type, is_const=True)
156
+ )
157
+
158
+
159
+ def emit_dense_f16_mma(m, n, k, s=1, layout=None):
160
+ """The 'layout' parameter is only used for m8n8k4, and is a string pair such as ('row','col')."""
161
+
162
+ cuda_name = f'mma_f16_m{m}_n{n}_k{k}'
163
+ if layout is not None:
164
+ cuda_name = f'{cuda_name}_{layout[0][0]}{layout[1][0]}'
165
+
166
+ a = 'row' if (layout is None) else layout[0]
167
+ b = 'col' if (layout is None) else layout[1]
168
+ ptx_name = f'mma.sync.aligned.m{m}n{n}k{k}.{a}.{b}.f16.f16.f16.f16'
169
+
170
+ emit_dense_mma(cuda_name, ptx_name, '__half2', 16, 16, m, n, k, ptx_type='uint', s=s)
171
+
172
+
173
+ def emit_dense_int_mma(sbits, m, n, k):
174
+ typename = f's{sbits}' if (sbits > 1) else 'b1'
175
+ satfinite = '.satfinite' if (sbits > 1) else ''
176
+ suffix = '' if (sbits > 1) else '.and.popc'
177
+ cuda_name = f'mma_{typename}_m{m}_n{n}_k{k}'
178
+ ptx_name = f'mma.sync.aligned.m{m}n{n}k{k}.row.col{satfinite}.s32.{typename}.{typename}.s32{suffix}'
179
+ emit_dense_mma(cuda_name, ptx_name, 'int', 32, sbits, m, n, k)
180
+
181
+
182
+ def emit_sparse_f16_mma(m, n, k):
183
+ cuda_name = f'mma_sp_f16_m{m}_n{n}_k{k}'
184
+ ptx_name = f'mma.sp::ordered_metadata.sync.aligned.m{m}n{n}k{k}.row.col.f16.f16.f16.f16'
185
+
186
+ # Register counts
187
+ na = (m*k) // 128
188
+ nb = (k*n) // 64
189
+ nc = (m*n) // 64
190
+
191
+ emit_kernel(
192
+ cuda_name,
193
+ ptx_name,
194
+ Argument('d', '__half2', nc, ptx_type='uint'),
195
+ Argument('a', '__half2', na, ptx_type='uint', is_const=True),
196
+ Argument('b', '__half2', nb, ptx_type='uint', is_const=True),
197
+ Argument('c', '__half2', nc, ptx_type='uint', is_const=True),
198
+ Argument('e', 'uint', 1, pack=False, is_scalar=True, is_const=True),
199
+ Argument('F', 'uint', 1, pack=False, is_scalar=True, is_const=True, is_immediate=True)
200
+ )
201
+
202
+
203
+
204
+ ####################################################################################################
205
+
206
+
207
+ if __name__ == '__main__':
208
+ print(f'#ifndef _KSGPU_DEVICE_MMA_HPP')
209
+ print(f'#define _KSGPU_DEVICE_MMA_HPP')
210
+ print(f'')
211
+ print(f'// Autogenerated by generate_device_mma_hpp.py')
212
+ print(f'//')
213
+ print(f'// Reference for matrix shapes:')
214
+ print(f'// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-shape')
215
+ print(f'//')
216
+ print(f'// Reference for PTX instruction syntax:')
217
+ print(f'// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma')
218
+ print()
219
+ print('#include <cuda_fp16.h>')
220
+ print(f'')
221
+ print(f'namespace ksgpu {{')
222
+ print(f'')
223
+
224
+ # float16
225
+ emit_dense_f16_mma(16, 8, 8)
226
+ emit_dense_f16_mma(16, 8, 16)
227
+
228
+ # int4
229
+ emit_dense_int_mma(4, 8, 8, 32)
230
+ emit_dense_int_mma(4, 16, 8, 32)
231
+ emit_dense_int_mma(4, 16, 8, 64)
232
+
233
+ # int8
234
+ emit_dense_int_mma(8, 8, 8, 16)
235
+ emit_dense_int_mma(8, 16, 8, 16)
236
+ emit_dense_int_mma(8, 16, 8, 32)
237
+
238
+ # int1
239
+ emit_dense_int_mma(1, 8, 8, 128)
240
+
241
+ # sparse float16
242
+ emit_sparse_f16_mma(16, 8, 16)
243
+ emit_sparse_f16_mma(16, 8, 32)
244
+
245
+ # The PTX ISA includes f16 m8n8k4 MMAs.
246
+ # I tried generating wrappers for these, but timing showed that they were extremely slow.
247
+ # I assume these MMAs are legacy instructions which are emulated on Ampere.
248
+ # I left commented-out code here, and in ../generate_device_mma_hpp.py in case I ever want to revisit this.
249
+
250
+ # for a in ['row','col']:
251
+ # for b in ['row','col']:
252
+ # emit_dense_f16_mma(8, 8, 4, s=4, layout=(a,b))
253
+
254
+ print(f'')
255
+ print(f'}} // namespace ksgpu')
256
+ print(f'')
257
+ print(f'#endif // _KSGPU_DEVICE_MMA_HPP')