ksgpu 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ksgpu-1.0.0/.gitignore +2 -0
- ksgpu-1.0.0/PKG-INFO +7 -0
- ksgpu-1.0.0/README.md +5 -0
- ksgpu-1.0.0/generate_device_mma_hpp.py +257 -0
- ksgpu-1.0.0/include/ksgpu/Array.hpp +624 -0
- ksgpu-1.0.0/include/ksgpu/Barrier.hpp +44 -0
- ksgpu-1.0.0/include/ksgpu/CpuThreadPool.hpp +96 -0
- ksgpu-1.0.0/include/ksgpu/CudaStreamPool.hpp +119 -0
- ksgpu-1.0.0/include/ksgpu/ThreadSafeRingBuffer.hpp +133 -0
- ksgpu-1.0.0/include/ksgpu/complex_type_traits.hpp +48 -0
- ksgpu-1.0.0/include/ksgpu/constexpr_functions.hpp +56 -0
- ksgpu-1.0.0/include/ksgpu/cuda_utils.hpp +199 -0
- ksgpu-1.0.0/include/ksgpu/device_mma.hpp +226 -0
- ksgpu-1.0.0/include/ksgpu/dlpack.h +332 -0
- ksgpu-1.0.0/include/ksgpu/mem_utils.hpp +149 -0
- ksgpu-1.0.0/include/ksgpu/memcpy_kernels.hpp +45 -0
- ksgpu-1.0.0/include/ksgpu/pybind11.hpp +89 -0
- ksgpu-1.0.0/include/ksgpu/pybind11_utils.hpp +109 -0
- ksgpu-1.0.0/include/ksgpu/rand_utils.hpp +167 -0
- ksgpu-1.0.0/include/ksgpu/string_utils.hpp +145 -0
- ksgpu-1.0.0/include/ksgpu/test_utils.hpp +94 -0
- ksgpu-1.0.0/include/ksgpu/time_utils.hpp +39 -0
- ksgpu-1.0.0/include/ksgpu/xassert.hpp +84 -0
- ksgpu-1.0.0/include/ksgpu.hpp +58 -0
- ksgpu-1.0.0/loose_ends/bit-mapping.cu +292 -0
- ksgpu-1.0.0/meson.build +166 -0
- ksgpu-1.0.0/pyproject.toml +17 -0
- ksgpu-1.0.0/src_bin/reverse-engineer-mma.cu +807 -0
- ksgpu-1.0.0/src_bin/scratch.cu +17 -0
- ksgpu-1.0.0/src_bin/show-devices.cu +41 -0
- ksgpu-1.0.0/src_bin/test-array.cu +474 -0
- ksgpu-1.0.0/src_bin/test-memcpy-kernels.cu +99 -0
- ksgpu-1.0.0/src_bin/test-sparse-mma.cu +221 -0
- ksgpu-1.0.0/src_bin/time-atomic-add.cu +214 -0
- ksgpu-1.0.0/src_bin/time-fma.cu +124 -0
- ksgpu-1.0.0/src_bin/time-l2-cache.cu +73 -0
- ksgpu-1.0.0/src_bin/time-local-transpose.cu +135 -0
- ksgpu-1.0.0/src_bin/time-memcpy-kernels.cu +81 -0
- ksgpu-1.0.0/src_bin/time-shared-memory.cu +87 -0
- ksgpu-1.0.0/src_bin/time-tensor-cores.cu +442 -0
- ksgpu-1.0.0/src_bin/time-warp-shuffle.cu +105 -0
- ksgpu-1.0.0/src_lib/Array.cu +408 -0
- ksgpu-1.0.0/src_lib/Barrier.cu +72 -0
- ksgpu-1.0.0/src_lib/CpuThreadPool.cu +157 -0
- ksgpu-1.0.0/src_lib/CudaStreamPool.cu +196 -0
- ksgpu-1.0.0/src_lib/cuda_utils.cu +171 -0
- ksgpu-1.0.0/src_lib/mem_utils.cu +391 -0
- ksgpu-1.0.0/src_lib/memcpy_kernels.cu +145 -0
- ksgpu-1.0.0/src_lib/rand_utils.cu +73 -0
- ksgpu-1.0.0/src_lib/string_utils.cu +113 -0
- ksgpu-1.0.0/src_lib/test_utils.cu +409 -0
- ksgpu-1.0.0/src_pybind11/ksgpu_pybind11.cu +197 -0
- ksgpu-1.0.0/src_pybind11/pybind11_utils.cu +430 -0
- ksgpu-1.0.0/src_python/ksgpu/__init__.py +20 -0
ksgpu-1.0.0/.gitignore
ADDED
ksgpu-1.0.0/PKG-INFO
ADDED
ksgpu-1.0.0/README.md
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
#
|
|
3
|
+
# Reference:
|
|
4
|
+
# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma
|
|
5
|
+
#
|
|
6
|
+
# Note: I'm no longer running this as part of the build process (in an automated way).
|
|
7
|
+
#
|
|
8
|
+
# Instead, I'm keeping the output file (include/ksgpu/device_mma.hpp) in git, and occasionally
|
|
9
|
+
# updating by hand:
|
|
10
|
+
#
|
|
11
|
+
# ./generate_device_mma_hpp.py > include/ksgpu/device_mma.hpp
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Argument:
|
|
15
|
+
def __init__(self, name, cuda_type, num_registers, is_const=False, pack=True, ptx_type=None, is_scalar=False, is_immediate=False):
|
|
16
|
+
self.name = name
|
|
17
|
+
self.cuda_type = cuda_type
|
|
18
|
+
self.num_registers = num_registers
|
|
19
|
+
self.is_const = is_const
|
|
20
|
+
self.pack = pack
|
|
21
|
+
self.ptx_type = ptx_type
|
|
22
|
+
self.is_scalar = is_scalar
|
|
23
|
+
self.is_immediate = is_immediate
|
|
24
|
+
|
|
25
|
+
if is_immediate:
|
|
26
|
+
assert is_const and is_scalar and (ptx_type is None)
|
|
27
|
+
|
|
28
|
+
if is_scalar:
|
|
29
|
+
assert (not pack) and (num_registers == 1)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def make_template_arglist(self):
|
|
33
|
+
return [ f'{self.cuda_type} {self.name}' ] if self.is_immediate else [ ]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def make_cuda_arglist(self):
|
|
37
|
+
amp = '' if self.is_const else '&'
|
|
38
|
+
cv = 'const ' if self.is_const else ''
|
|
39
|
+
|
|
40
|
+
if self.is_immediate:
|
|
41
|
+
return [ ]
|
|
42
|
+
elif self.is_scalar:
|
|
43
|
+
return [ f'{self.cuda_type} {amp}{self.name}' ]
|
|
44
|
+
elif self.pack:
|
|
45
|
+
return [ f'{cv}{self.cuda_type} {self.name}[{self.num_registers}]' ]
|
|
46
|
+
else:
|
|
47
|
+
return [ f'{self.cuda_type} {cv}{self.name}{i}' for i in range(self.num_registers) ]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def make_ptx_argstr(self, base):
|
|
51
|
+
t = [ f'%{i}' for i in range(base, base + self.num_registers) ]
|
|
52
|
+
t = ', '.join(t)
|
|
53
|
+
if not self.is_scalar:
|
|
54
|
+
t = '{' + t + '}'
|
|
55
|
+
return t
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def make_constraint_str(self):
|
|
59
|
+
cv = 'const ' if self.is_const else ''
|
|
60
|
+
constraint = '"r"' if self.is_const else '"=r"'
|
|
61
|
+
|
|
62
|
+
if self.is_immediate:
|
|
63
|
+
return f'"n" ({self.name})'
|
|
64
|
+
|
|
65
|
+
if self.is_scalar:
|
|
66
|
+
return f'{constraint} ({self.name})'
|
|
67
|
+
|
|
68
|
+
ret = [ ]
|
|
69
|
+
for i in range(self.num_registers):
|
|
70
|
+
v = f'{self.name}[{i}]' if self.pack else f'{self.name}{i}'
|
|
71
|
+
if self.ptx_type is not None:
|
|
72
|
+
v = f'*({cv}{self.ptx_type} *) &{v}'
|
|
73
|
+
ret.append(f'{constraint} ({v})')
|
|
74
|
+
|
|
75
|
+
return ', '.join(ret)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
####################################################################################################
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def emit_kernel(cuda_name, ptx_name, *args):
|
|
82
|
+
# 'ordered_metadata' was introduced in nvcc 12.5 (I think), and we accommodate it with a hack (see below).
|
|
83
|
+
# FIXME some day, when nvcc 12.4 is ancient history, this hack can be removed.
|
|
84
|
+
omstr = '::ordered_metadata'
|
|
85
|
+
omi = ptx_name.find(omstr)
|
|
86
|
+
|
|
87
|
+
template_arglist = [ ]
|
|
88
|
+
cuda_arglist = [ ]
|
|
89
|
+
|
|
90
|
+
for arg in args:
|
|
91
|
+
template_arglist += arg.make_template_arglist()
|
|
92
|
+
cuda_arglist += arg.make_cuda_arglist()
|
|
93
|
+
|
|
94
|
+
template_argstr = ', '.join(template_arglist)
|
|
95
|
+
cuda_argstr = ', '.join(cuda_arglist)
|
|
96
|
+
|
|
97
|
+
print(f'')
|
|
98
|
+
print(f'// D = A*B + C')
|
|
99
|
+
|
|
100
|
+
if len(template_arglist) > 0:
|
|
101
|
+
print(f'template<{template_argstr}>')
|
|
102
|
+
|
|
103
|
+
print(f'__device__ __forceinline__')
|
|
104
|
+
print(f'void {cuda_name}({cuda_argstr})')
|
|
105
|
+
print(f'{{')
|
|
106
|
+
|
|
107
|
+
if omi < 0:
|
|
108
|
+
print(f' asm("{ptx_name} "')
|
|
109
|
+
else:
|
|
110
|
+
ptx_name2 = ptx_name[:omi] + ptx_name[omi+len(omstr):]
|
|
111
|
+
print(f' asm(')
|
|
112
|
+
print(f'#if CUDART_VERSION >= 12050')
|
|
113
|
+
print(f' "{ptx_name} "')
|
|
114
|
+
print(f'#else')
|
|
115
|
+
print(f' "{ptx_name2} "')
|
|
116
|
+
print(f'#endif')
|
|
117
|
+
|
|
118
|
+
base = 0
|
|
119
|
+
for i,arg in enumerate(args):
|
|
120
|
+
s = f'"{arg.make_ptx_argstr(base)}'
|
|
121
|
+
t = ', "' if (i < len(args)-1) else ';" :'
|
|
122
|
+
base += arg.num_registers
|
|
123
|
+
print(f' {s}{t}')
|
|
124
|
+
|
|
125
|
+
for i,arg in enumerate(args):
|
|
126
|
+
t = ''
|
|
127
|
+
if i == 0:
|
|
128
|
+
t = ' :'
|
|
129
|
+
elif i < len(args)-1:
|
|
130
|
+
t = ','
|
|
131
|
+
|
|
132
|
+
s = arg.make_constraint_str()
|
|
133
|
+
print(f' {s}{t}')
|
|
134
|
+
|
|
135
|
+
print(f' );')
|
|
136
|
+
print(f'}}')
|
|
137
|
+
print(f'')
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
####################################################################################################
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def emit_dense_mma(cuda_name, ptx_name, cuda_type, dbits, sbits, m, n, k, ptx_type=None, s=1):
|
|
144
|
+
# Register counts
|
|
145
|
+
na = (m*k*s*sbits) // 1024
|
|
146
|
+
nb = (k*n*s*sbits) // 1024
|
|
147
|
+
nc = (m*n*s*dbits) // 1024
|
|
148
|
+
|
|
149
|
+
emit_kernel(
|
|
150
|
+
cuda_name,
|
|
151
|
+
ptx_name,
|
|
152
|
+
Argument('d', cuda_type, nc, ptx_type=ptx_type),
|
|
153
|
+
Argument('a', cuda_type, na, ptx_type=ptx_type, is_const=True),
|
|
154
|
+
Argument('b', cuda_type, nb, ptx_type=ptx_type, is_const=True),
|
|
155
|
+
Argument('c', cuda_type, nc, ptx_type=ptx_type, is_const=True)
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def emit_dense_f16_mma(m, n, k, s=1, layout=None):
|
|
160
|
+
"""The 'layout' parameter is only used for m8n8k4, and is a string pair such as ('row','col')."""
|
|
161
|
+
|
|
162
|
+
cuda_name = f'mma_f16_m{m}_n{n}_k{k}'
|
|
163
|
+
if layout is not None:
|
|
164
|
+
cuda_name = f'{cuda_name}_{layout[0][0]}{layout[1][0]}'
|
|
165
|
+
|
|
166
|
+
a = 'row' if (layout is None) else layout[0]
|
|
167
|
+
b = 'col' if (layout is None) else layout[1]
|
|
168
|
+
ptx_name = f'mma.sync.aligned.m{m}n{n}k{k}.{a}.{b}.f16.f16.f16.f16'
|
|
169
|
+
|
|
170
|
+
emit_dense_mma(cuda_name, ptx_name, '__half2', 16, 16, m, n, k, ptx_type='uint', s=s)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def emit_dense_int_mma(sbits, m, n, k):
|
|
174
|
+
typename = f's{sbits}' if (sbits > 1) else 'b1'
|
|
175
|
+
satfinite = '.satfinite' if (sbits > 1) else ''
|
|
176
|
+
suffix = '' if (sbits > 1) else '.and.popc'
|
|
177
|
+
cuda_name = f'mma_{typename}_m{m}_n{n}_k{k}'
|
|
178
|
+
ptx_name = f'mma.sync.aligned.m{m}n{n}k{k}.row.col{satfinite}.s32.{typename}.{typename}.s32{suffix}'
|
|
179
|
+
emit_dense_mma(cuda_name, ptx_name, 'int', 32, sbits, m, n, k)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def emit_sparse_f16_mma(m, n, k):
|
|
183
|
+
cuda_name = f'mma_sp_f16_m{m}_n{n}_k{k}'
|
|
184
|
+
ptx_name = f'mma.sp::ordered_metadata.sync.aligned.m{m}n{n}k{k}.row.col.f16.f16.f16.f16'
|
|
185
|
+
|
|
186
|
+
# Register counts
|
|
187
|
+
na = (m*k) // 128
|
|
188
|
+
nb = (k*n) // 64
|
|
189
|
+
nc = (m*n) // 64
|
|
190
|
+
|
|
191
|
+
emit_kernel(
|
|
192
|
+
cuda_name,
|
|
193
|
+
ptx_name,
|
|
194
|
+
Argument('d', '__half2', nc, ptx_type='uint'),
|
|
195
|
+
Argument('a', '__half2', na, ptx_type='uint', is_const=True),
|
|
196
|
+
Argument('b', '__half2', nb, ptx_type='uint', is_const=True),
|
|
197
|
+
Argument('c', '__half2', nc, ptx_type='uint', is_const=True),
|
|
198
|
+
Argument('e', 'uint', 1, pack=False, is_scalar=True, is_const=True),
|
|
199
|
+
Argument('F', 'uint', 1, pack=False, is_scalar=True, is_const=True, is_immediate=True)
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
####################################################################################################
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
if __name__ == '__main__':
|
|
208
|
+
print(f'#ifndef _KSGPU_DEVICE_MMA_HPP')
|
|
209
|
+
print(f'#define _KSGPU_DEVICE_MMA_HPP')
|
|
210
|
+
print(f'')
|
|
211
|
+
print(f'// Autogenerated by generate_device_mma_hpp.py')
|
|
212
|
+
print(f'//')
|
|
213
|
+
print(f'// Reference for matrix shapes:')
|
|
214
|
+
print(f'// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-shape')
|
|
215
|
+
print(f'//')
|
|
216
|
+
print(f'// Reference for PTX instruction syntax:')
|
|
217
|
+
print(f'// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma')
|
|
218
|
+
print()
|
|
219
|
+
print('#include <cuda_fp16.h>')
|
|
220
|
+
print(f'')
|
|
221
|
+
print(f'namespace ksgpu {{')
|
|
222
|
+
print(f'')
|
|
223
|
+
|
|
224
|
+
# float16
|
|
225
|
+
emit_dense_f16_mma(16, 8, 8)
|
|
226
|
+
emit_dense_f16_mma(16, 8, 16)
|
|
227
|
+
|
|
228
|
+
# int4
|
|
229
|
+
emit_dense_int_mma(4, 8, 8, 32)
|
|
230
|
+
emit_dense_int_mma(4, 16, 8, 32)
|
|
231
|
+
emit_dense_int_mma(4, 16, 8, 64)
|
|
232
|
+
|
|
233
|
+
# int8
|
|
234
|
+
emit_dense_int_mma(8, 8, 8, 16)
|
|
235
|
+
emit_dense_int_mma(8, 16, 8, 16)
|
|
236
|
+
emit_dense_int_mma(8, 16, 8, 32)
|
|
237
|
+
|
|
238
|
+
# int1
|
|
239
|
+
emit_dense_int_mma(1, 8, 8, 128)
|
|
240
|
+
|
|
241
|
+
# sparse float16
|
|
242
|
+
emit_sparse_f16_mma(16, 8, 16)
|
|
243
|
+
emit_sparse_f16_mma(16, 8, 32)
|
|
244
|
+
|
|
245
|
+
# The PTX ISA includes f16 m8n8k4 MMAs.
|
|
246
|
+
# I tried generating wrappers for these, but timing showed that they were extremely slow.
|
|
247
|
+
# I assume these MMAs are legacy instructions which are emulated on Ampere.
|
|
248
|
+
# I left commented-out code here, and in ../generate_device_mma_hpp.py in case I ever want to revisit this.
|
|
249
|
+
|
|
250
|
+
# for a in ['row','col']:
|
|
251
|
+
# for b in ['row','col']:
|
|
252
|
+
# emit_dense_f16_mma(8, 8, 4, s=4, layout=(a,b))
|
|
253
|
+
|
|
254
|
+
print(f'')
|
|
255
|
+
print(f'}} // namespace ksgpu')
|
|
256
|
+
print(f'')
|
|
257
|
+
print(f'#endif // _KSGPU_DEVICE_MMA_HPP')
|